diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6adae58832e191..0b2e4e1be86d86 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -64,8 +64,8 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/Dialect/*/Transforms/Bufferize.cpp @matthias-springer
 
 # Linalg Dialect in MLIR.
-/mlir/include/mlir/Dialect/Linalg/* @dcaballe @nicolasvasilache @rengolin
-/mlir/lib/Dialect/Linalg/* @dcaballe @nicolasvasilache @rengolin
+/mlir/include/mlir/Dialect/Linalg @dcaballe @nicolasvasilache @rengolin
+/mlir/lib/Dialect/Linalg @dcaballe @nicolasvasilache @rengolin
 /mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @MaheshRavishankar @nicolasvasilache
@@ -85,8 +85,8 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/**/*VectorToSCF* @banach-space @dcaballe @matthias-springer @nicolasvasilache
 /mlir/**/*VectorToLLVM* @banach-space @dcaballe @nicolasvasilache
 /mlir/**/*X86Vector* @aartbik @dcaballe @nicolasvasilache
-/mlir/include/mlir/Dialect/Vector/* @dcaballe @nicolasvasilache
-/mlir/lib/Dialect/Vector/* @dcaballe @nicolasvasilache
+/mlir/include/mlir/Dialect/Vector @dcaballe @nicolasvasilache
+/mlir/lib/Dialect/Vector @dcaballe @nicolasvasilache
 /mlir/lib/Dialect/Vector/Transforms/* @hanhanW @nicolasvasilache
 /mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/**/*EmulateNarrowType* @dcaballe @hanhanW
@@ -120,6 +120,9 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/**/LLVMIR/**/BasicPtxBuilderInterface* @grypp
 /mlir/**/NVVM* @grypp
 
+# MLIR Index Dialect
+/mlir/**/Index* @mogball
+
 # MLIR Python Bindings
 /mlir/test/python/ @ftynse @makslevental @stellaraccident
 /mlir/python/ @ftynse @makslevental @stellaraccident
@@ -141,3 +144,8 @@ clang/test/AST/Interp/ @tbaederr
 
 # ExtractAPI
 /clang/**/ExtractAPI @daniel-grumberg
+
+# DWARFLinker, dwarfutil, dsymutil
+/llvm/**/DWARFLinker/ @JDevlieghere
+/llvm/**/dsymutil/ @JDevlieghere
+/llvm/**/llvm-dwarfutil/ @JDevlieghere
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index e003be006c4e15..616b323ad5fac3 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -5,6 +5,7 @@ on:
     workflows:
       - "Check code formatting"
       - "Check for private emails used in PRs"
+      - "PR Request Release Note"
     types:
       - completed
 
@@ -92,7 +93,11 @@ jobs:
 
             var pr_number = 0;
             gql_result.repository.ref.associatedPullRequests.nodes.forEach((pr) => {
-              if (pr.baseRepository.owner.login = context.repo.owner && pr.state == 'OPEN') {
+
+              // The largest PR number is the one we care about.  The only way
+              // to have more than one associated pull requests is if all the
+              // old pull requests are in the closed state.
+              if (pr.baseRepository.owner.login = context.repo.owner && pr.number > pr_number) {
                 pr_number = pr.number;
               }
             });
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
index 5e48ce7aee2e2b..2fa501dda16bb3 100644
--- a/.github/workflows/pr-request-release-note.yml
+++ b/.github/workflows/pr-request-release-note.yml
@@ -2,7 +2,6 @@ name: PR Request Release Note
 
 permissions:
   contents: read
-  pull-requests: write
 
 on:
   pull_request:
@@ -41,3 +40,10 @@ jobs:
             --token "$GITHUB_TOKEN" \
             request-release-note \
             --pr-number ${{ github.event.pull_request.number}}
+
+      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        if: always()
+        with:
+          name: workflow-args
+          path: |
+            comments
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 49e226513028f8..00d472c5789168 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -56,6 +56,14 @@
 
   Allow processing of stripped binaries
 
+- `--alt-inst-feature-size=<uint>`
+
+  Size of feature field in .altinstructions
+
+- `--alt-inst-has-padlen`
+
+  Specify that .altinstructions has padlen field
+
 - `--asm-dump[=<dump folder>]`
 
   Dump function into assembly
@@ -78,6 +86,16 @@
   in the input is decoded and re-encoded. If the resulting bytes do not match
   the input, a warning message is printed.
 
+- `--comp-dir-override=<string>`
+
+  Overrides DW_AT_comp_dir, and provides an alterantive base location, which is
+  used with DW_AT_dwo_name to construct a path to *.dwo files.
+
+- `--create-debug-names-section`
+
+  Creates .debug_names section, if the input binary doesn't have it already, for
+  DWARF5 CU/TUs.
+
 - `--cu-processing-batch-size=<uint>`
 
   Specifies the size of batches for processing CUs. Higher number has better
@@ -93,7 +111,7 @@
 
 - `--debug-skeleton-cu`
 
-  Prints out offsetrs for abbrev and debu_info of Skeleton CUs that get patched.
+  Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
 
 - `--deterministic-debuginfo`
 
@@ -104,6 +122,10 @@
 
   Add basic block instructions as tool tips on nodes
 
+- `--dump-alt-instructions`
+
+  Dump Linux alternative instructions info
+
 - `--dump-cg=<string>`
 
   Dump callgraph to the given file
@@ -117,10 +139,34 @@
   Dump function CFGs to graphviz format after each stage;enable '-print-loops'
   for color-coded blocks
 
+- `--dump-linux-exceptions`
+
+  Dump Linux kernel exception table
+
 - `--dump-orc`
 
   Dump raw ORC unwind information (sorted)
 
+- `--dump-para-sites`
+
+  Dump Linux kernel paravitual patch sites
+
+- `--dump-pci-fixups`
+
+  Dump Linux kernel PCI fixup table
+
+- `--dump-smp-locks`
+
+  Dump Linux kernel SMP locks
+
+- `--dump-static-calls`
+
+  Dump Linux kernel static calls
+
+- `--dump-static-keys`
+
+  Dump Linux kernel static keys jump table
+
 - `--dwarf-output-path=<string>`
 
   Path to where .dwo files or dwp file will be written out to.
@@ -205,6 +251,18 @@
 
   Skip processing of cold functions
 
+- `--log-file=<string>`
+
+  Redirect journaling to a file instead of stdout/stderr
+
+- `--long-jump-labels`
+
+  Always use long jumps/nops for Linux kernel static keys
+
+- `--match-profile-with-function-hash`
+
+  Match profile with function hash
+
 - `--max-data-relocations=<uint>`
 
   Maximum number of data relocations to process
@@ -274,6 +332,10 @@
 
   Number of tasks to be created per thread
 
+- `--terminal-trap`
+
+  Assume that execution stops at trap instruction
+
 - `--thread-count=<uint>`
 
   Number of threads
@@ -618,10 +680,6 @@
   threshold means fewer functions to process. E.g threshold of 90 means only top
   10 percent of functions with profile will be processed.
 
-- `--mcf-use-rarcs`
-
-  In MCF, consider the possibility of cancelling flow to balance edges
-
 - `--memcpy1-spec=<func1,func2:cs1:cs2,func3:cs1,...>`
 
   List of functions with call sites for which to specialize memcpy() for size 1
@@ -710,7 +768,7 @@
   - `none`: do not reorder functions
   - `exec-count`: order by execution count
   - `hfsort`: use hfsort algorithm
-  - `hfsort+`: use hfsort+ algorithm
+  - `hfsort+`: use cache-directed sort
   - `cdsort`: use cache-directed sort
   - `pettis-hansen`: use Pettis-Hansen algorithm
   - `random`: reorder functions randomly
@@ -804,8 +862,8 @@
 
 - `--stale-matching-min-matched-block=<uint>`
 
-  Minimum percent of exact match block for a function to be considered for
-  profile inference.
+  Percentage threshold of matched basic blocks at which stale profile inference
+  is executed.
 
 - `--stale-threshold=<uint>`
 
@@ -853,6 +911,10 @@
 
   Only apply branch boundary alignment in hot code
 
+- `--x86-strip-redundant-address-size`
+
+  Remove redundant Address-Size override prefix
+
 ### BOLT options in relocation mode:
 
 - `--align-macro-fusion=<value>`
@@ -1039,6 +1101,10 @@
 
   Print clusters
 
+- `--print-estimate-edge-counts`
+
+  Print function after edge counts are set for no-LBR profile
+
 - `--print-finalized`
 
   Print function after CFG is finalized
@@ -1071,6 +1137,10 @@
 
   Print functions after inlining optimization
 
+- `--print-large-functions`
+
+  Print functions that could not be overwritten due to excessive size
+
 - `--print-longjmp`
 
   Print functions after longjmp pass
@@ -1166,4 +1236,4 @@
 
 - `--print-options`
 
-  Print non-default options after command line parsing
+  Print non-default options after command line parsing
\ No newline at end of file
diff --git a/bolt/docs/OptimizingLinux.md b/bolt/docs/OptimizingLinux.md
new file mode 100644
index 00000000000000..1fffcf22529068
--- /dev/null
+++ b/bolt/docs/OptimizingLinux.md
@@ -0,0 +1,120 @@
+# Optimizing Linux Kernel with BOLT
+
+
+## Introduction
+
+Many Linux applications spend a significant amount of their execution time in the kernel. Thus, when we consider code optimization for system performance, it is essential to improve the CPU utilization not only in the user-space applications and libraries but also in the kernel. BOLT has demonstrated double-digit gains while being applied to user-space programs. This guide shows how to apply BOLT to the x86-64 Linux kernel and enhance your system's performance. In our experiments, BOLT boosted database TPS by 2 percent when applied to the kernel compiled with the highest level optimizations, including PGO and LTO. The database spent ~40% of the time in the kernel and was quite sensitive to kernel performance.
+
+BOLT optimizes code layout based on a low-level execution profile collected with the Linux `perf` tool. The best quality profile should include branch history, such as Intel's last branch records (LBR). BOLT runs on a linked binary and reorders the code while combining frequently executed blocks of instructions in a manner best suited for the hardware. Other than branch instructions, most of the code is left unchanged. Additionally, BOLT updates all metadata associated with the modified code, including DWARF debug information and Linux ORC unwind information.
+
+While BOLT optimizations are not specific to the Linux kernel, certain quirks distinguish the kernel from user-level applications.
+
+BOLT has been successfully applied to and tested with several flavors of the x86-64 Linux kernel.
+
+
+## QuickStart Guide
+
+BOLT operates on a statically-linked kernel executable, a.k.a. `vmlinux` binary. However, most Linux distributions use a `vmlinuz` compressed image for system booting. To use BOLT on the kernel, you must either repackage `vmlinuz` after BOLT optimizations or add steps for running BOLT into the kernel build and rebuild `vmlinuz`. Uncompressing `vmlinuz` and repackaging it with a new `vmlinux` binary falls beyond the scope of this guide, and at some point, we may add the capability to run BOLT directly on `vmlinuz`. Meanwhile, this guide focuses on steps for integrating BOLT into the kernel build process.
+
+
+### Building the Kernel
+
+After downloading the kernel sources and configuration for your distribution, you should be able to build `vmlinuz` using the `make bzImage` command. Ideally, the kernel should binary match the kernel on the system you are about to optimize (the target system). The binary matching part is critical as BOLT performs profile matching and optimizations at the binary level. We recommend installing a freshly built kernel on the target system to avoid any discrepancies.
+
+Note that the kernel build will produce several artifacts besides bzImage. The most important of them is the uncompressed `vmlinux` binary, which will be used in the next steps. Make sure to save this file.
+
+Build and target systems should have a `perf` tool installed for collecting and processing profiles. If your build system differs from the target, make sure `perf` versions are compatible. The build system should also have the latest BOLT binary and tools (`llvm-bolt`, `perf2bolt`).
+
+Once the target system boots with the freshly-built kernel, start your workload, such as a database benchmark. While the system is under load, collect the kernel profile using perf:
+
+
+```bash
+$ sudo perf record -a -e cycles -j any,k -F 5000 -- sleep 600
+```
+
+
+Convert `perf` profile into a format suitable for BOLT passing the `vmlinux` binary to `perf2bolt`:
+
+
+```bash
+$ sudo chwon $USER perf.data
+$ perf2bolt -p perf.data -o perf.fdata vmlinux
+```
+
+
+Under a high load, `perf.data` should be several gigabytes in size and you should expect the converted `perf.fdata` not to exceed 100 MB.
+
+Two changes are required for the kernel build. The first one is optional but highly recommended. It introduces a BOLT-reserved space into `vmlinux` code section:
+
+
+```diff
+--- a/arch/x86/kernel/vmlinux.lds.S
++++ b/arch/x86/kernel/vmlinux.lds.S
+@@ -139,6 +139,11 @@ SECTIONS
+                STATIC_CALL_TEXT
+                *(.gnu.warning)
+
++    /* Allocate space for BOLT */
++    __bolt_reserved_start = .;
++               . += 2048 * 1024;
++    __bolt_reserved_end = .;
++
+ #ifdef CONFIG_RETPOLINE
+                __indirect_thunk_start = .;
+                *(.text.__x86.*)
+```
+
+
+The second patch adds a step that runs BOLT on `vmlinux` binary:
+
+
+```diff
+--- a/scripts/link-vmlinux.sh
++++ b/scripts/link-vmlinux.sh
+@@ -340,5 +340,13 @@ if is_enabled CONFIG_KALLSYMS; then
+        fi
+ fi
+
++# Apply BOLT
++BOLT=llvm-bolt
++BOLT_PROFILE=perf.fdata
++BOLT_OPTS="--dyno-stats --eliminate-unreachable=0 --reorder-blocks=ext-tsp --simplify-conditional-tail-calls=0 --skip-funcs=__entry_text_start,irq_entries_start --split-functions"
++mv vmlinux vmlinux.pre-bolt
++echo BOLTing vmlinux
++${BOLT} vmlinux.pre-bolt -o vmlinux --data ${BOLT_PROFILE} ${BOLT_OPTS}
++
+ # For fixdep
+ echo "vmlinux: $0" > .vmlinux.d
+```
+
+
+If you skipped the first step or are running BOLT on a pre-built `vmlinux` binary, drop the `--split-functions` option.
+
+
+## Performance Expectations
+
+By improving the code layout, BOLT can boost the kernel's performance by up to 5% by reducing instruction cache misses and branch mispredictions. When measuring total system performance, you should scale this number accordingly based on the time your application spends in the kernel (excluding I/O time).
+
+
+## Profile Quality
+
+The timing and duration of the profiling may have a significant effect on the performance of the BOLTed kernel. If you don't know your workload well, it's recommended that you profile for the whole duration of the benchmark run. As longer times will result in larger `perf.data` files, you can lower the profiling frequency by providing a smaller value of `-F` flag. E.g., to record the kernel profile for half an hour, use the following command:
+
+
+```bash
+$ sudo perf record -a -e cycles -j any,k -F 1000 -- sleep 1800
+```
+
+
+
+## BOLT Disassembly
+
+BOLT annotates the disassembly with control-flow information and attaches Linux-specific metadata to the code. To view annotated disassembly, run:
+
+
+```bash
+$ llvm-bolt vmlinux -o /dev/null --print-cfg
+```
+
+
+If you want to limit the disassembly to a set of functions, add `--print-only=<func1regex>,<func2regex>,...`, where a function name is specified using regular expressions.
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 3c641581e247a8..1048e50cd80582 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -930,6 +930,10 @@ class BinaryFunction {
     return const_cast<BinaryFunction *>(this)->getInstructionAtOffset(Offset);
   }
 
+  /// When the function is in disassembled state, return an instruction that
+  /// contains the \p Offset.
+  MCInst *getInstructionContainingOffset(uint64_t Offset);
+
   std::optional<MCInst> disassembleInstructionAtOffset(uint64_t Offset) const;
 
   /// Return offset for the first instruction. If there is data at the
diff --git a/bolt/include/bolt/Passes/BinaryFunctionCallGraph.h b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
similarity index 95%
rename from bolt/include/bolt/Passes/BinaryFunctionCallGraph.h
rename to bolt/include/bolt/Core/BinaryFunctionCallGraph.h
index 52e17db4f50ce4..4ff5b1b94c5e5c 100644
--- a/bolt/include/bolt/Passes/BinaryFunctionCallGraph.h
+++ b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h ----------------------------------*- C++ -*-===//
+//===- bolt/Core/CallGraph.h ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +9,7 @@
 #ifndef BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
 #define BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
 
-#include "bolt/Passes/CallGraph.h"
+#include "bolt/Core/CallGraph.h"
 #include <deque>
 #include <functional>
 #include <unordered_map>
diff --git a/bolt/include/bolt/Passes/CallGraph.h b/bolt/include/bolt/Core/CallGraph.h
similarity index 98%
rename from bolt/include/bolt/Passes/CallGraph.h
rename to bolt/include/bolt/Core/CallGraph.h
index bdbc50bb78e87d..2fc18e61afcaaa 100644
--- a/bolt/include/bolt/Passes/CallGraph.h
+++ b/bolt/include/bolt/Core/CallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h ----------------------------------*- C++ -*-===//
+//===- bolt/Core/CallGraph.h ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Passes/CallGraphWalker.h b/bolt/include/bolt/Core/CallGraphWalker.h
similarity index 95%
rename from bolt/include/bolt/Passes/CallGraphWalker.h
rename to bolt/include/bolt/Core/CallGraphWalker.h
index ac45644be362f2..b0a73aee143691 100644
--- a/bolt/include/bolt/Passes/CallGraphWalker.h
+++ b/bolt/include/bolt/Core/CallGraphWalker.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.h ----------------------------*- C++ -*-===//
+//===- bolt/Core/CallGraphWalker.h ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 765372aa9e402f..83f4d4c649fd81 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -439,7 +439,20 @@ class MCPlusBuilder {
   }
 
   /// Check whether this conditional branch can be reversed
-  virtual bool isReversibleBranch(const MCInst &Inst) const { return true; }
+  virtual bool isReversibleBranch(const MCInst &Inst) const {
+    assert(!isUnsupportedInstruction(Inst) && isConditionalBranch(Inst) &&
+           "Instruction is not known conditional branch");
+
+    if (isDynamicBranch(Inst))
+      return false;
+    return true;
+  }
+
+  /// Return true if this instruction inhibits analysis of the containing
+  /// function.
+  virtual bool isUnsupportedInstruction(const MCInst &Inst) const {
+    return false;
+  }
 
   /// Return true of the instruction is of pseudo kind.
   virtual bool isPseudo(const MCInst &Inst) const {
diff --git a/bolt/include/bolt/Passes/HFSort.h b/bolt/include/bolt/Passes/HFSort.h
index 3787187733d072..2a35760329bb8e 100644
--- a/bolt/include/bolt/Passes/HFSort.h
+++ b/bolt/include/bolt/Passes/HFSort.h
@@ -19,7 +19,7 @@
 #ifndef BOLT_PASSES_HFSORT_H
 #define BOLT_PASSES_HFSORT_H
 
-#include "bolt/Passes/CallGraph.h"
+#include "bolt/Core/CallGraph.h"
 
 #include <string>
 #include <vector>
diff --git a/bolt/include/bolt/Passes/RegReAssign.h b/bolt/include/bolt/Passes/RegReAssign.h
index c50e32ff46e293..a7554a12151042 100644
--- a/bolt/include/bolt/Passes/RegReAssign.h
+++ b/bolt/include/bolt/Passes/RegReAssign.h
@@ -9,7 +9,7 @@
 #ifndef BOLT_PASSES_REGREASSIGN_H
 #define BOLT_PASSES_REGREASSIGN_H
 
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/BinaryPasses.h"
 #include "bolt/Passes/RegAnalysis.h"
 
diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index 4c88142c588714..7da32324bc9333 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -9,7 +9,7 @@
 #ifndef BOLT_PASSES_REORDER_FUNCTIONS_H
 #define BOLT_PASSES_REORDER_FUNCTIONS_H
 
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/BinaryPasses.h"
 
 namespace llvm {
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index c608ff40c6d9c3..f8530c424b8e9b 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -543,7 +543,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
         else
           OS << "<unknown>\n";
       }
-      if (BB->getCFIState() >= 0)
+      if (hasCFI())
         OS << "  CFI State : " << BB->getCFIState() << '\n';
       if (opts::EnableBAT) {
         OS << "  Input offset: 0x" << Twine::utohexstr(BB->getInputOffset())
@@ -611,7 +611,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
       }
 
       // In CFG_Finalized state we can miscalculate CFI state at exit.
-      if (CurrentState == State::CFG) {
+      if (CurrentState == State::CFG && hasCFI()) {
         const int32_t CFIStateAtExit = BB->getCFIStateAtExit();
         if (CFIStateAtExit >= 0)
           OS << "  CFI State: " << CFIStateAtExit << '\n';
@@ -1276,6 +1276,10 @@ Error BinaryFunction::disassemble() {
       }
     }
 
+    bool IsUnsupported = BC.MIB->isUnsupportedInstruction(Instruction);
+    if (IsUnsupported)
+      setIgnored();
+
     if (MIB->isBranch(Instruction) || MIB->isCall(Instruction)) {
       uint64_t TargetAddress = 0;
       if (MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size,
@@ -1289,12 +1293,10 @@ Error BinaryFunction::disassemble() {
         const bool IsCondBranch = MIB->isConditionalBranch(Instruction);
         MCSymbol *TargetSymbol = nullptr;
 
-        if (!BC.MIB->isReversibleBranch(Instruction)) {
-          setIgnored();
-          if (BinaryFunction *TargetFunc =
+        if (IsUnsupported)
+          if (auto *TargetFunc =
                   BC.getBinaryFunctionContainingAddress(TargetAddress))
             TargetFunc->setIgnored();
-        }
 
         if (IsCall && containsAddress(TargetAddress)) {
           if (TargetAddress == getAddress()) {
@@ -4472,6 +4474,18 @@ MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) {
   }
 }
 
+MCInst *BinaryFunction::getInstructionContainingOffset(uint64_t Offset) {
+  assert(CurrentState == State::Disassembled && "Wrong function state");
+
+  if (Offset > Size)
+    return nullptr;
+
+  auto II = Instructions.upper_bound(Offset);
+  assert(II != Instructions.begin() && "First instruction not at offset 0");
+  --II;
+  return &II->second;
+}
+
 void BinaryFunction::printLoopInfo(raw_ostream &OS) const {
   if (!opts::shouldPrint(*this))
     return;
diff --git a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
similarity index 98%
rename from bolt/lib/Passes/BinaryFunctionCallGraph.cpp
rename to bolt/lib/Core/BinaryFunctionCallGraph.cpp
index bbcc9751c0cbe6..b4b7897aa426ae 100644
--- a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/BinaryFunctionCallGraph.cpp ----------------------------===//
+//===- bolt/Core/BinaryFunctionCallGraph.cpp ----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index 873cf67a56291f..bb58667066fd88 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -17,8 +17,11 @@ add_llvm_library(LLVMBOLTCore
   BinaryData.cpp
   BinaryEmitter.cpp
   BinaryFunction.cpp
+  BinaryFunctionCallGraph.cpp
   BinaryFunctionProfile.cpp
   BinarySection.cpp
+  CallGraph.cpp
+  CallGraphWalker.cpp
   DebugData.cpp
   DebugNames.cpp
   DIEBuilder.cpp
diff --git a/bolt/lib/Passes/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
similarity index 92%
rename from bolt/lib/Passes/CallGraph.cpp
rename to bolt/lib/Core/CallGraph.cpp
index ee2ec26f3e50d8..f1d52737bf5560 100644
--- a/bolt/lib/Passes/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.cpp ------------------------------------------===//
+//===- bolt/Core/CallGraph.cpp ------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,16 +10,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "bolt/Passes/CallGraph.h"
+#include "bolt/Core/CallGraph.h"
 
 #define DEBUG_TYPE "callgraph"
 
 #if defined(__x86_64__) && !defined(_MSC_VER)
-#  if (!defined USE_SSECRC)
-#    define USE_SSECRC
-#  endif
+#if (!defined USE_SSECRC)
+#define USE_SSECRC
+#endif
 #else
-#  undef USE_SSECRC
+#undef USE_SSECRC
 #endif
 
 static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
@@ -50,7 +50,7 @@ static inline size_t hash_int64_pair(int64_t k1, int64_t k2) {
   // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes
   // differently from (k2, k1).
   k1 += k1;
-  __asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2));
+  __asm("crc32q %1, %0\n" : "+r"(k1) : "rm"(k2));
   return k1;
 #else
   return (hash_int64(k1) << 1) ^ hash_int64(k2);
@@ -118,5 +118,5 @@ void CallGraph::adjustArcWeights() {
   }
 }
 
-}
-}
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/lib/Passes/CallGraphWalker.cpp b/bolt/lib/Core/CallGraphWalker.cpp
similarity index 90%
rename from bolt/lib/Passes/CallGraphWalker.cpp
rename to bolt/lib/Core/CallGraphWalker.cpp
index 53fb1cb4d98c99..cbfa178d8e0687 100644
--- a/bolt/lib/Passes/CallGraphWalker.cpp
+++ b/bolt/lib/Core/CallGraphWalker.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.cpp ------------------------------------===//
+//===- bolt/Core/CallGraphWalker.cpp ------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "bolt/Passes/CallGraphWalker.h"
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/CallGraphWalker.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Timer.h"
 #include <queue>
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index 04057a895d6662..407d8b03f73977 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -4,11 +4,8 @@ add_llvm_library(LLVMBOLTPasses
   AllocCombiner.cpp
   AsmDump.cpp
   BinaryPasses.cpp
-  BinaryFunctionCallGraph.cpp
   CMOVConversion.cpp
   CacheMetrics.cpp
-  CallGraph.cpp
-  CallGraphWalker.cpp
   DataflowAnalysis.cpp
   DataflowInfoManager.cpp
   FrameAnalysis.cpp
diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp
index 4ebfd8f158f7f5..e027fc0403cd9f 100644
--- a/bolt/lib/Passes/FrameAnalysis.cpp
+++ b/bolt/lib/Passes/FrameAnalysis.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Passes/FrameAnalysis.h"
+#include "bolt/Core/CallGraphWalker.h"
 #include "bolt/Core/ParallelUtilities.h"
-#include "bolt/Passes/CallGraphWalker.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Timer.h"
 #include <fstream>
diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp
index 8461225e1819c1..1c0f9555f9eb9b 100644
--- a/bolt/lib/Passes/FrameOptimizer.cpp
+++ b/bolt/lib/Passes/FrameOptimizer.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Passes/FrameOptimizer.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Core/ParallelUtilities.h"
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/DataflowInfoManager.h"
 #include "bolt/Passes/ShrinkWrapping.h"
 #include "bolt/Passes/StackAvailableExpressions.h"
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 55eede641fd2f7..a73eb867b37775 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Passes/IndirectCallPromotion.h"
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/DataflowInfoManager.h"
 #include "bolt/Passes/Inliner.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index 14f506f9ca9689..e824a42d826964 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -479,8 +479,7 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
         HasJumpTable = true;
       else if (BC.MIB->isUnconditionalBranch(Inst))
         HasUnconditionalBranch = true;
-      else if ((!BC.MIB->isCall(Inst) && !BC.MIB->isConditionalBranch(Inst)) ||
-               !BC.MIB->isReversibleBranch(Inst))
+      else if (!(BC.MIB->isCall(Inst) || BC.MIB->isConditionalBranch(Inst)))
         continue;
 
       const uint32_t FromOffset = *BC.MIB->getOffset(Inst);
diff --git a/bolt/lib/Passes/JTFootprintReduction.cpp b/bolt/lib/Passes/JTFootprintReduction.cpp
index fd291f96004d93..71bdbba950df14 100644
--- a/bolt/lib/Passes/JTFootprintReduction.cpp
+++ b/bolt/lib/Passes/JTFootprintReduction.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Passes/JTFootprintReduction.h"
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/DataflowInfoManager.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/bolt/lib/Passes/RegAnalysis.cpp b/bolt/lib/Passes/RegAnalysis.cpp
index 9054385c20b59b..66ca6800b1a5fa 100644
--- a/bolt/lib/Passes/RegAnalysis.cpp
+++ b/bolt/lib/Passes/RegAnalysis.cpp
@@ -12,7 +12,7 @@
 
 #include "bolt/Passes/RegAnalysis.h"
 #include "bolt/Core/BinaryFunction.h"
-#include "bolt/Passes/CallGraphWalker.h"
+#include "bolt/Core/CallGraphWalker.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp
index 0becfb4a06a38c..60349f18b11d39 100644
--- a/bolt/lib/Passes/RegReAssign.cpp
+++ b/bolt/lib/Passes/RegReAssign.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Passes/RegReAssign.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Core/MCPlus.h"
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/DataflowAnalysis.h"
 #include "bolt/Passes/DataflowInfoManager.h"
 #include "bolt/Utils/Utils.h"
diff --git a/bolt/lib/Passes/StokeInfo.cpp b/bolt/lib/Passes/StokeInfo.cpp
index 499cac4217099c..dd033508d200c1 100644
--- a/bolt/lib/Passes/StokeInfo.cpp
+++ b/bolt/lib/Passes/StokeInfo.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Passes/StokeInfo.h"
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/DataflowInfoManager.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index f25f59201f1cd9..554def697fa21c 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -22,12 +22,18 @@ namespace opts {
 extern cl::opt<unsigned> Verbosity;
 extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<bool> InferStaleProfile;
+extern cl::opt<bool> Lite;
 
 static llvm::cl::opt<bool>
     IgnoreHash("profile-ignore-hash",
                cl::desc("ignore hash while reading function profile"),
                cl::Hidden, cl::cat(BoltOptCategory));
 
+llvm::cl::opt<bool>
+    MatchProfileWithFunctionHash("match-profile-with-function-hash",
+                                 cl::desc("Match profile with function hash"),
+                                 cl::Hidden, cl::cat(BoltOptCategory));
+
 llvm::cl::opt<bool> ProfileUseDFS("profile-use-dfs",
                                   cl::desc("use DFS order for YAML profile"),
                                   cl::Hidden, cl::cat(BoltOptCategory));
@@ -329,6 +335,8 @@ Error YAMLProfileReader::preprocessProfile(BinaryContext &BC) {
 }
 
 bool YAMLProfileReader::mayHaveProfileData(const BinaryFunction &BF) {
+  if (opts::MatchProfileWithFunctionHash)
+    return true;
   for (StringRef Name : BF.getNames())
     if (ProfileFunctionNames.contains(Name))
       return true;
@@ -363,9 +371,24 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
     return Profile.Hash == static_cast<uint64_t>(BF.getHash());
   };
 
-  // We have to do 2 passes since LTO introduces an ambiguity in function
-  // names. The first pass assigns profiles that match 100% by name and
-  // by hash. The second pass allows name ambiguity for LTO private functions.
+  uint64_t MatchedWithExactName = 0;
+  uint64_t MatchedWithHash = 0;
+  uint64_t MatchedWithLTOCommonName = 0;
+
+  // Computes hash for binary functions.
+  if (opts::MatchProfileWithFunctionHash) {
+    for (auto &[_, BF] : BC.getBinaryFunctions()) {
+      BF.computeHash(YamlBP.Header.IsDFSOrder, YamlBP.Header.HashFunction);
+    }
+  } else if (!opts::IgnoreHash) {
+    for (BinaryFunction *BF : ProfileBFs) {
+      if (!BF)
+        continue;
+      BF->computeHash(YamlBP.Header.IsDFSOrder, YamlBP.Header.HashFunction);
+    }
+  }
+
+  // This first pass assigns profiles that match 100% by name and by hash.
   for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs)) {
     if (!BF)
       continue;
@@ -374,15 +397,35 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
     // the profile.
     Function.setExecutionCount(BinaryFunction::COUNT_NO_PROFILE);
 
-    // Recompute hash once per function.
-    if (!opts::IgnoreHash)
-      Function.computeHash(YamlBP.Header.IsDFSOrder,
-                           YamlBP.Header.HashFunction);
-
-    if (profileMatches(YamlBF, Function))
+    if (profileMatches(YamlBF, Function)) {
       matchProfileToFunction(YamlBF, Function);
+      ++MatchedWithExactName;
+    }
   }
 
+  // Iterates through profiled functions to match the first binary function with
+  // the same exact hash. Serves to match identical, renamed functions.
+  // Collisions are possible where multiple functions share the same exact hash.
+  if (opts::MatchProfileWithFunctionHash) {
+    DenseMap<size_t, BinaryFunction *> StrictHashToBF;
+    StrictHashToBF.reserve(BC.getBinaryFunctions().size());
+
+    for (auto &[_, BF] : BC.getBinaryFunctions())
+      StrictHashToBF[BF.getHash()] = &BF;
+
+    for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+      if (YamlBF.Used)
+        continue;
+      auto It = StrictHashToBF.find(YamlBF.Hash);
+      if (It != StrictHashToBF.end() && !ProfiledFunctions.count(It->second)) {
+        BinaryFunction *BF = It->second;
+        matchProfileToFunction(YamlBF, *BF);
+        ++MatchedWithHash;
+      }
+    }
+  }
+
+  // This second pass allows name ambiguity for LTO private functions.
   for (const auto &[CommonName, LTOProfiles] : LTOCommonNameMap) {
     if (!LTOCommonNameFunctionMap.contains(CommonName))
       continue;
@@ -396,6 +439,7 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
       for (BinaryFunction *BF : Functions) {
         if (!ProfiledFunctions.count(BF) && profileMatches(*YamlBF, *BF)) {
           matchProfileToFunction(*YamlBF, *BF);
+          ++MatchedWithLTOCommonName;
           return true;
         }
       }
@@ -407,8 +451,10 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
     // partially.
     if (!ProfileMatched && LTOProfiles.size() == 1 && Functions.size() == 1 &&
         !LTOProfiles.front()->Used &&
-        !ProfiledFunctions.count(*Functions.begin()))
+        !ProfiledFunctions.count(*Functions.begin())) {
       matchProfileToFunction(*LTOProfiles.front(), **Functions.begin());
+      ++MatchedWithLTOCommonName;
+    }
   }
 
   for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs))
@@ -420,6 +466,15 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
       errs() << "BOLT-WARNING: profile ignored for function " << YamlBF.Name
              << '\n';
 
+  if (opts::Verbosity >= 1) {
+    outs() << "BOLT-INFO: matched " << MatchedWithExactName
+           << " functions with identical names\n";
+    outs() << "BOLT-INFO: matched " << MatchedWithHash
+           << " functions with hash\n";
+    outs() << "BOLT-INFO: matched " << MatchedWithLTOCommonName
+           << " functions with matching LTO common names\n";
+  }
+
   // Set for parseFunctionProfile().
   NormalizeByInsnCount = usesEvent("cycles") || usesEvent("instructions");
   NormalizeByCalls = usesEvent("branches");
@@ -439,6 +494,12 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
 
   BC.setNumUnusedProfiledObjects(NumUnused);
 
+  if (opts::Lite && opts::MatchProfileWithFunctionHash) {
+    for (BinaryFunction *BF : BC.getAllBinaryFunctions())
+      if (!BF->hasProfile())
+        BF->setIgnored();
+  }
+
   return Error::success();
 }
 
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index aa18e79f73b9d2..03b414b71caca7 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -295,9 +295,6 @@ class LinuxKernelRewriter final : public MetadataRewriter {
     if (Error E = processSMPLocks())
       return E;
 
-    if (Error E = readORCTables())
-      return E;
-
     if (Error E = readStaticCalls())
       return E;
 
@@ -313,6 +310,11 @@ class LinuxKernelRewriter final : public MetadataRewriter {
     if (Error E = readAltInstructions())
       return E;
 
+    // Some ORC entries could be linked to alternative instruction
+    // sequences. Hence, we read ORC after .altinstructions.
+    if (Error E = readORCTables())
+      return E;
+
     if (Error E = readPCIFixupTable())
       return E;
 
@@ -563,11 +565,28 @@ Error LinuxKernelRewriter::readORCTables() {
     if (!BF->hasInstructions())
       continue;
 
-    MCInst *Inst = BF->getInstructionAtOffset(IP - BF->getAddress());
-    if (!Inst)
+    const uint64_t Offset = IP - BF->getAddress();
+    MCInst *Inst = BF->getInstructionAtOffset(Offset);
+    if (!Inst) {
+      // Check if there is an alternative instruction(s) at this IP. Multiple
+      // alternative instructions can take a place of a single original
+      // instruction and each alternative can have a separate ORC entry.
+      // Since ORC table is shared between all alternative sequences, there's
+      // a requirement that only one (out of many) sequences can have an
+      // instruction from the ORC table to avoid ambiguities/conflicts.
+      //
+      // For now, we have limited support for alternatives. I.e. we still print
+      // functions with them, but will not change the code in the output binary.
+      // As such, we can ignore alternative ORC entries. They will be preserved
+      // in the binary, but will not get printed in the instruction stream.
+      Inst = BF->getInstructionContainingOffset(Offset);
+      if (Inst || BC.MIB->hasAnnotation(*Inst, "AltInst"))
+        continue;
+
       return createStringError(
           errc::executable_format_error,
           "no instruction at address 0x%" PRIx64 " in .orc_unwind_ip", IP);
+    }
 
     // Some addresses will have two entries associated with them. The first
     // one being a "weak" section terminator. Since we ignore the terminator,
@@ -1440,7 +1459,7 @@ Error LinuxKernelRewriter::tryReadAltInstructions(uint32_t AltInstFeatureSize,
       AltBF->setIgnored();
     }
 
-    if (!BF || !BC.shouldEmit(*BF))
+    if (!BF || !BF->hasInstructions())
       continue;
 
     if (OrgInstAddress + OrgSize > BF->getAddress() + BF->getSize())
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 1a3a8af21d81b6..c118f2c3ccd65c 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -82,6 +82,7 @@ extern cl::opt<bool> Hugify;
 extern cl::opt<bool> Instrument;
 extern cl::opt<JumpTableSupportLevel> JumpTables;
 extern cl::opt<bool> KeepNops;
+extern cl::opt<bool> Lite;
 extern cl::list<std::string> ReorderData;
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
 extern cl::opt<bool> TerminalTrap;
@@ -140,9 +141,6 @@ KeepTmp("keep-tmp",
   cl::Hidden,
   cl::cat(BoltCategory));
 
-cl::opt<bool> Lite("lite", cl::desc("skip processing of cold functions"),
-                   cl::cat(BoltCategory));
-
 static cl::opt<unsigned>
 LiteThresholdPct("lite-threshold-pct",
   cl::desc("threshold (in percent) for selecting functions to process in lite "
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 515c9a94c58cd4..5bd77958934f92 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -328,19 +328,19 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return false;
   }
 
-  bool isReversibleBranch(const MCInst &Inst) const override {
-    if (isDynamicBranch(Inst))
-      return false;
-
+  bool isUnsupportedInstruction(const MCInst &Inst) const override {
     switch (Inst.getOpcode()) {
     default:
-      return true;
+      return false;
+
     case X86::LOOP:
     case X86::LOOPE:
     case X86::LOOPNE:
     case X86::JECXZ:
     case X86::JRCXZ:
-      return false;
+      // These have a short displacement, and therefore (often) break after
+      // basic block relayout.
+      return true;
     }
   }
 
@@ -1879,11 +1879,9 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         continue;
       }
 
-      // Handle conditional branches and ignore indirect branches
-      if (isReversibleBranch(*I) && getCondCode(*I) == X86::COND_INVALID) {
-        // Indirect branch
+      // Ignore indirect branches
+      if (getCondCode(*I) == X86::COND_INVALID)
         return false;
-      }
 
       if (CondBranch == nullptr) {
         const MCSymbol *TargetBB = getTargetSymbol(*I);
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index 41c89bc8aeba4e..b9bc79f408a6b2 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -128,6 +128,9 @@ cl::opt<bool>
                cl::desc("instrument code to generate accurate profile data"),
                cl::cat(BoltOptCategory));
 
+cl::opt<bool> Lite("lite", cl::desc("skip processing of cold functions"),
+                   cl::cat(BoltCategory));
+
 cl::opt<std::string>
 OutputFilename("o",
   cl::desc("<output file>"),
diff --git a/bolt/test/X86/hashing-based-function-matching.test b/bolt/test/X86/hashing-based-function-matching.test
new file mode 100644
index 00000000000000..41c991b4612ece
--- /dev/null
+++ b/bolt/test/X86/hashing-based-function-matching.test
@@ -0,0 +1,63 @@
+## Tests function matching in YAMLProfileReader by function hash.
+
+# REQUIRES: system-linux
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
+# RUN:   --print-cfg --match-profile-with-function-hash --profile-ignore-hash=0 2>&1 | FileCheck %s
+
+# CHECK: BOLT-INFO: matched 1 functions with hash
+
+#--- main.s
+.globl main
+.type	main, @function
+main:
+  .cfi_startproc
+.LBB00:
+  pushq   %rbp
+  movq    %rsp, %rbp
+  subq    $16, %rsp
+  testq   %rax, %rax
+  js      .LBB03
+.LBB01:
+  jne     .LBB04
+.LBB02:
+  nop
+.LBB03:
+  xorl    %eax, %eax
+  addq    $16, %rsp
+  popq    %rbp
+  retq
+.LBB04:
+  xorl    %eax, %eax
+  addq    $16, %rsp
+  popq    %rbp
+  retq
+## For relocations against .text
+  .reloc 0, R_X86_64_NONE
+  .cfi_endproc
+  .size	main, .-main
+
+#--- yaml
+---
+header:
+  profile-version: 1
+  binary-name:     'hashing-based-function-matching.s.tmp.exe'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+  hash-func:       xxh3
+functions:
+  - name:            main2
+    fid:             0
+    hash:            0x6E7F15254DE2478
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+...
diff --git a/bolt/test/X86/issue20.s b/bolt/test/X86/issue20.s
index 99a4f2ea2ac998..9645eaf2e62234 100644
--- a/bolt/test/X86/issue20.s
+++ b/bolt/test/X86/issue20.s
@@ -13,7 +13,6 @@
 # CHECK-NOT: BOLT-INFO: UCE removed {{.*}} blocks and {{.*}} bytes of code
 # CHECK: Binary Function "main"
 # CHECK:      .LFT{{.*}} (2 instructions, align : 1)
-# CHECK-NEXT:   CFI State : 0
 # CHECK-NEXT:     00000004:   andq
 # CHECK-NEXT:     00000008:   jmpq
 # CHECK-NEXT:   Successors: .Ltmp{{.*}}, .Ltmp{{.*}}, .Ltmp{{.*}}, .Ltmp{{.*}}
diff --git a/bolt/test/X86/issue20.test b/bolt/test/X86/issue20.test
index dcb1ce5ab1567f..09f9c36cc9e3dc 100644
--- a/bolt/test/X86/issue20.test
+++ b/bolt/test/X86/issue20.test
@@ -9,7 +9,6 @@
 CHECK-NOT: BOLT-INFO: UCE removed {{.*}} blocks and {{.*}}| bytes of code
 CHECK: Binary Function "main"
 CHECK:      .LFT0 (2 instructions, align : 1)
-CHECK-NEXT:   CFI State : 0
 CHECK-NEXT:     00000004:   andq
 CHECK-NEXT:     00000008:   jmpq
 CHECK-NEXT:   Successors: .Ltmp1, .Ltmp2, .Ltmp3, .Ltmp4
diff --git a/bolt/test/X86/linux-alt-instruction.s b/bolt/test/X86/linux-alt-instruction.s
index dc1b12a277785b..fe3abbfc2b4c93 100644
--- a/bolt/test/X86/linux-alt-instruction.s
+++ b/bolt/test/X86/linux-alt-instruction.s
@@ -38,7 +38,7 @@
 # RUN: llvm-bolt %t.exe --print-cfg -o %t.fs4.out | FileCheck %s
 
 # CHECK:      BOLT-INFO: Linux kernel binary detected
-# CHECK:      BOLT-INFO: parsed 2 alternative instruction entries
+# CHECK:      BOLT-INFO: parsed 3 alternative instruction entries
 
   .text
   .globl _start
@@ -50,10 +50,12 @@ _start:
 # CHECK:      rdtsc
 # CHECK-SAME: AltInst: 1
 # CHECK-SAME: AltInst2: 2
+# CHECK-SAME: AltInst3: 3
   nop
 # CHECK-NEXT: nop
 # CHECK-SAME: AltInst: 1
 # CHECK-SAME: AltInst2: 2
+# CHECK-SAME: AltInst3: 3
   nop
   nop
 .L1:
@@ -66,6 +68,9 @@ _start:
   rdtsc
 .A1:
   rdtscp
+.A2:
+  pushf
+  pop %rax
 .Ae:
 
 ## Alternative instruction info.
@@ -92,11 +97,51 @@ _start:
   .word 0x3b      # feature flags
 .endif
   .byte .L1 - .L0 # org size
-  .byte .Ae - .A1 # alt size
+  .byte .A2 - .A1 # alt size
 .ifdef PADLEN
   .byte 0
 .endif
 
+  .long .L0 - .   # org instruction
+  .long .A2 - .   # alt instruction
+.ifdef FEATURE_SIZE_4
+  .long 0x110     # feature flags
+.else
+  .word 0x110     # feature flags
+.endif
+  .byte .L1 - .L0 # org size
+  .byte .Ae - .A2 # alt size
+.ifdef PADLEN
+  .byte 0
+.endif
+
+## ORC unwind for "pushf; pop %rax" alternative sequence.
+  .section .orc_unwind,"a",@progbits
+  .align 4
+  .section .orc_unwind_ip,"a",@progbits
+  .align 4
+
+  .section .orc_unwind
+  .2byte 8
+  .2byte 0
+  .2byte 0x205
+  .section .orc_unwind_ip
+  .long _start - .
+
+  .section .orc_unwind
+  .2byte 16
+  .2byte 0
+  .2byte 0x205
+  .section .orc_unwind_ip
+  .long .L0 + 1 - .
+
+  .section .orc_unwind
+  .2byte 8
+  .2byte 0
+  .2byte 0x205
+  .section .orc_unwind_ip
+  .long .L0 + 2 - .
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/sctc-bug4.test b/bolt/test/X86/sctc-bug4.test
index 21a602b6729ae3..e192a238a4d0c9 100644
--- a/bolt/test/X86/sctc-bug4.test
+++ b/bolt/test/X86/sctc-bug4.test
@@ -6,24 +6,19 @@ RUN: llvm-bolt %t -o %t.null --enable-bat \
 RUN:   -funcs=test_func -print-sctc -sequential-disassembly 2>&1 | FileCheck %s
 
 CHECK:      .Ltmp2 (3 instructions, align : 1)
-CHECK-NEXT:   CFI State : 0
 CHECK-NEXT:   Input offset: 0x24
 CHECK-NEXT:   Predecessors: .LFT1
 CHECK-NEXT:     00000024: 	cmpq	$0x20, %rsi
 CHECK-NEXT:     00000028: 	ja	dummy # TAILCALL # Offset: 53 # CTCTakenCount: 0
 CHECK-NEXT:     0000002a: 	jmp .Ltmp4
 CHECK-NEXT:   Successors: .Ltmp4
-CHECK-NEXT:   CFI State: 0
 
 CHECK:      .Ltmp1 (2 instructions, align : 1)
-CHECK-NEXT:   CFI State : 0
 CHECK-NEXT:   Input offset: 0x2c
 CHECK-NEXT:   Predecessors: .LFT0
 CHECK-NEXT:     0000002c: 	xorq	%r11, %rax
 CHECK-NEXT:     0000002f: 	retq
-CHECK-NEXT:   CFI State: 0
 
 CHECK:      .Ltmp4 (4 instructions, align : 1)
-CHECK-NEXT:  CFI State : 0
 CHECK-NEXT:  Input offset: 0x3a
 CHECK-NEXT:  Predecessors: .Ltmp2
diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index c0faf5f7e8fd9b..b5da17564c2e01 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -55,7 +55,7 @@ class HTMLTag {
   operator TagType() const { return Value; }
   operator bool() = delete;
 
-  bool IsSelfClosing() const;
+  bool isSelfClosing() const;
   llvm::SmallString<16> ToString() const;
 
 private:
@@ -71,7 +71,7 @@ struct HTMLNode {
   HTMLNode(NodeType Type) : Type(Type) {}
   virtual ~HTMLNode() = default;
 
-  virtual void Render(llvm::raw_ostream &OS, int IndentationLevel) = 0;
+  virtual void render(llvm::raw_ostream &OS, int IndentationLevel) = 0;
   NodeType Type; // Type of node
 };
 
@@ -80,7 +80,7 @@ struct TextNode : public HTMLNode {
       : HTMLNode(NodeType::NODE_TEXT), Text(Text.str()) {}
 
   std::string Text; // Content of node
-  void Render(llvm::raw_ostream &OS, int IndentationLevel) override;
+  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
 };
 
 struct TagNode : public HTMLNode {
@@ -94,17 +94,17 @@ struct TagNode : public HTMLNode {
   std::vector<std::pair<std::string, std::string>>
       Attributes; // List of key-value attributes for tag
 
-  void Render(llvm::raw_ostream &OS, int IndentationLevel) override;
+  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
 };
 
 constexpr const char *kDoctypeDecl = "<!DOCTYPE html>";
 
 struct HTMLFile {
   std::vector<std::unique_ptr<HTMLNode>> Children; // List of child nodes
-  void Render(llvm::raw_ostream &OS) {
+  void render(llvm::raw_ostream &OS) {
     OS << kDoctypeDecl << "\n";
     for (const auto &C : Children) {
-      C->Render(OS, 0);
+      C->render(OS, 0);
       OS << "\n";
     }
   }
@@ -112,7 +112,7 @@ struct HTMLFile {
 
 } // namespace
 
-bool HTMLTag::IsSelfClosing() const {
+bool HTMLTag::isSelfClosing() const {
   switch (Value) {
   case HTMLTag::TAG_META:
   case HTMLTag::TAG_LINK:
@@ -177,12 +177,12 @@ llvm::SmallString<16> HTMLTag::ToString() const {
   llvm_unreachable("Unhandled HTMLTag::TagType");
 }
 
-void TextNode::Render(llvm::raw_ostream &OS, int IndentationLevel) {
+void TextNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
   OS.indent(IndentationLevel * 2);
   printHTMLEscaped(Text, OS);
 }
 
-void TagNode::Render(llvm::raw_ostream &OS, int IndentationLevel) {
+void TagNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
   // Children nodes are rendered in the same line if all of them are text nodes
   bool InlineChildren = true;
   for (const auto &C : Children)
@@ -194,7 +194,7 @@ void TagNode::Render(llvm::raw_ostream &OS, int IndentationLevel) {
   OS << "<" << Tag.ToString();
   for (const auto &A : Attributes)
     OS << " " << A.first << "=\"" << A.second << "\"";
-  if (Tag.IsSelfClosing()) {
+  if (Tag.isSelfClosing()) {
     OS << "/>";
     return;
   }
@@ -205,7 +205,7 @@ void TagNode::Render(llvm::raw_ostream &OS, int IndentationLevel) {
   for (const auto &C : Children) {
     int ChildrenIndentation =
         InlineChildren || !NewLineRendered ? 0 : IndentationLevel + 1;
-    C->Render(OS, ChildrenIndentation);
+    C->render(OS, ChildrenIndentation);
     if (!InlineChildren && (C == Children.back() ||
                             (C->Type != NodeType::NODE_TEXT ||
                              (&C + 1)->get()->Type != NodeType::NODE_TEXT))) {
@@ -221,7 +221,7 @@ void TagNode::Render(llvm::raw_ostream &OS, int IndentationLevel) {
 
 template <typename Derived, typename Base,
           typename = std::enable_if<std::is_base_of<Derived, Base>::value>>
-static void AppendVector(std::vector<Derived> &&New,
+static void appendVector(std::vector<Derived> &&New,
                          std::vector<Base> &Original) {
   std::move(New.begin(), New.end(), std::back_inserter(Original));
 }
@@ -289,9 +289,18 @@ genStylesheetsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
 static std::vector<std::unique_ptr<TagNode>>
 genJsScriptsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
   std::vector<std::unique_ptr<TagNode>> Out;
+
+  // index_json.js is part of every generated HTML file
+  SmallString<128> IndexJSONPath = computeRelativePath("", InfoPath);
+  auto IndexJSONNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
+  llvm::sys::path::append(IndexJSONPath, "index_json.js");
+  llvm::sys::path::native(IndexJSONPath, llvm::sys::path::Style::posix);
+  IndexJSONNode->Attributes.emplace_back("src", std::string(IndexJSONPath));
+  Out.emplace_back(std::move(IndexJSONNode));
+
   for (const auto &FilePath : CDCtx.JsScripts) {
-    auto ScriptNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
     SmallString<128> ScriptPath = computeRelativePath("", InfoPath);
+    auto ScriptNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
     llvm::sys::path::append(ScriptPath, llvm::sys::path::filename(FilePath));
     // Paths in HTML must be in posix-style
     llvm::sys::path::native(ScriptPath, llvm::sys::path::Style::posix);
@@ -313,8 +322,7 @@ genReference(const Reference &Type, StringRef CurrentDirectory,
   if (Type.Path.empty()) {
     if (!JumpToSection)
       return std::make_unique<TextNode>(Type.Name);
-    else
-      return genLink(Type.Name, "#" + *JumpToSection);
+    return genLink(Type.Name, "#" + *JumpToSection);
   }
   llvm::SmallString<64> Path = Type.getRelativeFilePath(CurrentDirectory);
   llvm::sys::path::append(Path, Type.getFileBaseName() + ".html");
@@ -357,7 +365,7 @@ genEnumsBlock(const std::vector<EnumInfo> &Enums,
   auto &DivBody = Out.back();
   for (const auto &E : Enums) {
     std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(E, CDCtx);
-    AppendVector(std::move(Nodes), DivBody->Children);
+    appendVector(std::move(Nodes), DivBody->Children);
   }
   return Out;
 }
@@ -388,7 +396,7 @@ genFunctionsBlock(const std::vector<FunctionInfo> &Functions,
   for (const auto &F : Functions) {
     std::vector<std::unique_ptr<TagNode>> Nodes =
         genHTML(F, CDCtx, ParentInfoDir);
-    AppendVector(std::move(Nodes), DivBody->Children);
+    appendVector(std::move(Nodes), DivBody->Children);
   }
   return Out;
 }
@@ -478,10 +486,10 @@ genFileHeadNodes(StringRef Title, StringRef InfoPath,
   Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_TITLE, Title));
   std::vector<std::unique_ptr<TagNode>> StylesheetsNodes =
       genStylesheetsHTML(InfoPath, CDCtx);
-  AppendVector(std::move(StylesheetsNodes), Out);
+  appendVector(std::move(StylesheetsNodes), Out);
   std::vector<std::unique_ptr<TagNode>> JsNodes =
       genJsScriptsHTML(InfoPath, CDCtx);
-  AppendVector(std::move(JsNodes), Out);
+  appendVector(std::move(JsNodes), Out);
   return Out;
 }
 
@@ -513,7 +521,7 @@ static std::unique_ptr<TagNode> genInfoFileMainNode(
   MainContentNode->Attributes.emplace_back("id", "main-content");
   MainContentNode->Attributes.emplace_back(
       "class", "col-xs-12 col-sm-9 col-md-8 main-content");
-  AppendVector(std::move(MainContentInnerNodes), MainContentNode->Children);
+  appendVector(std::move(MainContentInnerNodes), MainContentNode->Children);
 
   auto RightSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
   RightSidebarNode->Attributes.emplace_back("id", "sidebar-right");
@@ -521,7 +529,7 @@ static std::unique_ptr<TagNode> genInfoFileMainNode(
       "class", "col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right");
   std::vector<std::unique_ptr<TagNode>> InfoIndexHTML =
       genHTML(InfoIndex, InfoPath, true);
-  AppendVector(std::move(InfoIndexHTML), RightSidebarNode->Children);
+  appendVector(std::move(InfoIndexHTML), RightSidebarNode->Children);
 
   MainNode->Children.emplace_back(std::move(LeftSidebarNode));
   MainNode->Children.emplace_back(std::move(MainContentNode));
@@ -555,7 +563,7 @@ genInfoFile(StringRef Title, StringRef InfoPath,
       genInfoFileMainNode(InfoPath, MainContentNodes, InfoIndex);
   std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
 
-  AppendVector(std::move(HeadNodes), F.Children);
+  appendVector(std::move(HeadNodes), F.Children);
   F.Children.emplace_back(std::move(HeaderNode));
   F.Children.emplace_back(std::move(MainNode));
   F.Children.emplace_back(std::move(FooterNode));
@@ -594,7 +602,7 @@ genHTML(const Index &Index, StringRef InfoPath, bool IsOutermostList) {
   for (const auto &C : Index.Children) {
     auto LiBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
     std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(C, InfoPath, false);
-    AppendVector(std::move(Nodes), LiBody->Children);
+    appendVector(std::move(Nodes), LiBody->Children);
     UlBody->Children.emplace_back(std::move(LiBody));
   }
   return Out;
@@ -609,7 +617,9 @@ static std::unique_ptr<HTMLNode> genHTML(const CommentInfo &I) {
         FullComment->Children.emplace_back(std::move(Node));
     }
     return std::move(FullComment);
-  } else if (I.Kind == "ParagraphComment") {
+  }
+ 
+  if (I.Kind == "ParagraphComment") {
     auto ParagraphComment = std::make_unique<TagNode>(HTMLTag::TAG_P);
     for (const auto &Child : I.Children) {
       std::unique_ptr<HTMLNode> Node = genHTML(*Child);
@@ -619,7 +629,9 @@ static std::unique_ptr<HTMLNode> genHTML(const CommentInfo &I) {
     if (ParagraphComment->Children.empty())
       return nullptr;
     return std::move(ParagraphComment);
-  } else if (I.Kind == "TextComment") {
+  }
+
+  if (I.Kind == "TextComment") {
     if (I.Text == "")
       return nullptr;
     return std::make_unique<TextNode>(I.Text);
@@ -639,11 +651,7 @@ static std::unique_ptr<TagNode> genHTML(const std::vector<CommentInfo> &C) {
 static std::vector<std::unique_ptr<TagNode>>
 genHTML(const EnumInfo &I, const ClangDocContext &CDCtx) {
   std::vector<std::unique_ptr<TagNode>> Out;
-  std::string EnumType;
-  if (I.Scoped)
-    EnumType = "enum class ";
-  else
-    EnumType = "enum ";
+  std::string EnumType = I.Scoped ? "enum class " : "enum ";
 
   Out.emplace_back(
       std::make_unique<TagNode>(HTMLTag::TAG_H3, EnumType + I.Name));
@@ -737,17 +745,17 @@ genHTML(const NamespaceInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
 
   std::vector<std::unique_ptr<TagNode>> ChildNamespaces =
       genReferencesBlock(I.Children.Namespaces, "Namespaces", BasePath);
-  AppendVector(std::move(ChildNamespaces), Out);
+  appendVector(std::move(ChildNamespaces), Out);
   std::vector<std::unique_ptr<TagNode>> ChildRecords =
       genReferencesBlock(I.Children.Records, "Records", BasePath);
-  AppendVector(std::move(ChildRecords), Out);
+  appendVector(std::move(ChildRecords), Out);
 
   std::vector<std::unique_ptr<TagNode>> ChildFunctions =
       genFunctionsBlock(I.Children.Functions, CDCtx, BasePath);
-  AppendVector(std::move(ChildFunctions), Out);
+  appendVector(std::move(ChildFunctions), Out);
   std::vector<std::unique_ptr<TagNode>> ChildEnums =
       genEnumsBlock(I.Children.Enums, CDCtx);
-  AppendVector(std::move(ChildEnums), Out);
+  appendVector(std::move(ChildEnums), Out);
 
   if (!I.Children.Namespaces.empty())
     InfoIndex.Children.emplace_back("Namespaces", "Namespaces");
@@ -791,29 +799,29 @@ genHTML(const RecordInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
     auto &PBody = Out.back();
     PBody->Children.emplace_back(std::make_unique<TextNode>("Inherits from "));
     if (Parents.empty())
-      AppendVector(std::move(VParents), PBody->Children);
+      appendVector(std::move(VParents), PBody->Children);
     else if (VParents.empty())
-      AppendVector(std::move(Parents), PBody->Children);
+      appendVector(std::move(Parents), PBody->Children);
     else {
-      AppendVector(std::move(Parents), PBody->Children);
+      appendVector(std::move(Parents), PBody->Children);
       PBody->Children.emplace_back(std::make_unique<TextNode>(", "));
-      AppendVector(std::move(VParents), PBody->Children);
+      appendVector(std::move(VParents), PBody->Children);
     }
   }
 
   std::vector<std::unique_ptr<TagNode>> Members =
       genRecordMembersBlock(I.Members, I.Path);
-  AppendVector(std::move(Members), Out);
+  appendVector(std::move(Members), Out);
   std::vector<std::unique_ptr<TagNode>> ChildRecords =
       genReferencesBlock(I.Children.Records, "Records", I.Path);
-  AppendVector(std::move(ChildRecords), Out);
+  appendVector(std::move(ChildRecords), Out);
 
   std::vector<std::unique_ptr<TagNode>> ChildFunctions =
       genFunctionsBlock(I.Children.Functions, CDCtx, I.Path);
-  AppendVector(std::move(ChildFunctions), Out);
+  appendVector(std::move(ChildFunctions), Out);
   std::vector<std::unique_ptr<TagNode>> ChildEnums =
       genEnumsBlock(I.Children.Enums, CDCtx);
-  AppendVector(std::move(ChildEnums), Out);
+  appendVector(std::move(ChildEnums), Out);
 
   if (!I.Members.empty())
     InfoIndex.Children.emplace_back("Members", "Members");
@@ -936,7 +944,7 @@ llvm::Error HTMLGenerator::generateDocForInfo(Info *I, llvm::raw_ostream &OS,
 
   HTMLFile F = genInfoFile(InfoTitle, I->getRelativeFilePath(""),
                            MainContentNodes, InfoIndex, CDCtx);
-  F.Render(OS);
+  F.render(OS);
 
   return llvm::Error::success();
 }
@@ -959,7 +967,7 @@ static std::string getRefType(InfoType IT) {
   llvm_unreachable("Unknown InfoType");
 }
 
-static llvm::Error SerializeIndex(ClangDocContext &CDCtx) {
+static llvm::Error serializeIndex(ClangDocContext &CDCtx) {
   std::error_code OK;
   std::error_code FileErr;
   llvm::SmallString<128> FilePath;
@@ -985,9 +993,9 @@ static llvm::Error SerializeIndex(ClangDocContext &CDCtx) {
       });
     });
   };
-  OS << "var JsonIndex = `\n";
+  OS << "async function LoadIndex() {\nreturn";
   IndexToJSON(CDCtx.Idx);
-  OS << "`;\n";
+  OS << ";\n}";
   return llvm::Error::success();
 }
 
@@ -1009,7 +1017,7 @@ static std::unique_ptr<TagNode> genIndexFileMainNode() {
   return MainNode;
 }
 
-static llvm::Error GenIndex(const ClangDocContext &CDCtx) {
+static llvm::Error genIndex(const ClangDocContext &CDCtx) {
   std::error_code FileErr, OK;
   llvm::SmallString<128> IndexPath;
   llvm::sys::path::native(CDCtx.OutDirectory, IndexPath);
@@ -1029,17 +1037,17 @@ static llvm::Error GenIndex(const ClangDocContext &CDCtx) {
   std::unique_ptr<TagNode> MainNode = genIndexFileMainNode();
   std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
 
-  AppendVector(std::move(HeadNodes), F.Children);
+  appendVector(std::move(HeadNodes), F.Children);
   F.Children.emplace_back(std::move(HeaderNode));
   F.Children.emplace_back(std::move(MainNode));
   F.Children.emplace_back(std::move(FooterNode));
 
-  F.Render(IndexOS);
+  F.render(IndexOS);
 
   return llvm::Error::success();
 }
 
-static llvm::Error CopyFile(StringRef FilePath, StringRef OutDirectory) {
+static llvm::Error copyFile(StringRef FilePath, StringRef OutDirectory) {
   llvm::SmallString<128> PathWrite;
   llvm::sys::path::native(OutDirectory, PathWrite);
   llvm::sys::path::append(PathWrite, llvm::sys::path::filename(FilePath));
@@ -1057,20 +1065,20 @@ static llvm::Error CopyFile(StringRef FilePath, StringRef OutDirectory) {
 }
 
 llvm::Error HTMLGenerator::createResources(ClangDocContext &CDCtx) {
-  auto Err = SerializeIndex(CDCtx);
+  auto Err = serializeIndex(CDCtx);
   if (Err)
     return Err;
-  Err = GenIndex(CDCtx);
+  Err = genIndex(CDCtx);
   if (Err)
     return Err;
 
   for (const auto &FilePath : CDCtx.UserStylesheets) {
-    Err = CopyFile(FilePath, CDCtx.OutDirectory);
+    Err = copyFile(FilePath, CDCtx.OutDirectory);
     if (Err)
       return Err;
   }
-  for (const auto &FilePath : CDCtx.FilesToCopy) {
-    Err = CopyFile(FilePath, CDCtx.OutDirectory);
+  for (const auto &FilePath : CDCtx.JsScripts) {
+    Err = copyFile(FilePath, CDCtx.OutDirectory);
     if (Err)
       return Err;
   }
diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 2afff2929cf79c..d08afbb9621890 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -368,11 +368,9 @@ ClangDocContext::ClangDocContext(tooling::ExecutionContext *ECtx,
                                  StringRef ProjectName, bool PublicOnly,
                                  StringRef OutDirectory, StringRef SourceRoot,
                                  StringRef RepositoryUrl,
-                                 std::vector<std::string> UserStylesheets,
-                                 std::vector<std::string> JsScripts)
+                                 std::vector<std::string> UserStylesheets)
     : ECtx(ECtx), ProjectName(ProjectName), PublicOnly(PublicOnly),
-      OutDirectory(OutDirectory), UserStylesheets(UserStylesheets),
-      JsScripts(JsScripts) {
+      OutDirectory(OutDirectory), UserStylesheets(UserStylesheets) {
   llvm::SmallString<128> SourceRootDir(SourceRoot);
   if (SourceRoot.empty())
     // If no SourceRoot was provided the current path is used as the default
diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h
index a6b144eb7fa2a2..d70c279f7a2bdc 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -482,8 +482,7 @@ struct ClangDocContext {
   ClangDocContext(tooling::ExecutionContext *ECtx, StringRef ProjectName,
                   bool PublicOnly, StringRef OutDirectory, StringRef SourceRoot,
                   StringRef RepositoryUrl,
-                  std::vector<std::string> UserStylesheets,
-                  std::vector<std::string> JsScripts);
+                  std::vector<std::string> UserStylesheets);
   tooling::ExecutionContext *ECtx;
   std::string ProjectName; // Name of project clang-doc is documenting.
   bool PublicOnly; // Indicates if only public declarations are documented.
@@ -498,8 +497,6 @@ struct ClangDocContext {
   std::vector<std::string> UserStylesheets;
   // JavaScript files that will be imported in allHTML file.
   std::vector<std::string> JsScripts;
-  // Other files that should be copied to OutDirectory, besides UserStylesheets.
-  std::vector<std::string> FilesToCopy;
   Index Idx;
 };
 
diff --git a/clang-tools-extra/clang-doc/assets/index.js b/clang-tools-extra/clang-doc/assets/index.js
index a5ac90f2e06e78..49818763a43935 100644
--- a/clang-tools-extra/clang-doc/assets/index.js
+++ b/clang-tools-extra/clang-doc/assets/index.js
@@ -80,8 +80,8 @@ function createIndex(Index) {
 
 // Runs after DOM loads
 document.addEventListener("DOMContentLoaded", function() {
-  // JsonIndex is a variable from another file that contains the index
-  // in JSON format
-  var Index = JSON.parse(JsonIndex);
-  createIndex(Index);
+  // LoadIndex is an asynchronous function that will be generated clang-doc.
+  // It ensures that the function call will not block as soon the page loads,
+  // since the index object are often huge and can contain thousands of lines.
+  LoadIndex().then((Index) => { createIndex(Index); });
 });
diff --git a/clang-tools-extra/clang-doc/tool/CMakeLists.txt b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
index 4944251245c6bc..e93a5728d6b6b0 100644
--- a/clang-tools-extra/clang-doc/tool/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
@@ -25,7 +25,7 @@ set(assets
 )
 
 set(asset_dir "${CMAKE_CURRENT_SOURCE_DIR}/../assets")
-set(resource_dir "${CMAKE_BINARY_DIR}/share/clang")
+set(resource_dir "${CMAKE_BINARY_DIR}/share/clang-doc")
 set(out_files)
 
 function(copy_files_to_dst src_dir dst_dir file)
@@ -42,7 +42,7 @@ endfunction(copy_files_to_dst)
 
 foreach(f ${assets})
   install(FILES ${asset_dir}/${f}
-    DESTINATION "${CMAKE_INSTALL_DATADIR}/clang"
+    DESTINATION "${CMAKE_INSTALL_DATADIR}/clang-doc"
     COMPONENT clang-doc)
   copy_files_to_dst(${asset_dir} ${resource_dir} ${f})
 endforeach(f)
diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
index 5517522d7967dc..6198a6e0cdcc32 100644
--- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -81,6 +81,12 @@ static llvm::cl::list<std::string> UserStylesheets(
     llvm::cl::desc("CSS stylesheets to extend the default styles."),
     llvm::cl::cat(ClangDocCategory));
 
+static llvm::cl::opt<std::string> UserAssetPath(
+    "asset",
+    llvm::cl::desc("User supplied asset path to "
+                   "override the default css and js files for html output"),
+    llvm::cl::cat(ClangDocCategory));
+
 static llvm::cl::opt<std::string> SourceRoot("source-root", llvm::cl::desc(R"(
 Directory where processed files are stored.
 Links to definition locations will only be
@@ -127,16 +133,84 @@ std::string getFormatString() {
 // GetMainExecutable (since some platforms don't support taking the
 // address of main, and some platforms can't implement GetMainExecutable
 // without being given the address of a function in the main executable).
-std::string GetExecutablePath(const char *Argv0, void *MainAddr) {
+std::string getExecutablePath(const char *Argv0, void *MainAddr) {
   return llvm::sys::fs::getMainExecutable(Argv0, MainAddr);
 }
 
+llvm::Error getAssetFiles(clang::doc::ClangDocContext &CDCtx) {
+  using DirIt = llvm::sys::fs::directory_iterator;
+  std::error_code FileErr;
+  llvm::SmallString<128> FilePath(UserAssetPath);
+  for (DirIt DirStart = DirIt(UserAssetPath, FileErr),
+                   DirEnd;
+       !FileErr && DirStart != DirEnd; DirStart.increment(FileErr)) {
+    FilePath = DirStart->path();
+    if (llvm::sys::fs::is_regular_file(FilePath)) {
+      if (llvm::sys::path::extension(FilePath) == ".css")
+        CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(),
+                                     std::string(FilePath));
+      else if (llvm::sys::path::extension(FilePath) == ".js")
+        CDCtx.JsScripts.emplace_back(FilePath.str());
+    }
+  }
+  if (FileErr)
+    return llvm::createFileError(FilePath, FileErr);
+  return llvm::Error::success();
+}
+
+llvm::Error getDefaultAssetFiles(const char *Argv0,
+                                 clang::doc::ClangDocContext &CDCtx) {
+  void *MainAddr = (void *)(intptr_t)getExecutablePath;
+  std::string ClangDocPath = getExecutablePath(Argv0, MainAddr);
+  llvm::SmallString<128> NativeClangDocPath;
+  llvm::sys::path::native(ClangDocPath, NativeClangDocPath);
+
+  llvm::SmallString<128> AssetsPath;
+  AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
+  llvm::sys::path::append(AssetsPath, "..", "share", "clang-doc");
+  llvm::SmallString<128> DefaultStylesheet;
+  llvm::sys::path::native(AssetsPath, DefaultStylesheet);
+  llvm::sys::path::append(DefaultStylesheet,
+                          "clang-doc-default-stylesheet.css");
+  llvm::SmallString<128> IndexJS;
+  llvm::sys::path::native(AssetsPath, IndexJS);
+  llvm::sys::path::append(IndexJS, "index.js");
+
+  if (!llvm::sys::fs::is_regular_file(IndexJS))
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "default index.js file missing at " +
+                                       IndexJS + "\n");
+
+  if (!llvm::sys::fs::is_regular_file(DefaultStylesheet))
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "default clang-doc-default-stylesheet.css file missing at " +
+            DefaultStylesheet + "\n");
+
+  CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(),
+                               std::string(DefaultStylesheet));
+  CDCtx.JsScripts.emplace_back(IndexJS.str());
+
+  return llvm::Error::success();
+}
+
+llvm::Error getHtmlAssetFiles(const char *Argv0,
+                              clang::doc::ClangDocContext &CDCtx) {
+  if (!UserAssetPath.empty() &&
+      !llvm::sys::fs::is_directory(std::string(UserAssetPath)))
+    llvm::outs() << "Asset path supply is not a directory: " << UserAssetPath
+                 << " falling back to default\n";
+  if (llvm::sys::fs::is_directory(std::string(UserAssetPath)))
+    return getAssetFiles(CDCtx);
+  return getDefaultAssetFiles(Argv0, CDCtx);
+}
+
 int main(int argc, const char **argv) {
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
   std::error_code OK;
 
   const char *Overview =
-    R"(Generates documentation from source code and comments.
+      R"(Generates documentation from source code and comments.
 
 Example usage for files without flags (default):
 
@@ -178,27 +252,14 @@ Example usage for a project using a compile commands database:
       OutDirectory,
       SourceRoot,
       RepositoryUrl,
-      {UserStylesheets.begin(), UserStylesheets.end()},
-      {"index.js", "index_json.js"}};
+      {UserStylesheets.begin(), UserStylesheets.end()}
+  };
 
   if (Format == "html") {
-    void *MainAddr = (void *)(intptr_t)GetExecutablePath;
-    std::string ClangDocPath = GetExecutablePath(argv[0], MainAddr);
-    llvm::SmallString<128> NativeClangDocPath;
-    llvm::sys::path::native(ClangDocPath, NativeClangDocPath);
-    llvm::SmallString<128> AssetsPath;
-    AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
-    llvm::sys::path::append(AssetsPath, "..", "share", "clang");
-    llvm::SmallString<128> DefaultStylesheet;
-    llvm::sys::path::native(AssetsPath, DefaultStylesheet);
-    llvm::sys::path::append(DefaultStylesheet,
-                            "clang-doc-default-stylesheet.css");
-    llvm::SmallString<128> IndexJS;
-    llvm::sys::path::native(AssetsPath, IndexJS);
-    llvm::sys::path::append(IndexJS, "index.js");
-    CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(),
-                                 std::string(DefaultStylesheet));
-    CDCtx.FilesToCopy.emplace_back(IndexJS.str());
+    if (auto Err = getHtmlAssetFiles(argv[0], CDCtx)) {
+      llvm::errs() << toString(std::move(Err)) << "\n";
+      return 1;
+    }
   }
 
   // Mapping phase
diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
index 70d0281df28fad..44ccc2bc906a5b 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
@@ -18,6 +18,26 @@
 
 using namespace clang::ast_matchers;
 
+namespace clang::tidy {
+
+template <>
+struct OptionEnumMapping<misc::UseInternalLinkageCheck::FixModeKind> {
+  static llvm::ArrayRef<
+      std::pair<misc::UseInternalLinkageCheck::FixModeKind, StringRef>>
+  getEnumMapping() {
+    static constexpr std::pair<misc::UseInternalLinkageCheck::FixModeKind,
+                               StringRef>
+        Mapping[] = {
+            {misc::UseInternalLinkageCheck::FixModeKind::None, "None"},
+            {misc::UseInternalLinkageCheck::FixModeKind::UseStatic,
+             "UseStatic"},
+        };
+    return {Mapping};
+  }
+};
+
+} // namespace clang::tidy
+
 namespace clang::tidy::misc {
 
 namespace {
@@ -57,6 +77,16 @@ AST_POLYMORPHIC_MATCHER(isExternStorageClass,
 
 } // namespace
 
+UseInternalLinkageCheck::UseInternalLinkageCheck(StringRef Name,
+                                                 ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      HeaderFileExtensions(Context->getHeaderFileExtensions()),
+      FixMode(Options.get("FixMode", FixModeKind::UseStatic)) {}
+
+void UseInternalLinkageCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "FixMode", FixMode);
+}
+
 void UseInternalLinkageCheck::registerMatchers(MatchFinder *Finder) {
   auto Common =
       allOf(isFirstDecl(), isAllRedeclsInMainFile(HeaderFileExtensions),
@@ -82,11 +112,21 @@ static constexpr StringRef Message =
 
 void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *FD = Result.Nodes.getNodeAs<FunctionDecl>("fn")) {
-    diag(FD->getLocation(), Message) << "function" << FD;
+    DiagnosticBuilder DB = diag(FD->getLocation(), Message) << "function" << FD;
+    SourceLocation FixLoc = FD->getTypeSpecStartLoc();
+    if (FixLoc.isInvalid() || FixLoc.isMacroID())
+      return;
+    if (FixMode == FixModeKind::UseStatic)
+      DB << FixItHint::CreateInsertion(FixLoc, "static ");
     return;
   }
   if (const auto *VD = Result.Nodes.getNodeAs<VarDecl>("var")) {
-    diag(VD->getLocation(), Message) << "variable" << VD;
+    DiagnosticBuilder DB = diag(VD->getLocation(), Message) << "variable" << VD;
+    SourceLocation FixLoc = VD->getTypeSpecStartLoc();
+    if (FixLoc.isInvalid() || FixLoc.isMacroID())
+      return;
+    if (FixMode == FixModeKind::UseStatic)
+      DB << FixItHint::CreateInsertion(FixLoc, "static ");
     return;
   }
   llvm_unreachable("");
diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h
index a3c1c339659036..0d6c3e43aa9457 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h
@@ -20,17 +20,22 @@ namespace clang::tidy::misc {
 /// http://clang.llvm.org/extra/clang-tidy/checks/misc/use-internal-linkage.html
 class UseInternalLinkageCheck : public ClangTidyCheck {
 public:
-  UseInternalLinkageCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context),
-        HeaderFileExtensions(Context->getHeaderFileExtensions()) {}
+  UseInternalLinkageCheck(StringRef Name, ClangTidyContext *Context);
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   std::optional<TraversalKind> getCheckTraversalKind() const override {
     return TK_IgnoreUnlessSpelledInSource;
   }
 
+  enum class FixModeKind {
+    None,
+    UseStatic,
+  };
+
 private:
   FileExtensionsSet HeaderFileExtensions;
+  FixModeKind FixMode;
 };
 
 } // namespace clang::tidy::misc
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
index fd4730d9c8b9c8..499c88ef5d4e49 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
@@ -9,6 +9,7 @@
 #include "SimplifyBooleanExprCheck.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/DiagnosticIDs.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/Support/SaveAndRestore.h"
 
@@ -702,7 +703,8 @@ bool SimplifyBooleanExprCheck::canBeBypassed(const Stmt *S) const {
   return IgnoreMacros && S->getBeginLoc().isMacroID();
 }
 
-void SimplifyBooleanExprCheck::issueDiag(const ASTContext &Context,
+/// @brief return true when replacement created.
+bool SimplifyBooleanExprCheck::issueDiag(const ASTContext &Context,
                                          SourceLocation Loc,
                                          StringRef Description,
                                          SourceRange ReplacementRange,
@@ -712,8 +714,10 @@ void SimplifyBooleanExprCheck::issueDiag(const ASTContext &Context,
                                Context.getSourceManager(), getLangOpts());
 
   DiagnosticBuilder Diag = diag(Loc, Description);
-  if (!containsDiscardedTokens(Context, CharRange))
+  const bool HasReplacement = !containsDiscardedTokens(Context, CharRange);
+  if (HasReplacement)
     Diag << FixItHint::CreateReplacement(CharRange, Replacement);
+  return HasReplacement;
 }
 
 void SimplifyBooleanExprCheck::replaceWithThenStatement(
@@ -751,8 +755,18 @@ void SimplifyBooleanExprCheck::replaceWithReturnCondition(
       replacementExpression(Context, Negated, If->getCond());
   std::string Replacement = ("return " + Condition + Terminator).str();
   SourceLocation Start = BoolLiteral->getBeginLoc();
-  issueDiag(Context, Start, SimplifyConditionalReturnDiagnostic,
-            If->getSourceRange(), Replacement);
+
+  const bool HasReplacement =
+      issueDiag(Context, Start, SimplifyConditionalReturnDiagnostic,
+                If->getSourceRange(), Replacement);
+
+  if (!HasReplacement) {
+    const SourceRange ConditionRange = If->getCond()->getSourceRange();
+    if (ConditionRange.isValid())
+      diag(ConditionRange.getBegin(), "conditions that can be simplified",
+           DiagnosticIDs::Note)
+          << ConditionRange;
+  }
 }
 
 void SimplifyBooleanExprCheck::replaceCompoundReturnWithCondition(
@@ -760,9 +774,23 @@ void SimplifyBooleanExprCheck::replaceCompoundReturnWithCondition(
     const IfStmt *If, const Expr *ThenReturn) {
   const std::string Replacement =
       "return " + replacementExpression(Context, Negated, If->getCond());
-  issueDiag(Context, ThenReturn->getBeginLoc(),
-            SimplifyConditionalReturnDiagnostic,
-            SourceRange(If->getBeginLoc(), Ret->getEndLoc()), Replacement);
+
+  const bool HasReplacement = issueDiag(
+      Context, ThenReturn->getBeginLoc(), SimplifyConditionalReturnDiagnostic,
+      SourceRange(If->getBeginLoc(), Ret->getEndLoc()), Replacement);
+
+  if (!HasReplacement) {
+    const SourceRange ConditionRange = If->getCond()->getSourceRange();
+    if (ConditionRange.isValid())
+      diag(ConditionRange.getBegin(), "conditions that can be simplified",
+           DiagnosticIDs::Note)
+          << ConditionRange;
+    const SourceRange ReturnRange = Ret->getSourceRange();
+    if (ReturnRange.isValid())
+      diag(ReturnRange.getBegin(), "return statement that can be simplified",
+           DiagnosticIDs::Note)
+          << ReturnRange;
+  }
 }
 
 void SimplifyBooleanExprCheck::replaceWithAssignment(const ASTContext &Context,
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
index 63c3caa01e01a7..2ea69687984086 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
@@ -60,7 +60,7 @@ class SimplifyBooleanExprCheck : public ClangTidyCheck {
                       const BinaryOperator *Inner, bool TryOfferFix,
                       const Stmt *Parent, const ParenExpr *Parens);
 
-  void issueDiag(const ASTContext &Context, SourceLocation Loc,
+  bool issueDiag(const ASTContext &Context, SourceLocation Loc,
                  StringRef Description, SourceRange ReplacementRange,
                  StringRef Replacement);
 
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 7388f20ef288eb..d42dafa8ffc362 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -325,6 +325,14 @@ option is recognized.
 )"),
                                   cl::init(false), cl::cat(ClangTidyCategory));
 
+static cl::opt<bool> AllowNoChecks("allow-no-checks", desc(R"(
+Allow empty enabled checks. This suppresses
+the "no checks enabled" error when disabling
+all of the checks.
+)"),
+                                         cl::init(false),
+                                         cl::cat(ClangTidyCategory));
+
 namespace clang::tidy {
 
 static void printStats(const ClangTidyStats &Stats) {
@@ -598,7 +606,7 @@ int clangTidyMain(int argc, const char **argv) {
   }
 
   if (ListChecks) {
-    if (EnabledChecks.empty()) {
+    if (EnabledChecks.empty() && !AllowNoChecks) {
       llvm::errs() << "No checks enabled.\n";
       return 1;
     }
@@ -652,6 +660,10 @@ int clangTidyMain(int argc, const char **argv) {
   }
 
   if (EnabledChecks.empty()) {
+    if (AllowNoChecks) {
+      llvm::outs() << "No checks enabled.\n";
+      return 0;
+    }
     llvm::errs() << "Error: no checks enabled.\n";
     llvm::cl::PrintHelpMessage(/*Hidden=*/false, /*Categorized=*/true);
     return 1;
diff --git a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
index b048460abf2fca..62cb4297c50f75 100755
--- a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
+++ b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
@@ -229,6 +229,11 @@ def main():
         default=[],
         help="Load the specified plugin in clang-tidy.",
     )
+    parser.add_argument(
+        "-allow-no-checks",
+        action="store_true",
+        help="Allow empty enabled checks.",
+    )
 
     clang_tidy_args = []
     argv = sys.argv[1:]
@@ -327,6 +332,8 @@ def main():
         common_clang_tidy_args.append("-p=%s" % args.build_path)
     if args.use_color:
         common_clang_tidy_args.append("--use-color")
+    if args.allow_no_checks:
+        common_clang_tidy_args.append("--allow-no-checks")
     for arg in args.extra_arg:
         common_clang_tidy_args.append("-extra-arg=%s" % arg)
     for arg in args.extra_arg_before:
diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 4dd20bec81d3be..0dc35ad5873629 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -107,6 +107,7 @@ def get_tidy_invocation(
     plugins,
     warnings_as_errors,
     exclude_header_filter,
+    allow_no_checks,
 ):
     """Gets a command line for clang-tidy."""
     start = [clang_tidy_binary]
@@ -147,6 +148,8 @@ def get_tidy_invocation(
         start.append("-load=" + plugin)
     if warnings_as_errors:
         start.append("--warnings-as-errors=" + warnings_as_errors)
+    if allow_no_checks:
+        start.append("--allow-no-checks")
     start.append(f)
     return start
 
@@ -232,6 +235,7 @@ def run_tidy(args, clang_tidy_binary, tmpdir, build_path, queue, lock, failed_fi
             args.plugins,
             args.warnings_as_errors,
             args.exclude_header_filter,
+            args.allow_no_checks,
         )
 
         proc = subprocess.Popen(
@@ -261,20 +265,20 @@ def main():
     parser.add_argument(
         "-allow-enabling-alpha-checkers",
         action="store_true",
-        help="allow alpha checkers from clang-analyzer.",
+        help="Allow alpha checkers from clang-analyzer.",
     )
     parser.add_argument(
-        "-clang-tidy-binary", metavar="PATH", help="path to clang-tidy binary"
+        "-clang-tidy-binary", metavar="PATH", help="Path to clang-tidy binary."
     )
     parser.add_argument(
         "-clang-apply-replacements-binary",
         metavar="PATH",
-        help="path to clang-apply-replacements binary",
+        help="Path to clang-apply-replacements binary.",
     )
     parser.add_argument(
         "-checks",
         default=None,
-        help="checks filter, when not specified, use clang-tidy default",
+        help="Checks filter, when not specified, use clang-tidy default.",
     )
     config_group = parser.add_mutually_exclusive_group()
     config_group.add_argument(
@@ -307,7 +311,7 @@ def main():
     parser.add_argument(
         "-header-filter",
         default=None,
-        help="regular expression matching the names of the "
+        help="Regular expression matching the names of the "
         "headers to output diagnostics from. Diagnostics from "
         "the main file of each translation unit are always "
         "displayed.",
@@ -347,19 +351,22 @@ def main():
         "-j",
         type=int,
         default=0,
-        help="number of tidy instances to be run in parallel.",
+        help="Number of tidy instances to be run in parallel.",
     )
     parser.add_argument(
-        "files", nargs="*", default=[".*"], help="files to be processed (regex on path)"
+        "files",
+        nargs="*",
+        default=[".*"],
+        help="Files to be processed (regex on path).",
     )
-    parser.add_argument("-fix", action="store_true", help="apply fix-its")
+    parser.add_argument("-fix", action="store_true", help="apply fix-its.")
     parser.add_argument(
-        "-format", action="store_true", help="Reformat code after applying fixes"
+        "-format", action="store_true", help="Reformat code after applying fixes."
     )
     parser.add_argument(
         "-style",
         default="file",
-        help="The style of reformat code after applying fixes",
+        help="The style of reformat code after applying fixes.",
     )
     parser.add_argument(
         "-use-color",
@@ -388,7 +395,7 @@ def main():
         help="Additional argument to prepend to the compiler command line.",
     )
     parser.add_argument(
-        "-quiet", action="store_true", help="Run clang-tidy in quiet mode"
+        "-quiet", action="store_true", help="Run clang-tidy in quiet mode."
     )
     parser.add_argument(
         "-load",
@@ -400,7 +407,12 @@ def main():
     parser.add_argument(
         "-warnings-as-errors",
         default=None,
-        help="Upgrades warnings to errors. Same format as '-checks'",
+        help="Upgrades warnings to errors. Same format as '-checks'.",
+    )
+    parser.add_argument(
+        "-allow-no-checks",
+        action="store_true",
+        help="Allow empty enabled checks.",
     )
     args = parser.parse_args()
 
@@ -463,6 +475,7 @@ def main():
             args.plugins,
             args.warnings_as_errors,
             args.exclude_header_filter,
+            args.allow_no_checks,
         )
         invocation.append("-list-checks")
         invocation.append("-")
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index a6062ccf42230b..106feb7fb41720 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -296,7 +296,7 @@ AST_MATCHER_P(DeclRefExpr, doesNotMutateObject, int, Indirections) {
           return false;
         }
         const auto *const Method =
-            dyn_cast<CXXMethodDecl>(OpCall->getDirectCallee());
+            dyn_cast_or_null<CXXMethodDecl>(OpCall->getDirectCallee());
 
         if (Method == nullptr) {
           // This is not a member operator. Typically, a friend operator. These
diff --git a/clang-tools-extra/clangd/Format.cpp b/clang-tools-extra/clangd/Format.cpp
index 272a34d4ed7972..fc56a1c8c50304 100644
--- a/clang-tools-extra/clangd/Format.cpp
+++ b/clang-tools-extra/clangd/Format.cpp
@@ -281,7 +281,7 @@ formatIncremental(llvm::StringRef OriginalCode, unsigned OriginalCursor,
   // Never *remove* lines in response to pressing enter! This annoys users.
   if (InsertedText == "\n") {
     Style.MaxEmptyLinesToKeep = 1000;
-    Style.KeepEmptyLinesAtTheStartOfBlocks = true;
+    Style.KeepEmptyLines.AtStartOfBlock = true;
   }
 
   // Compute the code we want to format:
diff --git a/clang-tools-extra/clangd/index/remote/Client.h b/clang-tools-extra/clangd/index/remote/Client.h
index ff5348bd23279e..9004012f066ae2 100644
--- a/clang-tools-extra/clangd/index/remote/Client.h
+++ b/clang-tools-extra/clangd/index/remote/Client.h
@@ -12,8 +12,11 @@
 #include "index/Index.h"
 #include "llvm/ADT/StringRef.h"
 
+#include <memory>
+
 namespace clang {
 namespace clangd {
+
 namespace remote {
 
 /// Returns an SymbolIndex client that passes requests to remote index located
diff --git a/clang-tools-extra/clangd/support/Trace.cpp b/clang-tools-extra/clangd/support/Trace.cpp
index 419c2eee99ec84..7c6490f2e92fb2 100644
--- a/clang-tools-extra/clangd/support/Trace.cpp
+++ b/clang-tools-extra/clangd/support/Trace.cpp
@@ -159,8 +159,8 @@ class JSONTracer : public EventTracer {
     Out.object([&] {
       Out.attribute("pid", 0);
       Out.attribute("ph", Phase);
-      for (const auto &KV : Event)
-        Out.attribute(KV.first, KV.second);
+      for (const auto *KV : llvm::json::sortedElements(Event))
+        Out.attribute(KV->first, KV->second);
     });
   }
 
diff --git a/clang-tools-extra/clangd/test/trace.test b/clang-tools-extra/clangd/test/trace.test
index cb6a226b496a4b..56f2d6333f1de0 100644
--- a/clang-tools-extra/clangd/test/trace.test
+++ b/clang-tools-extra/clangd/test/trace.test
@@ -8,17 +8,17 @@
 # CHECK:   "traceEvents": [
 # CHECK:     {
 # CHECK:       "ph": "X",
-# CHECK:       "name": "BuildPreamble",
 # CHECK:       "args": {
 # CHECK:         "File": "{{.*(/|\\)}}foo.c"
 # CHECK:       },
+# CHECK:       "name": "BuildPreamble",
 # CHECK:     }
 # CHECK:     {
 # CHECK:       "ph": "X",
-# CHECK:       "name": "BuildAST",
 # CHECK:       "args": {
 # CHECK:         "File": "{{.*(/|\\)}}foo.c"
 # CHECK:       },
+# CHECK:       "name": "BuildAST",
 # CHECK:     }
 # CHECK:   ]
 # CHECK: }
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 16fdc20eafd62e..d968100cd57a76 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -125,6 +125,9 @@ Improvements to clang-tidy
 - Added argument `--exclude-header-filter` and config option `ExcludeHeaderFilterRegex`
   to exclude headers from analysis via a RegEx.
 
+- Added argument `--allow-no-checks` to suppress "no checks enabled" error
+  when disabling all of the checks by `--checks='-*'`.
+
 New checks
 ^^^^^^^^^^
 
@@ -398,7 +401,7 @@ Changes in existing checks
   analyzed, so the check now handles the common patterns
   `const auto e = (*vector_ptr)[i]` and `const auto e = vector_ptr->at(i);`.
   Calls to mutable function where there exists a `const` overload are also
-  handled.
+  handled. Fix crash in the case of a non-member operator call.
 
 - Improved :doc:`readability-avoid-return-with-void-value
   <clang-tidy/checks/readability/avoid-return-with-void-value>` check by adding
@@ -452,7 +455,8 @@ Changes in existing checks
 
 - Improved :doc:`readability-simplify-boolean-expr
   <clang-tidy/checks/readability/simplify-boolean-expr>` check to avoid to emit
-  warning for macro when IgnoreMacro option is enabled.
+  warning for macro when IgnoreMacro option is enabled and improve messages
+  when auto-fix does not work.
 
 - Improved :doc:`readability-static-definition-in-anonymous-namespace
   <clang-tidy/checks/readability/static-definition-in-anonymous-namespace>`
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst
index e8e43a1fb3d632..7147af9a7919bc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst
@@ -25,3 +25,16 @@ Example:
   }
   // already declared as extern
   extern int v2;
+
+Options
+-------
+
+.. option:: FixMode
+
+  Selects what kind of a fix the check should provide. The default is `UseStatic`.
+
+  ``None``
+    Don't fix automatically.
+
+  ``UseStatic``
+    Add ``static`` for internal linkage variable and function.
diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index 9ccacefa3c2c59..c8fc34c61caeb5 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -240,6 +240,9 @@ An overview of all the command-line options:
                                        This option's value is appended to the value of
                                        the 'WarningsAsErrors' option in .clang-tidy
                                        file, if any.
+    --allow-no-checks                - Allow empty enabled checks. This suppresses
+                                       the "no checks enabled" error when disabling
+                                       all of the checks.
 
   -p <build-path> is used to read a compile command database.
 
diff --git a/clang-tools-extra/test/clang-doc/Inputs/basic-project/src/Calculator.cpp b/clang-tools-extra/test/clang-doc/Inputs/basic-project/src/Calculator.cpp
index df1778c3b9d55f..64f31dbf13d871 100644
--- a/clang-tools-extra/test/clang-doc/Inputs/basic-project/src/Calculator.cpp
+++ b/clang-tools-extra/test/clang-doc/Inputs/basic-project/src/Calculator.cpp
@@ -1,5 +1,4 @@
 #include "Calculator.h"
-#include <stdexcept>
 
 int Calculator::add(int a, int b) {
     return a + b;
@@ -14,8 +13,5 @@ int Calculator::multiply(int a, int b) {
 }
 
 double Calculator::divide(int a, int b) {
-    if (b == 0) {
-        throw std::invalid_argument("Division by zero");
-    }
     return static_cast<double>(a) / b;
 }
diff --git a/clang-tools-extra/test/clang-doc/Inputs/test-assets/test.css b/clang-tools-extra/test/clang-doc/Inputs/test-assets/test.css
new file mode 100644
index 00000000000000..5c5532659be5e0
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/Inputs/test-assets/test.css
@@ -0,0 +1,3 @@
+body {
+    padding: 0;
+}
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-doc/Inputs/test-assets/test.js b/clang-tools-extra/test/clang-doc/Inputs/test-assets/test.js
new file mode 100644
index 00000000000000..06f320c59b9b0f
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/Inputs/test-assets/test.js
@@ -0,0 +1 @@
+console.log("Hello, world!");
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-doc/assets.cpp b/clang-tools-extra/test/clang-doc/assets.cpp
new file mode 100644
index 00000000000000..d5a2d20e92240b
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/assets.cpp
@@ -0,0 +1,22 @@
+// RUN: rm -rf %t && mkdir %t
+// RUN: clang-doc --format=html --output=%t --asset=%S/Inputs/test-assets --executor=standalone %s
+// RUN: FileCheck %s -input-file=%t/index.html -check-prefix=INDEX
+// RUN: FileCheck %s -input-file=%t/test.css -check-prefix=CSS
+// RUN: FileCheck %s -input-file=%t/test.js -check-prefix=JS
+
+// INDEX: <!DOCTYPE html>
+// INDEX-NEXT: <meta charset="utf-8"/>
+// INDEX-NEXT: <title>Index</title>
+// INDEX-NEXT: <link rel="stylesheet" href="test.css"/>
+// INDEX-NEXT: <script src="index_json.js"></script>
+// INDEX-NEXT: <script src="test.js"></script>
+// INDEX-NEXT: <header id="project-title"></header>
+// INDEX-NEXT: <main>
+// INDEX-NEXT:   <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left" style="flex: 0 100%;"></div>
+// INDEX-NEXT: </main>
+
+// CSS: body {
+// CSS-NEXT:     padding: 0;
+// CSS-NEXT: }
+
+// JS: console.log("Hello, world!");
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 0898acaea3a33e..bab5f8e1761bc6 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -7,8 +7,8 @@
 // RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefix=HTML-RECTANGLE
 // RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefix=HTML-CIRCLE
 
-// JSON-INDEX: var JsonIndex = `
-// JSON-INDEX-NEXT: {
+// JSON-INDEX: async function LoadIndex() {
+// JSON-INDEX-NEXT: return{
 // JSON-INDEX-NEXT:   "USR": "{{([0-9A-F]{40})}}",
 // JSON-INDEX-NEXT:   "Name": "",
 // JSON-INDEX-NEXT:   "RefType": "default",
@@ -51,14 +51,15 @@
 // JSON-INDEX-NEXT:       ]
 // JSON-INDEX-NEXT:     }
 // JSON-INDEX-NEXT:   ]
-// JSON-INDEX-NEXT: }`;
+// JSON-INDEX-NEXT: };
+// JSON-INDEX-NEXT: }
 
 // HTML-SHAPE: <!DOCTYPE html>
 // HTML-SHAPE-NEXT: <meta charset="utf-8"/>
 // HTML-SHAPE-NEXT: <title>class Shape</title>
 // HTML-SHAPE-NEXT: <link rel="stylesheet" href="../clang-doc-default-stylesheet.css"/>
-// HTML-SHAPE-NEXT: <script src="../index.js"></script>
-// HTML-SHAPE-NEXT: <script src="../index_json.js"></script>
+// HTML-SHAPE-NEXT: <script src="{{.*}}index_json.js"></script>
+// HTML-SHAPE-NEXT: <script src="{{.*}}index.js"></script>
 // HTML-SHAPE-NEXT: <header id="project-title"></header>
 // HTML-SHAPE-NEXT: <main>
 // HTML-SHAPE-NEXT:   <div id="sidebar-left" path="GlobalNamespace" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
@@ -122,8 +123,8 @@
 // HTML-CALC-NEXT: <meta charset="utf-8"/>
 // HTML-CALC-NEXT: <title>class Calculator</title>
 // HTML-CALC-NEXT: <link rel="stylesheet" href="{{.*}}clang-doc-default-stylesheet.css"/>
-// HTML-CALC-NEXT: <script src="{{.*}}index.js"></script>
 // HTML-CALC-NEXT: <script src="{{.*}}index_json.js"></script>
+// HTML-CALC-NEXT: <script src="{{.*}}index.js"></script>
 // HTML-CALC-NEXT: <header id="project-title"></header>
 // HTML-CALC-NEXT: <main>
 // HTML-CALC-NEXT:   <div id="sidebar-left" path="GlobalNamespace" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
@@ -139,25 +140,25 @@
 // HTML-CALC-NEXT:     <div>
 // HTML-CALC-NEXT:       <h3 id="{{([0-9A-F]{40})}}">add</h3>
 // HTML-CALC-NEXT:       <p>public int add(int a, int b)</p>
-// HTML-CALC-NEXT:       <p>Defined at line 4 of file {{.*}}Calculator.cpp</p>
+// HTML-CALC-NEXT:       <p>Defined at line 3 of file {{.*}}Calculator.cpp</p>
 // HTML-CALC-NEXT:       <div>
 // HTML-CALC-NEXT:         <div></div>
 // HTML-CALC-NEXT:       </div>
 // HTML-CALC-NEXT:       <h3 id="{{([0-9A-F]{40})}}">subtract</h3>
 // HTML-CALC-NEXT:       <p>public int subtract(int a, int b)</p>
-// HTML-CALC-NEXT:       <p>Defined at line 8 of file {{.*}}Calculator.cpp</p>
+// HTML-CALC-NEXT:       <p>Defined at line 7 of file {{.*}}Calculator.cpp</p>
 // HTML-CALC-NEXT:       <div>
 // HTML-CALC-NEXT:         <div></div>
 // HTML-CALC-NEXT:       </div>
 // HTML-CALC-NEXT:       <h3 id="{{([0-9A-F]{40})}}">multiply</h3>
 // HTML-CALC-NEXT:       <p>public int multiply(int a, int b)</p>
-// HTML-CALC-NEXT:       <p>Defined at line 12 of file {{.*}}Calculator.cpp</p>
+// HTML-CALC-NEXT:       <p>Defined at line 11 of file {{.*}}Calculator.cpp</p>
 // HTML-CALC-NEXT:       <div>
 // HTML-CALC-NEXT:         <div></div>
 // HTML-CALC-NEXT:       </div>
 // HTML-CALC-NEXT:       <h3 id="{{([0-9A-F]{40})}}">divide</h3>
 // HTML-CALC-NEXT:       <p>public double divide(int a, int b)</p>
-// HTML-CALC-NEXT:       <p>Defined at line 16 of file {{.*}}Calculator.cpp</p>
+// HTML-CALC-NEXT:       <p>Defined at line 15 of file {{.*}}Calculator.cpp</p>
 // HTML-CALC-NEXT:       <div>
 // HTML-CALC-NEXT:         <div></div>
 // HTML-CALC-NEXT:       </div>
@@ -200,8 +201,8 @@
 // HTML-RECTANGLE-NEXT: <meta charset="utf-8"/>
 // HTML-RECTANGLE-NEXT: <title>class Rectangle</title>
 // HTML-RECTANGLE-NEXT: <link rel="stylesheet" href="{{.*}}clang-doc-default-stylesheet.css"/>
-// HTML-RECTANGLE-NEXT: <script src="{{.*}}index.js"></script>
 // HTML-RECTANGLE-NEXT: <script src="{{.*}}index_json.js"></script>
+// HTML-RECTANGLE-NEXT: <script src="{{.*}}index.js"></script>
 // HTML-RECTANGLE-NEXT: <header id="project-title"></header>
 // HTML-RECTANGLE-NEXT: <main>
 // HTML-RECTANGLE-NEXT:   <div id="sidebar-left" path="GlobalNamespace" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
@@ -281,8 +282,8 @@
 // HTML-CIRCLE-NEXT: <meta charset="utf-8"/>
 // HTML-CIRCLE-NEXT: <title>class Circle</title>
 // HTML-CIRCLE-NEXT: <link rel="stylesheet" href="{{.*}}clang-doc-default-stylesheet.css"/>
-// HTML-CIRCLE-NEXT: <script src="{{.*}}index.js"></script>
 // HTML-CIRCLE-NEXT: <script src="{{.*}}index_json.js"></script>
+// HTML-CIRCLE-NEXT: <script src="{{.*}}index.js"></script>
 // HTML-CIRCLE-NEXT: <header id="project-title"></header>
 // HTML-CIRCLE-NEXT: <main>
 // HTML-CIRCLE-NEXT:   <div id="sidebar-left" path="GlobalNamespace" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-fix-mode-none.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-fix-mode-none.cpp
new file mode 100644
index 00000000000000..3f2f5897bf7183
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-fix-mode-none.cpp
@@ -0,0 +1,10 @@
+// RUN: %check_clang_tidy %s misc-use-internal-linkage %t -- \
+// RUN:   -config="{CheckOptions: {misc-use-internal-linkage.FixMode: 'None'}}"  -- -I%S/Inputs/use-internal-linkage
+
+void func() {}
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func'
+// CHECK-FIXES-NOT: static void func() {}
+
+int global;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'global'
+// CHECK-FIXES-NOT: static int global;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
index c6c513fe0b0c06..9c91389542b03d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
@@ -1,16 +1,21 @@
 // RUN: %check_clang_tidy %s misc-use-internal-linkage %t -- -- -I%S/Inputs/use-internal-linkage
+// RUN: %check_clang_tidy %s misc-use-internal-linkage %t -- \
+// RUN:   -config="{CheckOptions: {misc-use-internal-linkage.FixMode: 'UseStatic'}}"  -- -I%S/Inputs/use-internal-linkage
 
 #include "func.h"
 
 void func() {}
 // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func'
+// CHECK-FIXES: static void func() {}
 
 template<class T>
 void func_template() {}
 // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_template'
+// CHECK-FIXES: static void func_template() {}
 
 void func_cpp_inc();
 // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc'
+// CHECK-FIXES: static void func_cpp_inc();
 
 #include "func_cpp.inc"
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
index bd5ef5431de6cc..01b8d28e612301 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
@@ -1,13 +1,17 @@
 // RUN: %check_clang_tidy %s misc-use-internal-linkage %t -- -- -I%S/Inputs/use-internal-linkage
+// RUN: %check_clang_tidy %s misc-use-internal-linkage %t -- \
+// RUN:   -config="{CheckOptions: {misc-use-internal-linkage.FixMode: 'UseStatic'}}"  -- -I%S/Inputs/use-internal-linkage
 
 #include "var.h"
 
 int global;
 // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'global'
+// CHECK-FIXES: static int global;
 
 template<class T>
 T global_template;
 // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: variable 'global_template'
+// CHECK-FIXES: static T global_template;
 
 int gloabl_header;
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
index f259552dc8f1d8..d02bb98cf583cb 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
@@ -906,3 +906,12 @@ void negativeNonConstMemberExpr() {
   }
 }
 
+
+bool operator==(ExpensiveToCopyType, ExpensiveToCopyType);
+
+template<typename T> bool OperatorWithNoDirectCallee(T t) {
+  ExpensiveToCopyType a1;
+  ExpensiveToCopyType a2 = a1;
+  return a1 == t;
+}
+
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp
index c14438aa938019..bad1055a019045 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp
@@ -353,6 +353,23 @@ bool conditional_return_statements(int i) {
 // CHECK-FIXES:      {{^}}  return i == 0;{{$}}
 // CHECK-FIXES-NEXT: {{^}$}}
 
+bool conditional_return_statements_no_fix_1(int i) {
+  if (i == 0) return true;
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: {{.*}} in conditional return statement
+  // CHECK-MESSAGES: :[[@LINE-2]]:7: note: conditions that can be simplified
+  // comment
+  return false;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: note: return statement that can be simplified
+}
+
+bool conditional_return_statements_no_fix_2(int i) {
+  if (i == 0) return true;
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: {{.*}} in conditional return statement
+  // CHECK-MESSAGES: :[[@LINE-2]]:7: note: conditions that can be simplified
+  // comment
+  else return false;
+}
+
 bool conditional_return_statements_then_expr(int i, int j) {
   if (i == j) return (i == 0); else return false;
 }
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/allow-no-checks.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/allow-no-checks.cpp
new file mode 100644
index 00000000000000..a1f059b92384d0
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/allow-no-checks.cpp
@@ -0,0 +1,4 @@
+// RUN: not clang-tidy %s -checks='-*'
+// RUN: clang-tidy %s -checks='-*' --allow-no-checks | FileCheck --match-full-lines %s
+
+// CHECK: No checks enabled.
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
index 9aabb1ed30e426..e4a7340318b934 100644
--- a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
@@ -30,7 +30,7 @@ ClangDocContext
 getClangDocContext(std::vector<std::string> UserStylesheets = {},
                    StringRef RepositoryUrl = "") {
   ClangDocContext CDCtx{
-      {}, "test-project", {}, {}, {}, RepositoryUrl, UserStylesheets, {}};
+      {}, "test-project", {}, {}, {}, RepositoryUrl, UserStylesheets};
   CDCtx.UserStylesheets.insert(
       CDCtx.UserStylesheets.begin(),
       "../share/clang/clang-doc-default-stylesheet.css");
@@ -66,6 +66,7 @@ TEST(HTMLGeneratorTest, emitNamespaceHTML) {
 <title>namespace Namespace</title>
 <link rel="stylesheet" href="../clang-doc-default-stylesheet.css"/>
 <link rel="stylesheet" href="../user-provided-stylesheet.css"/>
+<script src="../index_json.js"></script>
 <script src="../index.js"></script>
 <header id="project-title">test-project</header>
 <main>
@@ -176,6 +177,7 @@ TEST(HTMLGeneratorTest, emitRecordHTML) {
 <meta charset="utf-8"/>
 <title>class r</title>
 <link rel="stylesheet" href="../../../clang-doc-default-stylesheet.css"/>
+<script src="../../../index_json.js"></script>
 <script src="../../../index.js"></script>
 <header id="project-title">test-project</header>
 <main>
@@ -290,6 +292,7 @@ TEST(HTMLGeneratorTest, emitFunctionHTML) {
 <meta charset="utf-8"/>
 <title></title>
 <link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
+<script src="index_json.js"></script>
 <script src="index.js"></script>
 <header id="project-title">test-project</header>
 <main>
@@ -337,6 +340,7 @@ TEST(HTMLGeneratorTest, emitEnumHTML) {
 <meta charset="utf-8"/>
 <title></title>
 <link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
+<script src="index_json.js"></script>
 <script src="index.js"></script>
 <header id="project-title">test-project</header>
 <main>
@@ -422,6 +426,7 @@ TEST(HTMLGeneratorTest, emitCommentHTML) {
 <meta charset="utf-8"/>
 <title></title>
 <link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
+<script src="index_json.js"></script>
 <script src="index.js"></script>
 <header id="project-title">test-project</header>
 <main>
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index a573ec54732101..9892b5d58e719b 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -321,7 +321,7 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m-unknown-eabi)
   set(RUNTIMES_${target}_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
   set(RUNTIMES_${target}_CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY CACHE STRING "")
   foreach(lang C;CXX;ASM)
-    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -mthumb" CACHE STRING "")
+    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -mthumb -Wno-atomic-alignment" CACHE STRING "")
   endforeach()
   foreach(type SHARED;MODULE;EXE)
     set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index bb00c20922d361..080cba90c4a8bf 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4443,23 +4443,51 @@ the configuration (without a prefix: ``Auto``).
      false:
      import {VeryLongImportsAreAnnoying, VeryLongImportsAreAnnoying, VeryLongImportsAreAnnoying,} from "some/module.js"
 
+.. _KeepEmptyLines:
+
+**KeepEmptyLines** (``KeepEmptyLinesStyle``) :versionbadge:`clang-format 19` :ref:`¶ <KeepEmptyLines>`
+  Which empty lines are kept.  See ``MaxEmptyLinesToKeep`` for how many
+  consecutive empty lines are kept.
+
+  Nested configuration flags:
+
+  Options regarding which empty lines are kept.
+
+  For example, the config below will remove empty lines at start of the
+  file, end of the file, and start of blocks.
+
+
+  .. code-block:: c++
+
+    KeepEmptyLines:
+      AtEndOfFile: false
+      AtStartOfBlock: false
+      AtStartOfFile: false
+
+  * ``bool AtEndOfFile`` Keep empty lines at end of file.
+
+  * ``bool AtStartOfBlock`` Keep empty lines at start of a block.
+
+    .. code-block:: c++
+
+       true:                                  false:
+       if (foo) {                     vs.     if (foo) {
+                                                bar();
+         bar();                               }
+       }
+
+  * ``bool AtStartOfFile`` Keep empty lines at start of file.
+
+
 .. _KeepEmptyLinesAtEOF:
 
 **KeepEmptyLinesAtEOF** (``Boolean``) :versionbadge:`clang-format 17` :ref:`¶ <KeepEmptyLinesAtEOF>`
-  Keep empty lines (up to ``MaxEmptyLinesToKeep``) at end of file.
+  This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
 
 .. _KeepEmptyLinesAtTheStartOfBlocks:
 
 **KeepEmptyLinesAtTheStartOfBlocks** (``Boolean``) :versionbadge:`clang-format 3.7` :ref:`¶ <KeepEmptyLinesAtTheStartOfBlocks>`
-  If true, the empty line at the start of blocks is kept.
-
-  .. code-block:: c++
-
-     true:                                  false:
-     if (foo) {                     vs.     if (foo) {
-                                              bar();
-       bar();                               }
-     }
+  This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
 
 .. _LambdaBodyIndentation:
 
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index 5ba84c2f670556..e26297c7af97ac 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -284,3 +284,48 @@ Example Usage
       Base* basePtr = &obj;
       basePtr->virtualFunction(); // Allowed since obj is constructed in device code
    }
+
+SPIR-V Support on HIPAMD ToolChain
+==================================
+
+The HIPAMD ToolChain supports targetting
+`AMDGCN Flavoured SPIR-V <https://llvm.org/docs/SPIRVUsage.html#target-triples>`_.
+The support for SPIR-V in the ROCm and HIPAMD ToolChain is under active
+development.
+
+Compilation Process
+-------------------
+
+When compiling HIP programs with the intent of utilizing SPIR-V, the process
+diverges from the traditional compilation flow:
+
+Using ``--offload-arch=amdgcnspirv``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- **Target Triple**: The ``--offload-arch=amdgcnspirv`` flag instructs the
+  compiler to use the target triple ``spirv64-amd-amdhsa``. This approach does
+  generates generic AMDGCN SPIR-V which retains architecture specific elements
+  without hardcoding them, thus allowing for optimal target specific code to be
+  generated at run time, when the concrete target is known.
+
+- **LLVM IR Translation**: The program is compiled to LLVM Intermediate
+  Representation (IR), which is subsequently translated into SPIR-V. In the
+  future, this translation step will be replaced by direct SPIR-V emission via
+  the SPIR-V Back-end.
+
+- **Clang Offload Bundler**: The resulting SPIR-V is embedded in the Clang
+  offload bundler with the bundle ID ``hip-spirv64-amd-amdhsa--amdgcnspirv``.
+
+Mixed with Normal ``--offload-arch``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**Mixing ``amdgcnspirv`` and concrete ``gfx###`` targets via ``--offload-arch``
+is not currently supported; this limitation is temporary and will be removed in
+a future release**
+
+Architecture Specific Macros
+----------------------------
+
+None of the architecture specific :doc:`AMDGPU macros <AMDGPUSupport>` are
+defined when targeting SPIR-V. An alternative, more flexible mechanism to enable
+doing per target / per feature code selection will be added in the future.
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index df80588b3fe035..d9439d49a2e299 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3483,6 +3483,60 @@ Query for this feature with ``__has_builtin(__builtin_trap)``.
 
 ``__builtin_arm_trap`` is lowered to the ``llvm.aarch64.break`` builtin, and then to ``brk #payload``.
 
+``__builtin_verbose_trap``
+--------------------------
+
+``__builtin_verbose_trap`` causes the program to stop its execution abnormally
+and shows a human-readable description of the reason for the termination when a
+debugger is attached or in a symbolicated crash log.
+
+**Syntax**:
+
+.. code-block:: c++
+
+    __builtin_verbose_trap(const char *category, const char *reason)
+
+**Description**
+
+``__builtin_verbose_trap`` is lowered to the ` ``llvm.trap`` <https://llvm.org/docs/LangRef.html#llvm-trap-intrinsic>`_ builtin.
+Additionally, clang emits debugging information that represents an artificial
+inline frame whose name encodes the category and reason strings passed to the builtin,
+prefixed by a "magic" prefix.
+
+For example, consider the following code:
+
+.. code-block:: c++
+
+    void foo(int* p) {
+      if (p == nullptr)
+        __builtin_verbose_trap("check null", "Argument must not be null!");
+    }
+
+The debugging information would look as if it were produced for the following code:
+
+.. code-block:: c++
+
+    __attribute__((always_inline))
+    inline void "__clang_trap_msg$check null$Argument must not be null!"() {
+      __builtin_trap();
+    }
+
+    void foo(int* p) {
+      if (p == nullptr)
+        "__clang_trap_msg$check null$Argument must not be null!"();
+    }
+
+However, the generated code would not actually contain a call to the artificial
+function — it only exists in the debugging information.
+
+Query for this feature with ``__has_builtin(__builtin_verbose_trap)``. Note that
+users need to enable debug information to enable this feature. A call to this
+builtin is equivalent to a call to ``__builtin_trap`` if debug information isn't
+enabled.
+
+The optimizer can merge calls to trap with different messages, which degrades
+the debugging experience.
+
 ``__builtin_allow_runtime_check``
 ---------------------------------
 
@@ -5345,7 +5399,7 @@ The ``#pragma clang section`` directive obeys the following rules:
 
 * The pragma clang section is enabled automatically, without need of any flags.
 
-* This feature is only defined to work sensibly for ELF targets.
+* This feature is only defined to work sensibly for ELF and Mach-O targets.
 
 * If section name is specified through _attribute_((section("myname"))), then
   the attribute name gains precedence.
@@ -5368,16 +5422,85 @@ The ``#pragma comment(lib, ...)`` directive is supported on all ELF targets.
 The second parameter is the library name (without the traditional Unix prefix of
 ``lib``).  This allows you to provide an implicit link of dependent libraries.
 
-Evaluating Object Size Dynamically
-==================================
+Evaluating Object Size
+======================
+
+Clang supports the builtins ``__builtin_object_size`` and
+``__builtin_dynamic_object_size``. The semantics are compatible with GCC's
+builtins of the same names, but the details are slightly different.
+
+.. code-block:: c
 
-Clang supports the builtin ``__builtin_dynamic_object_size``, the semantics are
-the same as GCC's ``__builtin_object_size`` (which Clang also supports), but
-``__builtin_dynamic_object_size`` can evaluate the object's size at runtime.
-``__builtin_dynamic_object_size`` is meant to be used as a drop-in replacement
-for ``__builtin_object_size`` in libraries that support it.
+  size_t __builtin_[dynamic_]object_size(const void *ptr, int type)
+
+Returns the number of accessible bytes ``n`` past ``ptr``. The value returned
+depends on ``type``, which is required to be an integer constant between 0 and
+3:
+
+* If ``type & 2 == 0``, the least ``n`` is returned such that accesses to
+  ``(const char*)ptr + n`` and beyond are known to be out of bounds. This is
+  ``(size_t)-1`` if no better bound is known.
+* If ``type & 2 == 2``, the greatest ``n`` is returned such that accesses to
+  ``(const char*)ptr + i`` are known to be in bounds, for 0 <= ``i`` < ``n``.
+  This is ``(size_t)0`` if no better bound is known.
+
+.. code-block:: c
+
+  char small[10], large[100];
+  bool cond;
+  // Returns 100: writes of more than 100 bytes are known to be out of bounds.
+  int n100 = __builtin_object_size(cond ? small : large, 0);
+  // Returns 10: writes of 10 or fewer bytes are known to be in bounds.
+  int n10 = __builtin_object_size(cond ? small : large, 2);
+
+* If ``type & 1 == 0``, pointers are considered to be in bounds if they point
+  into the same storage as ``ptr`` -- that is, the same stack object, global
+  variable, or heap allocation.
+* If ``type & 1 == 1``, pointers are considered to be in bounds if they point
+  to the same subobject that ``ptr`` points to. If ``ptr`` points to an array
+  element, other elements of the same array, but not of enclosing arrays, are
+  considered in bounds.
+
+.. code-block:: c
+
+  struct X { char a, b, c; } x;
+  static_assert(__builtin_object_size(&x, 0) == 3);
+  static_assert(__builtin_object_size(&x.b, 0) == 2);
+  static_assert(__builtin_object_size(&x.b, 1) == 1);
+
+.. code-block:: c
 
-For instance, here is a program that ``__builtin_dynamic_object_size`` will make
+  char a[10][10][10];
+  static_assert(__builtin_object_size(&a, 1) == 1000);
+  static_assert(__builtin_object_size(&a[1], 1) == 900);
+  static_assert(__builtin_object_size(&a[1][1], 1) == 90);
+  static_assert(__builtin_object_size(&a[1][1][1], 1) == 9);
+
+The values returned by this builtin are a best effort conservative approximation
+of the correct answers. When ``type & 2 == 0``, the true value is less than or
+equal to the value returned by the builtin, and when ``type & 2 == 1``, the true
+value is greater than or equal to the value returned by the builtin.
+
+For ``__builtin_object_size``, the value is determined entirely at compile time.
+With optimization enabled, better results will be produced, especially when the
+call to ``__builtin_object_size`` is in a different function from the formation
+of the pointer. Unlike in GCC, enabling optimization in Clang does not allow
+more information about subobjects to be determined, so the ``type & 1 == 1``
+case will often give imprecise results when used across a function call boundary
+even when optimization is enabled.
+
+`The pass_object_size and pass_dynamic_object_size attributes <https://clang.llvm.org/docs/AttributeReference.html#pass-object-size-pass-dynamic-object-size>`_
+can be used to invisibly pass the object size for a pointer parameter alongside
+the pointer in a function call. This allows more precise object sizes to be
+determined both when building without optimizations and in the ``type & 1 == 1``
+case.
+
+For ``__builtin_dynamic_object_size``, the result is not limited to being a
+compile time constant. Instead, a small amount of runtime evaluation is
+permitted to determine the size of the object, in order to give a more precise
+result. ``__builtin_dynamic_object_size`` is meant to be used as a drop-in
+replacement for ``__builtin_object_size`` in libraries that support it. For
+instance, here is a program that ``__builtin_dynamic_object_size`` will make
 safer:
 
 .. code-block:: c
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9c8f8c4a4fbafe..c720e47dbe35b7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -71,6 +71,11 @@ C++ Specific Potentially Breaking Changes
 
   To fix this, update libstdc++ to version 14.1.1 or greater.
 
+- Clang now emits errors when Thread Safety Analysis trylock attributes are
+  applied to functions or methods with incompatible return values, such as
+  constructors, destructors, and void-returning functions. This only affects the
+  ``TRY_ACQUIRE`` and ``TRY_ACQUIRE_SHARED`` attributes (and any synonyms).
+
 ABI Changes in This Version
 ---------------------------
 - Fixed Microsoft name mangling of implicitly defined variables used for thread
@@ -95,17 +100,22 @@ ABI Changes in This Version
 - Fixed Microsoft calling convention when returning classes that have a deleted
   copy assignment operator. Such a class should be returned indirectly.
 
+- Removed the global alias that was pointing to AArch64 Function Multiversioning
+  ifuncs. Its purpose was to preserve backwards compatibility when the ".ifunc"
+  suffix got removed from the name mangling. The alias interacts badly with
+  GlobalOpt (see the issue #96197).
+  
 - Fixed Microsoft name mangling for auto non-type template arguments of pointer
   type for MSVC 1920+. This change resolves incompatibilities with code compiled
   by MSVC 1920+ but will introduce incompatibilities with code compiled by
   earlier versions of Clang unless such code is built with the compiler option
   `-fms-compatibility-version=19.14` to imitate the MSVC 1914 mangling behavior.
 
-
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
 - The text ast-dumper has improved printing of TemplateArguments.
+- The text decl-dumper prints template parameters' trailing requires expressions now.
 
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
@@ -511,6 +521,11 @@ Attribute Changes in Clang
      };
 
 
+- Introduced new function type attributes ``[[clang::nonblocking]]``, ``[[clang::nonallocating]]``,
+  ``[[clang::blocking]]``, and ``[[clang::allocating]]``, with GNU-style variants as well.
+  The attributes declare constraints about a function's behavior pertaining to blocking and
+  heap memory allocation.
+
 Improvements to Clang's diagnostics
 -----------------------------------
 - Clang now applies syntax highlighting to the code snippets it
@@ -720,11 +735,19 @@ Bug Fixes in This Version
 
 - Fixed `static_cast` to array of unknown bound. Fixes (#GH62863).
 
+- Clang's Thread Safety Analysis now evaluates trylock success arguments of enum
+  types rather than silently defaulting to false. This fixes a class of false
+  negatives where the analysis failed to detect unchecked access to guarded
+  data.
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Fix crash when atomic builtins are called with pointer to zero-size struct (#GH90330)
 
+- Clang now allows pointee types of atomic builtin arguments to be complete template types
+  that was not instantiated elsewhere.
+
 Bug Fixes to Attribute Support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -915,6 +938,7 @@ Bug Fixes to C++ Support
 - Fix an assertion failure caused by parsing a lambda used as a default argument for the value of a
   forward-declared class. (#GH93512).
 - Fixed a bug in access checking inside return-type-requirement of compound requirements. (#GH93788).
+- Fixed an assertion failure about invalid conversion when calling lambda. (#GH96205).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1091,6 +1115,8 @@ clang-format
 - Adds ``AllowShortCaseExpressionOnASingleLine`` option.
 - Adds ``AlignCaseArrows`` suboption to ``AlignConsecutiveShortCaseStatements``.
 - Adds ``LeftWithLastLine`` suboption to ``AlignEscapedNewlines``.
+- Adds ``KeepEmptyLines`` option to deprecate ``KeepEmptyLinesAtEOF``
+  and ``KeepEmptyLinesAtTheStartOfBlocks``.
 
 libclang
 --------
diff --git a/clang/docs/ThreadSafetyAnalysis.rst b/clang/docs/ThreadSafetyAnalysis.rst
index dcde0c706c7045..1513719caa464b 100644
--- a/clang/docs/ThreadSafetyAnalysis.rst
+++ b/clang/docs/ThreadSafetyAnalysis.rst
@@ -420,10 +420,17 @@ TRY_ACQUIRE(<bool>, ...), TRY_ACQUIRE_SHARED(<bool>, ...)
 *Previously:* ``EXCLUSIVE_TRYLOCK_FUNCTION``, ``SHARED_TRYLOCK_FUNCTION``
 
 These are attributes on a function or method that tries to acquire the given
-capability, and returns a boolean value indicating success or failure.
-The first argument must be ``true`` or ``false``, to specify which return value
-indicates success, and the remaining arguments are interpreted in the same way
-as ``ACQUIRE``.  See :ref:`mutexheader`, below, for example uses.
+capability, and returns a boolean, integer, or pointer value indicating success
+or failure.
+
+The attribute's first argument defines whether a zero or non-zero return value
+indicates success. Syntactically, it accepts ``NULL`` or ``nullptr``, ``bool``
+and ``int`` literals, as well as enumerator values. *The analysis only cares
+whether this success value is zero or non-zero.* This leads to some subtle
+consequences, discussed in the next section.
+
+The remaining arguments are interpreted in the same way as ``ACQUIRE``.  See
+:ref:`mutexheader`, below, for example uses.
 
 Because the analysis doesn't support conditional locking, a capability is
 treated as acquired after the first branch on the return value of a try-acquire
@@ -445,6 +452,44 @@ function.
     }
   }
 
+Subtle Consequences of Non-Boolean Success Values
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The trylock attributes accept non-boolean expressions for the success value, but
+the analysis only cares whether the value is zero or non-zero.
+
+Suppose you define an enum with two non-zero enumerators: ``LockAcquired = 1``
+and ``LockNotAcquired = 2``. If your trylock function returns ``LockAcquired``
+on success and ``LockNotAcquired`` on failure, the analysis may fail to detect
+access to guarded data without holding the mutex because they are both non-zero.
+
+.. code-block:: c++
+
+  // *** Beware: this code demonstrates incorrect usage. ***
+
+  enum TrylockResult { LockAcquired = 1, LockNotAcquired = 2 };
+
+  class CAPABILITY("mutex") Mutex {
+  public:
+    TrylockResult TryLock() TRY_ACQUIRE(LockAcquired);
+  };
+
+  Mutex mu;
+  int a GUARDED_BY(mu);
+
+  void foo() {
+    if (mu.TryLock()) { // This branch satisfies the analysis, but permits
+                        // unguarded access to `a`!
+      a = 0;
+      mu.Unlock();
+    }
+  }
+
+It's also possible to return a pointer from the trylock function. Similarly, all
+that matters is whether the success value is zero or non-zero. For instance, a
+success value of `true` means the function returns a non-null pointer on
+success.
+
 
 ASSERT_CAPABILITY(...) and ASSERT_SHARED_CAPABILITY(...)
 --------------------------------------------------------
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index b8d5f372bdf619..42c097d973d532 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -420,21 +420,56 @@ around, such as ``std::string_view``.
 
 cplusplus.Move (C++)
 """"""""""""""""""""
-Method calls on a moved-from object and copying a moved-from object will be reported.
-
+Find use-after-move bugs in C++. This includes method calls on moved-from
+objects, assignment of a moved-from object, and repeated move of a moved-from
+object.
 
 .. code-block:: cpp
 
-  struct A {
+ struct A {
    void foo() {}
  };
 
- void f() {
+ void f1() {
    A a;
    A b = std::move(a); // note: 'a' became 'moved-from' here
    a.foo();            // warn: method call on a 'moved-from' object 'a'
  }
 
+ void f2() {
+   A a;
+   A b = std::move(a);
+   A c(std::move(a)); // warn: move of an already moved-from object
+ }
+
+ void f3() {
+   A a;
+   A b = std::move(a);
+   b = a; // warn: copy of moved-from object
+ }
+
+The checker option ``WarnOn`` controls on what objects the use-after-move is
+checked:
+
+* The most strict value is ``KnownsOnly``, in this mode only objects are
+  checked whose type is known to be move-unsafe. These include most STL objects
+  (but excluding move-safe ones) and smart pointers.
+* With option value ``KnownsAndLocals`` local variables (of any type) are
+  additionally checked. The idea behind this is that local variables are
+  usually not tempting to be re-used so an use after move is more likely a bug
+  than with member variables.
+* With option value ``All`` any use-after move condition is checked on all
+  kinds of variables, excluding global variables and known move-safe cases.
+
+Default value is ``KnownsAndLocals``.
+
+Calls of methods named ``empty()`` or ``isEmpty()`` are allowed on moved-from
+objects because these methods are considered as move-safe. Functions called
+``reset()``, ``destroy()``, ``clear()``, ``assign``, ``resize``,  ``shrink`` are
+treated as state-reset functions and are allowed on moved-from objects, these
+make the object valid again. This applies to any type of object (not only STL
+ones).
+
 .. _cplusplus-NewDelete:
 
 cplusplus.NewDelete (C++)
diff --git a/clang/examples/LLVMPrintFunctionNames/LLVMPrintFunctionNames.cpp b/clang/examples/LLVMPrintFunctionNames/LLVMPrintFunctionNames.cpp
index 6cb87e4b5e19c4..79a315d5a4ee27 100644
--- a/clang/examples/LLVMPrintFunctionNames/LLVMPrintFunctionNames.cpp
+++ b/clang/examples/LLVMPrintFunctionNames/LLVMPrintFunctionNames.cpp
@@ -18,6 +18,7 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendPluginRegistry.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index de86cb5e9d7fcf..a99f2dc6eb3f2b 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -37,6 +37,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Support/TypeSize.h"
 #include <optional>
@@ -467,6 +468,14 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// This is the top-level (C++20) Named module we are building.
   Module *CurrentCXXNamedModule = nullptr;
 
+  /// Help structures to decide whether two `const Module *` belongs
+  /// to the same conceptual module to avoid the expensive to string comparison
+  /// if possible.
+  ///
+  /// Not serialized intentionally.
+  llvm::StringMap<const Module *> PrimaryModuleNameMap;
+  llvm::DenseMap<const Module *, const Module *> SameModuleLookupSet;
+
   static constexpr unsigned ConstantArrayTypesLog2InitSize = 8;
   static constexpr unsigned GeneralTypesLog2InitSize = 9;
   static constexpr unsigned FunctionProtoTypesLog2InitSize = 12;
@@ -1073,6 +1082,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// Get module under construction, nullptr if this is not a C++20 module.
   Module *getCurrentNamedModule() const { return CurrentCXXNamedModule; }
 
+  /// If the two module \p M1 and \p M2 are in the same module.
+  ///
+  /// FIXME: The signature may be confusing since `clang::Module` means to
+  /// a module fragment or a module unit but not a C++20 module.
+  bool isInSameModule(const Module *M1, const Module *M2);
+
   TranslationUnitDecl *getTranslationUnitDecl() const {
     return TUDecl->getMostRecentDecl();
   }
@@ -1263,6 +1278,11 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// space.
   QualType removeAddrSpaceQualType(QualType T) const;
 
+  /// Return the "other" discriminator used for the pointer auth schema used for
+  /// vtable pointers in instances of the requested type.
+  uint16_t
+  getPointerAuthVTablePointerDiscriminator(const CXXRecordDecl *RD);
+
   /// Apply Objective-C protocol qualifiers to the given type.
   /// \param allowOnPointerType specifies if we can apply protocol
   /// qualifiers on ObjCObjectPointerType. It can be set to true when
@@ -3424,12 +3444,21 @@ OPT_LIST(V)
   /// Whether a C++ static variable or CUDA/HIP kernel should be externalized.
   bool shouldExternalize(const Decl *D) const;
 
+  /// Resolve the root record to be used to derive the vtable pointer
+  /// authentication policy for the specified record.
+  const CXXRecordDecl *
+  baseForVTableAuthentication(const CXXRecordDecl *ThisClass);
+  bool useAbbreviatedThunkName(GlobalDecl VirtualMethodDecl,
+                               StringRef MangledName);
+
   StringRef getCUIDHash() const;
 
 private:
   /// All OMPTraitInfo objects live in this collection, one per
   /// `pragma omp [begin] declare variant` directive.
   SmallVector<std::unique_ptr<OMPTraitInfo>, 4> OMPTraitInfoVector;
+
+  llvm::DenseMap<GlobalDecl, llvm::StringSet<>> ThunksToBeAbbreviated;
 };
 
 /// Insertion operator for diagnostics.
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index ab036f1d445acc..4b627c65e276bb 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -244,6 +244,15 @@ class DataStreamBasicReader : public BasicReaderBase<Impl> {
     return FunctionProtoType::ExtParameterInfo::getFromOpaqueValue(value);
   }
 
+  FunctionEffect readFunctionEffect() {
+    uint32_t value = asImpl().readUInt32();
+    return FunctionEffect::fromOpaqueInt32(value);
+  }
+
+  EffectConditionExpr readEffectConditionExpr() {
+    return EffectConditionExpr{asImpl().readExprRef()};
+  }
+
   NestedNameSpecifier *readNestedNameSpecifier() {
     auto &ctx = getASTContext();
 
diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h
index 8e42fcaad1d388..b941add8bde889 100644
--- a/clang/include/clang/AST/AbstractBasicWriter.h
+++ b/clang/include/clang/AST/AbstractBasicWriter.h
@@ -222,6 +222,14 @@ class DataStreamBasicWriter : public BasicWriterBase<Impl> {
     asImpl().writeUInt32(epi.getOpaqueValue());
   }
 
+  void writeFunctionEffect(FunctionEffect E) {
+    asImpl().writeUInt32(E.toOpaqueInt32());
+  }
+
+  void writeEffectConditionExpr(EffectConditionExpr CE) {
+    asImpl().writeExprRef(CE.getCondition());
+  }
+
   void writeNestedNameSpecifier(NestedNameSpecifier *NNS) {
     // Nested name specifiers usually aren't too long. I think that 8 would
     // typically accommodate the vast majority.
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 7fd80b90d10337..5957f14098363e 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -3042,6 +3042,16 @@ class FunctionDecl : public DeclaratorDecl,
   /// computed and stored.
   unsigned getODRHash() const;
 
+  FunctionEffectsRef getFunctionEffects() const {
+    // Effects may differ between declarations, but they should be propagated
+    // from old to new on any redeclaration, so it suffices to look at
+    // getMostRecentDecl().
+    if (const auto *FPT =
+            getMostRecentDecl()->getType()->getAs<FunctionProtoType>())
+      return FPT->getFunctionEffects();
+    return {};
+  }
+
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) {
@@ -4670,6 +4680,13 @@ class BlockDecl : public Decl, public DeclContext {
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
+  FunctionEffectsRef getFunctionEffects() const {
+    if (const TypeSourceInfo *TSI = getSignatureAsWritten())
+      if (const auto *FPT = TSI->getType()->getAs<FunctionProtoType>())
+        return FPT->getFunctionEffects();
+    return {};
+  }
+
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) { return K == Block; }
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 3310f57acc6835..45dac82e540776 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -835,10 +835,7 @@ class alignas(8) Decl {
 
   /// Get the module that owns this declaration for linkage purposes.
   /// There only ever is such a standard C++ module.
-  ///
-  /// \param IgnoreLinkage Ignore the linkage of the entity; assume that
-  /// all declarations in a global module fragment are unowned.
-  Module *getOwningModuleForLinkage(bool IgnoreLinkage = false) const;
+  Module *getOwningModuleForLinkage() const;
 
   /// Determine whether this declaration is definitely visible to name lookup,
   /// independent of whether the owning module is visible.
diff --git a/clang/include/clang/AST/DeclID.h b/clang/include/clang/AST/DeclID.h
index e8f4860e13f1f8..e5e27389fac60d 100644
--- a/clang/include/clang/AST/DeclID.h
+++ b/clang/include/clang/AST/DeclID.h
@@ -197,6 +197,8 @@ class LocalDeclID : public DeclIDBase {
 
   static LocalDeclID get(ASTReader &Reader, serialization::ModuleFile &MF,
                          DeclID ID);
+  static LocalDeclID get(ASTReader &Reader, serialization::ModuleFile &MF,
+                         unsigned ModuleFileIndex, unsigned LocalDeclID);
 
   LocalDeclID &operator++() {
     ++ID;
@@ -259,11 +261,7 @@ template <> struct DenseMapInfo<clang::GlobalDeclID> {
   }
 
   static unsigned getHashValue(const GlobalDeclID &Key) {
-    // Our default hash algorithm for 64 bits integer may not be very good.
-    // In GlobalDeclID's case, it is pretty common that the lower 32 bits can
-    // be same.
-    // FIXME: Remove this when we fix the underlying issue.
-    return llvm::hash_value(Key.getRawValue());
+    return DenseMapInfo<DeclID>::getHashValue(Key.getRawValue());
   }
 
   static bool isEqual(const GlobalDeclID &L, const GlobalDeclID &R) {
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 3bc8cae4d8c86c..a8add9d1337c6b 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -384,7 +384,7 @@ class Expr : public ValueStmt {
     bool isRValue() const { return Kind >= CL_XValue; }
     bool isModifiable() const { return getModifiable() == CM_Modifiable; }
 
-    /// Create a simple, modifiably lvalue
+    /// Create a simple, modifiable lvalue
     static Classification makeSimpleLValue() {
       return Classification(CL_LValue, CM_Modifiable);
     }
@@ -787,6 +787,11 @@ class Expr : public ValueStmt {
                                  const Expr *PtrExpression, ASTContext &Ctx,
                                  EvalResult &Status) const;
 
+  /// If the current Expr can be evaluated to a pointer to a null-terminated
+  /// constant string, return the constant string (without the terminating
+  /// null).
+  std::optional<std::string> tryEvaluateString(ASTContext &Ctx) const;
+
   /// Enumeration used to describe the kind of Null pointer constant
   /// returned from \c isNullPointerConstant().
   enum NullPointerConstantKind {
@@ -1287,7 +1292,7 @@ class DeclRefExpr final
 
   DeclRefExpr(const ASTContext &Ctx, NestedNameSpecifierLoc QualifierLoc,
               SourceLocation TemplateKWLoc, ValueDecl *D,
-              bool RefersToEnlosingVariableOrCapture,
+              bool RefersToEnclosingVariableOrCapture,
               const DeclarationNameInfo &NameInfo, NamedDecl *FoundD,
               const TemplateArgumentListInfo *TemplateArgs, QualType T,
               ExprValueKind VK, NonOdrUseReason NOUR);
@@ -1653,14 +1658,14 @@ class FloatingLiteral : public Expr, private APFloatStorage {
   }
 
   /// Get a raw enumeration value representing the floating-point semantics of
-  /// this literal (32-bit IEEE, x87, ...), suitable for serialisation.
+  /// this literal (32-bit IEEE, x87, ...), suitable for serialization.
   llvm::APFloatBase::Semantics getRawSemantics() const {
     return static_cast<llvm::APFloatBase::Semantics>(
         FloatingLiteralBits.Semantics);
   }
 
   /// Set the raw enumeration value representing the floating-point semantics of
-  /// this literal (32-bit IEEE, x87, ...), suitable for serialisation.
+  /// this literal (32-bit IEEE, x87, ...), suitable for serialization.
   void setRawSemantics(llvm::APFloatBase::Semantics Sem) {
     FloatingLiteralBits.Semantics = Sem;
   }
@@ -2125,7 +2130,7 @@ class SYCLUniqueStableNameExpr final : public Expr {
   static std::string ComputeName(ASTContext &Context, QualType Ty);
 };
 
-/// ParenExpr - This represents a parethesized expression, e.g. "(1)".  This
+/// ParenExpr - This represents a parenthesized expression, e.g. "(1)".  This
 /// AST node is only formed if full location information is requested.
 class ParenExpr : public Expr {
   SourceLocation L, R;
@@ -2241,7 +2246,7 @@ class UnaryOperator final
   bool canOverflow() const { return UnaryOperatorBits.CanOverflow; }
   void setCanOverflow(bool C) { UnaryOperatorBits.CanOverflow = C; }
 
-  /// Get the FP contractability status of this operator. Only meaningful for
+  /// Get the FP contractibility status of this operator. Only meaningful for
   /// operations on floating point types.
   bool isFPContractableWithinStatement(const LangOptions &LO) const {
     return getFPFeaturesInEffect(LO).allowFPContractWithinStatement();
@@ -4049,7 +4054,7 @@ class BinaryOperator : public Expr {
     return FPOptionsOverride();
   }
 
-  /// Get the FP contractability status of this operator. Only meaningful for
+  /// Get the FP contractibility status of this operator. Only meaningful for
   /// operations on floating point types.
   bool isFPContractableWithinStatement(const LangOptions &LO) const {
     return getFPFeaturesInEffect(LO).allowFPContractWithinStatement();
@@ -4290,7 +4295,7 @@ class BinaryConditionalOperator : public AbstractConditionalOperator {
   }
 
   /// getFalseExpr - Return the subexpression which will be
-  ///   evaluated if the condnition evaluates to false; this is
+  ///   evaluated if the condition evaluates to false; this is
   ///   defined in terms of the opaque value.
   Expr *getFalseExpr() const {
     return cast<Expr>(SubExprs[RHS]);
@@ -5998,7 +6003,7 @@ class GenericSelectionExpr final
     //    if *It1 and *It2 are bound to the same objects.
     // An alternative design approach was discussed during review;
     // store an Association object inside the iterator, and return a reference
-    // to it when dereferenced. This idea was discarded beacuse of nasty
+    // to it when dereferenced. This idea was discarded because of nasty
     // lifetime issues:
     //    AssociationIterator It = ...;
     //    const Association &Assoc = *It++; // Oops, Assoc is dangling.
diff --git a/clang/include/clang/AST/GlobalDecl.h b/clang/include/clang/AST/GlobalDecl.h
index 88abba28c991d3..386693cabb1fbb 100644
--- a/clang/include/clang/AST/GlobalDecl.h
+++ b/clang/include/clang/AST/GlobalDecl.h
@@ -145,6 +145,10 @@ class GlobalDecl {
            LHS.MultiVersionIndex == RHS.MultiVersionIndex;
   }
 
+  bool operator!=(const GlobalDecl &Other) const {
+    return !(*this == Other);
+  }
+
   void *getAsOpaquePtr() const { return Value.getOpaqueValue(); }
 
   explicit operator bool() const { return getAsOpaquePtr(); }
diff --git a/clang/include/clang/AST/Mangle.h b/clang/include/clang/AST/Mangle.h
index e586b0cec43dfd..d5f6c0f6cc67df 100644
--- a/clang/include/clang/AST/Mangle.h
+++ b/clang/include/clang/AST/Mangle.h
@@ -130,15 +130,15 @@ class MangleContext {
   // FIXME: consider replacing raw_ostream & with something like SmallString &.
   void mangleName(GlobalDecl GD, raw_ostream &);
   virtual void mangleCXXName(GlobalDecl GD, raw_ostream &) = 0;
-  virtual void mangleThunk(const CXXMethodDecl *MD,
-                          const ThunkInfo &Thunk,
-                          raw_ostream &) = 0;
+  virtual void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
+                           bool ElideOverrideInfo, raw_ostream &) = 0;
   virtual void mangleCXXDtorThunk(const CXXDestructorDecl *DD, CXXDtorType Type,
-                                  const ThisAdjustment &ThisAdjustment,
-                                  raw_ostream &) = 0;
+                                  const ThunkInfo &Thunk,
+                                  bool ElideOverrideInfo, raw_ostream &) = 0;
   virtual void mangleReferenceTemporary(const VarDecl *D,
                                         unsigned ManglingNumber,
                                         raw_ostream &) = 0;
+  virtual void mangleCXXVTable(const CXXRecordDecl *RD, raw_ostream &) = 0;
   virtual void mangleCXXRTTI(QualType T, raw_ostream &) = 0;
   virtual void mangleCXXRTTIName(QualType T, raw_ostream &,
                                  bool NormalizeIntegers = false) = 0;
@@ -192,7 +192,6 @@ class ItaniumMangleContext : public MangleContext {
                                 bool IsAux = false)
       : MangleContext(C, D, MK_Itanium, IsAux) {}
 
-  virtual void mangleCXXVTable(const CXXRecordDecl *RD, raw_ostream &) = 0;
   virtual void mangleCXXVTT(const CXXRecordDecl *RD, raw_ostream &) = 0;
   virtual void mangleCXXCtorVTable(const CXXRecordDecl *RD, int64_t Offset,
                                    const CXXRecordDecl *Type,
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index 6df1d93a7ba2eb..5f7d6195187623 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -117,6 +117,8 @@ def ExtParameterInfo : PropertyType<"FunctionProtoType::ExtParameterInfo">;
 def FixedPointSemantics : PropertyType<"llvm::FixedPointSemantics"> {
   let PassByReference = 1;
 }
+def FunctionEffect : PropertyType<"FunctionEffect">;
+def EffectConditionExpr : PropertyType<"EffectConditionExpr">;
 def Identifier : RefPropertyType<"IdentifierInfo"> { let ConstWhenWriting = 1; }
 def LValuePathEntry : PropertyType<"APValue::LValuePathEntry">;
 def LValuePathSerializationHelper :
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 61246479188e95..62836ec5c63125 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -118,6 +118,7 @@ class EnumDecl;
 class Expr;
 class ExtQualsTypeCommonBase;
 class FunctionDecl;
+class FunctionEffectSet;
 class IdentifierInfo;
 class NamedDecl;
 class ObjCInterfaceDecl;
@@ -131,6 +132,7 @@ class TemplateArgument;
 class TemplateArgumentListInfo;
 class TemplateArgumentLoc;
 class TemplateTypeParmDecl;
+template <typename> class TreeTransform;
 class TypedefNameDecl;
 class UnresolvedUsingTypenameDecl;
 class UsingShadowDecl;
@@ -4524,8 +4526,13 @@ class FunctionType : public Type {
     LLVM_PREFERRED_TYPE(bool)
     unsigned HasArmTypeAttributes : 1;
 
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned EffectsHaveConditions : 1;
+    unsigned NumFunctionEffects : 4;
+
     FunctionTypeExtraBitfields()
-        : NumExceptionType(0), HasArmTypeAttributes(false) {}
+        : NumExceptionType(0), HasArmTypeAttributes(false),
+          EffectsHaveConditions(false), NumFunctionEffects(0) {}
   };
 
   /// The AArch64 SME ACLE (Arm C/C++ Language Extensions) define a number
@@ -4658,6 +4665,296 @@ class FunctionNoProtoType : public FunctionType, public llvm::FoldingSetNode {
   }
 };
 
+// ------------------------------------------------------------------------------
+
+/// Represents an abstract function effect, using just an enumeration describing
+/// its kind.
+class FunctionEffect {
+public:
+  /// Identifies the particular effect.
+  enum class Kind : uint8_t {
+    None = 0,
+    NonBlocking = 1,
+    NonAllocating = 2,
+    Blocking = 3,
+    Allocating = 4
+  };
+
+  /// Flags describing some behaviors of the effect.
+  using Flags = unsigned;
+  enum FlagBit : Flags {
+    // Can verification inspect callees' implementations? (e.g. nonblocking:
+    // yes, tcb+types: no). This also implies the need for 2nd-pass
+    // verification.
+    FE_InferrableOnCallees = 0x1,
+
+    // Language constructs which effects can diagnose as disallowed.
+    FE_ExcludeThrow = 0x2,
+    FE_ExcludeCatch = 0x4,
+    FE_ExcludeObjCMessageSend = 0x8,
+    FE_ExcludeStaticLocalVars = 0x10,
+    FE_ExcludeThreadLocalVars = 0x20
+  };
+
+private:
+  LLVM_PREFERRED_TYPE(Kind)
+  unsigned FKind : 3;
+
+  // Expansion: for hypothetical TCB+types, there could be one Kind for TCB,
+  // then ~16(?) bits "SubKind" to map to a specific named TCB. SubKind would
+  // be considered for uniqueness.
+
+public:
+  FunctionEffect() : FKind(unsigned(Kind::None)) {}
+
+  explicit FunctionEffect(Kind K) : FKind(unsigned(K)) {}
+
+  /// The kind of the effect.
+  Kind kind() const { return Kind(FKind); }
+
+  /// Return the opposite kind, for effects which have opposites.
+  Kind oppositeKind() const;
+
+  /// For serialization.
+  uint32_t toOpaqueInt32() const { return FKind; }
+  static FunctionEffect fromOpaqueInt32(uint32_t Value) {
+    return FunctionEffect(Kind(Value));
+  }
+
+  /// Flags describing some behaviors of the effect.
+  Flags flags() const {
+    switch (kind()) {
+    case Kind::NonBlocking:
+      return FE_InferrableOnCallees | FE_ExcludeThrow | FE_ExcludeCatch |
+             FE_ExcludeObjCMessageSend | FE_ExcludeStaticLocalVars |
+             FE_ExcludeThreadLocalVars;
+    case Kind::NonAllocating:
+      // Same as NonBlocking, except without FE_ExcludeStaticLocalVars.
+      return FE_InferrableOnCallees | FE_ExcludeThrow | FE_ExcludeCatch |
+             FE_ExcludeObjCMessageSend | FE_ExcludeThreadLocalVars;
+    case Kind::Blocking:
+    case Kind::Allocating:
+      return 0;
+    case Kind::None:
+      break;
+    }
+    llvm_unreachable("unknown effect kind");
+  }
+
+  /// The description printed in diagnostics, e.g. 'nonblocking'.
+  StringRef name() const;
+
+  /// Return true if the effect is allowed to be inferred on the callee,
+  /// which is either a FunctionDecl or BlockDecl.
+  /// Example: This allows nonblocking(false) to prevent inference for the
+  /// function.
+  bool canInferOnFunction(const Decl &Callee) const;
+
+  // Return false for success. When true is returned for a direct call, then the
+  // FE_InferrableOnCallees flag may trigger inference rather than an immediate
+  // diagnostic. Caller should be assumed to have the effect (it may not have it
+  // explicitly when inferring).
+  bool shouldDiagnoseFunctionCall(bool Direct,
+                                  ArrayRef<FunctionEffect> CalleeFX) const;
+
+  friend bool operator==(const FunctionEffect &LHS, const FunctionEffect &RHS) {
+    return LHS.FKind == RHS.FKind;
+  }
+  friend bool operator!=(const FunctionEffect &LHS, const FunctionEffect &RHS) {
+    return !(LHS == RHS);
+  }
+  friend bool operator<(const FunctionEffect &LHS, const FunctionEffect &RHS) {
+    return LHS.FKind < RHS.FKind;
+  }
+};
+
+/// Wrap a function effect's condition expression in another struct so
+/// that FunctionProtoType's TrailingObjects can treat it separately.
+class EffectConditionExpr {
+  Expr *Cond = nullptr; // if null, unconditional.
+
+public:
+  EffectConditionExpr() = default;
+  EffectConditionExpr(Expr *E) : Cond(E) {}
+
+  Expr *getCondition() const { return Cond; }
+
+  bool operator==(const EffectConditionExpr &RHS) const {
+    return Cond == RHS.Cond;
+  }
+};
+
+/// A FunctionEffect plus a potential boolean expression determining whether
+/// the effect is declared (e.g. nonblocking(expr)). Generally the condition
+/// expression when present, is dependent.
+struct FunctionEffectWithCondition {
+  FunctionEffect Effect;
+  EffectConditionExpr Cond;
+
+  FunctionEffectWithCondition() = default;
+  FunctionEffectWithCondition(const FunctionEffect &E,
+                              const EffectConditionExpr &C)
+      : Effect(E), Cond(C) {}
+
+  /// Return a textual description of the effect, and its condition, if any.
+  std::string description() const;
+};
+
+/// Support iteration in parallel through a pair of FunctionEffect and
+/// EffectConditionExpr containers.
+template <typename Container> class FunctionEffectIterator {
+  friend Container;
+
+  const Container *Outer = nullptr;
+  size_t Idx = 0;
+
+public:
+  FunctionEffectIterator();
+  FunctionEffectIterator(const Container &O, size_t I) : Outer(&O), Idx(I) {}
+  bool operator==(const FunctionEffectIterator &Other) const {
+    return Idx == Other.Idx;
+  }
+  bool operator!=(const FunctionEffectIterator &Other) const {
+    return Idx != Other.Idx;
+  }
+
+  FunctionEffectIterator operator++() {
+    ++Idx;
+    return *this;
+  }
+
+  FunctionEffectWithCondition operator*() const {
+    assert(Outer != nullptr && "invalid FunctionEffectIterator");
+    bool HasConds = !Outer->Conditions.empty();
+    return FunctionEffectWithCondition{Outer->Effects[Idx],
+                                       HasConds ? Outer->Conditions[Idx]
+                                                : EffectConditionExpr()};
+  }
+};
+
+/// An immutable set of FunctionEffects and possibly conditions attached to
+/// them. The effects and conditions reside in memory not managed by this object
+/// (typically, trailing objects in FunctionProtoType, or borrowed references
+/// from a FunctionEffectSet).
+///
+/// Invariants:
+/// - there is never more than one instance of any given effect.
+/// - the array of conditions is either empty or has the same size as the
+///   array of effects.
+/// - some conditions may be null expressions; each condition pertains to
+///   the effect at the same array index.
+///
+/// Also, if there are any conditions, at least one of those expressions will be
+/// dependent, but this is only asserted in the constructor of
+/// FunctionProtoType.
+///
+/// See also FunctionEffectSet, in Sema, which provides a mutable set.
+class FunctionEffectsRef {
+  // Restrict classes which can call the private constructor -- these friends
+  // all maintain the required invariants. FunctionEffectSet is generally the
+  // only way in which the arrays are created; FunctionProtoType will not
+  // reorder them.
+  friend FunctionProtoType;
+  friend FunctionEffectSet;
+
+  ArrayRef<FunctionEffect> Effects;
+  ArrayRef<EffectConditionExpr> Conditions;
+
+  // The arrays are expected to have been sorted by the caller, with the
+  // effects in order. The conditions array must be empty or the same size
+  // as the effects array, since the conditions are associated with the effects
+  // at the same array indices.
+  FunctionEffectsRef(ArrayRef<FunctionEffect> FX,
+                     ArrayRef<EffectConditionExpr> Conds)
+      : Effects(FX), Conditions(Conds) {}
+
+public:
+  /// Extract the effects from a Type if it is a function, block, or member
+  /// function pointer, or a reference or pointer to one.
+  static FunctionEffectsRef get(QualType QT);
+
+  /// Asserts invariants.
+  static FunctionEffectsRef create(ArrayRef<FunctionEffect> FX,
+                                   ArrayRef<EffectConditionExpr> Conds);
+
+  FunctionEffectsRef() = default;
+
+  bool empty() const { return Effects.empty(); }
+  size_t size() const { return Effects.size(); }
+
+  ArrayRef<FunctionEffect> effects() const { return Effects; }
+  ArrayRef<EffectConditionExpr> conditions() const { return Conditions; }
+
+  using iterator = FunctionEffectIterator<FunctionEffectsRef>;
+  friend iterator;
+  iterator begin() const { return iterator(*this, 0); }
+  iterator end() const { return iterator(*this, size()); }
+
+  friend bool operator==(const FunctionEffectsRef &LHS,
+                         const FunctionEffectsRef &RHS) {
+    return LHS.Effects == RHS.Effects && LHS.Conditions == RHS.Conditions;
+  }
+  friend bool operator!=(const FunctionEffectsRef &LHS,
+                         const FunctionEffectsRef &RHS) {
+    return !(LHS == RHS);
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const;
+  void dump(llvm::raw_ostream &OS) const;
+};
+
+/// A mutable set of FunctionEffects and possibly conditions attached to them.
+/// Used to compare and merge effects on declarations.
+///
+/// Has the same invariants as FunctionEffectsRef.
+class FunctionEffectSet {
+  SmallVector<FunctionEffect> Effects;
+  SmallVector<EffectConditionExpr> Conditions;
+
+public:
+  FunctionEffectSet() = default;
+
+  explicit FunctionEffectSet(const FunctionEffectsRef &FX)
+      : Effects(FX.effects()), Conditions(FX.conditions()) {}
+
+  bool empty() const { return Effects.empty(); }
+  size_t size() const { return Effects.size(); }
+
+  using iterator = FunctionEffectIterator<FunctionEffectSet>;
+  friend iterator;
+  iterator begin() const { return iterator(*this, 0); }
+  iterator end() const { return iterator(*this, size()); }
+
+  operator FunctionEffectsRef() const { return {Effects, Conditions}; }
+
+  void dump(llvm::raw_ostream &OS) const;
+
+  // Mutators
+
+  // On insertion, a conflict occurs when attempting to insert an
+  // effect which is opposite an effect already in the set, or attempting
+  // to insert an effect which is already in the set but with a condition
+  // which is not identical.
+  struct Conflict {
+    FunctionEffectWithCondition Kept;
+    FunctionEffectWithCondition Rejected;
+  };
+  using Conflicts = SmallVector<Conflict>;
+
+  // Returns true for success (obviating a check of Errs.empty()).
+  bool insert(const FunctionEffectWithCondition &NewEC, Conflicts &Errs);
+
+  // Returns true for success (obviating a check of Errs.empty()).
+  bool insert(const FunctionEffectsRef &Set, Conflicts &Errs);
+
+  // Set operations
+
+  static FunctionEffectSet getUnion(FunctionEffectsRef LHS,
+                                    FunctionEffectsRef RHS, Conflicts &Errs);
+  static FunctionEffectSet getIntersection(FunctionEffectsRef LHS,
+                                           FunctionEffectsRef RHS);
+};
+
 /// Represents a prototype with parameter type info, e.g.
 /// 'int foo(int)' or 'int foo(void)'.  'void' is represented as having no
 /// parameters, not as having a single void parameter. Such a type can have
@@ -4672,7 +4969,8 @@ class FunctionProtoType final
           FunctionProtoType, QualType, SourceLocation,
           FunctionType::FunctionTypeExtraBitfields,
           FunctionType::FunctionTypeArmAttributes, FunctionType::ExceptionType,
-          Expr *, FunctionDecl *, FunctionType::ExtParameterInfo, Qualifiers> {
+          Expr *, FunctionDecl *, FunctionType::ExtParameterInfo,
+          FunctionEffect, EffectConditionExpr, Qualifiers> {
   friend class ASTContext; // ASTContext creates these.
   friend TrailingObjects;
 
@@ -4703,9 +5001,15 @@ class FunctionProtoType final
   //   an ExtParameterInfo for each of the parameters. Present if and
   //   only if hasExtParameterInfos() is true.
   //
+  // * Optionally, an array of getNumFunctionEffects() FunctionEffect.
+  //   Present only when getNumFunctionEffects() > 0
+  //
+  // * Optionally, an array of getNumFunctionEffects() EffectConditionExpr.
+  //   Present only when getNumFunctionEffectConditions() > 0.
+  //
   // * Optionally a Qualifiers object to represent extra qualifiers that can't
-  //   be represented by FunctionTypeBitfields.FastTypeQuals. Present if and only
-  //   if hasExtQualifiers() is true.
+  //   be represented by FunctionTypeBitfields.FastTypeQuals. Present if and
+  //   only if hasExtQualifiers() is true.
   //
   // The optional FunctionTypeExtraBitfields has to be before the data
   // related to the exception specification since it contains the number
@@ -4761,6 +5065,7 @@ class FunctionProtoType final
     ExceptionSpecInfo ExceptionSpec;
     const ExtParameterInfo *ExtParameterInfos = nullptr;
     SourceLocation EllipsisLoc;
+    FunctionEffectsRef FunctionEffects;
 
     ExtProtoInfo()
         : Variadic(false), HasTrailingReturn(false),
@@ -4778,7 +5083,8 @@ class FunctionProtoType final
 
     bool requiresFunctionProtoTypeExtraBitfields() const {
       return ExceptionSpec.Type == EST_Dynamic ||
-             requiresFunctionProtoTypeArmAttributes();
+             requiresFunctionProtoTypeArmAttributes() ||
+             !FunctionEffects.empty();
     }
 
     bool requiresFunctionProtoTypeArmAttributes() const {
@@ -4826,6 +5132,14 @@ class FunctionProtoType final
     return hasExtParameterInfos() ? getNumParams() : 0;
   }
 
+  unsigned numTrailingObjects(OverloadToken<FunctionEffect>) const {
+    return getNumFunctionEffects();
+  }
+
+  unsigned numTrailingObjects(OverloadToken<EffectConditionExpr>) const {
+    return getNumFunctionEffectConditions();
+  }
+
   /// Determine whether there are any argument types that
   /// contain an unexpanded parameter pack.
   static bool containsAnyUnexpandedParameterPack(const QualType *ArgArray,
@@ -4927,6 +5241,7 @@ class FunctionProtoType final
     EPI.RefQualifier = getRefQualifier();
     EPI.ExtParameterInfos = getExtParameterInfosOrNull();
     EPI.AArch64SMEAttributes = getAArch64SMEAttributes();
+    EPI.FunctionEffects = getFunctionEffects();
     return EPI;
   }
 
@@ -5138,6 +5453,62 @@ class FunctionProtoType final
     return false;
   }
 
+  unsigned getNumFunctionEffects() const {
+    return hasExtraBitfields()
+               ? getTrailingObjects<FunctionTypeExtraBitfields>()
+                     ->NumFunctionEffects
+               : 0;
+  }
+
+  // For serialization.
+  ArrayRef<FunctionEffect> getFunctionEffectsWithoutConditions() const {
+    if (hasExtraBitfields()) {
+      const auto *Bitfields = getTrailingObjects<FunctionTypeExtraBitfields>();
+      if (Bitfields->NumFunctionEffects > 0)
+        return {getTrailingObjects<FunctionEffect>(),
+                Bitfields->NumFunctionEffects};
+    }
+    return {};
+  }
+
+  unsigned getNumFunctionEffectConditions() const {
+    if (hasExtraBitfields()) {
+      const auto *Bitfields = getTrailingObjects<FunctionTypeExtraBitfields>();
+      if (Bitfields->EffectsHaveConditions)
+        return Bitfields->NumFunctionEffects;
+    }
+    return 0;
+  }
+
+  // For serialization.
+  ArrayRef<EffectConditionExpr> getFunctionEffectConditions() const {
+    if (hasExtraBitfields()) {
+      const auto *Bitfields = getTrailingObjects<FunctionTypeExtraBitfields>();
+      if (Bitfields->EffectsHaveConditions)
+        return {getTrailingObjects<EffectConditionExpr>(),
+                Bitfields->NumFunctionEffects};
+    }
+    return {};
+  }
+
+  // Combines effects with their conditions.
+  FunctionEffectsRef getFunctionEffects() const {
+    if (hasExtraBitfields()) {
+      const auto *Bitfields = getTrailingObjects<FunctionTypeExtraBitfields>();
+      if (Bitfields->NumFunctionEffects > 0) {
+        const size_t NumConds = Bitfields->EffectsHaveConditions
+                                    ? Bitfields->NumFunctionEffects
+                                    : 0;
+        return FunctionEffectsRef(
+            {getTrailingObjects<FunctionEffect>(),
+             Bitfields->NumFunctionEffects},
+            {NumConds ? getTrailingObjects<EffectConditionExpr>() : nullptr,
+             NumConds});
+      }
+    }
+    return {};
+  }
+
   bool isSugared() const { return false; }
   QualType desugar() const { return QualType(this, 0); }
 
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index aba14b222a03ac..7d4353c2773a30 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -352,6 +352,12 @@ let Class = FunctionProtoType in {
   def : Property<"AArch64SMEAttributes", UInt32> {
     let Read = [{ node->getAArch64SMEAttributes() }];
   }
+  def : Property<"functionEffects", Array<FunctionEffect>> {
+    let Read = [{ node->getFunctionEffectsWithoutConditions() }];
+  }
+  def : Property<"functionEffectConds", Array<EffectConditionExpr>> {
+    let Read = [{ node->getFunctionEffectConditions() }];
+  }
 
   def : Creator<[{
     auto extInfo = FunctionType::ExtInfo(noReturn, hasRegParm, regParm,
@@ -368,6 +374,7 @@ let Class = FunctionProtoType in {
     epi.ExtParameterInfos =
       extParameterInfo.empty() ? nullptr : extParameterInfo.data();
     epi.AArch64SMEAttributes = AArch64SMEAttributes;
+    epi.FunctionEffects = FunctionEffectsRef::create(functionEffects, functionEffectConds);
     return ctx.getFunctionType(returnType, parameters, epi);
   }]>;
 }
diff --git a/clang/include/clang/AST/VTableBuilder.h b/clang/include/clang/AST/VTableBuilder.h
index fbf6c041a1ec11..a5de41dbc22f14 100644
--- a/clang/include/clang/AST/VTableBuilder.h
+++ b/clang/include/clang/AST/VTableBuilder.h
@@ -361,6 +361,10 @@ class VTableContextBase {
 };
 
 class ItaniumVTableContext : public VTableContextBase {
+public:
+  typedef llvm::DenseMap<const CXXMethodDecl *, const CXXMethodDecl *>
+      OriginalMethodMapTy;
+
 private:
 
   /// Contains the index (relative to the vtable address point)
@@ -384,6 +388,10 @@ class ItaniumVTableContext : public VTableContextBase {
     VirtualBaseClassOffsetOffsetsMapTy;
   VirtualBaseClassOffsetOffsetsMapTy VirtualBaseClassOffsetOffsets;
 
+  /// Map from a virtual method to the nearest method in the primary base class
+  /// chain that it overrides.
+  OriginalMethodMapTy OriginalMethodMap;
+
   void computeVTableRelatedInformation(const CXXRecordDecl *RD) override;
 
 public:
@@ -425,6 +433,27 @@ class ItaniumVTableContext : public VTableContextBase {
   CharUnits getVirtualBaseOffsetOffset(const CXXRecordDecl *RD,
                                        const CXXRecordDecl *VBase);
 
+  /// Return the method that added the v-table slot that will be used to call
+  /// the given method.
+  ///
+  /// In the Itanium ABI, where overrides always cause methods to be added to
+  /// the primary v-table if they're not already there, this will be the first
+  /// declaration in the primary base class chain for which the return type
+  /// adjustment is trivial.
+  GlobalDecl findOriginalMethod(GlobalDecl GD);
+
+  const CXXMethodDecl *findOriginalMethodInMap(const CXXMethodDecl *MD) const;
+
+  void setOriginalMethod(const CXXMethodDecl *Key, const CXXMethodDecl *Val) {
+    OriginalMethodMap[Key] = Val;
+  }
+
+  /// This method is reserved for the implementation and shouldn't be used
+  /// directly.
+  const OriginalMethodMapTy &getOriginalMethodMap() {
+    return OriginalMethodMap;
+  }
+
   static bool classof(const VTableContextBase *VT) {
     return !VT->isMicrosoft();
   }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index c8e2015c8e66ae..452cd1810f6539 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -686,6 +686,10 @@ class Attr {
   bit PragmaAttributeSupport;
   // Set to true if this attribute accepts parameter pack expansion expressions.
   bit AcceptsExprPack = 0;
+  // To support multiple enum parameters to an attribute without breaking
+  // our existing general parsing we need to have a separate flag that
+  // opts an attribute into strict parsing of attribute parameters
+  bit StrictEnumParameters = 0;
   // Lists language options, one of which is required to be true for the
   // attribute to be applicable. If empty, no language options are required.
   list<LangOpt> LangOpts = [];
@@ -1461,6 +1465,28 @@ def CXX11NoReturn : InheritableAttr {
   let Documentation = [CXX11NoReturnDocs];
 }
 
+def NonBlocking : TypeAttr {
+  let Spellings = [Clang<"nonblocking">];
+  let Args = [ExprArgument<"Cond", /*optional*/1>];
+  let Documentation = [NonBlockingDocs];
+}
+
+def NonAllocating : TypeAttr {
+  let Spellings = [Clang<"nonallocating">];
+  let Args = [ExprArgument<"Cond", /*optional*/1>];
+  let Documentation = [NonAllocatingDocs];
+}
+
+def Blocking : TypeAttr {
+  let Spellings = [Clang<"blocking">];
+  let Documentation = [BlockingDocs];
+}
+
+def Allocating : TypeAttr {
+  let Spellings = [Clang<"allocating">];
+  let Documentation = [AllocatingDocs];
+}
+
 // Similar to CUDA, OpenCL attributes do not receive a [[]] spelling because
 // the specification does not expose them with one currently.
 def OpenCLKernel : InheritableAttr {
@@ -4554,6 +4580,31 @@ def NoRandomizeLayout : InheritableAttr {
 }
 def : MutualExclusions<[RandomizeLayout, NoRandomizeLayout]>;
 
+def VTablePointerAuthentication : InheritableAttr {
+  let Spellings = [Clang<"ptrauth_vtable_pointer">];
+  let Subjects = SubjectList<[CXXRecord]>;
+  let Documentation = [Undocumented];
+  let StrictEnumParameters = 1;
+  let Args = [EnumArgument<"Key", "VPtrAuthKeyType", /*is_string=*/ true,
+                ["default_key", "no_authentication", "process_dependent",
+                 "process_independent"],
+                ["DefaultKey", "NoKey", "ProcessDependent",
+                 "ProcessIndependent"]>,
+              EnumArgument<"AddressDiscrimination", "AddressDiscriminationMode",
+                /*is_string=*/ true,
+                ["default_address_discrimination", "no_address_discrimination",
+                 "address_discrimination"],
+                ["DefaultAddressDiscrimination", "NoAddressDiscrimination",
+                 "AddressDiscrimination"]>,
+              EnumArgument<"ExtraDiscrimination", "ExtraDiscrimination",
+                /*is_string=*/ true,
+                ["default_extra_discrimination", "no_extra_discrimination",
+                 "type_discrimination", "custom_discrimination"],
+                ["DefaultExtraDiscrimination", "NoExtraDiscrimination",
+                 "TypeDiscrimination", "CustomDiscrimination"]>,
+              IntArgument<"CustomDiscriminationValue", 1>];
+}
+
 def FunctionReturnThunks : InheritableAttr,
     TargetSpecificAttr<TargetAnyX86> {
   let Spellings = [GCC<"function_return">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 70d5dfa8aaf868..ab4bd003541fa4 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -5672,9 +5672,10 @@ may be changed in the future.
   be used to pass function arguments. Floating-point registers (XMMs/YMMs) still
   follow the C calling convention.
 - On AArch64, only LR and FP are preserved by the callee.
-  Registers X19-X28, X0-X7, and X9-X15 are used to pass function arguments.
-  X8, X16-X18, SIMD and floating-point registers follow the AAPCS calling
-  convention.
+  Registers X20-X28, X0-X7, and X9-X14 are used to pass function arguments.
+  X8, X16-X19, SIMD and floating-point registers follow the AAPCS calling
+  convention. X15 is not available for argument passing on Windows, but is
+  used to pass arguments on other platforms.
   }];
 }
 
@@ -6584,7 +6585,8 @@ like pointers to an object of type ``T``:
   private:
     int *valuePointer;
   public:
-    int *getInt() { return &valuePointer; }
+    IntPointer(const IntOwner&);
+    int *getInt() { return valuePointer; }
   };
 
 The argument ``T`` is optional and is ignored.
@@ -6600,12 +6602,8 @@ When the Owner's lifetime ends, it will consider the Pointer to be dangling.
 .. code-block:: c++
 
   int f() {
-    IntPointer P;
-    if (true) {
-      IntOwner O(7);
-      P = IntPointer(O); // P "points into" O
-    } // P is dangling
-    return P.get(); // error: Using a dangling Pointer.
+    IntPointer P(IntOwner{}); // P "points into" a temporary IntOwner object
+    P.getInt(); // P is dangling
   }
 
 }];
@@ -8106,3 +8104,72 @@ Attribute used by `clspv`_ (OpenCL-C to Vulkan SPIR-V compiler) to identify func
 .. _`libclc`: https://libclc.llvm.org
 }];
 }
+
+def DocCatNonBlockingNonAllocating : DocumentationCategory<"Performance Constraint Attributes"> {
+  let Content = [{
+The ``nonblocking``, ``blocking``, ``nonallocating`` and ``allocating`` attributes can be attached
+to function types, including blocks, C++ lambdas, and member functions. The attributes declare
+constraints about a function's behavior pertaining to blocking and heap memory allocation.
+
+There are several rules for function types with these attributes, enforced with
+compiler warnings:
+
+- When assigning or otherwise converting to a function pointer of ``nonblocking`` or
+  ``nonallocating`` type, the source must also be a function or function pointer of
+  that type, unless it is a null pointer, i.e. the attributes should not be "spoofed". Conversions
+  that remove the attributes are transparent and valid.
+
+- An override of a ``nonblocking`` or ``nonallocating`` virtual method must also be declared
+  with that same attribute (or a stronger one.) An overriding method may add an attribute.
+
+- A redeclaration of a ``nonblocking`` or ``nonallocating`` function must also be declared with
+  the same attribute (or a stronger one). A redeclaration may add an attribute.
+
+The warnings are controlled by ``-Wfunction-effects``, which is enabled by default.
+
+In a future commit, the compiler will diagnose function calls from ``nonblocking`` and ``nonallocating``
+functions to other functions which lack the appropriate attribute.
+  }];
+}
+
+def NonBlockingDocs : Documentation {
+  let Category = DocCatNonBlockingNonAllocating;
+  let Heading = "nonblocking";
+  let Content = [{
+Declares that a function or function type either does or does not block in any way, according
+to the optional, compile-time constant boolean argument, which defaults to true. When the argument
+is false, the attribute is equivalent to ``blocking``.
+
+For the purposes of diagnostics, ``nonblocking`` is considered to include the
+``nonallocating`` guarantee and is therefore a "stronger" constraint or attribute.
+  }];
+}
+
+def NonAllocatingDocs : Documentation {
+  let Category = DocCatNonBlockingNonAllocating;
+  let Heading = "nonallocating";
+  let Content = [{
+Declares that a function or function type either does or does not allocate heap memory, according
+to the optional, compile-time constant boolean argument, which defaults to true. When the argument
+is false, the attribute is equivalent to ``allocating``.
+  }];
+}
+
+def BlockingDocs : Documentation {
+  let Category = DocCatNonBlockingNonAllocating;
+  let Heading = "blocking";
+  let Content = [{
+Declares that a function potentially blocks, and prevents any potential inference of ``nonblocking``
+by the compiler.
+  }];
+}
+
+def AllocatingDocs : Documentation {
+  let Category = DocCatNonBlockingNonAllocating;
+  let Heading = "allocating";
+  let Content = [{
+Declares that a function potentially allocates heap memory, and prevents any potential inference
+of ``nonallocating`` by the compiler.
+  }];
+}
+
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index c8f6104a7f1a7a..f5b15cf90d1f83 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1152,6 +1152,12 @@ def Trap : Builtin {
   let Prototype = "void()";
 }
 
+def VerboseTrap : Builtin {
+  let Spellings = ["__builtin_verbose_trap"];
+  let Attributes = [NoThrow, NoReturn];
+  let Prototype = "void(char const*, char const*)";
+}
+
 def Debugtrap : Builtin {
   let Spellings = ["__builtin_debugtrap"];
   let Attributes = [NoThrow];
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index a73e63355cfd73..56bba448e12a42 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -149,6 +149,12 @@ BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
 BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
 
 BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vcQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vsQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "viQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b64, "vV2iQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b96, "vV3iQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_buffer_store_b128, "vV4iQbiiIi", "n")
 
 //===----------------------------------------------------------------------===//
 // Ballot builtins.
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 0d5e38e825aa78..83699f8897f663 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V);
 // Input is "Major.Minor"
 CudaVersion CudaStringToVersion(const llvm::Twine &S);
 
-enum class CudaArch {
+enum class OffloadArch {
   UNUSED,
   UNKNOWN,
   // TODO: Deprecate and remove GPU architectures older than sm_52.
@@ -128,12 +128,13 @@ enum class CudaArch {
   GFX12_GENERIC,
   GFX1200,
   GFX1201,
+  AMDGCNSPIRV,
   Generic, // A processor model named 'generic' if the target backend defines a
            // public one.
   LAST,
 
-  CudaDefault = CudaArch::SM_52,
-  HIPDefault = CudaArch::GFX906,
+  CudaDefault = OffloadArch::SM_52,
+  HIPDefault = OffloadArch::GFX906,
 };
 
 enum class CUDAFunctionTarget {
@@ -144,26 +145,26 @@ enum class CUDAFunctionTarget {
   InvalidTarget
 };
 
-static inline bool IsNVIDIAGpuArch(CudaArch A) {
-  return A >= CudaArch::SM_20 && A < CudaArch::GFX600;
+static inline bool IsNVIDIAOffloadArch(OffloadArch A) {
+  return A >= OffloadArch::SM_20 && A < OffloadArch::GFX600;
 }
 
-static inline bool IsAMDGpuArch(CudaArch A) {
+static inline bool IsAMDOffloadArch(OffloadArch A) {
   // Generic processor model is for testing only.
-  return A >= CudaArch::GFX600 && A < CudaArch::Generic;
+  return A >= OffloadArch::GFX600 && A < OffloadArch::Generic;
 }
 
-const char *CudaArchToString(CudaArch A);
-const char *CudaArchToVirtualArchString(CudaArch A);
+const char *OffloadArchToString(OffloadArch A);
+const char *OffloadArchToVirtualArchString(OffloadArch A);
 
 // The input should have the form "sm_20".
-CudaArch StringToCudaArch(llvm::StringRef S);
+OffloadArch StringToOffloadArch(llvm::StringRef S);
 
-/// Get the earliest CudaVersion that supports the given CudaArch.
-CudaVersion MinVersionForCudaArch(CudaArch A);
+/// Get the earliest CudaVersion that supports the given OffloadArch.
+CudaVersion MinVersionForOffloadArch(OffloadArch A);
 
-/// Get the latest CudaVersion that supports the given CudaArch.
-CudaVersion MaxVersionForCudaArch(CudaArch A);
+/// Get the latest CudaVersion that supports the given OffloadArch.
+CudaVersion MaxVersionForOffloadArch(OffloadArch A);
 
 //  Various SDK-dependent features that affect CUDA compilation
 enum class CudaFeature {
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 9b37d4bd3205bf..de99cb46d3145a 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1527,6 +1527,10 @@ def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">;
 def UnsafeBufferUsageInContainer : DiagGroup<"unsafe-buffer-usage-in-container">;
 def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer]>;
 
+// Warnings and notes related to the function effects system underlying
+// the nonblocking and nonallocating attributes.
+def FunctionEffects : DiagGroup<"function-effects">;
+
 // Warnings and notes InstallAPI verification.
 def InstallAPIViolation : DiagGroup<"installapi-violation">;
 
@@ -1535,3 +1539,7 @@ def BitIntExtension : DiagGroup<"bit-int-extension">;
 
 // Warnings about misuse of ExtractAPI options.
 def ExtractAPIMisuse : DiagGroup<"extractapi-misuse">;
+
+// Warnings about using the non-standard extension having an explicit specialization
+// with a storage class specifier.
+def ExplicitSpecializationStorageClass : DiagGroup<"explicit-specialization-storage-class">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index d8c3fee7841f43..1160b0f7a7a5a9 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1508,6 +1508,8 @@ def err_omp_unexpected_append_op : Error<
   "unexpected operation specified in 'append_args' clause, expected 'interop'">;
 def err_omp_unexpected_execution_modifier : Error<
   "unexpected 'execution' modifier in non-executable context">;
+def err_omp_unknown_adjust_args_op : Error<
+  "incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'">;
 def err_omp_declare_variant_wrong_clause : Error<
   "expected %select{'match'|'match', 'adjust_args', or 'append_args'}0 clause "
   "on 'omp declare variant' directive">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 25a87078a57093..5dc36c594bcb7c 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -940,6 +940,13 @@ def warn_ptrauth_auth_null_pointer :
 def err_ptrauth_string_not_literal : Error<
   "argument must be a string literal%select{| of char type}0">;
 
+def note_ptrauth_virtual_function_pointer_incomplete_arg_ret :
+  Note<"cannot take an address of a virtual member function if its return or "
+       "argument types are incomplete">;
+def note_ptrauth_virtual_function_incomplete_arg_ret_type :
+  Note<"%0 is incomplete">;
+
+
 /// main()
 // static main() is not an error in C, just in C++.
 def warn_static_main : Warning<"'main' should not be declared static">,
@@ -3246,6 +3253,9 @@ def warn_unsupported_target_attribute
 def err_attribute_unsupported
     : Error<"%0 attribute is not supported on targets missing %1;"
             " specify an appropriate -march= or -mcpu=">;
+def err_attribute_unsupported_m_profile
+    : Error<"on M-profile architectures %0 attribute is not supported on targets missing %1;"
+            " specify an appropriate -march= or -mcpu=">;
 def err_duplicate_target_attribute
     : Error<"%select{unsupported|duplicate|unknown}0%select{| CPU|"
               " tune CPU}1 '%2' in the '%select{target|target_clones|target_version}3' "
@@ -3272,11 +3282,23 @@ def warn_aligned_attr_underaligned : Warning<err_alignas_underaligned.Summary>,
 def err_attribute_sizeless_type : Error<
   "%0 attribute cannot be applied to sizeless type %1">;
 def err_attribute_argument_n_type : Error<
-  "%0 attribute requires parameter %1 to be %select{int or bool|an integer "
-  "constant|a string|an identifier|a constant expression|a builtin function}2">;
+  "%0 attribute requires parameter %1 to be %select{"
+  "int or bool"
+  "|an integer constant"
+  "|a string"
+  "|an identifier"
+  "|a constant expression"
+  "|a builtin function"
+  "|nullptr or a bool, int, or enum literal}2">;
 def err_attribute_argument_type : Error<
-  "%0 attribute requires %select{int or bool|an integer "
-  "constant|a string|an identifier}1">;
+  "%0 attribute requires %select{"
+  "int or bool"
+  "|an integer constant"
+  "|a string"
+  "|an identifier"
+  "|a constant expression"
+  "|a builtin function"
+  "|a pointer or a bool, int, or enum literal}1">;
 def err_attribute_argument_out_of_range : Error<
   "%0 attribute requires integer constant between %1 and %2 inclusive">;
 def err_init_priority_object_attr : Error<
@@ -3728,7 +3750,8 @@ def warn_attribute_wrong_decl_type : Warning<
   "|types and namespaces"
   "|variables, functions and classes"
   "|kernel functions"
-  "|non-K&R-style functions}2">,
+  "|non-K&R-style functions"
+  "|functions that return bool, integer, or a pointer type}2">,
   InGroup<IgnoredAttributes>;
 def err_attribute_wrong_decl_type : Error<warn_attribute_wrong_decl_type.Summary>;
 def warn_type_attribute_wrong_type : Warning<
@@ -5366,7 +5389,7 @@ def err_not_class_template_specialization : Error<
   "cannot specialize a %select{dependent template|template template "
   "parameter}0">;
 def ext_explicit_specialization_storage_class : ExtWarn<
-  "explicit specialization cannot have a storage class">;
+  "explicit specialization cannot have a storage class">, InGroup<ExplicitSpecializationStorageClass>;
 def err_dependent_function_template_spec_no_match : Error<
   "no candidate function template was found for dependent"
   " %select{member|friend}0 function template specialization">;
@@ -6075,6 +6098,9 @@ def err_thread_thread_different_kind : Error<
 def err_mismatched_owning_module : Error<
   "declaration of %0 in %select{the global module|module %2}1 follows "
   "declaration in %select{the global module|module %4}3">;
+def err_multiple_decl_in_different_modules : Error<
+  "declaration %0 attached to named module '%1' can't be attached to "
+  "other modules">;
 def err_redefinition_different_type : Error<
   "redefinition of %0 with a different type%diff{: $ vs $|}1,2">;
 def err_redefinition_different_kind : Error<
@@ -8937,6 +8963,8 @@ def err_expected_callable_argument : Error<
   "expected a callable expression as %ordinal0 argument to %1, found %2">;
 def note_building_builtin_dump_struct_call : Note<
   "in call to printing function with arguments '(%0)' while dumping struct">;
+def err_builtin_verbose_trap_arg : Error<
+  "argument to __builtin_verbose_trap must %select{be a pointer to a constant string|not contain $}0">;
 
 def err_atomic_load_store_uses_lib : Error<
   "atomic %select{load|store}0 requires runtime support that is not "
@@ -10881,6 +10909,23 @@ def warn_imp_cast_drops_unaligned : Warning<
   "implicit cast from type %0 to type %1 drops __unaligned qualifier">,
   InGroup<DiagGroup<"unaligned-qualifier-implicit-cast">>;
 
+// Function effects
+// spoofing nonblocking/nonallocating
+def warn_invalid_add_func_effects : Warning<
+  "attribute '%0' should not be added via type conversion">,
+  InGroup<FunctionEffects>;
+def warn_mismatched_func_effect_override : Warning<
+  "attribute '%0' on overriding function does not match base declaration">,
+  InGroup<FunctionEffects>;
+def warn_mismatched_func_effect_redeclaration : Warning<
+  "attribute '%0' on function does not match previous declaration">,
+  InGroup<FunctionEffects>;
+def warn_conflicting_func_effects : Warning<
+  "effects conflict when merging declarations; kept '%0', discarded '%1'">,
+  InGroup<FunctionEffects>;
+def err_func_with_effects_no_prototype : Error<
+  "'%0' function must have a prototype">;
+
 } // end of sema category
 
 let CategoryName = "API Notes Issue" in {
@@ -11072,7 +11117,7 @@ def err_omp_loop_variable_type : Error<
 def err_omp_loop_incr_not_compatible : Error<
   "increment expression must cause %0 to %select{decrease|increase}1 "
   "on each iteration of OpenMP for loop">;
-def note_omp_loop_cond_requres_compatible_incr : Note<
+def note_omp_loop_cond_requires_compatible_incr : Note<
   "loop step is expected to be %select{negative|positive}0 due to this condition">;
 def err_omp_loop_diff_cxx : Error<
   "could not calculate number of iterations calling 'operator-' with "
@@ -11135,16 +11180,12 @@ def err_omp_several_directives_in_region : Error<
 def note_omp_previous_directive : Note<
   "previous '%0' directive used here">;
 def err_omp_sections_not_compound_stmt : Error<
-  "the statement for '#pragma omp sections' must be a compound statement">;
-def err_omp_parallel_sections_not_compound_stmt : Error<
-  "the statement for '#pragma omp parallel sections' must be a compound statement">;
+  "the statement for '#pragma omp %0' must be a compound statement">;
 def err_omp_orphaned_section_directive : Error<
   "%select{orphaned 'omp section' directives are prohibited, it|'omp section' directive}0"
   " must be closely nested to a sections region%select{|, not a %1 region}0">;
 def err_omp_sections_substmt_not_section : Error<
-  "statement in 'omp sections' directive must be enclosed into a section region">;
-def err_omp_parallel_sections_substmt_not_section : Error<
-  "statement in 'omp parallel sections' directive must be enclosed into a section region">;
+  "statement in 'omp %0' directive must be enclosed into a section region">;
 def err_omp_parallel_reduction_in_task_firstprivate : Error<
   "argument of a reduction clause of a %0 construct must not appear in a firstprivate clause on a task construct">;
 def err_omp_atomic_read_not_expression_statement : Error<
@@ -12182,6 +12223,30 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
   "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
   "%1 attribute">, InGroup<IgnoredAttributes>;
 
+// VTable pointer authentication errors
+def err_non_polymorphic_vtable_pointer_auth : Error<
+  "cannot set vtable pointer authentication on monomorphic type %0">;
+def err_incomplete_type_vtable_pointer_auth : Error<
+  "cannot set vtable pointer authentication on an incomplete type %0">;
+def err_non_top_level_vtable_pointer_auth : Error<
+  "cannot set vtable pointer authentication on %0 which is a subclass of polymorphic type %1">;
+def err_duplicated_vtable_pointer_auth : Error<
+  "multiple vtable pointer authentication policies on %0">;
+def err_invalid_authentication_key : Error<
+  "invalid authentication key %0">;
+def err_invalid_address_discrimination : Error<
+  "invalid address discrimination mode %0">;
+def err_invalid_extra_discrimination : Error<
+  "invalid extra discrimination selection %0">;
+def err_invalid_custom_discrimination : Error<
+  "invalid custom discrimination">;
+def err_missing_custom_discrimination : Error<
+  "missing custom discrimination">;
+def err_no_default_vtable_pointer_auth : Error<
+  "cannot specify a default vtable pointer authentication "
+  "%select{key|address discrimination mode|discriminator}0 with no default set"
+>;
+
 def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "
                                  "have a bit size of at least %select{2|1}0">;
 def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit "
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 75e88afbd97050..91f1c2f2e6239e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -575,6 +575,10 @@ class LangOptions : public LangOptionsBase {
   // implementation on real-world examples.
   std::string OpenACCMacroOverride;
 
+  // Indicates if the wasm-opt binary must be ignored in the case of a
+  // WebAssembly target.
+  bool NoWasmOpt = false;
+
   LangOptions();
 
   /// Set language defaults for the given input language and
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index d127498774c7fa..29a852d6665111 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -368,6 +368,14 @@ bool needsTaskBasedThreadLimit(OpenMPDirectiveKind DKind);
 /// is restricted only to memory order clauses of "OMPC_acquire",
 /// "OMPC_relaxed" and "OMPC_seq_cst".
 bool checkFailClauseParameter(OpenMPClauseKind FailClauseParameter);
+
+/// Checks if the specified directive is considered as "executable". This
+/// combines the OpenMP categories of "executable" and "subsidiary", plus
+/// any other directives that should be treated as executable.
+/// \param DKind Specified directive.
+/// \return true - if the above condition is met for this directive
+/// otherwise - false.
+bool isOpenMPExecutableDirective(OpenMPDirectiveKind DKind);
 }
 
 #endif
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
index aaad4a2b2b5ae6..197d63642ca6d2 100644
--- a/clang/include/clang/Basic/PointerAuthOptions.h
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -47,6 +47,12 @@ class PointerAuthSchema {
     /// No additional discrimination.
     None,
 
+    /// Include a hash of the entity's type.
+    Type,
+
+    /// Include a hash of the entity's identity.
+    Decl,
+
     /// Discriminate using a constant value.
     Constant,
   };
@@ -150,6 +156,25 @@ class PointerAuthSchema {
 struct PointerAuthOptions {
   /// The ABI for C function pointers.
   PointerAuthSchema FunctionPointers;
+
+  /// The ABI for C++ virtual table pointers (the pointer to the table
+  /// itself) as installed in an actual class instance.
+  PointerAuthSchema CXXVTablePointers;
+
+  /// TypeInfo has external ABI requirements and is emitted without
+  /// actually having parsed the libcxx definition, so we can't simply
+  /// perform a look up. The settings for this should match the exact
+  /// specification in type_info.h
+  PointerAuthSchema CXXTypeInfoVTablePointer;
+
+  /// The ABI for C++ virtual table pointers as installed in a VTT.
+  PointerAuthSchema CXXVTTVTablePointers;
+
+  /// The ABI for most C++ virtual function pointers, i.e. v-table entries.
+  PointerAuthSchema CXXVirtualFunctionPointers;
+
+  /// The ABI for variadic C++ virtual function pointers.
+  PointerAuthSchema CXXVirtualVariadicFunctionPointers;
 };
 
 } // end namespace clang
diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index d2e2e914327f2b..d3ccc7ef81c079 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1981,6 +1981,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
                                          SourceLocation SpellLoc,
                                          SourceLocation ExpansionLoc,
                                          unsigned ExpansionLength) const;
+  void updateSlocUsageStats() const;
 };
 
 /// Comparison function object.
diff --git a/clang/include/clang/Basic/TargetCXXABI.h b/clang/include/clang/Basic/TargetCXXABI.h
index c113a6a048ad44..d204452afbf4b3 100644
--- a/clang/include/clang/Basic/TargetCXXABI.h
+++ b/clang/include/clang/Basic/TargetCXXABI.h
@@ -116,7 +116,7 @@ class TargetCXXABI {
       return T.isKnownWindowsMSVCEnvironment();
     }
     llvm_unreachable("invalid CXXABI kind");
-  };
+  }
 
   /// Does this ABI generally fall into the Itanium family of ABIs?
   bool isItaniumFamily() const {
diff --git a/clang/include/clang/Basic/Thunk.h b/clang/include/clang/Basic/Thunk.h
index 0247e279408f08..8ff7603e0094d7 100644
--- a/clang/include/clang/Basic/Thunk.h
+++ b/clang/include/clang/Basic/Thunk.h
@@ -21,6 +21,7 @@
 namespace clang {
 
 class CXXMethodDecl;
+class Type;
 
 /// A return adjustment.
 struct ReturnAdjustment {
@@ -162,20 +163,24 @@ struct ThunkInfo {
 
   /// Holds a pointer to the overridden method this thunk is for,
   /// if needed by the ABI to distinguish different thunks with equal
-  /// adjustments. Otherwise, null.
+  /// adjustments.
+  /// In the Itanium ABI, this field can hold the method that created the
+  /// vtable entry for this thunk.
+  /// Otherwise, null.
   /// CAUTION: In the unlikely event you need to sort ThunkInfos, consider using
   /// an ABI-specific comparator.
   const CXXMethodDecl *Method;
+  const Type *ThisType;
 
-  ThunkInfo() : Method(nullptr) {}
+  ThunkInfo() : Method(nullptr), ThisType(nullptr) {}
 
   ThunkInfo(const ThisAdjustment &This, const ReturnAdjustment &Return,
-            const CXXMethodDecl *Method = nullptr)
-      : This(This), Return(Return), Method(Method) {}
+            const Type *ThisT, const CXXMethodDecl *Method = nullptr)
+      : This(This), Return(Return), Method(Method), ThisType(ThisT) {}
 
   friend bool operator==(const ThunkInfo &LHS, const ThunkInfo &RHS) {
     return LHS.This == RHS.This && LHS.Return == RHS.Return &&
-           LHS.Method == RHS.Method;
+           LHS.Method == RHS.Method && LHS.ThisType == RHS.ThisType;
   }
 
   bool isEmpty() const {
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 564a58e4eb6709..ce211f97d1c968 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -15,11 +15,13 @@
 
 include "arm_sve_sme_incl.td"
 
+let SVETargetGuard = InvalidMode in {
+
 ////////////////////////////////////////////////////////////////////////////////
 // Loads
 
 multiclass ZALoad<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimPQ", t,
                           [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                           MemEltTyDefault, i_prefix # "_horiz", ch>;
@@ -44,7 +46,7 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0
 defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>;
 defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>;
 
-let TargetGuard = "sme" in {
+let SMETargetGuard = "sme" in {
 def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                           MemEltTyDefault, "aarch64_sme_ldr">;
@@ -58,7 +60,7 @@ def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
 // Stores
 
 multiclass ZAStore<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimP%", t,
                           [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                           MemEltTyDefault, i_prefix # "_horiz", ch>;
@@ -83,7 +85,7 @@ defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck
 defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>;
 defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>;
 
-let TargetGuard = "sme" in {
+let SMETargetGuard = "sme" in {
 def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInZA],
                           MemEltTyDefault, "aarch64_sme_str">;
@@ -97,7 +99,7 @@ def SVSTR_ZA : MInst<"svstr_za", "vm%", "",
 // Read horizontal/vertical ZA slices
 
 multiclass ZARead<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPim", t,
                           MergeOp1, i_prefix # "_horiz",
                           [IsReadZA, IsStreaming, IsInZA], ch>;
@@ -118,7 +120,7 @@ defm SVREAD_ZA128 : ZARead<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_readq", [Im
 // Write horizontal/vertical ZA slices
 
 multiclass ZAWrite<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimPd", t,
                           MergeOp1, i_prefix # "_horiz",
                           [IsWriteZA, IsStreaming, IsInOutZA], ch>;
@@ -138,7 +140,7 @@ defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_writeq",
 ////////////////////////////////////////////////////////////////////////////////
 // SME - Zero
 
-let TargetGuard = "sme" in {
+let SMETargetGuard = "sme" in {
   def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero",
                              [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                              [ImmCheck<0, ImmCheck0_255>]>;
@@ -146,7 +148,7 @@ let TargetGuard = "sme" in {
                              [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
 }
 
-let TargetGuard = "sme2p1" in {
+let SMETargetGuard = "sme2p1" in {
   def SVZERO_ZA64_VG1x2 : SInst<"svzero_za64_vg1x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x2",
                             [IsOverloadNone, IsStreaming, IsInOutZA]>;
   def SVZERO_ZA64_VG1x4 : SInst<"svzero_za64_vg1x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x4",
@@ -169,7 +171,7 @@ let TargetGuard = "sme2p1" in {
 // SME - Counting elements in a streaming vector
 
 multiclass ZACount<string n_suffix> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone,
                       "aarch64_sme_" # n_suffix,
                       [IsOverloadNone, IsStreamingCompatible]>;
@@ -185,13 +187,13 @@ defm SVCNTSD : ZACount<"cntsd">;
 // SME - ADDHA/ADDVA
 
 multiclass ZAAdd<string n_suffix> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPd", "iUi", MergeOp1,
                       "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
                       [ImmCheck<0, ImmCheck0_3>]>;
   }
 
-  let TargetGuard = "sme-i16i64" in {
+  let SMETargetGuard = "sme-i16i64" in {
     def NAME # _ZA64: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPd", "lUl", MergeOp1,
                      "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
                      [ImmCheck<0, ImmCheck0_7>]>;
@@ -205,7 +207,7 @@ defm SVADDVA : ZAAdd<"addva">;
 // SME - SMOPA, SMOPS, UMOPA, UMOPS
 
 multiclass ZAIntOuterProd<string n_suffix1, string n_suffix2> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]",
                               "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "c",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
@@ -213,7 +215,7 @@ multiclass ZAIntOuterProd<string n_suffix1, string n_suffix2> {
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
-  let TargetGuard = "sme-i16i64" in {
+  let SMETargetGuard = "sme-i16i64" in {
     def NAME # _ZA64_H: SInst<"sv" # n_suffix2 # "_za64[_{d}]",
                               "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "s",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
@@ -231,7 +233,7 @@ defm SVUMOPS : ZAIntOuterProd<"u", "mops">;
 // SME - SUMOPA, SUMOPS, USMOPA, USMOPS
 
 multiclass ZAIntOuterProdMixedSigns<string n_suffix1, string n_suffix2> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _ZA32_B: SInst<"sv" # n_suffix1 # n_suffix2 # "_za32[_{d}]",
                               "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
                               !cond(!eq(n_suffix1, "su") : "", true: "U") # "c",
@@ -240,7 +242,7 @@ multiclass ZAIntOuterProdMixedSigns<string n_suffix1, string n_suffix2> {
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
-  let TargetGuard = "sme-i16i64" in {
+  let SMETargetGuard = "sme-i16i64" in {
     def NAME # _ZA64_H: SInst<"sv" # n_suffix1 # n_suffix2 # "_za64[_{d}]",
                               "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
                               !cond(!eq(n_suffix1, "su") : "", true: "U") # "s",
@@ -259,7 +261,7 @@ defm SVUSMOPS : ZAIntOuterProdMixedSigns<"us", "mops">;
 // SME - FMOPA, FMOPS
 
 multiclass ZAFPOuterProd<string n_suffix> {
-  let TargetGuard = "sme" in {
+  let SMETargetGuard = "sme" in {
     def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "h",
                               MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
                               [IsStreaming, IsInOutZA],
@@ -276,7 +278,7 @@ multiclass ZAFPOuterProd<string n_suffix> {
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
-  let TargetGuard = "sme-f64f64" in {
+  let SMETargetGuard = "sme-f64f64" in {
     def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPdd", "d",
                               MergeOp1, "aarch64_sme_" # n_suffix,
                               [IsStreaming, IsInOutZA],
@@ -291,7 +293,7 @@ defm SVMOPS : ZAFPOuterProd<"mops">;
 // SME2 - ADD, SUB
 
 multiclass ZAAddSub<string n_suffix> {
-  let TargetGuard = "sme2" in {
+  let SMETargetGuard = "sme2" in {
     def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
     def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
@@ -302,7 +304,7 @@ multiclass ZAAddSub<string n_suffix> {
     def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
   }
 
-  let TargetGuard = "sme2,sme-i16i64" in {
+  let SMETargetGuard = "sme2,sme-i16i64" in {
     def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
     def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
@@ -313,17 +315,17 @@ multiclass ZAAddSub<string n_suffix> {
     def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
   }
 
-  let TargetGuard = "sme2,sme-f64f64" in {
+  let SMETargetGuard = "sme2,sme-f64f64" in {
     def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
     def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
   }
 
-  let TargetGuard = "sme-f16f16|sme-f8f16" in {
+  let SMETargetGuard = "sme-f16f16|sme-f8f16" in {
     def NAME # _ZA16_VG1X2_F16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x2", "vm2", "h", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x2", [IsStreaming, IsInOutZA], []>;
     def NAME # _ZA16_VG1X4_F16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x4", "vm4", "h", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x4", [IsStreaming, IsInOutZA], []>;
   }
 
-  let TargetGuard = "sme2,b16b16" in {
+  let SMETargetGuard = "sme2,b16b16" in {
     def NAME # _ZA16_VG1X2_BF16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x2", "vm2", "b", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x2", [IsStreaming, IsInOutZA], []>;
     def NAME # _ZA16_VG1X4_BF16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x4", "vm4", "b", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x4", [IsStreaming, IsInOutZA], []>;
   }
@@ -347,7 +349,7 @@ multiclass ZAWrite_VG<string n, string t, string i, list<ImmCheck> checks> {
   def NAME # _VG1x4 : Inst<"svwrite_"     # n # "[_{d}]_vg1x4", "vm4",  t, MergeNone, i # "_vg1x4",   [IsInOutZA, IsStreaming], []>;
 }
 
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   defm SVWRITE_ZA8  : ZAWrite_VG<"za8",  "cUc",   "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>]>;
   defm SVWRITE_ZA16 : ZAWrite_VG<"za16", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>]>;
   defm SVWRITE_ZA32 : ZAWrite_VG<"za32", "iUif",  "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>]>;
@@ -363,7 +365,7 @@ multiclass ZARead_VG<string n, string t, string i, list<ImmCheck> checks> {
   def NAME # _VG1x4 : Inst<"svread_"     # n # "_{d}_vg1x4", "4m",  t, MergeNone, i # "_vg1x4",   [IsInZA, IsStreaming], []>;
 }
 
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   defm SVREAD_ZA8  : ZARead_VG<"za8",  "cUc",   "aarch64_sme_read", [ImmCheck<0, ImmCheck0_0>]>;
   defm SVREAD_ZA16 : ZARead_VG<"za16", "sUshb", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_1>]>;
   defm SVREAD_ZA32 : ZARead_VG<"za32", "iUif",  "aarch64_sme_read", [ImmCheck<0, ImmCheck0_3>]>;
@@ -374,7 +376,7 @@ let TargetGuard = "sme2" in {
 // Outer product and accumulate/subtract
 //
 
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   def SVSMOPA  : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "s", MergeNone, "aarch64_sme_smopa_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
   def SVUSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "Us", MergeNone, "aarch64_sme_umopa_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 
@@ -434,7 +436,7 @@ let TargetGuard = "sme2" in {
   def SVDOT_LANE_ZA32_VG1x4_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sme2,sme-i16i64" in {
+let SMETargetGuard = "sme2,sme-i16i64" in {
   def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
   def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 
@@ -453,7 +455,7 @@ let TargetGuard = "sme2,sme-i16i64" in {
 }
 
 // FMLA/FMLS
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
   def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
   def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
@@ -470,7 +472,7 @@ let TargetGuard = "sme2" in {
   def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sme2,sme-f64f64" in {
+let SMETargetGuard = "sme2,sme-f64f64" in {
   def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
   def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
   def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
@@ -487,7 +489,7 @@ let TargetGuard = "sme2,sme-f64f64" in {
   def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
-let TargetGuard = "sme-f16f16" in {
+let SMETargetGuard = "sme-f16f16" in {
   def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
   def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
   def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
@@ -504,7 +506,7 @@ let TargetGuard = "sme-f16f16" in {
   def SVMLS_LANE_VG1x4_F16 : Inst<"svmls_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
-let TargetGuard = "sme2,b16b16" in {
+let SMETargetGuard = "sme2,b16b16" in {
   def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
   def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
   def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
@@ -523,7 +525,7 @@ let TargetGuard = "sme2,b16b16" in {
 
 // FMLAL/FMLSL/UMLAL/SMLAL
 // SMLALL/UMLALL/USMLALL/SUMLALL
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   // MULTI MLAL
   def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsInOutZA], []>;
   def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsInOutZA], []>;
@@ -653,7 +655,7 @@ let TargetGuard = "sme2" in {
   def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vm4xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
 }
 
-let TargetGuard = "sme2,sme-i16i64" in {
+let SMETargetGuard = "sme2,sme-i16i64" in {
   // MULTI MLAL
   def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsInOutZA], []>;
   def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsInOutZA], []>;
@@ -702,7 +704,7 @@ let TargetGuard = "sme2,sme-i16i64" in {
 //
 // Spill and fill of ZT0
 //
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
   def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
@@ -710,14 +712,14 @@ let TargetGuard = "sme2" in {
 //
 // Zero ZT0
 //
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 //
 // lookup table expand four contiguous registers
 //
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
   def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
 }
@@ -725,7 +727,7 @@ let TargetGuard = "sme2" in {
 //
 // lookup table expand one register
 //
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
   def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
 }
@@ -733,14 +735,14 @@ let TargetGuard = "sme2" in {
 //
 // lookup table expand two contiguous registers
 //
-let TargetGuard = "sme2" in {
+let SMETargetGuard = "sme2" in {
   def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
   def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SME2p1 - FMOPA, FMOPS (non-widening)
-let TargetGuard = "sme2,b16b16" in {
+let SMETargetGuard = "sme2,b16b16" in {
   def SVMOPA_BF16_NW : SInst<"svmopa_za16[_bf16]_m", "viPPdd", "b",
                              MergeNone, "aarch64_sme_mopa",
                              [IsStreaming, IsInOutZA],
@@ -751,7 +753,7 @@ let TargetGuard = "sme2,b16b16" in {
                              [ImmCheck<0, ImmCheck0_1>]>;
 }
 
-let TargetGuard = "sme-f16f16" in {
+let SMETargetGuard = "sme-f16f16" in {
   def SVMOPA_F16_NW : SInst<"svmopa_za16[_f16]_m", "viPPdd", "h",
                             MergeNone, "aarch64_sme_mopa",
                             [IsStreaming, IsInOutZA],
@@ -761,3 +763,46 @@ let TargetGuard = "sme-f16f16" in {
                             [IsStreaming, IsInOutZA],
                             [ImmCheck<0, ImmCheck0_1>]>;
 }
+
+
+multiclass ZAReadz<string n_suffix, string vg_num, string t, string i_prefix, list<ImmCheck> ch> {
+  let SMETargetGuard = "sme2p1" in {
+    def NAME # _H : SInst<"svreadz_hor_" # n_suffix # "_{d}_vg" # vg_num, vg_num # "im", t,
+                          MergeNone, i_prefix # "_horiz_x" # vg_num,
+                          [IsStreaming, IsInOutZA], ch>;
+
+    def NAME # _V : SInst<"svreadz_ver_" # n_suffix # "_{d}_vg" # vg_num, vg_num # "im", t,
+                          MergeNone, i_prefix # "_vert_x" #vg_num,
+                          [IsStreaming, IsInOutZA], ch>;
+  }
+}
+
+defm SVREADZ_ZA8_X2  : ZAReadz<"za8",  "2", "cUc",   "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVREADZ_ZA16_X2 : ZAReadz<"za16", "2", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVREADZ_ZA32_X2 : ZAReadz<"za32", "2", "iUif",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVREADZ_ZA64_X2 : ZAReadz<"za64", "2", "lUld",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>;
+
+defm SVREADZ_ZA8_X4  : ZAReadz<"za8",  "4", "cUc",   "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVREADZ_ZA16_X4 : ZAReadz<"za16", "4", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVREADZ_ZA32_X4 : ZAReadz<"za32", "4", "iUif",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVREADZ_ZA64_X4 : ZAReadz<"za64", "4", "lUld",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>;
+
+
+multiclass ZAReadzSingle<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
+  let SMETargetGuard = "sme2p1" in {
+    def NAME # _H : SInst<"svreadz_hor_" # n_suffix # "_{d}", "dim", t,
+                          MergeNone, i_prefix # "_horiz",
+                          [IsStreaming, IsInOutZA], ch>;
+
+    def NAME # _V : SInst<"svreadz_ver_" # n_suffix # "_{d}", "dim", t,
+                          MergeNone, i_prefix # "_vert",
+                          [IsStreaming, IsInOutZA], ch>;
+  }
+}
+
+defm SVREADZ_ZA8   : ZAReadzSingle<"za8", "cUc", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVREADZ_ZA16  : ZAReadzSingle<"za16", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVREADZ_ZA32  : ZAReadzSingle<"za32", "iUif", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVREADZ_ZA64  : ZAReadzSingle<"za64", "lUld", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>;
+defm SVREADZ_ZA128 : ZAReadzSingle<"za128", "csilUcUiUsUlbhfd", "aarch64_sme_readz_q", [ImmCheck<0, ImmCheck0_15>]>;
+} // let SVETargetGuard = InvalidMode
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 4a3f92520ba74f..94c093d8911562 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -27,7 +27,7 @@ def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl",          [IsLoad, IsZExtRetu
 def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl",             [IsLoad, VerifyRuntimeMode],               MemEltTyInt32,   "aarch64_sve_ld1">;
 def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl",             [IsLoad, IsZExtReturn, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_ld1">;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVLD1_BF      : MInst<"svld1[_{2}]",      "dPc",  "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ld1">;
   def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ld1">;
 }
@@ -41,7 +41,7 @@ def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl",          [IsLoad,
 def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl",             [IsLoad, VerifyRuntimeMode],               MemEltTyInt32,   "aarch64_sve_ld1">;
 def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl",             [IsLoad, IsZExtReturn, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_ld1">;
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 // Load one vector (vector base)
 def SVLD1_GATHER_BASES_U   : MInst<"svld1_gather[_{2}base]_{d}",   "dPu", "ilUiUlfd", [IsGatherLoad],               MemEltTyDefault, "aarch64_sve_ld1_gather_scalar_offset">;
 def SVLD1SB_GATHER_BASES_U : MInst<"svld1sb_gather[_{2}base]_{d}", "dPu", "ilUiUl",   [IsGatherLoad],               MemEltTyInt8,    "aarch64_sve_ld1_gather_scalar_offset">;
@@ -137,14 +137,14 @@ def SVLDFF1SH_VNUM : MInst<"svldff1sh_vnum_{d}", "dPTl", "ilUiUl",          [IsL
 def SVLDFF1UH_VNUM : MInst<"svldff1uh_vnum_{d}", "dPXl", "ilUiUl",          [IsLoad, IsZExtReturn], MemEltTyInt16,   "aarch64_sve_ldff1">;
 def SVLDFF1SW_VNUM : MInst<"svldff1sw_vnum_{d}", "dPUl", "lUl",             [IsLoad],               MemEltTyInt32,   "aarch64_sve_ldff1">;
 def SVLDFF1UW_VNUM : MInst<"svldff1uw_vnum_{d}", "dPYl", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldff1">;
-} //  let TargetGuard = "sve"
+} //  let SVETargetGuard = "sve", SMETargetGuard = InvalidMode
 
-let TargetGuard = "sve,bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = InvalidMode in {
   def SVLDFF1_BF      : MInst<"svldff1[_{2}]",      "dPc",  "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
   def SVLDFF1_VNUM_BF : MInst<"svldff1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
 }
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 // First-faulting load one vector (vector base)
 def SVLDFF1_GATHER_BASES_U   : MInst<"svldff1_gather[_{2}base]_{d}",   "dPu", "ilUiUlfd", [IsGatherLoad],               MemEltTyDefault, "aarch64_sve_ldff1_gather_scalar_offset">;
 def SVLDFF1SB_GATHER_BASES_U : MInst<"svldff1sb_gather[_{2}base]_{d}", "dPu", "ilUiUl",   [IsGatherLoad],               MemEltTyInt8,    "aarch64_sve_ldff1_gather_scalar_offset">;
@@ -239,9 +239,9 @@ def SVLDNF1SH_VNUM : MInst<"svldnf1sh_vnum_{d}", "dPTl", "ilUiUl",          [IsL
 def SVLDNF1UH_VNUM : MInst<"svldnf1uh_vnum_{d}", "dPXl", "ilUiUl",          [IsLoad, IsZExtReturn], MemEltTyInt16,   "aarch64_sve_ldnf1">;
 def SVLDNF1SW_VNUM : MInst<"svldnf1sw_vnum_{d}", "dPUl", "lUl",             [IsLoad],               MemEltTyInt32,   "aarch64_sve_ldnf1">;
 def SVLDNF1UW_VNUM : MInst<"svldnf1uw_vnum_{d}", "dPYl", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldnf1">;
-} //  let TargetGuard = "sve"
+} //  let SVETargetGuard = "sve", SMETargetGuard = InvalidMode
 
-let TargetGuard = "sve,bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = InvalidMode in {
   def SVLDNF1_BF      : MInst<"svldnf1[_{2}]",      "dPc",  "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
   def SVLDNF1_VNUM_BF : MInst<"svldnf1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
 }
@@ -252,7 +252,7 @@ def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, VerifyRu
 // Load one vector, unextended load, non-temporal (scalar base, VL displacement)
 def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVLDNT1_BF      : MInst<"svldnt1[_{2}]",      "dPc",  "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
   def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
 }
@@ -260,13 +260,13 @@ let TargetGuard = "(sve|sme),bf16" in {
 // Load one quadword and replicate (scalar base)
 def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq", [VerifyRuntimeMode]>;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc",  "b", MergeNone, "aarch64_sve_ld1rq", [VerifyRuntimeMode]>;
 }
 
 multiclass StructLoad<string name, string proto, string i> {
   def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructLoad, VerifyRuntimeMode]>;
-  let TargetGuard = "(sve|sme),bf16" in {
+  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
     def: SInst<name, proto, "b", MergeNone, i, [IsStructLoad, VerifyRuntimeMode]>;
   }
 }
@@ -282,18 +282,18 @@ defm SVLD3_VNUM : StructLoad<"svld3_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3_sret">
 defm SVLD4_VNUM : StructLoad<"svld4_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4_sret">;
 
 // Load one octoword and replicate (scalar base)
-let TargetGuard = "sve,f64mm" in {
+let SVETargetGuard = "sve,f64mm", SMETargetGuard = InvalidMode in {
   def SVLD1RO : SInst<"svld1ro[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1ro">;
 }
-let TargetGuard = "sve,f64mm,bf16" in {
+let SVETargetGuard = "sve,f64mm,bf16", SMETargetGuard = InvalidMode in {
   def SVLD1RO_BF16 : SInst<"svld1ro[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1ro">;
 }
 
-let TargetGuard = "sve,bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = InvalidMode in {
   def SVBFMMLA       : SInst<"svbfmmla[_{0}]",       "MMdd",  "b", MergeNone, "aarch64_sve_bfmmla",       [IsOverloadNone]>;
 }
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVBFDOT        : SInst<"svbfdot[_{0}]",        "MMdd",  "b", MergeNone, "aarch64_sve_bfdot",        [IsOverloadNone, VerifyRuntimeMode]>;
   def SVBFMLALB      : SInst<"svbfmlalb[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalb",      [IsOverloadNone, VerifyRuntimeMode]>;
   def SVBFMLALT      : SInst<"svbfmlalt[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalt",      [IsOverloadNone, VerifyRuntimeMode]>;
@@ -303,9 +303,9 @@ let TargetGuard = "(sve|sme),bf16" in {
   def SVBFDOT_LANE   : SInst<"svbfdot_lane[_{0}]",   "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2",   [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_3>]>;
   def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
   def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
-} // let TargetGuard = "(sve|sme),bf16"
+} // let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16"
 
-let TargetGuard = "sve2p1" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   // Contiguous zero-extending load to quadword (single vector).
   def SVLD1UWQ      : MInst<"svld1uwq[_{d}]", "dPc",  "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
   def SVLD1UWQ_VNUM : MInst<"svld1uwq_vnum[_{d}]", "dPcl", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
@@ -358,12 +358,12 @@ def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl",            [IsSt
 def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l",               [IsStore, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_st1">;
 def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul",              [IsStore, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_st1">;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVST1_BF      : MInst<"svst1[_{d}]",      "vPpd",  "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
   def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
 }
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 // Store one vector (vector base)
 def SVST1_SCATTER_BASES_U     : MInst<"svst1_scatter[_{2}base_{d}]",  "vPud",  "ilUiUlfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1_scatter_scalar_offset">;
 def SVST1B_SCATTER_BASES_U    : MInst<"svst1b_scatter[_{2}base_{d}]", "vPud",  "ilUiUl",   [IsScatterStore], MemEltTyInt8,    "aarch64_sve_st1_scatter_scalar_offset">;
@@ -432,11 +432,11 @@ def SVST1H_SCATTER_32B_INDICES_UU : MInst<"svst1h_scatter_[{3}]index[_{d}]", "vP
 def SVST1_SCATTER_INDEX_S     : MInst<"svst1_scatter[_{2}base]_index[_{d}]",  "vPuld", "ilUiUlfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1_scatter_scalar_offset">;
 def SVST1H_SCATTER_INDEX_S    : MInst<"svst1h_scatter[_{2}base]_index[_{d}]", "vPuld", "ilUiUl",   [IsScatterStore], MemEltTyInt16,   "aarch64_sve_st1_scatter_scalar_offset">;
 def SVST1W_SCATTER_INDEX_S    : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl",      [IsScatterStore], MemEltTyInt32,   "aarch64_sve_st1_scatter_scalar_offset">;
-} // let TargetGuard = "sve"
+} // let SVETargetGuard = "sve"
 
 multiclass StructStore<string name, string proto, string i> {
   def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructStore, VerifyRuntimeMode]>;
-  let TargetGuard = "(sve|sme),bf16" in {
+  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
     def: SInst<name, proto, "b", MergeNone, i, [IsStructStore, VerifyRuntimeMode]>;
   }
 }
@@ -456,12 +456,12 @@ def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, Verify
 // Store one vector, with no truncation, non-temporal (scalar base, VL displacement)
 def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVSTNT1_BF      : MInst<"svstnt1[_{d}]",      "vPpd",  "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
   def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
 }
 
-let TargetGuard = "sve2p1" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   // Contiguous truncating store from quadword (single vector).
   def SVST1UWQ      : MInst<"svst1wq[_{d}]", "vPcd", "iUif",  [IsStore], MemEltTyInt32, "aarch64_sve_st1wq">;
   def SVST1UWQ_VNUM : MInst<"svst1wq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1wq">;
@@ -508,7 +508,7 @@ def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch, VerifyRuntimeM
 def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch, VerifyRuntimeMode], MemEltTyInt32, "aarch64_sve_prf">;
 def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch, VerifyRuntimeMode], MemEltTyInt64, "aarch64_sve_prf">;
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 // Prefetch (Vector bases)
 def SVPRFB_GATHER_BASES : MInst<"svprfb_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt8,  "aarch64_sve_prfb_gather_scalar_offset">;
 def SVPRFH_GATHER_BASES : MInst<"svprfh_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt16, "aarch64_sve_prfh_gather_scalar_offset">;
@@ -541,24 +541,24 @@ def SVPRFB_GATHER_BASES_OFFSET : MInst<"svprfb_gather[_{2}base]_offset", "vPdlJ"
 def SVPRFH_GATHER_BASES_OFFSET : MInst<"svprfh_gather[_{2}base]_index",  "vPdlJ", "UiUl", [IsGatherPrefetch], MemEltTyInt16, "aarch64_sve_prfh_gather_scalar_offset">;
 def SVPRFW_GATHER_BASES_OFFSET : MInst<"svprfw_gather[_{2}base]_index",  "vPdlJ", "UiUl", [IsGatherPrefetch], MemEltTyInt32, "aarch64_sve_prfw_gather_scalar_offset">;
 def SVPRFD_GATHER_BASES_OFFSET : MInst<"svprfd_gather[_{2}base]_index",  "vPdlJ", "UiUl", [IsGatherPrefetch], MemEltTyInt64, "aarch64_sve_prfd_gather_scalar_offset">;
-} // let TargetGuard = "sve"
+} // let SVETargetGuard = "sve"
 
 ////////////////////////////////////////////////////////////////////////////////
 // Address calculations
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 def SVADRB : SInst<"svadrb[_{0}base]_[{2}]offset", "uud", "ilUiUl", MergeNone, "aarch64_sve_adrb">;
 def SVADRH : SInst<"svadrh[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "aarch64_sve_adrh">;
 def SVADRW : SInst<"svadrw[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "aarch64_sve_adrw">;
 def SVADRD : SInst<"svadrd[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "aarch64_sve_adrd">;
-} // let TargetGuard = "sve"
+} // let SVETargetGuard = "sve"
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar to vector
 
 def SVDUPQ_8  : SInst<"svdupq[_n]_{d}", "dssssssssssssssss",  "cUc", MergeNone, "", [VerifyRuntimeMode]>;
 def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "sUsh", MergeNone, "", [VerifyRuntimeMode]>;
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "b", MergeNone, "", [VerifyRuntimeMode]>;
 }
 def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss",  "iUif", MergeNone, "", [VerifyRuntimeMode]>;
@@ -566,7 +566,7 @@ def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss",  "lUld", MergeNone, "", [VerifyRu
 
 multiclass svdup_base<string n, string p, MergeType mt, string i> {
   def NAME : SInst<n, p, "csilUcUsUiUlhfd", mt, i, [VerifyRuntimeMode]>;
-  let TargetGuard = "(sve|sme),bf16" in {
+  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
     def _BF16: SInst<n, p, "b", mt, i, [VerifyRuntimeMode]>;
   }
 }
@@ -657,9 +657,7 @@ def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]",  "ddqqi",  "UiUl", MergeNone, "aarc
 ////////////////////////////////////////////////////////////////////////////////
 // Logical operations
 
-let TargetGuard = "sve|sme" in {    // FIXME: Make this the default for most operations.
 defm SVAND  : SInstZPZZ<"svand", "csilUcUsUiUl", "aarch64_sve_and", "aarch64_sve_and_u">;
-}
 defm SVBIC  : SInstZPZZ<"svbic", "csilUcUsUiUl", "aarch64_sve_bic", "aarch64_sve_bic_u">;
 defm SVEOR  : SInstZPZZ<"sveor", "csilUcUsUiUl", "aarch64_sve_eor", "aarch64_sve_eor_u">;
 defm SVORR  : SInstZPZZ<"svorr", "csilUcUsUiUl", "aarch64_sve_orr", "aarch64_sve_orr_u">;
@@ -697,7 +695,7 @@ def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeAny,  "aa
 def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeZero, "aarch64_sve_asrd", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 
 def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr", [VerifyRuntimeMode]>;
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds",  "b", MergeNone, "aarch64_sve_insr", [VerifyRuntimeMode]>;
 }
 
@@ -786,7 +784,7 @@ defm SVCLS : SInstCLS<"svcls", "csil",            "aarch64_sve_cls", [VerifyRunt
 defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl",    "aarch64_sve_clz", [VerifyRuntimeMode]>;
 defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt", [VerifyRuntimeMode]>;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt", [VerifyRuntimeMode]>;
 }
 
@@ -815,9 +813,7 @@ defm SVABS_F : SInstZPZ<"svabs", "hfd", "aarch64_sve_fabs">;
 defm SVNEG_F : SInstZPZ<"svneg", "hfd", "aarch64_sve_fneg">;
 
 defm SVABD_F  : SInstZPZZ<"svabd",  "hfd", "aarch64_sve_fabd",   "aarch64_sve_fabd_u">;
-let TargetGuard = "sve|sme" in {    // FIXME: Make this the default for most operations.
 defm SVADD_F  : SInstZPZZ<"svadd",  "hfd", "aarch64_sve_fadd",   "aarch64_sve_fadd_u">;
-}
 defm SVDIV_F  : SInstZPZZ<"svdiv",  "hfd", "aarch64_sve_fdiv",   "aarch64_sve_fdiv_u">;
 defm SVDIVR_F : SInstZPZZ<"svdivr", "hfd", "aarch64_sve_fdivr",  "aarch64_sve_fdiv_u", [ReverseMergeAnyBinOp]>;
 defm SVMAX_F  : SInstZPZZ<"svmax",  "hfd", "aarch64_sve_fmax",   "aarch64_sve_fmax_u">;
@@ -839,7 +835,7 @@ defm SVRINTX : SInstZPZ<"svrintx", "hfd", "aarch64_sve_frintx">;
 defm SVRINTZ : SInstZPZ<"svrintz", "hfd", "aarch64_sve_frintz">;
 defm SVSQRT  : SInstZPZ<"svsqrt",  "hfd", "aarch64_sve_fsqrt">;
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 def SVEXPA  : SInst<"svexpa[_{d}]",  "du",   "hfd", MergeNone, "aarch64_sve_fexpa_x">;
 def SVTMAD  : SInst<"svtmad[_{d}]",  "dddi", "hfd", MergeNone, "aarch64_sve_ftmad_x", [], [ImmCheck<2, ImmCheck0_7>]>;
 def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu",  "hfd", MergeNone, "aarch64_sve_ftsmul_x">;
@@ -947,7 +943,7 @@ defm SVFCVTZS_S64_F16 : SInstCvtMXZ<"svcvt_s64[_f16]", "ddPO", "dPO", "l",  "aar
 defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i",  "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
 defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l",  "aarch64_sve_fcvtzs_i64f32">;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   defm SVCVT_BF16_F32   : SInstCvtMXZ<"svcvt_bf16[_f32]",  "ddPM", "dPM", "b",  "aarch64_sve_fcvt_bf16f32">;
   def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b",  MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone, VerifyRuntimeMode]>;
 }
@@ -1007,7 +1003,7 @@ defm SVFCVT_F32_F64   : SInstCvtMXZ<"svcvt_f32[_f64]", "MMPd", "MPd", "d", "aarc
 defm SVFCVT_F64_F16   : SInstCvtMXZ<"svcvt_f64[_f16]", "ddPO", "dPO", "d", "aarch64_sve_fcvt_f64f16">;
 defm SVFCVT_F64_F32   : SInstCvtMXZ<"svcvt_f64[_f32]", "ddPM", "dPM", "d", "aarch64_sve_fcvt_f64f32">;
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 defm SVCVTLT_F32    : SInstCvtMX<"svcvtlt_f32[_f16]",  "ddPh", "dPh", "f", "aarch64_sve_fcvtlt_f32f16">;
 defm SVCVTLT_F64    : SInstCvtMX<"svcvtlt_f64[_f32]",  "ddPh", "dPh", "d", "aarch64_sve_fcvtlt_f64f32">;
 
@@ -1027,7 +1023,7 @@ def SVCVTXNT_F32    : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch6
 
 multiclass SVEPerm<string name, string proto, string i> {
   def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [VerifyRuntimeMode]>;
-  let TargetGuard = "(sve|sme),bf16" in {
+  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
     def: SInst<name, proto, "b", MergeNone, i, [VerifyRuntimeMode]>;
   }
 }
@@ -1037,7 +1033,7 @@ defm SVCLASTA_N  : SVEPerm<"svclasta[_n_{d}]", "sPsd", "aarch64_sve_clasta_n">;
 defm SVCLASTB    : SVEPerm<"svclastb[_{d}]",   "dPdd", "aarch64_sve_clastb">;
 defm SVCLASTB_N  : SVEPerm<"svclastb[_n_{d}]", "sPsd", "aarch64_sve_clastb_n">;
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 def SVCOMPACT    : SInst<"svcompact[_{d}]",   "dPd",  "ilUiUlfd",        MergeNone, "aarch64_sve_compact">;
 }
 
@@ -1046,13 +1042,13 @@ def SVCOMPACT    : SInst<"svcompact[_{d}]",   "dPd",  "ilUiUlfd",        MergeNo
 // instruction such as DUP (indexed) if the lane index fits the range of the
 // instruction's immediate.
 def SVDUP_LANE   : SInst<"svdup_lane[_{d}]",  "ddL",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
 def SVDUP_LANE_BF16 :
                    SInst<"svdup_lane[_{d}]",  "ddL",  "b",               MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
 }
 
 def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane", [VerifyRuntimeMode]>;
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVDUPQ_LANE_BF16  : SInst<"svdupq_lane[_{d}]", "ddn",  "b", MergeNone, "aarch64_sve_dupq_lane", [VerifyRuntimeMode]>;
 }
 def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckExtract, 1>]>;
@@ -1063,7 +1059,7 @@ def SVSEL        : SInst<"svsel[_{d}]",       "dPdd", "csilUcUsUiUlhfd", MergeNo
 def SVSPLICE     : SInst<"svsplice[_{d}]",    "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice", [VerifyRuntimeMode]>;
 def SVTBL        : SInst<"svtbl[_{d}]",       "ddu",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
   def SVTBL_BF16 : SInst<"svtbl[_{d}]",       "ddu",  "b",               MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
 }
 
@@ -1078,7 +1074,7 @@ def SVUZP2       : SInst<"svuzp2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNo
 def SVZIP1       : SInst<"svzip1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1", [VerifyRuntimeMode]>;
 def SVZIP2       : SInst<"svzip2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2", [VerifyRuntimeMode]>;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
 def SVEXT_BF16    : SInst<"svext[_{d}]",    "dddi", "b", MergeNone, "aarch64_sve_ext", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckExtract, 1>]>;
 def SVREV_BF16    : SInst<"svrev[_{d}]",    "dd",   "b", MergeNone, "aarch64_sve_rev", [VerifyRuntimeMode]>;
 def SVSEL_BF16    : SInst<"svsel[_{d}]",    "dPdd", "b", MergeNone, "aarch64_sve_sel", [VerifyRuntimeMode]>;
@@ -1127,11 +1123,8 @@ def SVZIP2_B64 : SInst<"svzip2_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip
 // Predicate creation
 
 def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone, VerifyRuntimeMode]>;
-
-let TargetGuard = "sve|sme" in {    // FIXME: Make this the default for most operations.
 def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [VerifyRuntimeMode]>;
 def SVPTRUE     : SInst<"svptrue_{d}",     "Pv",  "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL, VerifyRuntimeMode]>;
-}
 
 def SVDUPQ_B8      : SInst<"svdupq[_n]_{d}",  "Pssssssssssssssss",  "Pc", MergeNone, "", [VerifyRuntimeMode]>;
 def SVDUPQ_B16     : SInst<"svdupq[_n]_{d}", "Pssssssss",  "Ps", MergeNone, "", [VerifyRuntimeMode]>;
@@ -1143,9 +1136,7 @@ def SVDUP_N_B      : SInst<"svdup[_n]_{d}",  "Ps", "PcPsPiPl", MergeNone, "", [V
 ////////////////////////////////////////////////////////////////////////////////
 // Predicate operations
 
-let TargetGuard = "sve|sme" in {    // FIXME: Make this the default for most operations.
 def SVAND_B_Z  : SInst<"svand[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_and_z", [VerifyRuntimeMode]>;
-}
 def SVBIC_B_Z  : SInst<"svbic[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z", [VerifyRuntimeMode]>;
 def SVEOR_B_Z  : SInst<"sveor[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z", [VerifyRuntimeMode]>;
 def SVMOV_B_Z  : SInst<"svmov[_b]_z",  "PPP",  "Pc", MergeNone, "", [VerifyRuntimeMode]>; // Uses custom expansion
@@ -1176,7 +1167,7 @@ def SVPTEST_LAST  : SInst<"svptest_last",  "sPP", "Pc", MergeNone, "aarch64_sve_
 ////////////////////////////////////////////////////////////////////////////////
 // FFR manipulation
 
-let TargetGuard = "sve" in {
+let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 def SVRDFFR   : SInst<"svrdffr",   "Pv",  "Pc", MergeNone, "", [IsOverloadNone]>;
 def SVRDFFR_Z : SInst<"svrdffr_z", "PP", "Pc", MergeNone, "", [IsOverloadNone]>;
 def SVSETFFR  : SInst<"svsetffr",  "vv",  "",   MergeNone, "", [IsOverloadNone]>;
@@ -1199,7 +1190,7 @@ def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendS
 def SVCNTP : SInst<"svcntp_{d}",  "nPP", "PcPsPiPl",        MergeNone, "aarch64_sve_cntp", [VerifyRuntimeMode]>;
 def SVLEN  : SInst<"svlen[_{d}]", "nd",  "csilUcUsUiUlhfd", MergeNone, "", [VerifyRuntimeMode]>;
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
 def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone, "", [VerifyRuntimeMode]>;
 }
 
@@ -1265,13 +1256,13 @@ def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "
 def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32", [VerifyRuntimeMode]>;
 def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64", [VerifyRuntimeMode]>;
 
-let TargetGuard = "sve,i8mm" in {
+let SVETargetGuard = "sve,i8mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_S32   : SInst<"svmmla[_s32]",   "ddqq","i",  MergeNone, "aarch64_sve_smmla">;
 def SVMLLA_U32   : SInst<"svmmla[_u32]",   "ddqq","Ui", MergeNone, "aarch64_sve_ummla">;
 def SVUSMLLA_S32 : SInst<"svusmmla[_s32]", "ddbq","i",  MergeNone, "aarch64_sve_usmmla">;
 }
 
-let TargetGuard = "(sve|sme),i8mm" in {
+let SVETargetGuard = "sve,i8mm", SMETargetGuard = "sme,i8mm"in {
 def SVUSDOT_S    : SInst<"svusdot[_s32]",    "ddbq", "i",       MergeNone, "aarch64_sve_usdot", [VerifyRuntimeMode]>;
 def SVUSDOT_N_S  : SInst<"svusdot[_n_s32]",  "ddbr", "i",       MergeNone, "aarch64_sve_usdot", [VerifyRuntimeMode]>;
 def SVSUDOT_S    : SInst<"svsudot[_s32]",    "ddqb", "i",       MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, VerifyRuntimeMode]>;
@@ -1281,11 +1272,11 @@ def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]",  "ddbqi",  "i",   MergeNone, "a
 def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]",  "ddqbi",  "i",   MergeNone, "aarch64_sve_sudot_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
 }
 
-let TargetGuard = "sve,f32mm" in {
+let SVETargetGuard = "sve,f32mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">;
 }
 
-let TargetGuard = "sve,f64mm" in {
+let SVETargetGuard = "sve,f64mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd","d", MergeNone, "aarch64_sve_fmmla">;
 def SVTRN1Q      : SInst<"svtrn1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q">;
 def SVTRN2Q      : SInst<"svtrn2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q">;
@@ -1295,7 +1286,7 @@ def SVZIP1Q      : SInst<"svzip1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNo
 def SVZIP2Q      : SInst<"svzip2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q">;
 }
 
-let TargetGuard = "sve,bf16,f64mm" in {
+let SVETargetGuard = "sve,bf16,f64mm", SMETargetGuard = InvalidMode in {
 def SVTRN1Q_BF16      : SInst<"svtrn1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_trn1q">;
 def SVTRN2Q_BF16      : SInst<"svtrn2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_trn2q">;
 def SVUZP1Q_BF16      : SInst<"svuzp1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_uzp1q">;
@@ -1306,7 +1297,6 @@ def SVZIP2Q_BF16      : SInst<"svzip2q[_{d}]",     "ddd",  "b", MergeNone, "aarc
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector creation
-let TargetGuard = "sve|sme" in {    // FIXME: Make this the default for most operations.
 def SVUNDEF_1 : SInst<"svundef_{d}",  "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
 def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
 def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
@@ -1315,9 +1305,8 @@ def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [I
 def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd",   "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
 def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd",  "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
 def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
-}
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
 def SVUNDEF_1_BF16 : SInst<"svundef_{d}",  "dv", "b", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
 def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
 def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
@@ -1328,14 +1317,13 @@ def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd",  "b", MergeNone, "", [IsT
 def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   def SVCREATE_2_B : SInst<"svcreate2[_b]", "2dd",   "Pc", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
   def SVCREATE_4_B : SInst<"svcreate4[_b]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector insertion and extraction
-let TargetGuard = "sve|sme" in {    // FIXME: Make this the default for most operations.
 def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
 def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
 def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
@@ -1343,9 +1331,8 @@ def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [Is
 def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
 def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
 def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
-}
 
-let TargetGuard = "(sve|sme),bf16" in {
+let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
 def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
 def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
 def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
@@ -1355,7 +1342,7 @@ def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet
 def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   def SVGET_2_B : SInst<"svget2[_b]", "d2i", "Pc", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
   def SVGET_4_B : SInst<"svget4[_b]", "d4i", "Pc", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
 
@@ -1363,13 +1350,13 @@ let TargetGuard = "sve2p1|sme2" in {
   def SVSET_4_B : SInst<"svset4[_b]", "44id", "Pc", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   def SVUNDEF_2_B: Inst<"svundef2_b", "2", "Pc", MergeNone, "", [IsUndef, VerifyRuntimeMode], []>;
   def SVUNDEF_4_B: Inst<"svundef4_b", "4", "Pc", MergeNone, "", [IsUndef, VerifyRuntimeMode], []>;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 WhileGE/GT
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhileOrMultiVecCvt, VerifyRuntimeMode]>;
 def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhileOrMultiVecCvt, VerifyRuntimeMode]>;
 def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhileOrMultiVecCvt, VerifyRuntimeMode]>;
@@ -1380,7 +1367,7 @@ def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNon
 def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhileOrMultiVecCvt, VerifyRuntimeMode]>;
 }
 
-let TargetGuard = "sve2p1|sme2"  in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2"  in {
   def SVWHILEGE_S64_X2 : SInst<"svwhilege_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege_x2", [VerifyRuntimeMode]>;
   def SVWHILEGT_S64_X2 : SInst<"svwhilegt_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt_x2", [VerifyRuntimeMode]>;
   def SVWHILEHI_U64_X2 : SInst<"svwhilegt_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilehi_x2", [VerifyRuntimeMode]>;
@@ -1395,7 +1382,7 @@ let TargetGuard = "sve2p1|sme2"  in {
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Uniform DSP operations
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 defm SVQADD_S  : SInstZPZZ<"svqadd",  "csli",     "aarch64_sve_sqadd",  "aarch64_sve_sqadd">;
 defm SVQADD_U  : SInstZPZZ<"svqadd",  "UcUsUiUl", "aarch64_sve_uqadd",  "aarch64_sve_uqadd">;
 defm SVHADD_S  : SInstZPZZ<"svhadd",  "csli",     "aarch64_sve_shadd",  "aarch64_sve_shadd">;
@@ -1430,7 +1417,7 @@ multiclass SInstZPZxZ<string name, string types, string pat_v, string pat_n, str
   def _N_Z : SInst<name # "[_n_{d}]", pat_n, types, MergeZero, intrinsic, flags>;
 }
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil",     "dPdx", "dPdK", "aarch64_sve_sqrshl", [VerifyRuntimeMode]>;
 defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl", [VerifyRuntimeMode]>;
 defm SVQSHL_S  : SInstZPZxZ<"svqshl",  "csil",     "dPdx", "dPdK", "aarch64_sve_sqshl", [VerifyRuntimeMode]>;
@@ -1484,7 +1471,7 @@ multiclass SInstPairwise<string name, string types, string intrinsic, list<FlagT
   def _X   : SInst<name # "[_{d}]", "dPdd", types, MergeAny, intrinsic, flags>;
 }
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 defm SVADDP   : SInstPairwise<"svaddp",   "csliUcUsUiUl", "aarch64_sve_addp", [VerifyRuntimeMode]>;
 defm SVADDP_F : SInstPairwise<"svaddp",   "hfd",          "aarch64_sve_faddp", [VerifyRuntimeMode]>;
 defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd",          "aarch64_sve_fmaxnmp", [VerifyRuntimeMode]>;
@@ -1500,7 +1487,7 @@ defm SVMINP_U : SInstPairwise<"svminp",   "UcUsUiUl",     "aarch64_sve_uminp", [
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Widening pairwise arithmetic
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeOp1,  "aarch64_sve_sadalp", [VerifyRuntimeMode]>;
 def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeAny,  "aarch64_sve_sadalp", [VerifyRuntimeMode]>;
 def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeZero, "aarch64_sve_sadalp", [VerifyRuntimeMode]>;
@@ -1514,7 +1501,7 @@ def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_s
 // SVE2 - Bitwise ternary logical instructions
 //
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVBCAX  : SInst<"svbcax[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [VerifyRuntimeMode]>;
 def SVBSL   : SInst<"svbsl[_{d}]",   "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [VerifyRuntimeMode]>;
 def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [VerifyRuntimeMode]>;
@@ -1534,7 +1521,7 @@ def SVXAR_N   : SInst<"svxar[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aar
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Large integer arithmetic
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb", [VerifyRuntimeMode]>;
 def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt", [VerifyRuntimeMode]>;
 def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb", [VerifyRuntimeMode]>;
@@ -1549,7 +1536,7 @@ def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Multiplication by indexed elements
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi",  "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
@@ -1557,7 +1544,7 @@ def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi",  "silUsUiUl", MergeNone, "a
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Uniform complex integer arithmetic
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVCADD             : SInst<"svcadd[_{d}]",          "dddi",   "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x",           [VerifyRuntimeMode], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
 def SVSQCADD           : SInst<"svqcadd[_{d}]",         "dddi",   "csil",         MergeNone, "aarch64_sve_sqcadd_x",         [VerifyRuntimeMode], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
 def SVCMLA             : SInst<"svcmla[_{d}]",          "ddddi",  "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x",           [VerifyRuntimeMode], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
@@ -1586,7 +1573,7 @@ multiclass SInstWideDSPWide<string name, string types, string intrinsic> {
   def _N : SInst<name # "[_n_{d}]", "ddR", types, MergeNone, intrinsic, [VerifyRuntimeMode]>;
 }
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 defm SVABALB_S : SInstWideDSPAcc<"svabalb",   "sil",    "aarch64_sve_sabalb">;
 defm SVABALB_U : SInstWideDSPAcc<"svabalb",   "UsUiUl", "aarch64_sve_uabalb">;
 defm SVABALT_S : SInstWideDSPAcc<"svabalt",   "sil",    "aarch64_sve_sabalt">;
@@ -1665,7 +1652,7 @@ def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi",  "il",   MergeNone, "
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Narrowing DSP operations
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVADDHNB   : SInst<"svaddhnb[_{d}]",     "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [VerifyRuntimeMode]>;
 def SVADDHNT   : SInst<"svaddhnt[_{d}]",     "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [VerifyRuntimeMode]>;
 def SVRADDHNB  : SInst<"svraddhnb[_{d}]",    "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [VerifyRuntimeMode]>;
@@ -1705,7 +1692,7 @@ def SVQRSHRNT_U  : SInst<"svqrshrnt[_n_{d}]",  "hhdi", "UsUiUl",    MergeNone, "
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Unary narrowing operations
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVQXTNB_S  : SInst<"svqxtnb[_{d}]",  "hd",  "sil",     MergeNone, "aarch64_sve_sqxtnb", [VerifyRuntimeMode]>;
 def SVQXTNB_U  : SInst<"svqxtnb[_{d}]",  "hd",  "UsUiUl",  MergeNone, "aarch64_sve_uqxtnb", [VerifyRuntimeMode]>;
 def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed",  "sil",     MergeNone, "aarch64_sve_sqxtunb", [VerifyRuntimeMode]>;
@@ -1718,7 +1705,7 @@ def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil",     MergeNone, "aarch64_s
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Widening complex integer arithmetic
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 defm SVADDLBT : SInstWideDSPLong<"svaddlbt", "sil", "aarch64_sve_saddlbt">;
 defm SVSUBLBT : SInstWideDSPLong<"svsublbt", "sil", "aarch64_sve_ssublbt">;
 defm SVSUBLTB : SInstWideDSPLong<"svsubltb", "sil", "aarch64_sve_ssubltb">;
@@ -1729,7 +1716,7 @@ defm SVQDMLSLBT : SInstWideDSPAcc<"svqdmlslbt", "sil", "aarch64_sve_sqdmlslbt">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Non-temporal gather/scatter
-let TargetGuard = "sve2" in {
+let SVETargetGuard = "sve2", SMETargetGuard = InvalidMode in {
 // Non-temporal gather load one vector (vector base)
 def SVLDNT1_GATHER_BASES_U   : MInst<"svldnt1_gather[_{2}base]_{0}",   "dPu", "ilUiUlfd", [IsGatherLoad],               MemEltTyDefault, "aarch64_sve_ldnt1_gather_scalar_offset">;
 def SVLDNT1SB_GATHER_BASES_U : MInst<"svldnt1sb_gather[_{2}base]_{0}", "dPu", "ilUiUl",   [IsGatherLoad],               MemEltTyInt8,    "aarch64_sve_ldnt1_gather_scalar_offset">;
@@ -1852,7 +1839,7 @@ def SVSTNT1W_SCATTER_INDEX_S : MInst<"svstnt1w_scatter[_{2}base]_index[_{d}]", "
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Polynomial arithmetic
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVEORBT         : SInst<"sveorbt[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [VerifyRuntimeMode]>;
 def SVEORBT_N       : SInst<"sveorbt[_n_{d}]",       "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [VerifyRuntimeMode]>;
 def SVEORTB         : SInst<"sveortb[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [VerifyRuntimeMode]>;
@@ -1872,7 +1859,7 @@ def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda",  "UcUi",         Mer
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Complex integer dot product
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVCDOT      : SInst<"svcdot[_{d}]",      "ddqqi",  "il",   MergeNone, "aarch64_sve_cdot",      [VerifyRuntimeMode], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
 def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il",   MergeNone, "aarch64_sve_cdot_lane", [VerifyRuntimeMode], [ImmCheck<4, ImmCheckComplexRotAll90>,
                                                                                                         ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
@@ -1881,7 +1868,7 @@ def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il",   MergeNone, "aarch
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Floating-point widening multiply-accumulate
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVMLALB_F      : SInst<"svmlalb[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlalb", [VerifyRuntimeMode]>;
 def SVMLALB_F_N    : SInst<"svmlalb[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlalb", [VerifyRuntimeMode]>;
 def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlalb_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
@@ -1899,7 +1886,7 @@ def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f",   MergeNone, "aar
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Floating-point integer binary logarithm
 
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVLOGB_M  : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1,     "aarch64_sve_flogb", [VerifyRuntimeMode]>;
 def SVLOGB_X  : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeAnyExp,  "aarch64_sve_flogb", [VerifyRuntimeMode]>;
 def SVLOGB_Z  : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeZeroExp, "aarch64_sve_flogb", [VerifyRuntimeMode]>;
@@ -1908,7 +1895,7 @@ def SVLOGB_Z  : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeZeroExp, "aarch64_sve_
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Vector Histogram count
 
-let TargetGuard = "sve2" in {
+let SVETargetGuard = "sve2", SMETargetGuard = InvalidMode in {
 def SVHISTCNT  : SInst<"svhistcnt[_{d}]_z", "uPdd", "ilUiUl", MergeNone, "aarch64_sve_histcnt">;
 def SVHISTSEG  : SInst<"svhistseg[_{d}]",   "udd",  "cUc",    MergeNone, "aarch64_sve_histseg">;
 }
@@ -1916,14 +1903,14 @@ def SVHISTSEG  : SInst<"svhistseg[_{d}]",   "udd",  "cUc",    MergeNone, "aarch6
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Character match
 
-let TargetGuard = "sve2" in {
+let SVETargetGuard = "sve2", SMETargetGuard = InvalidMode in {
 def SVMATCH  : SInst<"svmatch[_{d}]",  "PPdd", "csUcUs", MergeNone, "aarch64_sve_match">;
 def SVNMATCH : SInst<"svnmatch[_{d}]", "PPdd", "csUcUs", MergeNone, "aarch64_sve_nmatch">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Contiguous conflict detection
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, VerifyRuntimeMode]>;
@@ -1935,19 +1922,19 @@ def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sv
 def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 }
 
-let TargetGuard = "(sve2|sme),bf16" in {
+let SVETargetGuard = "sve2,bf16", SMETargetGuard = "sme,bf16" in {
 def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Extended table lookup/permute
-let TargetGuard = "sve2|sme" in {
+let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u",  "csilUcUsUiUlhfd", MergeNone, "", [VerifyRuntimeMode]>;
 def SVTBX  : SInst<"svtbx[_{d}]",  "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
 }
 
-let TargetGuard = "(sve2|sme),bf16" in {
+let SVETargetGuard = "sve2,bf16", SMETargetGuard = "sme,bf16" in {
 def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u",  "b", MergeNone, "", [VerifyRuntimeMode]>;
 def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
 }
@@ -1955,7 +1942,7 @@ def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Optional
 
-let TargetGuard = "sve2-aes" in {
+let SVETargetGuard = "sve2-aes", SMETargetGuard = InvalidMode in {
 def SVAESD   : SInst<"svaesd[_{d}]",   "ddd", "Uc", MergeNone, "aarch64_sve_aesd", [IsOverloadNone]>;
 def SVAESIMC : SInst<"svaesimc[_{d}]", "dd",  "Uc", MergeNone, "aarch64_sve_aesimc", [IsOverloadNone]>;
 def SVAESE   : SInst<"svaese[_{d}]",   "ddd", "Uc", MergeNone, "aarch64_sve_aese", [IsOverloadNone]>;
@@ -1968,16 +1955,16 @@ def SVPMULLT_PAIR_U64   : SInst<"svpmullt_pair[_{d}]",   "ddd", "Ul", MergeNone,
 def SVPMULLT_PAIR_N_U64 : SInst<"svpmullt_pair[_n_{d}]", "dda", "Ul", MergeNone, "aarch64_sve_pmullt_pair">;
 }
 
-let TargetGuard = "sve2-sha3" in {
+let SVETargetGuard = "sve2-sha3", SMETargetGuard = InvalidMode in { // FIXME: valid from sme2p1.
 def SVRAX1   : SInst<"svrax1[_{d}]",   "ddd", "lUl", MergeNone, "aarch64_sve_rax1", [IsOverloadNone]>;
 }
 
-let TargetGuard = "sve2-sm4" in {
+let SVETargetGuard = "sve2-sm4", SMETargetGuard = InvalidMode in {
 def SVSM4E    : SInst<"svsm4e[_{d}]",    "ddd", "Ui", MergeNone, "aarch64_sve_sm4e", [IsOverloadNone]>;
 def SVSM4EKEY : SInst<"svsm4ekey[_{d}]", "ddd", "Ui", MergeNone, "aarch64_sve_sm4ekey", [IsOverloadNone]>;
 }
 
-let TargetGuard = "sve2-bitperm" in {
+let SVETargetGuard = "sve2-bitperm", SMETargetGuard = InvalidMode in {
 def SVBDEP   : SInst<"svbdep[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">;
 def SVBDEP_N : SInst<"svbdep[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">;
 def SVBEXT   : SInst<"svbext[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bext_x">;
@@ -1986,7 +1973,7 @@ def SVBGRP   : SInst<"svbgrp[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sv
 def SVBGRP_N : SInst<"svbgrp[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bgrp_x">;
 }
 
-let TargetGuard = "sve2p1|sme" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme" in {
 def SVPSEL_B : SInst<"svpsel_lane_b8",  "PPPm", "Pc", MergeNone, "", [VerifyRuntimeMode], []>;
 def SVPSEL_H : SInst<"svpsel_lane_b16", "PPPm", "Ps", MergeNone, "", [VerifyRuntimeMode], []>;
 def SVPSEL_S : SInst<"svpsel_lane_b32", "PPPm", "Pi", MergeNone, "", [VerifyRuntimeMode], []>;
@@ -1994,7 +1981,7 @@ def SVPSEL_D : SInst<"svpsel_lane_b64", "PPPm", "Pl", MergeNone, "", [VerifyRunt
 }
 
 // Standalone sve2.1 builtins
-let TargetGuard = "sve2p1" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
 def SVORQV   : SInst<"svorqv[_{d}]",  "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orqv",   [IsReductionQV]>;
 def SVEORQV  : SInst<"sveorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorqv",  [IsReductionQV]>;
 def SVADDQV  : SInst<"svaddqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_addqv",  [IsReductionQV]>;
@@ -2011,7 +1998,7 @@ def SVFMAXQV   : SInst<"svmaxqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_
 def SVFMINQV   : SInst<"svminqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_fminqv",   [IsReductionQV]>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
 def SVPEXT_X2     : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
 
@@ -2050,7 +2037,7 @@ multiclass MultiVecLoad<string i> {
   def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "lUld",  [IsStructLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   defm LD1   : MultiVecLoad<"ld1">;
   defm LDNT1 : MultiVecLoad<"ldnt1">;
 }
@@ -2075,12 +2062,12 @@ multiclass MultiVecStore<string i> {
   def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "lUld",  [IsStructStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   defm ST1   : MultiVecStore<"st1">;
   defm STNT1 : MultiVecStore<"stnt1">;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 def SVDOT_X2_S : SInst<"svdot[_{d}_{2}]", "ddhh", "i",  MergeNone, "aarch64_sve_sdot_x2", [VerifyRuntimeMode], []>;
 def SVDOT_X2_U : SInst<"svdot[_{d}_{2}]", "ddhh", "Ui", MergeNone, "aarch64_sve_udot_x2", [VerifyRuntimeMode], []>;
 def SVDOT_X2_F : SInst<"svdot[_{d}_{2}]", "ddhh", "f",  MergeNone, "aarch64_sve_fdot_x2", [VerifyRuntimeMode], []>;
@@ -2089,14 +2076,14 @@ def SVDOT_LANE_X2_U : SInst<"svdot_lane[_{d}_{2}]", "ddhhi", "Ui", MergeNone, "a
 def SVDOT_LANE_X2_F : SInst<"svdot_lane[_{d}_{2}]", "ddhhi", "f",  MergeNone, "aarch64_sve_fdot_lane_x2", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sve2p1|sme" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme" in {
 def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [VerifyRuntimeMode], []>;
 def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [VerifyRuntimeMode], []>;
 
 defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUlbhfd", "aarch64_sve_revd">;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   def SVPTRUE_COUNT  : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsOverloadNone, VerifyRuntimeMode], []>;
 
   def SVPFALSE_COUNT_ALIAS : SInst<"svpfalse_c", "}v", "", MergeNone, "", [IsOverloadNone, VerifyRuntimeMode]>;
@@ -2105,7 +2092,7 @@ let TargetGuard = "sve2p1|sme2" in {
   def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<1, ImmCheck2_4_Mul2>]>;
 }
 
-let TargetGuard = "(sve2|sme2),b16b16" in {
+let SVETargetGuard = "sve2,b16b16", SMETargetGuard = "sme2,b16b16" in {
 defm SVMUL_BF  : SInstZPZZ<"svmul",  "b", "aarch64_sve_fmul",   "aarch64_sve_fmul_u", [VerifyRuntimeMode]>;
 defm SVADD_BF  : SInstZPZZ<"svadd",  "b", "aarch64_sve_fadd",   "aarch64_sve_fadd_u", [VerifyRuntimeMode]>;
 defm SVSUB_BF  : SInstZPZZ<"svsub",  "b", "aarch64_sve_fsub",   "aarch64_sve_fsub_u", [VerifyRuntimeMode]>;
@@ -2132,7 +2119,7 @@ multiclass MinMaxIntr<string i, string zm, string mul, string t> {
   def SVF # NAME : SInst<"sv" # i # "[" # zm # "_{d}_" # mul # "]", t, "bhfd",      MergeNone, "aarch64_sve_f" # i # zm # "_" # mul, [IsStreaming], []>;
 }
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
 // == SMAX / UMAX / FMAX ==
   defm MAX_SINGLE_X2 : MinMaxIntr<"max", "_single", "x2", "22d">;
   defm MAX_MULTI_X2  : MinMaxIntr<"max", "",        "x2", "222">;
@@ -2154,13 +2141,13 @@ multiclass SInstMinMaxByVector<string name> {
   def NAME # _X4 : SInst<"sv" # name # "nm[_{d}_x4]", "444", "bhfd", MergeNone, "aarch64_sve_f" # name # "nm_x4", [IsStreaming], []>;
 }
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
 // == FMINNM / FMAXNM ==
   defm SVMINNM : SInstMinMaxByVector<"min">;
   defm SVMAXNM : SInstMinMaxByVector<"max">;
 }
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   // FRINTA / FRINTM / FRINTN / FRINTP
   def SVRINTA_X2 : SInst<"svrinta[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frinta_x2", [IsStreaming], []>;
   def SVRINTA_X4 : SInst<"svrinta[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frinta_x4", [IsStreaming], []>;
@@ -2175,7 +2162,7 @@ let TargetGuard = "sme2" in {
   def SVRINTP_X4 : SInst<"svrintp[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintp_x4", [IsStreaming], []>;
 }
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVSCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "csil",     MergeNone, "aarch64_sve_sclamp_single_x2",  [IsStreaming], []>;
   def SVUCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "UcUsUiUl", MergeNone, "aarch64_sve_uclamp_single_x2",  [IsStreaming], []>;
   def SVFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "hfd",      MergeNone, "aarch64_sve_fclamp_single_x2",  [IsStreaming], []>;
@@ -2185,12 +2172,12 @@ let TargetGuard = "sme2" in {
   def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "hfd",      MergeNone, "aarch64_sve_fclamp_single_x4",  [IsStreaming], []>;
 }
 
-let TargetGuard = "sme2,b16b16"in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,b16b16"in {
   def SVBFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "b",      MergeNone, "aarch64_sve_bfclamp_single_x2",  [IsStreaming], []>;
   def SVBFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "b",      MergeNone, "aarch64_sve_bfclamp_single_x4",  [IsStreaming], []>;
 }
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
 // == ADD (vectors) ==
   def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
   def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>;
@@ -2232,7 +2219,7 @@ let TargetGuard = "sme2" in {
   def SVSQDMULH_X4        : SInst<"svqdmulh[_{d}_x4]",        "444", "csil", MergeNone, "aarch64_sve_sqdmulh_vgx4",        [IsStreaming], []>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [VerifyRuntimeMode], []>;
   def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [VerifyRuntimeMode], []>;
 
@@ -2244,7 +2231,7 @@ let TargetGuard = "sve2p1|sme2" in {
   def SVSQRSHRUN_X2 : SInst<"svqrshrun[_n]_{0}[_{d}_x2]", "e2i", "i",  MergeNone, "aarch64_sve_sqrshrun_x2", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck1_16>]>;
 }
 
-let TargetGuard = "sve2p1" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   // ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
   def SVZIPQ1 : SInst<"svzipq1[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq1", [], []>;
   def SVZIPQ2 : SInst<"svzipq2[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq2", [], []>;
@@ -2278,7 +2265,7 @@ let TargetGuard = "sve2p1" in {
   defm SVPMOV_TO_VEC_LANE_D : PMOV_TO_VEC<"svpmov", "lUl", "aarch64_sve_pmov_to_vector_lane" ,[], ImmCheck1_7>;
 }
 
-let TargetGuard = "sve2p1|sme2p1" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2p1" in {
   // DUPQ
   def SVDUP_LANEQ_B  : SInst<"svdup_laneq[_{d}]", "ddi",  "cUc", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_15>]>;
   def SVDUP_LANEQ_H  : SInst<"svdup_laneq[_{d}]", "ddi",  "sUsh", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_7>]>;
@@ -2286,14 +2273,14 @@ let TargetGuard = "sve2p1|sme2p1" in {
   def SVDUP_LANEQ_D  : SInst<"svdup_laneq[_{d}]", "ddi",  "lUld", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
 }
 
-let TargetGuard = "(sve2p1|sme2p1),bf16" in {
+let SVETargetGuard = "sve2p1,bf16", SMETargetGuard = "sme2p1,bf16" in {
   def SVDUP_LANEQ_BF16  : SInst<"svdup_laneq[_{d}]", "ddi",  "b", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_7>]>;
 }
 
 //
 // Multi-vector convert to/from floating-point.
 //
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVCVT_F16_X2  : SInst<"svcvt_f16[_f32_x2]", "e2", "f", MergeNone, "aarch64_sve_fcvt_x2", [IsStreaming],[]>;
   def SVCVT_BF16_X2 : SInst<"svcvt_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvt_x2", [IsOverloadNone, IsStreaming],[]>;
 
@@ -2308,14 +2295,14 @@ let TargetGuard = "sme2" in {
   def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i",  MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
 }
 
-let TargetGuard = "sme-f16f16" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme-f16f16" in {
   def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>;
 }
 
 //
 // Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16
 //
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVCVTN_F16_X2  : SInst<"svcvtn_f16[_f32_x2]", "e2", "f", MergeNone, "aarch64_sve_fcvtn_x2", [IsStreaming],[]>;
   def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>;
 }
@@ -2323,14 +2310,14 @@ let TargetGuard = "sme2" in {
 //
 //Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
 //
-let TargetGuard = "sme-f16f16" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme-f16f16" in {
   def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>;
 }
 
 //
 // Multi-vector saturating extract narrow
 //
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
  def SVQCVT_S16_S32_X2 : SInst<"svqcvt_s16[_{d}_x2]", "h2.d", "i", MergeNone, "aarch64_sve_sqcvt_x2", [IsStreaming], []>;
  def SVQCVT_U16_U32_X2 : SInst<"svqcvt_u16[_{d}_x2]", "e2.d", "Ui", MergeNone, "aarch64_sve_uqcvt_x2", [IsStreaming], []>;
  def SVQCVT_U16_S32_X2 : SInst<"svqcvt_u16[_{d}_x2]", "e2.d", "i", MergeNone, "aarch64_sve_sqcvtu_x2", [IsStreaming], []>;
@@ -2347,13 +2334,13 @@ let TargetGuard = "sme2" in {
 //
 // Multi-vector saturating extract narrow and interleave
 //
-let TargetGuard = "sme2|sve2p1" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   def SVQCVTN_S16_S32_X2 : SInst<"svqcvtn_s16[_{d}_x2]", "h2.d", "i", MergeNone, "aarch64_sve_sqcvtn_x2", [VerifyRuntimeMode], []>;
   def SVQCVTN_U16_U32_X2 : SInst<"svqcvtn_u16[_{d}_x2]", "e2.d", "Ui", MergeNone, "aarch64_sve_uqcvtn_x2", [VerifyRuntimeMode], []>;
   def SVQCVTN_U16_S32_X2 : SInst<"svqcvtn_u16[_{d}_x2]", "e2.d", "i", MergeNone, "aarch64_sve_sqcvtun_x2", [VerifyRuntimeMode], []>;
 }
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVQCVTN_S8_S32_X4 : SInst<"svqcvtn_s8[_{d}_x4]", "q4.d", "i", MergeNone, "aarch64_sve_sqcvtn_x4", [IsStreaming], []>;
   def SVQCVTN_U8_U32_X4 : SInst<"svqcvtn_u8[_{d}_x4]", "b4.d", "Ui", MergeNone, "aarch64_sve_uqcvtn_x4", [IsStreaming], []>;
   def SVQCVTN_U8_S32_X4 : SInst<"svqcvtn_u8[_{d}_x4]", "b4.d", "i", MergeNone, "aarch64_sve_sqcvtun_x4", [IsStreaming], []>;
@@ -2367,7 +2354,7 @@ let TargetGuard = "sme2" in {
 // Multi-vector zip/unzip
 //
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVZIP_X2  : SInst<"svzip[_{d}_x2]",  "22", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zip_x2",  [IsStreaming], []>;
   def SVZIPQ_X2 : SInst<"svzipq[_{d}_x2]", "22", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq_x2", [IsStreaming], []>;
   def SVZIP_X4  : SInst<"svzip[_{d}_x4]",  "44", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zip_x4",  [IsStreaming], []>;
@@ -2383,14 +2370,14 @@ let TargetGuard = "sme2" in {
 // Multi-vector unpack
 //
 
-let TargetGuard = "sme2" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVSUNPK_X2 : SInst<"svunpk_{d}[_{1}_x2]", "2h",   "sil",    MergeNone, "aarch64_sve_sunpk_x2", [IsStreaming], []>;
   def SVUUNPK_X2 : SInst<"svunpk_{d}[_{1}_x2]", "2h",   "UsUiUl", MergeNone, "aarch64_sve_uunpk_x2", [IsStreaming], []>;
   def SVSUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "sil",    MergeNone, "aarch64_sve_sunpk_x4", [IsStreaming], []>;
   def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 // == BFloat16 multiply-subtract ==
   def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, VerifyRuntimeMode], []>;
   def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone, VerifyRuntimeMode], []>;
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 707f445858067d..6ec357825a132a 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -267,12 +267,15 @@ class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
   ImmCheckType Kind = kind;
 }
 
+defvar InvalidMode = "";
+
 class Inst<string n, string p, string t, MergeType mt, string i,
            list<FlagType> ft, list<ImmCheck> ch, MemEltType met = MemEltTyDefault> {
   string Name = n;
   string Prototype = p;
   string Types = t;
-  string TargetGuard = "sve|sme";
+  string SVETargetGuard = "sve";
+  string SMETargetGuard = "sme";
   int Merge = mt.Value;
   string MergeSuffix = mt.Suffix;
   string LLVMIntrinsic = i;
diff --git a/clang/include/clang/CodeGen/CodeGenABITypes.h b/clang/include/clang/CodeGen/CodeGenABITypes.h
index fda0855dc86837..d4822dc160082b 100644
--- a/clang/include/clang/CodeGen/CodeGenABITypes.h
+++ b/clang/include/clang/CodeGen/CodeGenABITypes.h
@@ -42,6 +42,7 @@ class CXXConstructorDecl;
 class CXXDestructorDecl;
 class CXXRecordDecl;
 class CXXMethodDecl;
+class GlobalDecl;
 class ObjCMethodDecl;
 class ObjCProtocolDecl;
 
@@ -104,6 +105,9 @@ llvm::Type *convertTypeForMemory(CodeGenModule &CGM, QualType T);
 unsigned getLLVMFieldNumber(CodeGenModule &CGM,
                             const RecordDecl *RD, const FieldDecl *FD);
 
+/// Return a declaration discriminator for the given global decl.
+uint16_t getPointerAuthDeclDiscriminator(CodeGenModule &CGM, GlobalDecl GD);
+
 /// Given the language and code-generation options that Clang was configured
 /// with, set the default LLVM IR attributes for a function definition.
 /// The attributes set here are mostly global target-configuration and
diff --git a/clang/include/clang/CodeGen/ConstantInitBuilder.h b/clang/include/clang/CodeGen/ConstantInitBuilder.h
index 498acfd3801318..28d4764b6d60b0 100644
--- a/clang/include/clang/CodeGen/ConstantInitBuilder.h
+++ b/clang/include/clang/CodeGen/ConstantInitBuilder.h
@@ -25,8 +25,11 @@
 #include <vector>
 
 namespace clang {
-namespace CodeGen {
+class GlobalDecl;
+class PointerAuthSchema;
+class QualType;
 
+namespace CodeGen {
 class CodeGenModule;
 
 /// A convenience builder class for complex constant initializers,
@@ -199,6 +202,11 @@ class ConstantAggregateBuilderBase {
     add(llvm::ConstantInt::get(intTy, value, isSigned));
   }
 
+  /// Add a signed pointer using the given pointer authentication schema.
+  void addSignedPointer(llvm::Constant *Pointer,
+                        const PointerAuthSchema &Schema, GlobalDecl CalleeDecl,
+                        QualType CalleeType);
+
   /// Add a null pointer of a specific type.
   void addNullPointer(llvm::PointerType *ptrTy) {
     add(llvm::ConstantPointerNull::get(ptrTy));
diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h
index edacd82bf899db..59b9840d02e086 100644
--- a/clang/include/clang/CodeGen/ModuleBuilder.h
+++ b/clang/include/clang/CodeGen/ModuleBuilder.h
@@ -15,6 +15,7 @@
 
 #include "clang/AST/ASTConsumer.h"
 #include "clang/Basic/LLVM.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
   class Constant;
@@ -27,6 +28,9 @@ namespace llvm {
   }
 }
 
+// Prefix of the name of the artificial inline frame.
+inline constexpr llvm::StringRef ClangTrapPrefix = "__clang_trap_msg";
+
 namespace clang {
   class CodeGenOptions;
   class CoverageSourceInfo;
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 084c3ffe69ae86..cc1538372d5f8d 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -747,9 +747,6 @@ class Driver {
   /// option.
   void setDriverMode(StringRef DriverModeValue);
 
-  /// Set the resource directory, depending on which driver is being used.
-  void setResourceDirectory();
-
   /// Parse the \p Args list for LTO options and record the type of LTO
   /// compilation based on which -f(no-)?lto(=.*)? option occurs last.
   void setLTOMode(const llvm::opt::ArgList &Args);
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index bbf860aa491e1d..1c2b8cfeef6ce6 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5713,6 +5713,11 @@ def print_supported_extensions : Flag<["-", "--"], "print-supported-extensions">
   Visibility<[ClangOption, CC1Option, CLOption]>,
   HelpText<"Print supported -march extensions (RISC-V, AArch64 and ARM only)">,
   MarshallingInfoFlag<FrontendOpts<"PrintSupportedExtensions">>;
+def print_enabled_extensions : Flag<["-", "--"], "print-enabled-extensions">,
+  Visibility<[ClangOption, CC1Option, CLOption]>,
+  HelpText<"Print the extensions enabled by the given target and -march/-mcpu options."
+           " (AArch64 only)">,
+  MarshallingInfoFlag<FrontendOpts<"PrintEnabledExtensions">>;
 def : Flag<["-"], "mcpu=help">, Alias<print_supported_cpus>;
 def : Flag<["-"], "mtune=help">, Alias<print_supported_cpus>;
 def time : Flag<["-"], "time">,
@@ -6314,12 +6319,12 @@ def mno_gather : Flag<["-"], "mno-gather">, Group<m_Group>,
 def mno_scatter : Flag<["-"], "mno-scatter">, Group<m_Group>,
                   HelpText<"Disable generation of scatter instructions in auto-vectorization(x86 only)">;
 def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group<m_x86_Features_Group>,
-    HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
+    HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group<m_x86_Features_Group>,
-    HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
+    HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">;
 // For stability, we only add a feature to -mapxf after it passes the validation of llvm-test-suite && cpu2017 on Intel SDE.
-def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>;
+def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf"]>;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf"]>;
 def mapx_inline_asm_use_gpr32 : Flag<["-"], "mapx-inline-asm-use-gpr32">, Group<m_Group>,
                                 HelpText<"Enable use of GPR32 in inline assembly for APX">;
 } // let Flags = [TargetSpecific]
@@ -8597,6 +8602,8 @@ def : Separate<["-"], "Xmicrosoft-windows-sdk-root">,
     Alias<_SLASH_winsdkdir>;
 def : Separate<["-"], "Xmicrosoft-windows-sdk-version">,
     Alias<_SLASH_winsdkversion>;
+def : Separate<["-"], "Xmicrosoft-windows-sys-root">,
+    Alias<_SLASH_winsysroot>;
 
 // Ignored:
 
@@ -8768,3 +8775,11 @@ def spirv : DXCFlag<"spirv">,
 def fspv_target_env_EQ : Joined<["-"], "fspv-target-env=">, Group<dxc_Group>,
   HelpText<"Specify the target environment">,
   Values<"vulkan1.2, vulkan1.3">;
+def no_wasm_opt : Flag<["--"], "no-wasm-opt">,
+  Group<m_Group>,
+  HelpText<"Disable the wasm-opt optimizer">,
+  MarshallingInfoFlag<LangOpts<"NoWasmOpt">>;
+def wasm_opt : Flag<["--"], "wasm-opt">,
+  Group<m_Group>,
+  HelpText<"Enable the wasm-opt optimizer (default)">,
+  MarshallingInfoNegativeFlag<LangOpts<"NoWasmOpt">>;
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 7d257be10af42c..efc2e450b723fa 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3095,20 +3095,49 @@ struct FormatStyle {
   bool JavaScriptWrapImports;
   // clang-format on
 
-  /// Keep empty lines (up to ``MaxEmptyLinesToKeep``) at end of file.
-  /// \version 17
-  bool KeepEmptyLinesAtEOF;
-
-  /// If true, the empty line at the start of blocks is kept.
+  /// Options regarding which empty lines are kept.
+  ///
+  /// For example, the config below will remove empty lines at start of the
+  /// file, end of the file, and start of blocks.
+  ///
   /// \code
-  ///    true:                                  false:
-  ///    if (foo) {                     vs.     if (foo) {
-  ///                                             bar();
-  ///      bar();                               }
-  ///    }
+  ///   KeepEmptyLines:
+  ///     AtEndOfFile: false
+  ///     AtStartOfBlock: false
+  ///     AtStartOfFile: false
   /// \endcode
+  struct KeepEmptyLinesStyle {
+    /// Keep empty lines at end of file.
+    bool AtEndOfFile;
+    /// Keep empty lines at start of a block.
+    /// \code
+    ///    true:                                  false:
+    ///    if (foo) {                     vs.     if (foo) {
+    ///                                             bar();
+    ///      bar();                               }
+    ///    }
+    /// \endcode
+    bool AtStartOfBlock;
+    /// Keep empty lines at start of file.
+    bool AtStartOfFile;
+    bool operator==(const KeepEmptyLinesStyle &R) const {
+      return AtEndOfFile == R.AtEndOfFile &&
+             AtStartOfBlock == R.AtStartOfBlock &&
+             AtStartOfFile == R.AtStartOfFile;
+    }
+  };
+  /// Which empty lines are kept.  See ``MaxEmptyLinesToKeep`` for how many
+  /// consecutive empty lines are kept.
+  /// \version 19
+  KeepEmptyLinesStyle KeepEmptyLines;
+
+  /// This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
+  /// \version 17
+  // bool KeepEmptyLinesAtEOF;
+
+  /// This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
   /// \version 3.7
-  bool KeepEmptyLinesAtTheStartOfBlocks;
+  // bool KeepEmptyLinesAtTheStartOfBlocks;
 
   /// Indentation logic for lambda bodies.
   enum LambdaBodyIndentationKind : int8_t {
@@ -5033,10 +5062,7 @@ struct FormatStyle {
            JavaImportGroups == R.JavaImportGroups &&
            JavaScriptQuotes == R.JavaScriptQuotes &&
            JavaScriptWrapImports == R.JavaScriptWrapImports &&
-           KeepEmptyLinesAtEOF == R.KeepEmptyLinesAtEOF &&
-           KeepEmptyLinesAtTheStartOfBlocks ==
-               R.KeepEmptyLinesAtTheStartOfBlocks &&
-           Language == R.Language &&
+           KeepEmptyLines == R.KeepEmptyLines && Language == R.Language &&
            LambdaBodyIndentation == R.LambdaBodyIndentation &&
            LineEnding == R.LineEnding && MacroBlockBegin == R.MacroBlockBegin &&
            MacroBlockEnd == R.MacroBlockEnd && Macros == R.Macros &&
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index ebb8e9e59c6b64..5e5034fe01eb54 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -306,6 +306,10 @@ class FrontendOptions {
   LLVM_PREFERRED_TYPE(bool)
   unsigned PrintSupportedExtensions : 1;
 
+  /// Print the extensions enabled for the current target.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned PrintEnabledExtensions : 1;
+
   /// Show the -version text.
   LLVM_PREFERRED_TYPE(bool)
   unsigned ShowVersion : 1;
diff --git a/clang/include/clang/InstallAPI/Visitor.h b/clang/include/clang/InstallAPI/Visitor.h
index 9ac948ded3e332..3680ee566ca875 100644
--- a/clang/include/clang/InstallAPI/Visitor.h
+++ b/clang/include/clang/InstallAPI/Visitor.h
@@ -60,8 +60,8 @@ class InstallAPIVisitor final : public ASTConsumer,
   std::string getMangledName(const NamedDecl *D) const;
   std::string getBackendMangledName(llvm::Twine Name) const;
   std::string getMangledCXXVTableName(const CXXRecordDecl *D) const;
-  std::string getMangledCXXThunk(const GlobalDecl &D,
-                                 const ThunkInfo &Thunk) const;
+  std::string getMangledCXXThunk(const GlobalDecl &D, const ThunkInfo &Thunk,
+                                 bool ElideOverrideInfo) const;
   std::string getMangledCXXRTTI(const CXXRecordDecl *D) const;
   std::string getMangledCXXRTTIName(const CXXRecordDecl *D) const;
   std::string getMangledCtorDtor(const CXXMethodDecl *D, int Type) const;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 95c0655f9a2145..6880fa4bb0b03a 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3017,11 +3017,12 @@ class Parser : public CodeCompletionHandler {
       const IdentifierInfo *EnclosingScope = nullptr);
 
   void MaybeParseHLSLAnnotations(Declarator &D,
-                                 SourceLocation *EndLoc = nullptr) {
+                                 SourceLocation *EndLoc = nullptr,
+                                 bool CouldBeBitField = false) {
     assert(getLangOpts().HLSL && "MaybeParseHLSLAnnotations is for HLSL only");
     if (Tok.is(tok::colon)) {
       ParsedAttributes Attrs(AttrFactory);
-      ParseHLSLAnnotations(Attrs, EndLoc);
+      ParseHLSLAnnotations(Attrs, EndLoc, CouldBeBitField);
       D.takeAttributes(Attrs);
     }
   }
@@ -3029,12 +3030,13 @@ class Parser : public CodeCompletionHandler {
   void MaybeParseHLSLAnnotations(ParsedAttributes &Attrs,
                                  SourceLocation *EndLoc = nullptr) {
     assert(getLangOpts().HLSL && "MaybeParseHLSLAnnotations is for HLSL only");
-    if (getLangOpts().HLSL && Tok.is(tok::colon))
+    if (Tok.is(tok::colon))
       ParseHLSLAnnotations(Attrs, EndLoc);
   }
 
   void ParseHLSLAnnotations(ParsedAttributes &Attrs,
-                            SourceLocation *EndLoc = nullptr);
+                            SourceLocation *EndLoc = nullptr,
+                            bool CouldBeBitField = false);
   Decl *ParseHLSLBuffer(SourceLocation &DeclEnd);
 
   void MaybeParseMicrosoftAttributes(ParsedAttributes &Attrs) {
@@ -3511,6 +3513,19 @@ class Parser : public CodeCompletionHandler {
   /// metadirective and therefore ends on the closing paren.
   StmtResult ParseOpenMPDeclarativeOrExecutableDirective(
       ParsedStmtContext StmtCtx, bool ReadDirectiveWithinMetadirective = false);
+
+  /// Parses executable directive.
+  ///
+  /// \param StmtCtx The context in which we're parsing the directive.
+  /// \param DKind The kind of the executable directive.
+  /// \param Loc Source location of the beginning of the directive.
+  /// \param ReadDirectiveWithinMetadirective true if directive is within a
+  /// metadirective and therefore ends on the closing paren.
+  StmtResult
+  ParseOpenMPExecutableDirective(ParsedStmtContext StmtCtx,
+                                 OpenMPDirectiveKind DKind, SourceLocation Loc,
+                                 bool ReadDirectiveWithinMetadirective);
+
   /// Parses clause of kind \a CKind for directive of a kind \a Kind.
   ///
   /// \param DKind Kind of current directive.
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 22cbd0d90ee432..65d73f6cd44f6e 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -1083,6 +1083,7 @@ enum AttributeArgumentNType {
   AANT_ArgumentIdentifier,
   AANT_ArgumentConstantExpr,
   AANT_ArgumentBuiltinFunction,
+  AANT_ArgumentNullptrOrBoolIntOrEnumLiteral,
 };
 
 /// These constants match the enumerated choices of
@@ -1101,6 +1102,7 @@ enum AttributeDeclKind {
   ExpectedFunctionVariableOrClass,
   ExpectedKernelFunction,
   ExpectedFunctionWithProtoType,
+  ExpectedFunctionReturningPointerBoolIntOrEnum,
 };
 
 inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index e43e5f465361d5..ef4fc47567a7c4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -473,6 +473,63 @@ enum class TagUseKind {
   Friend       // Friend declaration:  'friend struct foo;'
 };
 
+/// Used with attributes/effects with a boolean condition, e.g. `nonblocking`.
+enum class FunctionEffectMode : uint8_t {
+  None,     // effect is not present.
+  False,    // effect(false).
+  True,     // effect(true).
+  Dependent // effect(expr) where expr is dependent.
+};
+
+struct FunctionEffectDiff {
+  enum class Kind { Added, Removed, ConditionMismatch };
+
+  FunctionEffect::Kind EffectKind;
+  Kind DiffKind;
+  FunctionEffectWithCondition Old; // invalid when Added.
+  FunctionEffectWithCondition New; // invalid when Removed.
+
+  StringRef effectName() const {
+    if (Old.Effect.kind() != FunctionEffect::Kind::None)
+      return Old.Effect.name();
+    return New.Effect.name();
+  }
+
+  /// Describes the result of effects differing between a base class's virtual
+  /// method and an overriding method in a subclass.
+  enum class OverrideResult {
+    NoAction,
+    Warn,
+    Merge // Merge missing effect from base to derived.
+  };
+
+  /// Return true if adding or removing the effect as part of a type conversion
+  /// should generate a diagnostic.
+  bool shouldDiagnoseConversion(QualType SrcType,
+                                const FunctionEffectsRef &SrcFX,
+                                QualType DstType,
+                                const FunctionEffectsRef &DstFX) const;
+
+  /// Return true if adding or removing the effect in a redeclaration should
+  /// generate a diagnostic.
+  bool shouldDiagnoseRedeclaration(const FunctionDecl &OldFunction,
+                                   const FunctionEffectsRef &OldFX,
+                                   const FunctionDecl &NewFunction,
+                                   const FunctionEffectsRef &NewFX) const;
+
+  /// Return true if adding or removing the effect in a C++ virtual method
+  /// override should generate a diagnostic.
+  OverrideResult shouldDiagnoseMethodOverride(
+      const CXXMethodDecl &OldMethod, const FunctionEffectsRef &OldFX,
+      const CXXMethodDecl &NewMethod, const FunctionEffectsRef &NewFX) const;
+};
+
+struct FunctionEffectDifferences : public SmallVector<FunctionEffectDiff> {
+  /// Caller should short-circuit by checking for equality first.
+  FunctionEffectDifferences(const FunctionEffectsRef &Old,
+                            const FunctionEffectsRef &New);
+};
+
 /// Sema - This implements semantic analysis and AST building for C.
 /// \nosubgrouping
 class Sema final : public SemaBase {
@@ -783,6 +840,28 @@ class Sema final : public SemaBase {
   /// Warn when implicitly casting 0 to nullptr.
   void diagnoseZeroToNullptrConversion(CastKind Kind, const Expr *E);
 
+  // ----- function effects ---
+
+  /// Warn when implicitly changing function effects.
+  void diagnoseFunctionEffectConversion(QualType DstType, QualType SrcType,
+                                        SourceLocation Loc);
+
+  /// Warn and return true if adding an effect to a set would create a conflict.
+  bool diagnoseConflictingFunctionEffect(const FunctionEffectsRef &FX,
+                                         const FunctionEffectWithCondition &EC,
+                                         SourceLocation NewAttrLoc);
+
+  void
+  diagnoseFunctionEffectMergeConflicts(const FunctionEffectSet::Conflicts &Errs,
+                                       SourceLocation NewLoc,
+                                       SourceLocation OldLoc);
+
+  /// Try to parse the conditional expression attached to an effect attribute
+  /// (e.g. 'nonblocking'). (c.f. Sema::ActOnNoexceptSpec). Return an empty
+  /// optional on error.
+  std::optional<FunctionEffectMode>
+  ActOnEffectExpression(Expr *CondExpr, StringRef AttributeName);
+
   bool makeUnavailableInSystemHeader(SourceLocation loc,
                                      UnavailableAttr::ImplicitReason reason);
 
@@ -2035,8 +2114,6 @@ class Sema final : public SemaBase {
 
   bool FormatStringHasSArg(const StringLiteral *FExpr);
 
-  static bool GetFormatNSStringIdx(const FormatAttr *Format, unsigned &Idx);
-
   void CheckFloatComparison(SourceLocation Loc, Expr *LHS, Expr *RHS,
                             BinaryOperatorKind Opcode);
 
@@ -2149,8 +2226,6 @@ class Sema final : public SemaBase {
   bool BuiltinVectorMath(CallExpr *TheCall, QualType &Res);
   bool BuiltinVectorToScalarMath(CallExpr *TheCall);
 
-  bool CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
-
   void checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
                  const Expr *ThisArg, ArrayRef<const Expr *> Args,
                  bool IsMemberFunction, SourceLocation Loc, SourceRange Range,
@@ -2180,6 +2255,14 @@ class Sema final : public SemaBase {
 
   bool ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum);
 
+  void CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
+                               bool *ICContext = nullptr,
+                               bool IsListInit = false);
+
+  bool BuiltinElementwiseTernaryMath(CallExpr *TheCall,
+                                     bool CheckForFloatArgs = true);
+  bool PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall);
+
 private:
   void CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
                         const ArraySubscriptExpr *ASE = nullptr,
@@ -2227,9 +2310,6 @@ class Sema final : public SemaBase {
                                  AtomicExpr::AtomicOp Op);
 
   bool BuiltinElementwiseMath(CallExpr *TheCall);
-  bool BuiltinElementwiseTernaryMath(CallExpr *TheCall,
-                                     bool CheckForFloatArgs = true);
-  bool PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall);
   bool PrepareBuiltinReduceMathOneArgCall(CallExpr *TheCall);
 
   bool BuiltinNonDeterministicValue(CallExpr *TheCall);
@@ -4487,6 +4567,10 @@ class Sema final : public SemaBase {
   /// conditions that are needed for the attribute to have an effect.
   void checkIllFormedTrivialABIStruct(CXXRecordDecl &RD);
 
+  /// Check that VTable Pointer authentication is only being set on the first
+  /// first instantiation of the vtable
+  void checkIncorrectVTablePointerAuthenticationAttribute(CXXRecordDecl &RD);
+
   void ActOnFinishCXXMemberSpecification(Scope *S, SourceLocation RLoc,
                                          Decl *TagDecl, SourceLocation LBrac,
                                          SourceLocation RBrac,
@@ -4612,7 +4696,7 @@ class Sema final : public SemaBase {
 
   std::string getAmbiguousPathsDisplayString(CXXBasePaths &Paths);
 
-  bool CheckOverridingFunctionAttributes(const CXXMethodDecl *New,
+  bool CheckOverridingFunctionAttributes(CXXMethodDecl *New,
                                          const CXXMethodDecl *Old);
 
   /// CheckOverridingFunctionReturnType - Checks whether the return types are
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 0e41a72e444ef4..4d6958a1be3e5b 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -58,6 +58,8 @@ class SemaHLSL : public SemaBase {
   void handleShaderAttr(Decl *D, const ParsedAttr &AL);
   void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL);
   void handleParamModifierAttr(Decl *D, const ParsedAttr &AL);
+
+  bool CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Sema/SemaObjC.h b/clang/include/clang/Sema/SemaObjC.h
index bb8887691ce5d3..07c3c1a06be160 100644
--- a/clang/include/clang/Sema/SemaObjC.h
+++ b/clang/include/clang/Sema/SemaObjC.h
@@ -158,6 +158,27 @@ class SemaObjC : public SemaBase {
 
   IdentifierInfo *getNSErrorIdent();
 
+  bool GetFormatNSStringIdx(const FormatAttr *Format, unsigned &Idx);
+
+  /// Diagnose use of %s directive in an NSString which is being passed
+  /// as formatting string to formatting method.
+  void DiagnoseCStringFormatDirectiveInCFAPI(const NamedDecl *FDecl,
+                                             Expr **Args, unsigned NumArgs);
+
+  bool isSignedCharBool(QualType Ty);
+
+  void adornBoolConversionDiagWithTernaryFixit(
+      Expr *SourceExpr, const Sema::SemaDiagnosticBuilder &Builder);
+
+  /// Check an Objective-C dictionary literal being converted to the given
+  /// target type.
+  void checkDictionaryLiteral(QualType TargetType,
+                              ObjCDictionaryLiteral *DictionaryLiteral);
+
+  /// Check an Objective-C array literal being converted to the given
+  /// target type.
+  void checkArrayLiteral(QualType TargetType, ObjCArrayLiteral *ArrayLiteral);
+
 private:
   IdentifierInfo *Ident_NSError = nullptr;
 
diff --git a/clang/include/clang/Sema/SemaOpenCL.h b/clang/include/clang/Sema/SemaOpenCL.h
index 0d80c4b4c0b561..7d6b4b0dec09c2 100644
--- a/clang/include/clang/Sema/SemaOpenCL.h
+++ b/clang/include/clang/Sema/SemaOpenCL.h
@@ -28,6 +28,78 @@ class SemaOpenCL : public SemaBase {
 
   // Handles intel_reqd_sub_group_size.
   void handleSubGroupSize(Decl *D, const ParsedAttr &AL);
+
+  // Performs semantic analysis for the read/write_pipe call.
+  // \param S Reference to the semantic analyzer.
+  // \param Call A pointer to the builtin call.
+  // \return True if a semantic error has been found, false otherwise.
+  bool checkBuiltinRWPipe(CallExpr *Call);
+
+  // Performs a semantic analysis on the {work_group_/sub_group_
+  //        /_}reserve_{read/write}_pipe
+  // \param S Reference to the semantic analyzer.
+  // \param Call The call to the builtin function to be analyzed.
+  // \return True if a semantic error was found, false otherwise.
+  bool checkBuiltinReserveRWPipe(CallExpr *Call);
+
+  bool checkSubgroupExt(CallExpr *Call);
+
+  // Performs a semantic analysis on {work_group_/sub_group_
+  //        /_}commit_{read/write}_pipe
+  // \param S Reference to the semantic analyzer.
+  // \param Call The call to the builtin function to be analyzed.
+  // \return True if a semantic error was found, false otherwise.
+  bool checkBuiltinCommitRWPipe(CallExpr *Call);
+
+  // Performs a semantic analysis on the call to built-in Pipe
+  //        Query Functions.
+  // \param S Reference to the semantic analyzer.
+  // \param Call The call to the builtin function to be analyzed.
+  // \return True if a semantic error was found, false otherwise.
+  bool checkBuiltinPipePackets(CallExpr *Call);
+
+  // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
+  // Performs semantic analysis for the to_global/local/private call.
+  // \param S Reference to the semantic analyzer.
+  // \param BuiltinID ID of the builtin function.
+  // \param Call A pointer to the builtin call.
+  // \return True if a semantic error has been found, false otherwise.
+  bool checkBuiltinToAddr(unsigned BuiltinID, CallExpr *Call);
+
+  /// OpenCL C v2.0, s6.13.17 - Enqueue kernel function contains four different
+  /// overload formats specified in Table 6.13.17.1.
+  /// int enqueue_kernel(queue_t queue,
+  ///                    kernel_enqueue_flags_t flags,
+  ///                    const ndrange_t ndrange,
+  ///                    void (^block)(void))
+  /// int enqueue_kernel(queue_t queue,
+  ///                    kernel_enqueue_flags_t flags,
+  ///                    const ndrange_t ndrange,
+  ///                    uint num_events_in_wait_list,
+  ///                    clk_event_t *event_wait_list,
+  ///                    clk_event_t *event_ret,
+  ///                    void (^block)(void))
+  /// int enqueue_kernel(queue_t queue,
+  ///                    kernel_enqueue_flags_t flags,
+  ///                    const ndrange_t ndrange,
+  ///                    void (^block)(local void*, ...),
+  ///                    uint size0, ...)
+  /// int enqueue_kernel(queue_t queue,
+  ///                    kernel_enqueue_flags_t flags,
+  ///                    const ndrange_t ndrange,
+  ///                    uint num_events_in_wait_list,
+  ///                    clk_event_t *event_wait_list,
+  ///                    clk_event_t *event_ret,
+  ///                    void (^block)(local void*, ...),
+  ///                    uint size0, ...)
+  bool checkBuiltinEnqueueKernel(CallExpr *TheCall);
+
+  /// OpenCL C v2.0, s6.13.17.6 - Check the argument to the
+  /// get_kernel_work_group_size
+  /// and get_kernel_preferred_work_group_size_multiple builtin functions.
+  bool checkBuiltinKernelWorkGroupSize(CallExpr *TheCall);
+
+  bool checkBuiltinNDRangeAndBlock(CallExpr *TheCall);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 429c334a0b24b2..6e224a4e098ad2 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -686,22 +686,11 @@ def MoveChecker: Checker<"Move">,
   CheckerOptions<[
     CmdLineOption<String,
                   "WarnOn",
-                  "In non-aggressive mode, only warn on use-after-move of "
-                  "local variables (or local rvalue references) and of STL "
-                  "objects. The former is possible because local variables (or "
-                  "local rvalue references) are not tempting their user to "
-                  "re-use the storage. The latter is possible because STL "
-                  "objects are known to end up in a valid but unspecified "
-                  "state after the move and their state-reset methods are also "
-                  "known, which allows us to predict precisely when "
-                  "use-after-move is invalid. Some STL objects are known to "
-                  "conform to additional contracts after move, so they are not "
-                  "tracked. However, smart pointers specifically are tracked "
-                  "because we can perform extra checking over them. In "
-                  "aggressive mode, warn on any use-after-move because the "
-                  "user has intentionally asked us to completely eliminate "
-                  "use-after-move in his code. Values: \"KnownsOnly\", "
-                  "\"KnownsAndLocals\", \"All\".",
+                  "With setting \"KnownsOnly\" warn only on objects with known "
+                  "move semantics like smart pointers and other STL objects. "
+                  "With setting \"KnownsAndLocals\" warn additionally on local "
+                  "variables (or rvalue references). With setting \"All\" warn "
+                  "on all variables (excluding global variables).",
                   "KnownsAndLocals",
                   Released>
   ]>,
diff --git a/clang/include/clang/Tooling/Transformer/RangeSelector.h b/clang/include/clang/Tooling/Transformer/RangeSelector.h
index 1e288043f0a8e5..462a9da8f10ebb 100644
--- a/clang/include/clang/Tooling/Transformer/RangeSelector.h
+++ b/clang/include/clang/Tooling/Transformer/RangeSelector.h
@@ -86,6 +86,11 @@ RangeSelector name(std::string ID);
 // source between the call's parentheses).
 RangeSelector callArgs(std::string ID);
 
+// Given a \c CXXConstructExpr (bound to \p ID), selects the
+// arguments' source text. Depending on the syntactic form of the construct,
+// this is the range between parentheses or braces.
+RangeSelector constructExprArgs(std::string ID);
+
 // Given a \c CompoundStmt (bound to \p ID), selects the source of the
 // statements (all source between the braces).
 RangeSelector statements(std::string ID);
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index acd960c9c932ac..00ecd47d35b623 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -37,19 +37,20 @@ module Clang_Basic {
   umbrella "clang/Basic"
 
   textual header "clang/Basic/AArch64SVEACLETypes.def"
+  textual header "clang/Basic/AMDGPUTypes.def"
+  textual header "clang/Basic/BuiltinHeaders.def"
   textual header "clang/Basic/BuiltinsAArch64.def"
-  textual header "clang/Basic/BuiltinsAMDGPU.def"
   textual header "clang/Basic/BuiltinsAArch64NeonSVEBridge.def"
   textual header "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
+  textual header "clang/Basic/BuiltinsAMDGPU.def"
   textual header "clang/Basic/BuiltinsARM.def"
-  textual header "clang/Basic/BuiltinHeaders.def"
   textual header "clang/Basic/BuiltinsHexagon.def"
   textual header "clang/Basic/BuiltinsHexagonDep.def"
   textual header "clang/Basic/BuiltinsHexagonMapCustomDep.def"
   textual header "clang/Basic/BuiltinsLoongArch.def"
   textual header "clang/Basic/BuiltinsLoongArchBase.def"
-  textual header "clang/Basic/BuiltinsLoongArchLSX.def"
   textual header "clang/Basic/BuiltinsLoongArchLASX.def"
+  textual header "clang/Basic/BuiltinsLoongArchLSX.def"
   textual header "clang/Basic/BuiltinsMips.def"
   textual header "clang/Basic/BuiltinsNEON.def"
   textual header "clang/Basic/BuiltinsNVPTX.def"
@@ -67,13 +68,14 @@ module Clang_Basic {
   textual header "clang/Basic/CodeGenOptions.def"
   textual header "clang/Basic/DebugOptions.def"
   textual header "clang/Basic/DiagnosticOptions.def"
-  textual header "clang/Basic/Features.def"
   textual header "clang/Basic/FPOptions.def"
-  textual header "clang/Basic/MSP430Target.def"
+  textual header "clang/Basic/Features.def"
   textual header "clang/Basic/LangOptions.def"
+  textual header "clang/Basic/MSP430Target.def"
+  textual header "clang/Basic/OpenACCClauses.def"
+  textual header "clang/Basic/OpenCLExtensionTypes.def"
   textual header "clang/Basic/OpenCLExtensions.def"
   textual header "clang/Basic/OpenCLImageTypes.def"
-  textual header "clang/Basic/OpenCLExtensionTypes.def"
   textual header "clang/Basic/OpenMPKinds.def"
   textual header "clang/Basic/OperatorKinds.def"
   textual header "clang/Basic/PPCTypes.def"
@@ -81,8 +83,8 @@ module Clang_Basic {
   textual header "clang/Basic/Sanitizers.def"
   textual header "clang/Basic/TargetCXXABI.def"
   textual header "clang/Basic/TargetOSMacros.def"
-  textual header "clang/Basic/TransformTypeTraits.def"
   textual header "clang/Basic/TokenKinds.def"
+  textual header "clang/Basic/TransformTypeTraits.def"
   textual header "clang/Basic/WebAssemblyReferenceTypes.def"
 
   module * { export * }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index fca200988fea19..84deaf5429df75 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -86,6 +86,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SipHash.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/Triple.h"
@@ -1110,6 +1111,31 @@ void ASTContext::setCurrentNamedModule(Module *M) {
   CurrentCXXNamedModule = M;
 }
 
+bool ASTContext::isInSameModule(const Module *M1, const Module *M2) {
+  if (!M1 != !M2)
+    return false;
+
+  /// Get the representative module for M. The representative module is the
+  /// first module unit for a specific primary module name. So that the module
+  /// units have the same representative module belongs to the same module.
+  ///
+  /// The process is helpful to reduce the expensive string operations.
+  auto GetRepresentativeModule = [this](const Module *M) {
+    auto Iter = SameModuleLookupSet.find(M);
+    if (Iter != SameModuleLookupSet.end())
+      return Iter->second;
+
+    const Module *RepresentativeModule =
+        PrimaryModuleNameMap.try_emplace(M->getPrimaryModuleInterfaceName(), M)
+            .first->second;
+    SameModuleLookupSet[M] = RepresentativeModule;
+    return RepresentativeModule;
+  };
+
+  assert(M1 && "Shouldn't call `isInSameModule` if both M1 and M2 are none.");
+  return GetRepresentativeModule(M1) == GetRepresentativeModule(M2);
+}
+
 ExternCContextDecl *ASTContext::getExternCContextDecl() const {
   if (!ExternCContext)
     ExternCContext = ExternCContextDecl::Create(*this, getTranslationUnitDecl());
@@ -3103,6 +3129,17 @@ QualType ASTContext::removeAddrSpaceQualType(QualType T) const {
     return QualType(TypeNode, Quals.getFastQualifiers());
 }
 
+uint16_t
+ASTContext::getPointerAuthVTablePointerDiscriminator(const CXXRecordDecl *RD) {
+  assert(RD->isPolymorphic() &&
+         "Attempted to get vtable pointer discriminator on a monomorphic type");
+  std::unique_ptr<MangleContext> MC(createMangleContext());
+  SmallString<256> Str;
+  llvm::raw_svector_ostream Out(Str);
+  MC->mangleCXXVTable(RD, Out);
+  return llvm::getPointerAuthStableSipHash(Str);
+}
+
 QualType ASTContext::getObjCGCQualType(QualType T,
                                        Qualifiers::GC GCAttr) const {
   QualType CanT = getCanonicalType(T);
@@ -4576,11 +4613,13 @@ QualType ASTContext::getFunctionTypeInternal(
   size_t Size = FunctionProtoType::totalSizeToAlloc<
       QualType, SourceLocation, FunctionType::FunctionTypeExtraBitfields,
       FunctionType::FunctionTypeArmAttributes, FunctionType::ExceptionType,
-      Expr *, FunctionDecl *, FunctionProtoType::ExtParameterInfo, Qualifiers>(
+      Expr *, FunctionDecl *, FunctionProtoType::ExtParameterInfo,
+      FunctionEffect, EffectConditionExpr, Qualifiers>(
       NumArgs, EPI.Variadic, EPI.requiresFunctionProtoTypeExtraBitfields(),
       EPI.requiresFunctionProtoTypeArmAttributes(), ESH.NumExceptionType,
       ESH.NumExprPtr, ESH.NumFunctionDeclPtr,
-      EPI.ExtParameterInfos ? NumArgs : 0,
+      EPI.ExtParameterInfos ? NumArgs : 0, EPI.FunctionEffects.size(),
+      EPI.FunctionEffects.conditions().size(),
       EPI.TypeQuals.hasNonFastQualifiers() ? 1 : 0);
 
   auto *FTP = (FunctionProtoType *)Allocate(Size, alignof(FunctionProtoType));
@@ -10525,6 +10564,8 @@ QualType ASTContext::mergeFunctionTypes(QualType lhs, QualType rhs,
 
   FunctionType::ExtInfo einfo = lbaseInfo.withNoReturn(NoReturn);
 
+  std::optional<FunctionEffectSet> MergedFX;
+
   if (lproto && rproto) { // two C99 style function prototypes
     assert((AllowCXX ||
             (!lproto->hasExceptionSpec() && !rproto->hasExceptionSpec())) &&
@@ -10540,6 +10581,25 @@ QualType ASTContext::mergeFunctionTypes(QualType lhs, QualType rhs,
     if (lproto->getMethodQuals() != rproto->getMethodQuals())
       return {};
 
+    // Function effects are handled similarly to noreturn, see above.
+    FunctionEffectsRef LHSFX = lproto->getFunctionEffects();
+    FunctionEffectsRef RHSFX = rproto->getFunctionEffects();
+    if (LHSFX != RHSFX) {
+      if (IsConditionalOperator)
+        MergedFX = FunctionEffectSet::getIntersection(LHSFX, RHSFX);
+      else {
+        FunctionEffectSet::Conflicts Errs;
+        MergedFX = FunctionEffectSet::getUnion(LHSFX, RHSFX, Errs);
+        // Here we're discarding a possible error due to conflicts in the effect
+        // sets. But we're not in a context where we can report it. The
+        // operation does however guarantee maintenance of invariants.
+      }
+      if (*MergedFX != LHSFX)
+        allLTypes = false;
+      if (*MergedFX != RHSFX)
+        allRTypes = false;
+    }
+
     SmallVector<FunctionProtoType::ExtParameterInfo, 4> newParamInfos;
     bool canUseLeft, canUseRight;
     if (!mergeExtParameterInfo(lproto, rproto, canUseLeft, canUseRight,
@@ -10583,6 +10643,8 @@ QualType ASTContext::mergeFunctionTypes(QualType lhs, QualType rhs,
     EPI.ExtInfo = einfo;
     EPI.ExtParameterInfos =
         newParamInfos.empty() ? nullptr : newParamInfos.data();
+    if (MergedFX)
+      EPI.FunctionEffects = *MergedFX;
     return getFunctionType(retType, types, EPI);
   }
 
@@ -10620,6 +10682,8 @@ QualType ASTContext::mergeFunctionTypes(QualType lhs, QualType rhs,
 
     FunctionProtoType::ExtProtoInfo EPI = proto->getExtProtoInfo();
     EPI.ExtInfo = einfo;
+    if (MergedFX)
+      EPI.FunctionEffects = *MergedFX;
     return getFunctionType(retType, proto->getParamTypes(), EPI);
   }
 
@@ -13842,3 +13906,74 @@ StringRef ASTContext::getCUIDHash() const {
   CUIDHash = llvm::utohexstr(llvm::MD5Hash(LangOpts.CUID), /*LowerCase=*/true);
   return CUIDHash;
 }
+
+const CXXRecordDecl *
+ASTContext::baseForVTableAuthentication(const CXXRecordDecl *ThisClass) {
+  assert(ThisClass);
+  assert(ThisClass->isPolymorphic());
+  const CXXRecordDecl *PrimaryBase = ThisClass;
+  while (1) {
+    assert(PrimaryBase);
+    assert(PrimaryBase->isPolymorphic());
+    auto &Layout = getASTRecordLayout(PrimaryBase);
+    auto Base = Layout.getPrimaryBase();
+    if (!Base || Base == PrimaryBase || !Base->isPolymorphic())
+      break;
+    PrimaryBase = Base;
+  }
+  return PrimaryBase;
+}
+
+bool ASTContext::useAbbreviatedThunkName(GlobalDecl VirtualMethodDecl,
+                                         StringRef MangledName) {
+  auto *Method = cast<CXXMethodDecl>(VirtualMethodDecl.getDecl());
+  assert(Method->isVirtual());
+  bool DefaultIncludesPointerAuth =
+      LangOpts.PointerAuthCalls || LangOpts.PointerAuthIntrinsics;
+
+  if (!DefaultIncludesPointerAuth)
+    return true;
+
+  auto Existing = ThunksToBeAbbreviated.find(VirtualMethodDecl);
+  if (Existing != ThunksToBeAbbreviated.end())
+    return Existing->second.contains(MangledName.str());
+
+  std::unique_ptr<MangleContext> Mangler(createMangleContext());
+  llvm::StringMap<llvm::SmallVector<std::string, 2>> Thunks;
+  auto VtableContext = getVTableContext();
+  if (const auto *ThunkInfos = VtableContext->getThunkInfo(VirtualMethodDecl)) {
+    auto *Destructor = dyn_cast<CXXDestructorDecl>(Method);
+    for (const auto &Thunk : *ThunkInfos) {
+      SmallString<256> ElidedName;
+      llvm::raw_svector_ostream ElidedNameStream(ElidedName);
+      if (Destructor)
+        Mangler->mangleCXXDtorThunk(Destructor, VirtualMethodDecl.getDtorType(),
+                                    Thunk, /* elideOverrideInfo */ true,
+                                    ElidedNameStream);
+      else
+        Mangler->mangleThunk(Method, Thunk, /* elideOverrideInfo */ true,
+                             ElidedNameStream);
+      SmallString<256> MangledName;
+      llvm::raw_svector_ostream mangledNameStream(MangledName);
+      if (Destructor)
+        Mangler->mangleCXXDtorThunk(Destructor, VirtualMethodDecl.getDtorType(),
+                                    Thunk, /* elideOverrideInfo */ false,
+                                    mangledNameStream);
+      else
+        Mangler->mangleThunk(Method, Thunk, /* elideOverrideInfo */ false,
+                             mangledNameStream);
+
+      if (Thunks.find(ElidedName) == Thunks.end())
+        Thunks[ElidedName] = {};
+      Thunks[ElidedName].push_back(std::string(MangledName));
+    }
+  }
+  llvm::StringSet<> SimplifiedThunkNames;
+  for (auto &ThunkList : Thunks) {
+    llvm::sort(ThunkList.second);
+    SimplifiedThunkNames.insert(ThunkList.second[0]);
+  }
+  bool Result = SimplifiedThunkNames.contains(MangledName);
+  ThunksToBeAbbreviated[VirtualMethodDecl] = std::move(SimplifiedThunkNames);
+  return Result;
+}
diff --git a/clang/lib/AST/ASTDumper.cpp b/clang/lib/AST/ASTDumper.cpp
index f0603880c32dde..864d0393f9a78e 100644
--- a/clang/lib/AST/ASTDumper.cpp
+++ b/clang/lib/AST/ASTDumper.cpp
@@ -342,8 +342,7 @@ LLVM_DUMP_METHOD void APValue::dump() const {
 
 LLVM_DUMP_METHOD void APValue::dump(raw_ostream &OS,
                                     const ASTContext &Context) const {
-  ASTDumper Dumper(llvm::errs(), Context,
-                   Context.getDiagnostics().getShowColors());
+  ASTDumper Dumper(OS, Context, Context.getDiagnostics().getShowColors());
   Dumper.Visit(*this, /*Ty=*/Context.getPointerType(Context.CharTy));
 }
 
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index 0328666d59b1fc..ceaad8d3c5a865 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -65,8 +65,7 @@ add_clang_library(clangAST
   FormatString.cpp
   InheritViz.cpp
   Interp/ByteCodeEmitter.cpp
-  Interp/ByteCodeExprGen.cpp
-  Interp/ByteCodeStmtGen.cpp
+  Interp/Compiler.cpp
   Interp/Context.cpp
   Interp/Descriptor.cpp
   Interp/Disasm.cpp
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 9d0a835a12c458..16ed6d88d1cb1e 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1614,7 +1614,7 @@ LinkageInfo LinkageComputer::getDeclLinkageAndVisibility(const NamedDecl *D) {
                              : CK);
 }
 
-Module *Decl::getOwningModuleForLinkage(bool IgnoreLinkage) const {
+Module *Decl::getOwningModuleForLinkage() const {
   if (isa<NamespaceDecl>(this))
     // Namespaces never have module linkage.  It is the entities within them
     // that [may] do.
@@ -1637,24 +1637,9 @@ Module *Decl::getOwningModuleForLinkage(bool IgnoreLinkage) const {
 
   case Module::ModuleHeaderUnit:
   case Module::ExplicitGlobalModuleFragment:
-  case Module::ImplicitGlobalModuleFragment: {
-    // External linkage declarations in the global module have no owning module
-    // for linkage purposes. But internal linkage declarations in the global
-    // module fragment of a particular module are owned by that module for
-    // linkage purposes.
-    // FIXME: p1815 removes the need for this distinction -- there are no
-    // internal linkage declarations that need to be referred to from outside
-    // this TU.
-    if (IgnoreLinkage)
-      return nullptr;
-    bool InternalLinkage;
-    if (auto *ND = dyn_cast<NamedDecl>(this))
-      InternalLinkage = !ND->hasExternalFormalLinkage();
-    else
-      InternalLinkage = isInAnonymousNamespace();
-    return InternalLinkage ? M->Kind == Module::ModuleHeaderUnit ? M : M->Parent
-                           : nullptr;
-  }
+  case Module::ImplicitGlobalModuleFragment:
+    // The global module shouldn't change the linkage.
+    return nullptr;
 
   case Module::PrivateModuleFragment:
     // The private module fragment is part of its containing module for linkage
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 0cf4e64f83b8df..26773a69ab9acf 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1187,6 +1187,13 @@ void DeclPrinter::printTemplateParameters(const TemplateParameterList *Params,
   }
 
   Out << '>';
+
+  if (const Expr *RequiresClause = Params->getRequiresClause()) {
+    Out << " requires ";
+    RequiresClause->printPretty(Out, nullptr, Policy, Indentation, "\n",
+                                &Context);
+  }
+
   if (!OmitTemplateKW)
     Out << ' ';
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index fe4b9a569ab874..374a3acf7aa26f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1885,7 +1885,8 @@ static bool EvaluateAtomic(const Expr *E, const LValue *This, APValue &Result,
                            EvalInfo &Info);
 static bool EvaluateAsRValue(EvalInfo &Info, const Expr *E, APValue &Result);
 static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result,
-                                  EvalInfo &Info);
+                                  EvalInfo &Info,
+                                  std::string *StringResult = nullptr);
 
 /// Evaluate an integer or fixed point expression into an APResult.
 static bool EvaluateFixedPointOrInteger(const Expr *E, APFixedPoint &Result,
@@ -17009,7 +17010,7 @@ bool Expr::tryEvaluateObjectSize(uint64_t &Result, ASTContext &Ctx,
 }
 
 static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result,
-                                  EvalInfo &Info) {
+                                  EvalInfo &Info, std::string *StringResult) {
   if (!E->getType()->hasPointerRepresentation() || !E->isPRValue())
     return false;
 
@@ -17036,6 +17037,8 @@ static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result,
         Str = Str.substr(0, Pos);
 
       Result = Str.size();
+      if (StringResult)
+        *StringResult = Str;
       return true;
     }
 
@@ -17051,12 +17054,24 @@ static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result,
     if (!Char.getInt()) {
       Result = Strlen;
       return true;
-    }
+    } else if (StringResult)
+      StringResult->push_back(Char.getInt().getExtValue());
     if (!HandleLValueArrayAdjustment(Info, E, String, CharTy, 1))
       return false;
   }
 }
 
+std::optional<std::string> Expr::tryEvaluateString(ASTContext &Ctx) const {
+  Expr::EvalStatus Status;
+  EvalInfo Info(Ctx, Status, EvalInfo::EM_ConstantFold);
+  uint64_t Result;
+  std::string StringResult;
+
+  if (EvaluateBuiltinStrLen(this, Result, Info, &StringResult))
+    return StringResult;
+  return {};
+}
+
 bool Expr::EvaluateCharRangeAsString(std::string &Result,
                                      const Expr *SizeExpression,
                                      const Expr *PtrExpression, ASTContext &Ctx,
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.h b/clang/lib/AST/Interp/ByteCodeEmitter.h
index d797a0ab4a1c9d..9a329e969f339c 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.h
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.h
@@ -54,6 +54,9 @@ class ByteCodeEmitter {
   bool jump(const LabelTy &Label);
   bool fallthrough(const LabelTy &Label);
 
+  /// We're always emitting bytecode.
+  bool isActive() const { return true; }
+
   /// Callback for local registration.
   Local createLocal(Descriptor *D);
 
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
deleted file mode 100644
index 0618ec1aa8f584..00000000000000
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ /dev/null
@@ -1,734 +0,0 @@
-//===--- ByteCodeStmtGen.cpp - Code generator for expressions ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ByteCodeStmtGen.h"
-#include "ByteCodeEmitter.h"
-#include "Context.h"
-#include "Function.h"
-#include "PrimType.h"
-
-using namespace clang;
-using namespace clang::interp;
-
-namespace clang {
-namespace interp {
-
-/// Scope managing label targets.
-template <class Emitter> class LabelScope {
-public:
-  virtual ~LabelScope() {  }
-
-protected:
-  LabelScope(ByteCodeStmtGen<Emitter> *Ctx) : Ctx(Ctx) {}
-  /// ByteCodeStmtGen instance.
-  ByteCodeStmtGen<Emitter> *Ctx;
-};
-
-/// Sets the context for break/continue statements.
-template <class Emitter> class LoopScope final : public LabelScope<Emitter> {
-public:
-  using LabelTy = typename ByteCodeStmtGen<Emitter>::LabelTy;
-  using OptLabelTy = typename ByteCodeStmtGen<Emitter>::OptLabelTy;
-
-  LoopScope(ByteCodeStmtGen<Emitter> *Ctx, LabelTy BreakLabel,
-            LabelTy ContinueLabel)
-      : LabelScope<Emitter>(Ctx), OldBreakLabel(Ctx->BreakLabel),
-        OldContinueLabel(Ctx->ContinueLabel) {
-    this->Ctx->BreakLabel = BreakLabel;
-    this->Ctx->ContinueLabel = ContinueLabel;
-  }
-
-  ~LoopScope() {
-    this->Ctx->BreakLabel = OldBreakLabel;
-    this->Ctx->ContinueLabel = OldContinueLabel;
-  }
-
-private:
-  OptLabelTy OldBreakLabel;
-  OptLabelTy OldContinueLabel;
-};
-
-// Sets the context for a switch scope, mapping labels.
-template <class Emitter> class SwitchScope final : public LabelScope<Emitter> {
-public:
-  using LabelTy = typename ByteCodeStmtGen<Emitter>::LabelTy;
-  using OptLabelTy = typename ByteCodeStmtGen<Emitter>::OptLabelTy;
-  using CaseMap = typename ByteCodeStmtGen<Emitter>::CaseMap;
-
-  SwitchScope(ByteCodeStmtGen<Emitter> *Ctx, CaseMap &&CaseLabels,
-              LabelTy BreakLabel, OptLabelTy DefaultLabel)
-      : LabelScope<Emitter>(Ctx), OldBreakLabel(Ctx->BreakLabel),
-        OldDefaultLabel(this->Ctx->DefaultLabel),
-        OldCaseLabels(std::move(this->Ctx->CaseLabels)) {
-    this->Ctx->BreakLabel = BreakLabel;
-    this->Ctx->DefaultLabel = DefaultLabel;
-    this->Ctx->CaseLabels = std::move(CaseLabels);
-  }
-
-  ~SwitchScope() {
-    this->Ctx->BreakLabel = OldBreakLabel;
-    this->Ctx->DefaultLabel = OldDefaultLabel;
-    this->Ctx->CaseLabels = std::move(OldCaseLabels);
-  }
-
-private:
-  OptLabelTy OldBreakLabel;
-  OptLabelTy OldDefaultLabel;
-  CaseMap OldCaseLabels;
-};
-
-} // namespace interp
-} // namespace clang
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::emitLambdaStaticInvokerBody(
-    const CXXMethodDecl *MD) {
-  assert(MD->isLambdaStaticInvoker());
-  assert(MD->hasBody());
-  assert(cast<CompoundStmt>(MD->getBody())->body_empty());
-
-  const CXXRecordDecl *ClosureClass = MD->getParent();
-  const CXXMethodDecl *LambdaCallOp = ClosureClass->getLambdaCallOperator();
-  assert(ClosureClass->captures_begin() == ClosureClass->captures_end());
-  const Function *Func = this->getFunction(LambdaCallOp);
-  if (!Func)
-    return false;
-  assert(Func->hasThisPointer());
-  assert(Func->getNumParams() == (MD->getNumParams() + 1 + Func->hasRVO()));
-
-  if (Func->hasRVO()) {
-    if (!this->emitRVOPtr(MD))
-      return false;
-  }
-
-  // The lambda call operator needs an instance pointer, but we don't have
-  // one here, and we don't need one either because the lambda cannot have
-  // any captures, as verified above. Emit a null pointer. This is then
-  // special-cased when interpreting to not emit any misleading diagnostics.
-  if (!this->emitNullPtr(nullptr, MD))
-    return false;
-
-  // Forward all arguments from the static invoker to the lambda call operator.
-  for (const ParmVarDecl *PVD : MD->parameters()) {
-    auto It = this->Params.find(PVD);
-    assert(It != this->Params.end());
-
-    // We do the lvalue-to-rvalue conversion manually here, so no need
-    // to care about references.
-    PrimType ParamType = this->classify(PVD->getType()).value_or(PT_Ptr);
-    if (!this->emitGetParam(ParamType, It->second.Offset, MD))
-      return false;
-  }
-
-  if (!this->emitCall(Func, 0, LambdaCallOp))
-    return false;
-
-  this->emitCleanup();
-  if (ReturnType)
-    return this->emitRet(*ReturnType, MD);
-
-  // Nothing to do, since we emitted the RVO pointer above.
-  return this->emitRetVoid(MD);
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
-  // Classify the return type.
-  ReturnType = this->classify(F->getReturnType());
-
-  auto emitFieldInitializer = [&](const Record::Field *F, unsigned FieldOffset,
-                                  const Expr *InitExpr) -> bool {
-    // We don't know what to do with these, so just return false.
-    if (InitExpr->getType().isNull())
-      return false;
-
-    if (std::optional<PrimType> T = this->classify(InitExpr)) {
-      if (!this->visit(InitExpr))
-        return false;
-
-      if (F->isBitField())
-        return this->emitInitThisBitField(*T, F, FieldOffset, InitExpr);
-      return this->emitInitThisField(*T, FieldOffset, InitExpr);
-    }
-
-    // Non-primitive case. Get a pointer to the field-to-initialize
-    // on the stack and call visitInitialzer() for it.
-    InitLinkScope<Emitter> FieldScope(this, InitLink::Field(F->Offset));
-    if (!this->emitGetPtrThisField(FieldOffset, InitExpr))
-      return false;
-
-    if (!this->visitInitializer(InitExpr))
-      return false;
-
-    return this->emitPopPtr(InitExpr);
-  };
-
-  // Emit custom code if this is a lambda static invoker.
-  if (const auto *MD = dyn_cast<CXXMethodDecl>(F);
-      MD && MD->isLambdaStaticInvoker())
-    return this->emitLambdaStaticInvokerBody(MD);
-
-  // Constructor. Set up field initializers.
-  if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(F)) {
-    const RecordDecl *RD = Ctor->getParent();
-    const Record *R = this->getRecord(RD);
-    if (!R)
-      return false;
-
-    InitLinkScope<Emitter> InitScope(this, InitLink::This());
-    for (const auto *Init : Ctor->inits()) {
-      // Scope needed for the initializers.
-      BlockScope<Emitter> Scope(this);
-
-      const Expr *InitExpr = Init->getInit();
-      if (const FieldDecl *Member = Init->getMember()) {
-        const Record::Field *F = R->getField(Member);
-
-        if (!emitFieldInitializer(F, F->Offset, InitExpr))
-          return false;
-      } else if (const Type *Base = Init->getBaseClass()) {
-        const auto *BaseDecl = Base->getAsCXXRecordDecl();
-        assert(BaseDecl);
-
-        if (Init->isBaseVirtual()) {
-          assert(R->getVirtualBase(BaseDecl));
-          if (!this->emitGetPtrThisVirtBase(BaseDecl, InitExpr))
-            return false;
-
-        } else {
-          // Base class initializer.
-          // Get This Base and call initializer on it.
-          const Record::Base *B = R->getBase(BaseDecl);
-          assert(B);
-          if (!this->emitGetPtrThisBase(B->Offset, InitExpr))
-            return false;
-        }
-
-        if (!this->visitInitializer(InitExpr))
-          return false;
-        if (!this->emitFinishInitPop(InitExpr))
-          return false;
-      } else if (const IndirectFieldDecl *IFD = Init->getIndirectMember()) {
-        assert(IFD->getChainingSize() >= 2);
-
-        unsigned NestedFieldOffset = 0;
-        const Record::Field *NestedField = nullptr;
-        for (const NamedDecl *ND : IFD->chain()) {
-          const auto *FD = cast<FieldDecl>(ND);
-          const Record *FieldRecord =
-              this->P.getOrCreateRecord(FD->getParent());
-          assert(FieldRecord);
-
-          NestedField = FieldRecord->getField(FD);
-          assert(NestedField);
-
-          NestedFieldOffset += NestedField->Offset;
-        }
-        assert(NestedField);
-
-        if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr))
-          return false;
-      } else {
-        assert(Init->isDelegatingInitializer());
-        if (!this->emitThis(InitExpr))
-          return false;
-        if (!this->visitInitializer(Init->getInit()))
-          return false;
-        if (!this->emitPopPtr(InitExpr))
-          return false;
-      }
-    }
-  }
-
-  if (const auto *Body = F->getBody())
-    if (!visitStmt(Body))
-      return false;
-
-  // Emit a guard return to protect against a code path missing one.
-  if (F->getReturnType()->isVoidType())
-    return this->emitRetVoid(SourceInfo{});
-  else
-    return this->emitNoRet(SourceInfo{});
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitStmt(const Stmt *S) {
-  switch (S->getStmtClass()) {
-  case Stmt::CompoundStmtClass:
-    return visitCompoundStmt(cast<CompoundStmt>(S));
-  case Stmt::DeclStmtClass:
-    return visitDeclStmt(cast<DeclStmt>(S));
-  case Stmt::ReturnStmtClass:
-    return visitReturnStmt(cast<ReturnStmt>(S));
-  case Stmt::IfStmtClass:
-    return visitIfStmt(cast<IfStmt>(S));
-  case Stmt::WhileStmtClass:
-    return visitWhileStmt(cast<WhileStmt>(S));
-  case Stmt::DoStmtClass:
-    return visitDoStmt(cast<DoStmt>(S));
-  case Stmt::ForStmtClass:
-    return visitForStmt(cast<ForStmt>(S));
-  case Stmt::CXXForRangeStmtClass:
-    return visitCXXForRangeStmt(cast<CXXForRangeStmt>(S));
-  case Stmt::BreakStmtClass:
-    return visitBreakStmt(cast<BreakStmt>(S));
-  case Stmt::ContinueStmtClass:
-    return visitContinueStmt(cast<ContinueStmt>(S));
-  case Stmt::SwitchStmtClass:
-    return visitSwitchStmt(cast<SwitchStmt>(S));
-  case Stmt::CaseStmtClass:
-    return visitCaseStmt(cast<CaseStmt>(S));
-  case Stmt::DefaultStmtClass:
-    return visitDefaultStmt(cast<DefaultStmt>(S));
-  case Stmt::AttributedStmtClass:
-    return visitAttributedStmt(cast<AttributedStmt>(S));
-  case Stmt::CXXTryStmtClass:
-    return visitCXXTryStmt(cast<CXXTryStmt>(S));
-  case Stmt::NullStmtClass:
-    return true;
-  // Always invalid statements.
-  case Stmt::GCCAsmStmtClass:
-  case Stmt::MSAsmStmtClass:
-  case Stmt::GotoStmtClass:
-    return this->emitInvalid(S);
-  case Stmt::LabelStmtClass:
-    return this->visitStmt(cast<LabelStmt>(S)->getSubStmt());
-  default: {
-    if (auto *Exp = dyn_cast<Expr>(S))
-      return this->discard(Exp);
-    return false;
-  }
-  }
-}
-
-/// Visits the given statment without creating a variable
-/// scope for it in case it is a compound statement.
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitLoopBody(const Stmt *S) {
-  if (isa<NullStmt>(S))
-    return true;
-
-  if (const auto *CS = dyn_cast<CompoundStmt>(S)) {
-    for (auto *InnerStmt : CS->body())
-      if (!visitStmt(InnerStmt))
-        return false;
-    return true;
-  }
-
-  return this->visitStmt(S);
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitCompoundStmt(
-    const CompoundStmt *CompoundStmt) {
-  BlockScope<Emitter> Scope(this);
-  for (auto *InnerStmt : CompoundStmt->body())
-    if (!visitStmt(InnerStmt))
-      return false;
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitDeclStmt(const DeclStmt *DS) {
-  for (auto *D : DS->decls()) {
-    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl,
-            FunctionDecl>(D))
-      continue;
-
-    const auto *VD = dyn_cast<VarDecl>(D);
-    if (!VD)
-      return false;
-    if (!this->visitVarDecl(VD))
-      return false;
-  }
-
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitReturnStmt(const ReturnStmt *RS) {
-  if (const Expr *RE = RS->getRetValue()) {
-    ExprScope<Emitter> RetScope(this);
-    if (ReturnType) {
-      // Primitive types are simply returned.
-      if (!this->visit(RE))
-        return false;
-      this->emitCleanup();
-      return this->emitRet(*ReturnType, RS);
-    } else if (RE->getType()->isVoidType()) {
-      if (!this->visit(RE))
-        return false;
-    } else {
-      // RVO - construct the value in the return location.
-      if (!this->emitRVOPtr(RE))
-        return false;
-      if (!this->visitInitializer(RE))
-        return false;
-      if (!this->emitPopPtr(RE))
-        return false;
-
-      this->emitCleanup();
-      return this->emitRetVoid(RS);
-    }
-  }
-
-  // Void return.
-  this->emitCleanup();
-  return this->emitRetVoid(RS);
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitIfStmt(const IfStmt *IS) {
-  BlockScope<Emitter> IfScope(this);
-
-  if (IS->isNonNegatedConsteval())
-    return visitStmt(IS->getThen());
-  if (IS->isNegatedConsteval())
-    return IS->getElse() ? visitStmt(IS->getElse()) : true;
-
-  if (auto *CondInit = IS->getInit())
-    if (!visitStmt(CondInit))
-      return false;
-
-  if (const DeclStmt *CondDecl = IS->getConditionVariableDeclStmt())
-    if (!visitDeclStmt(CondDecl))
-      return false;
-
-  if (!this->visitBool(IS->getCond()))
-    return false;
-
-  if (const Stmt *Else = IS->getElse()) {
-    LabelTy LabelElse = this->getLabel();
-    LabelTy LabelEnd = this->getLabel();
-    if (!this->jumpFalse(LabelElse))
-      return false;
-    if (!visitStmt(IS->getThen()))
-      return false;
-    if (!this->jump(LabelEnd))
-      return false;
-    this->emitLabel(LabelElse);
-    if (!visitStmt(Else))
-      return false;
-    this->emitLabel(LabelEnd);
-  } else {
-    LabelTy LabelEnd = this->getLabel();
-    if (!this->jumpFalse(LabelEnd))
-      return false;
-    if (!visitStmt(IS->getThen()))
-      return false;
-    this->emitLabel(LabelEnd);
-  }
-
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitWhileStmt(const WhileStmt *S) {
-  const Expr *Cond = S->getCond();
-  const Stmt *Body = S->getBody();
-
-  LabelTy CondLabel = this->getLabel(); // Label before the condition.
-  LabelTy EndLabel = this->getLabel();  // Label after the loop.
-  LoopScope<Emitter> LS(this, EndLabel, CondLabel);
-
-  this->emitLabel(CondLabel);
-
-  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
-    if (!visitDeclStmt(CondDecl))
-      return false;
-
-  if (!this->visitBool(Cond))
-    return false;
-  if (!this->jumpFalse(EndLabel))
-    return false;
-
-  LocalScope<Emitter> Scope(this);
-  {
-    DestructorScope<Emitter> DS(Scope);
-    if (!this->visitLoopBody(Body))
-      return false;
-  }
-
-  if (!this->jump(CondLabel))
-    return false;
-  this->emitLabel(EndLabel);
-
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitDoStmt(const DoStmt *S) {
-  const Expr *Cond = S->getCond();
-  const Stmt *Body = S->getBody();
-
-  LabelTy StartLabel = this->getLabel();
-  LabelTy EndLabel = this->getLabel();
-  LabelTy CondLabel = this->getLabel();
-  LoopScope<Emitter> LS(this, EndLabel, CondLabel);
-  LocalScope<Emitter> Scope(this);
-
-  this->emitLabel(StartLabel);
-  {
-    DestructorScope<Emitter> DS(Scope);
-
-    if (!this->visitLoopBody(Body))
-      return false;
-    this->emitLabel(CondLabel);
-    if (!this->visitBool(Cond))
-      return false;
-  }
-  if (!this->jumpTrue(StartLabel))
-    return false;
-
-  this->emitLabel(EndLabel);
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitForStmt(const ForStmt *S) {
-  // for (Init; Cond; Inc) { Body }
-  const Stmt *Init = S->getInit();
-  const Expr *Cond = S->getCond();
-  const Expr *Inc = S->getInc();
-  const Stmt *Body = S->getBody();
-
-  LabelTy EndLabel = this->getLabel();
-  LabelTy CondLabel = this->getLabel();
-  LabelTy IncLabel = this->getLabel();
-  LoopScope<Emitter> LS(this, EndLabel, IncLabel);
-  LocalScope<Emitter> Scope(this);
-
-  if (Init && !this->visitStmt(Init))
-    return false;
-  this->emitLabel(CondLabel);
-
-  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
-    if (!visitDeclStmt(CondDecl))
-      return false;
-  if (Cond) {
-    if (!this->visitBool(Cond))
-      return false;
-    if (!this->jumpFalse(EndLabel))
-      return false;
-  }
-
-  {
-    DestructorScope<Emitter> DS(Scope);
-
-    if (Body && !this->visitLoopBody(Body))
-      return false;
-    this->emitLabel(IncLabel);
-    if (Inc && !this->discard(Inc))
-      return false;
-  }
-
-  if (!this->jump(CondLabel))
-    return false;
-  this->emitLabel(EndLabel);
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitCXXForRangeStmt(const CXXForRangeStmt *S) {
-  const Stmt *Init = S->getInit();
-  const Expr *Cond = S->getCond();
-  const Expr *Inc = S->getInc();
-  const Stmt *Body = S->getBody();
-  const Stmt *BeginStmt = S->getBeginStmt();
-  const Stmt *RangeStmt = S->getRangeStmt();
-  const Stmt *EndStmt = S->getEndStmt();
-  const VarDecl *LoopVar = S->getLoopVariable();
-
-  LabelTy EndLabel = this->getLabel();
-  LabelTy CondLabel = this->getLabel();
-  LabelTy IncLabel = this->getLabel();
-  LoopScope<Emitter> LS(this, EndLabel, IncLabel);
-
-  // Emit declarations needed in the loop.
-  if (Init && !this->visitStmt(Init))
-    return false;
-  if (!this->visitStmt(RangeStmt))
-    return false;
-  if (!this->visitStmt(BeginStmt))
-    return false;
-  if (!this->visitStmt(EndStmt))
-    return false;
-
-  // Now the condition as well as the loop variable assignment.
-  this->emitLabel(CondLabel);
-  if (!this->visitBool(Cond))
-    return false;
-  if (!this->jumpFalse(EndLabel))
-    return false;
-
-  if (!this->visitVarDecl(LoopVar))
-    return false;
-
-  // Body.
-  LocalScope<Emitter> Scope(this);
-  {
-    DestructorScope<Emitter> DS(Scope);
-
-    if (!this->visitLoopBody(Body))
-      return false;
-    this->emitLabel(IncLabel);
-    if (!this->discard(Inc))
-      return false;
-  }
-  if (!this->jump(CondLabel))
-    return false;
-
-  this->emitLabel(EndLabel);
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitBreakStmt(const BreakStmt *S) {
-  if (!BreakLabel)
-    return false;
-
-  this->VarScope->emitDestructors();
-  return this->jump(*BreakLabel);
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitContinueStmt(const ContinueStmt *S) {
-  if (!ContinueLabel)
-    return false;
-
-  this->VarScope->emitDestructors();
-  return this->jump(*ContinueLabel);
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitSwitchStmt(const SwitchStmt *S) {
-  const Expr *Cond = S->getCond();
-
-  LabelTy EndLabel = this->getLabel();
-  OptLabelTy DefaultLabel = std::nullopt;
-
-  if (const auto *CondInit = S->getInit())
-    if (!visitStmt(CondInit))
-      return false;
-
-  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
-    if (!visitDeclStmt(CondDecl))
-      return false;
-
-  // Initialize condition variable.
-  PrimType CondT = this->classifyPrim(Cond->getType());
-  unsigned CondVar = this->allocateLocalPrimitive(Cond, CondT, true, false);
-  if (!this->visit(Cond))
-    return false;
-  if (!this->emitSetLocal(CondT, CondVar, S))
-    return false;
-
-  CaseMap CaseLabels;
-  // Create labels and comparison ops for all case statements.
-  for (const SwitchCase *SC = S->getSwitchCaseList(); SC;
-       SC = SC->getNextSwitchCase()) {
-    if (const auto *CS = dyn_cast<CaseStmt>(SC)) {
-      // FIXME: Implement ranges.
-      if (CS->caseStmtIsGNURange())
-        return false;
-      CaseLabels[SC] = this->getLabel();
-
-      const Expr *Value = CS->getLHS();
-      PrimType ValueT = this->classifyPrim(Value->getType());
-
-      // Compare the case statement's value to the switch condition.
-      if (!this->emitGetLocal(CondT, CondVar, CS))
-        return false;
-      if (!this->visit(Value))
-        return false;
-
-      // Compare and jump to the case label.
-      if (!this->emitEQ(ValueT, S))
-        return false;
-      if (!this->jumpTrue(CaseLabels[CS]))
-        return false;
-    } else {
-      assert(!DefaultLabel);
-      DefaultLabel = this->getLabel();
-    }
-  }
-
-  // If none of the conditions above were true, fall through to the default
-  // statement or jump after the switch statement.
-  if (DefaultLabel) {
-    if (!this->jump(*DefaultLabel))
-      return false;
-  } else {
-    if (!this->jump(EndLabel))
-      return false;
-  }
-
-  SwitchScope<Emitter> SS(this, std::move(CaseLabels), EndLabel, DefaultLabel);
-  if (!this->visitStmt(S->getBody()))
-    return false;
-  this->emitLabel(EndLabel);
-  return true;
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitCaseStmt(const CaseStmt *S) {
-  this->emitLabel(CaseLabels[S]);
-  return this->visitStmt(S->getSubStmt());
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitDefaultStmt(const DefaultStmt *S) {
-  this->emitLabel(*DefaultLabel);
-  return this->visitStmt(S->getSubStmt());
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitAttributedStmt(const AttributedStmt *S) {
-
-  if (this->Ctx.getLangOpts().CXXAssumptions &&
-      !this->Ctx.getLangOpts().MSVCCompat) {
-    for (const Attr *A : S->getAttrs()) {
-      auto *AA = dyn_cast<CXXAssumeAttr>(A);
-      if (!AA)
-        continue;
-
-      assert(isa<NullStmt>(S->getSubStmt()));
-
-      const Expr *Assumption = AA->getAssumption();
-      if (Assumption->isValueDependent())
-        return false;
-
-      if (Assumption->HasSideEffects(this->Ctx.getASTContext()))
-        continue;
-
-      // Evaluate assumption.
-      if (!this->visitBool(Assumption))
-        return false;
-
-      if (!this->emitAssume(Assumption))
-        return false;
-    }
-  }
-
-  // Ignore other attributes.
-  return this->visitStmt(S->getSubStmt());
-}
-
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitCXXTryStmt(const CXXTryStmt *S) {
-  // Ignore all handlers.
-  return this->visitStmt(S->getTryBlock());
-}
-
-namespace clang {
-namespace interp {
-
-template class ByteCodeStmtGen<ByteCodeEmitter>;
-
-} // namespace interp
-} // namespace clang
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.h b/clang/lib/AST/Interp/ByteCodeStmtGen.h
deleted file mode 100644
index d7e6e5042c2740..00000000000000
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//===--- ByteCodeStmtGen.h - Code generator for expressions -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines the constexpr bytecode compiler.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_AST_INTERP_BYTECODESTMTGEN_H
-#define LLVM_CLANG_AST_INTERP_BYTECODESTMTGEN_H
-
-#include "ByteCodeEmitter.h"
-#include "ByteCodeExprGen.h"
-#include "EvalEmitter.h"
-#include "PrimType.h"
-#include "clang/AST/StmtVisitor.h"
-
-namespace clang {
-namespace interp {
-
-template <class Emitter> class LoopScope;
-template <class Emitter> class SwitchScope;
-template <class Emitter> class LabelScope;
-
-/// Compilation context for statements.
-template <class Emitter>
-class ByteCodeStmtGen final : public ByteCodeExprGen<Emitter> {
-  using LabelTy = typename Emitter::LabelTy;
-  using AddrTy = typename Emitter::AddrTy;
-  using OptLabelTy = std::optional<LabelTy>;
-  using CaseMap = llvm::DenseMap<const SwitchCase *, LabelTy>;
-
-public:
-  template<typename... Tys>
-  ByteCodeStmtGen(Tys&&... Args)
-      : ByteCodeExprGen<Emitter>(std::forward<Tys>(Args)...) {}
-
-protected:
-  bool visitFunc(const FunctionDecl *F) override;
-
-private:
-  friend class LabelScope<Emitter>;
-  friend class LoopScope<Emitter>;
-  friend class SwitchScope<Emitter>;
-
-  // Statement visitors.
-  bool visitStmt(const Stmt *S);
-  bool visitCompoundStmt(const CompoundStmt *S);
-  bool visitLoopBody(const Stmt *S);
-  bool visitDeclStmt(const DeclStmt *DS);
-  bool visitReturnStmt(const ReturnStmt *RS);
-  bool visitIfStmt(const IfStmt *IS);
-  bool visitWhileStmt(const WhileStmt *S);
-  bool visitDoStmt(const DoStmt *S);
-  bool visitForStmt(const ForStmt *S);
-  bool visitCXXForRangeStmt(const CXXForRangeStmt *S);
-  bool visitBreakStmt(const BreakStmt *S);
-  bool visitContinueStmt(const ContinueStmt *S);
-  bool visitSwitchStmt(const SwitchStmt *S);
-  bool visitCaseStmt(const CaseStmt *S);
-  bool visitDefaultStmt(const DefaultStmt *S);
-  bool visitAttributedStmt(const AttributedStmt *S);
-  bool visitCXXTryStmt(const CXXTryStmt *S);
-
-  bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
-
-  /// Type of the expression returned by the function.
-  std::optional<PrimType> ReturnType;
-
-  /// Switch case mapping.
-  CaseMap CaseLabels;
-
-  /// Point to break to.
-  OptLabelTy BreakLabel;
-  /// Point to continue to.
-  OptLabelTy ContinueLabel;
-  /// Default case label.
-  OptLabelTy DefaultLabel;
-};
-
-extern template class ByteCodeStmtGen<ByteCodeEmitter>;
-extern template class ByteCodeExprGen<EvalEmitter>;
-
-} // namespace interp
-} // namespace clang
-
-#endif
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/Compiler.cpp
similarity index 79%
rename from clang/lib/AST/Interp/ByteCodeExprGen.cpp
rename to clang/lib/AST/Interp/Compiler.cpp
index 72c569f56a7887..424f4f84a01673 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -1,4 +1,4 @@
-//===--- ByteCodeExprGen.cpp - Code generator for expressions ---*- C++ -*-===//
+//===--- Compiler.cpp - Code generator for expressions ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ByteCodeExprGen.h"
+#include "Compiler.h"
 #include "ByteCodeEmitter.h"
-#include "ByteCodeStmtGen.h"
 #include "Context.h"
 #include "Floating.h"
 #include "Function.h"
@@ -28,7 +27,7 @@ namespace interp {
 /// Scope used to handle temporaries in toplevel variable declarations.
 template <class Emitter> class DeclScope final : public VariableScope<Emitter> {
 public:
-  DeclScope(ByteCodeExprGen<Emitter> *Ctx, const ValueDecl *VD)
+  DeclScope(Compiler<Emitter> *Ctx, const ValueDecl *VD)
       : VariableScope<Emitter>(Ctx, nullptr), Scope(Ctx->P, VD),
         OldGlobalDecl(Ctx->GlobalDecl),
         OldInitializingDecl(Ctx->InitializingDecl) {
@@ -37,6 +36,10 @@ template <class Emitter> class DeclScope final : public VariableScope<Emitter> {
     Ctx->InitStack.push_back(InitLink::Decl(VD));
   }
 
+  void addExtended(const Scope::Local &Local) override {
+    return this->addLocal(Local);
+  }
+
   ~DeclScope() {
     this->Ctx->GlobalDecl = OldGlobalDecl;
     this->Ctx->InitializingDecl = OldInitializingDecl;
@@ -53,7 +56,7 @@ template <class Emitter> class DeclScope final : public VariableScope<Emitter> {
 template <class Emitter> class OptionScope final {
 public:
   /// Root constructor, compiling or discarding primitives.
-  OptionScope(ByteCodeExprGen<Emitter> *Ctx, bool NewDiscardResult,
+  OptionScope(Compiler<Emitter> *Ctx, bool NewDiscardResult,
               bool NewInitializing)
       : Ctx(Ctx), OldDiscardResult(Ctx->DiscardResult),
         OldInitializing(Ctx->Initializing) {
@@ -68,14 +71,14 @@ template <class Emitter> class OptionScope final {
 
 private:
   /// Parent context.
-  ByteCodeExprGen<Emitter> *Ctx;
+  Compiler<Emitter> *Ctx;
   /// Old discard flag to restore.
   bool OldDiscardResult;
   bool OldInitializing;
 };
 
 template <class Emitter>
-bool InitLink::emit(ByteCodeExprGen<Emitter> *Ctx, const Expr *E) const {
+bool InitLink::emit(Compiler<Emitter> *Ctx, const Expr *E) const {
   switch (Kind) {
   case K_This:
     return Ctx->emitThis(E);
@@ -88,11 +91,74 @@ bool InitLink::emit(ByteCodeExprGen<Emitter> *Ctx, const Expr *E) const {
   return true;
 }
 
+/// Scope managing label targets.
+template <class Emitter> class LabelScope {
+public:
+  virtual ~LabelScope() {}
+
+protected:
+  LabelScope(Compiler<Emitter> *Ctx) : Ctx(Ctx) {}
+  /// Compiler instance.
+  Compiler<Emitter> *Ctx;
+};
+
+/// Sets the context for break/continue statements.
+template <class Emitter> class LoopScope final : public LabelScope<Emitter> {
+public:
+  using LabelTy = typename Compiler<Emitter>::LabelTy;
+  using OptLabelTy = typename Compiler<Emitter>::OptLabelTy;
+
+  LoopScope(Compiler<Emitter> *Ctx, LabelTy BreakLabel, LabelTy ContinueLabel)
+      : LabelScope<Emitter>(Ctx), OldBreakLabel(Ctx->BreakLabel),
+        OldContinueLabel(Ctx->ContinueLabel) {
+    this->Ctx->BreakLabel = BreakLabel;
+    this->Ctx->ContinueLabel = ContinueLabel;
+  }
+
+  ~LoopScope() {
+    this->Ctx->BreakLabel = OldBreakLabel;
+    this->Ctx->ContinueLabel = OldContinueLabel;
+  }
+
+private:
+  OptLabelTy OldBreakLabel;
+  OptLabelTy OldContinueLabel;
+};
+
+// Sets the context for a switch scope, mapping labels.
+template <class Emitter> class SwitchScope final : public LabelScope<Emitter> {
+public:
+  using LabelTy = typename Compiler<Emitter>::LabelTy;
+  using OptLabelTy = typename Compiler<Emitter>::OptLabelTy;
+  using CaseMap = typename Compiler<Emitter>::CaseMap;
+
+  SwitchScope(Compiler<Emitter> *Ctx, CaseMap &&CaseLabels, LabelTy BreakLabel,
+              OptLabelTy DefaultLabel)
+      : LabelScope<Emitter>(Ctx), OldBreakLabel(Ctx->BreakLabel),
+        OldDefaultLabel(this->Ctx->DefaultLabel),
+        OldCaseLabels(std::move(this->Ctx->CaseLabels)) {
+    this->Ctx->BreakLabel = BreakLabel;
+    this->Ctx->DefaultLabel = DefaultLabel;
+    this->Ctx->CaseLabels = std::move(CaseLabels);
+  }
+
+  ~SwitchScope() {
+    this->Ctx->BreakLabel = OldBreakLabel;
+    this->Ctx->DefaultLabel = OldDefaultLabel;
+    this->Ctx->CaseLabels = std::move(OldCaseLabels);
+  }
+
+private:
+  OptLabelTy OldBreakLabel;
+  OptLabelTy OldDefaultLabel;
+  CaseMap OldCaseLabels;
+};
+
 } // namespace interp
 } // namespace clang
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
+bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
   const Expr *SubExpr = CE->getSubExpr();
   switch (CE->getCastKind()) {
 
@@ -546,7 +612,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitIntegerLiteral(const IntegerLiteral *LE) {
+bool Compiler<Emitter>::VisitIntegerLiteral(const IntegerLiteral *LE) {
   if (DiscardResult)
     return true;
 
@@ -554,7 +620,7 @@ bool ByteCodeExprGen<Emitter>::VisitIntegerLiteral(const IntegerLiteral *LE) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
+bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
@@ -562,8 +628,7 @@ bool ByteCodeExprGen<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitImaginaryLiteral(
-    const ImaginaryLiteral *E) {
+bool Compiler<Emitter>::VisitImaginaryLiteral(const ImaginaryLiteral *E) {
   assert(E->getType()->isAnyComplexType());
   if (DiscardResult)
     return true;
@@ -587,12 +652,12 @@ bool ByteCodeExprGen<Emitter>::VisitImaginaryLiteral(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitParenExpr(const ParenExpr *E) {
+bool Compiler<Emitter>::VisitParenExpr(const ParenExpr *E) {
   return this->delegate(E->getSubExpr());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
+bool Compiler<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   // Need short-circuiting for these.
   if (BO->isLogicalOp())
     return this->VisitLogicalBinOp(BO);
@@ -764,7 +829,7 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
 /// Perform addition/subtraction of a pointer and an integer or
 /// subtraction of two pointers.
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitPointerArithBinOp(const BinaryOperator *E) {
+bool Compiler<Emitter>::VisitPointerArithBinOp(const BinaryOperator *E) {
   BinaryOperatorKind Op = E->getOpcode();
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
@@ -812,7 +877,7 @@ bool ByteCodeExprGen<Emitter>::VisitPointerArithBinOp(const BinaryOperator *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitLogicalBinOp(const BinaryOperator *E) {
+bool Compiler<Emitter>::VisitLogicalBinOp(const BinaryOperator *E) {
   assert(E->isLogicalOp());
   BinaryOperatorKind Op = E->getOpcode();
   const Expr *LHS = E->getLHS();
@@ -873,7 +938,7 @@ bool ByteCodeExprGen<Emitter>::VisitLogicalBinOp(const BinaryOperator *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
+bool Compiler<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
   // Prepare storage for result.
   if (!Initializing) {
     std::optional<unsigned> LocalIndex = allocateLocal(E);
@@ -1101,7 +1166,8 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitImplicitValueInitExpr(const ImplicitValueInitExpr *E) {
+bool Compiler<Emitter>::VisitImplicitValueInitExpr(
+    const ImplicitValueInitExpr *E) {
   QualType QT = E->getType();
 
   if (std::optional<PrimType> T = classify(QT))
@@ -1184,8 +1250,7 @@ bool ByteCodeExprGen<Emitter>::VisitImplicitValueInitExpr(const ImplicitValueIni
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitArraySubscriptExpr(
-    const ArraySubscriptExpr *E) {
+bool Compiler<Emitter>::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
   const Expr *Base = E->getBase();
   const Expr *Index = E->getIdx();
 
@@ -1205,9 +1270,8 @@ bool ByteCodeExprGen<Emitter>::VisitArraySubscriptExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
-                                             const Expr *ArrayFiller,
-                                             const Expr *E) {
+bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
+                                      const Expr *ArrayFiller, const Expr *E) {
   if (E->getType()->isVoidType())
     return this->emitInvalid(E);
 
@@ -1234,7 +1298,7 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
     const Record *R = getRecord(E->getType());
 
     if (Inits.size() == 1 && E->getType() == Inits[0]->getType())
-      return this->visitInitializer(Inits[0]);
+      return this->delegate(Inits[0]);
 
     auto initPrimitiveField = [=](const Record::Field *FieldToInit,
                                   const Expr *Init, PrimType T) -> bool {
@@ -1329,22 +1393,8 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
   }
 
   if (T->isArrayType()) {
-    // Prepare composite return value.
-    if (!Initializing) {
-      if (GlobalDecl) {
-        std::optional<unsigned> GlobalIndex = P.createGlobal(E);
-        if (!GlobalIndex)
-          return false;
-        if (!this->emitGetPtrGlobal(*GlobalIndex, E))
-          return false;
-      } else {
-        std::optional<unsigned> LocalIndex = allocateLocal(E);
-        if (!LocalIndex)
-          return false;
-        if (!this->emitGetPtrLocal(*LocalIndex, E))
-          return false;
-      }
-    }
+    if (Inits.size() == 1 && E->getType() == Inits[0]->getType())
+      return this->delegate(Inits[0]);
 
     unsigned ElementIndex = 0;
     for (const Expr *Init : Inits) {
@@ -1433,7 +1483,8 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
       // If the initializer is of vector type itself, we have to deconstruct
       // that and initialize all the target fields from the initializer fields.
       if (const auto *InitVecT = Init->getType()->getAs<VectorType>()) {
-        if (!this->emitCopyArray(ElemT, 0, InitIndex, InitVecT->getNumElements(), E))
+        if (!this->emitCopyArray(ElemT, 0, InitIndex,
+                                 InitVecT->getNumElements(), E))
           return false;
         InitIndex += InitVecT->getNumElements();
       } else {
@@ -1461,8 +1512,8 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
 /// Pointer to the array(not the element!) must be on the stack when calling
 /// this.
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
-                                                  const Expr *Init) {
+bool Compiler<Emitter>::visitArrayElemInit(unsigned ElemIndex,
+                                           const Expr *Init) {
   if (std::optional<PrimType> T = classify(Init->getType())) {
     // Visit the primitive element like normal.
     if (!this->visit(Init))
@@ -1482,24 +1533,24 @@ bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
+bool Compiler<Emitter>::VisitInitListExpr(const InitListExpr *E) {
   return this->visitInitList(E->inits(), E->getArrayFiller(), E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXParenListInitExpr(
+bool Compiler<Emitter>::VisitCXXParenListInitExpr(
     const CXXParenListInitExpr *E) {
   return this->visitInitList(E->getInitExprs(), E->getArrayFiller(), E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitSubstNonTypeTemplateParmExpr(
+bool Compiler<Emitter>::VisitSubstNonTypeTemplateParmExpr(
     const SubstNonTypeTemplateParmExpr *E) {
   return this->delegate(E->getReplacement());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitConstantExpr(const ConstantExpr *E) {
+bool Compiler<Emitter>::VisitConstantExpr(const ConstantExpr *E) {
   std::optional<PrimType> T = classify(E->getType());
   if (T && E->hasAPValueResult()) {
     // Try to emit the APValue directly, without visiting the subexpr.
@@ -1515,7 +1566,7 @@ bool ByteCodeExprGen<Emitter>::VisitConstantExpr(const ConstantExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitEmbedExpr(const EmbedExpr *E) {
+bool Compiler<Emitter>::VisitEmbedExpr(const EmbedExpr *E) {
   auto It = E->begin();
   return this->visit(*It);
 }
@@ -1544,7 +1595,7 @@ static CharUnits AlignOfType(QualType T, const ASTContext &ASTCtx,
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
+bool Compiler<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     const UnaryExprOrTypeTraitExpr *E) {
   UnaryExprOrTypeTrait Kind = E->getKind();
   const ASTContext &ASTCtx = Ctx.getASTContext();
@@ -1636,7 +1687,7 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
+bool Compiler<Emitter>::VisitMemberExpr(const MemberExpr *E) {
   // 'Base.Member'
   const Expr *Base = E->getBase();
   const ValueDecl *Member = E->getMemberDecl();
@@ -1688,8 +1739,7 @@ bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitArrayInitIndexExpr(
-    const ArrayInitIndexExpr *E) {
+bool Compiler<Emitter>::VisitArrayInitIndexExpr(const ArrayInitIndexExpr *E) {
   // ArrayIndex might not be set if a ArrayInitIndexExpr is being evaluated
   // stand-alone, e.g. via EvaluateAsInt().
   if (!ArrayIndex)
@@ -1698,8 +1748,7 @@ bool ByteCodeExprGen<Emitter>::VisitArrayInitIndexExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitArrayInitLoopExpr(
-    const ArrayInitLoopExpr *E) {
+bool Compiler<Emitter>::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E) {
   assert(Initializing);
   assert(!DiscardResult);
 
@@ -1727,7 +1776,7 @@ bool ByteCodeExprGen<Emitter>::VisitArrayInitLoopExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitOpaqueValueExpr(const OpaqueValueExpr *E) {
+bool Compiler<Emitter>::VisitOpaqueValueExpr(const OpaqueValueExpr *E) {
   const Expr *SourceExpr = E->getSourceExpr();
   if (!SourceExpr)
     return false;
@@ -1763,7 +1812,7 @@ bool ByteCodeExprGen<Emitter>::VisitOpaqueValueExpr(const OpaqueValueExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitAbstractConditionalOperator(
+bool Compiler<Emitter>::VisitAbstractConditionalOperator(
     const AbstractConditionalOperator *E) {
   const Expr *Condition = E->getCond();
   const Expr *TrueExpr = E->getTrueExpr();
@@ -1795,7 +1844,7 @@ bool ByteCodeExprGen<Emitter>::VisitAbstractConditionalOperator(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitStringLiteral(const StringLiteral *E) {
+bool Compiler<Emitter>::VisitStringLiteral(const StringLiteral *E) {
   if (DiscardResult)
     return true;
 
@@ -1852,13 +1901,12 @@ bool ByteCodeExprGen<Emitter>::VisitStringLiteral(const StringLiteral *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitObjCStringLiteral(
-    const ObjCStringLiteral *E) {
+bool Compiler<Emitter>::VisitObjCStringLiteral(const ObjCStringLiteral *E) {
   return this->delegate(E->getString());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitObjCEncodeExpr(const ObjCEncodeExpr *E) {
+bool Compiler<Emitter>::VisitObjCEncodeExpr(const ObjCEncodeExpr *E) {
   auto &A = Ctx.getASTContext();
   std::string Str;
   A.getObjCEncodingForType(E->getEncodedType(), Str);
@@ -1869,7 +1917,7 @@ bool ByteCodeExprGen<Emitter>::VisitObjCEncodeExpr(const ObjCEncodeExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitSYCLUniqueStableNameExpr(
+bool Compiler<Emitter>::VisitSYCLUniqueStableNameExpr(
     const SYCLUniqueStableNameExpr *E) {
   if (DiscardResult)
     return true;
@@ -1893,15 +1941,14 @@ bool ByteCodeExprGen<Emitter>::VisitSYCLUniqueStableNameExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCharacterLiteral(
-    const CharacterLiteral *E) {
+bool Compiler<Emitter>::VisitCharacterLiteral(const CharacterLiteral *E) {
   if (DiscardResult)
     return true;
   return this->emitConst(E->getValue(), E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitFloatCompoundAssignOperator(
+bool Compiler<Emitter>::VisitFloatCompoundAssignOperator(
     const CompoundAssignOperator *E) {
 
   const Expr *LHS = E->getLHS();
@@ -1975,7 +2022,7 @@ bool ByteCodeExprGen<Emitter>::VisitFloatCompoundAssignOperator(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitPointerCompoundAssignOperator(
+bool Compiler<Emitter>::VisitPointerCompoundAssignOperator(
     const CompoundAssignOperator *E) {
   BinaryOperatorKind Op = E->getOpcode();
   const Expr *LHS = E->getLHS();
@@ -2012,7 +2059,7 @@ bool ByteCodeExprGen<Emitter>::VisitPointerCompoundAssignOperator(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
+bool Compiler<Emitter>::VisitCompoundAssignOperator(
     const CompoundAssignOperator *E) {
 
   const Expr *LHS = E->getLHS();
@@ -2133,8 +2180,7 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitExprWithCleanups(
-    const ExprWithCleanups *E) {
+bool Compiler<Emitter>::VisitExprWithCleanups(const ExprWithCleanups *E) {
   ExprScope<Emitter> ES(this);
   const Expr *SubExpr = E->getSubExpr();
 
@@ -2144,13 +2190,13 @@ bool ByteCodeExprGen<Emitter>::VisitExprWithCleanups(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitMaterializeTemporaryExpr(
+bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
     const MaterializeTemporaryExpr *E) {
   const Expr *SubExpr = E->getSubExpr();
 
   if (Initializing) {
     // We already have a value, just initialize that.
-    return this->visitInitializer(SubExpr);
+    return this->delegate(SubExpr);
   }
   // If we don't end up using the materialized temporary anyway, don't
   // bother creating it.
@@ -2216,14 +2262,13 @@ bool ByteCodeExprGen<Emitter>::VisitMaterializeTemporaryExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXBindTemporaryExpr(
+bool Compiler<Emitter>::VisitCXXBindTemporaryExpr(
     const CXXBindTemporaryExpr *E) {
   return this->delegate(E->getSubExpr());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCompoundLiteralExpr(
-    const CompoundLiteralExpr *E) {
+bool Compiler<Emitter>::VisitCompoundLiteralExpr(const CompoundLiteralExpr *E) {
   const Expr *Init = E->getInitializer();
   if (Initializing) {
     // We already have a value, just initialize that.
@@ -2288,7 +2333,7 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundLiteralExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitTypeTraitExpr(const TypeTraitExpr *E) {
+bool Compiler<Emitter>::VisitTypeTraitExpr(const TypeTraitExpr *E) {
   if (DiscardResult)
     return true;
   if (E->getType()->isBooleanType())
@@ -2297,15 +2342,14 @@ bool ByteCodeExprGen<Emitter>::VisitTypeTraitExpr(const TypeTraitExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitArrayTypeTraitExpr(
-    const ArrayTypeTraitExpr *E) {
+bool Compiler<Emitter>::VisitArrayTypeTraitExpr(const ArrayTypeTraitExpr *E) {
   if (DiscardResult)
     return true;
   return this->emitConst(E->getValue(), E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
+bool Compiler<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
   if (DiscardResult)
     return true;
 
@@ -2344,7 +2388,7 @@ bool ByteCodeExprGen<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitPredefinedExpr(const PredefinedExpr *E) {
+bool Compiler<Emitter>::VisitPredefinedExpr(const PredefinedExpr *E) {
   if (DiscardResult)
     return true;
 
@@ -2352,7 +2396,7 @@ bool ByteCodeExprGen<Emitter>::VisitPredefinedExpr(const PredefinedExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXThrowExpr(const CXXThrowExpr *E) {
+bool Compiler<Emitter>::VisitCXXThrowExpr(const CXXThrowExpr *E) {
   if (E->getSubExpr() && !this->discard(E->getSubExpr()))
     return false;
 
@@ -2360,7 +2404,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXThrowExpr(const CXXThrowExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXReinterpretCastExpr(
+bool Compiler<Emitter>::VisitCXXReinterpretCastExpr(
     const CXXReinterpretCastExpr *E) {
   if (!this->discard(E->getSubExpr()))
     return false;
@@ -2369,7 +2413,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXReinterpretCastExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXNoexceptExpr(const CXXNoexceptExpr *E) {
+bool Compiler<Emitter>::VisitCXXNoexceptExpr(const CXXNoexceptExpr *E) {
   assert(E->getType()->isBooleanType());
 
   if (DiscardResult)
@@ -2378,8 +2422,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXNoexceptExpr(const CXXNoexceptExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXConstructExpr(
-    const CXXConstructExpr *E) {
+bool Compiler<Emitter>::VisitCXXConstructExpr(const CXXConstructExpr *E) {
   QualType T = E->getType();
   assert(!classify(T));
 
@@ -2491,7 +2534,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXConstructExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitSourceLocExpr(const SourceLocExpr *E) {
+bool Compiler<Emitter>::VisitSourceLocExpr(const SourceLocExpr *E) {
   if (DiscardResult)
     return true;
 
@@ -2547,7 +2590,7 @@ bool ByteCodeExprGen<Emitter>::VisitSourceLocExpr(const SourceLocExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitOffsetOfExpr(const OffsetOfExpr *E) {
+bool Compiler<Emitter>::VisitOffsetOfExpr(const OffsetOfExpr *E) {
   unsigned N = E->getNumComponents();
   if (N == 0)
     return false;
@@ -2582,7 +2625,7 @@ bool ByteCodeExprGen<Emitter>::VisitOffsetOfExpr(const OffsetOfExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXScalarValueInitExpr(
+bool Compiler<Emitter>::VisitCXXScalarValueInitExpr(
     const CXXScalarValueInitExpr *E) {
   QualType Ty = E->getType();
 
@@ -2641,24 +2684,23 @@ bool ByteCodeExprGen<Emitter>::VisitCXXScalarValueInitExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitSizeOfPackExpr(const SizeOfPackExpr *E) {
+bool Compiler<Emitter>::VisitSizeOfPackExpr(const SizeOfPackExpr *E) {
   return this->emitConst(E->getPackLength(), E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitGenericSelectionExpr(
+bool Compiler<Emitter>::VisitGenericSelectionExpr(
     const GenericSelectionExpr *E) {
   return this->delegate(E->getResultExpr());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitChooseExpr(const ChooseExpr *E) {
+bool Compiler<Emitter>::VisitChooseExpr(const ChooseExpr *E) {
   return this->delegate(E->getChosenSubExpr());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitObjCBoolLiteralExpr(
-    const ObjCBoolLiteralExpr *E) {
+bool Compiler<Emitter>::VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *E) {
   if (DiscardResult)
     return true;
 
@@ -2666,7 +2708,7 @@ bool ByteCodeExprGen<Emitter>::VisitObjCBoolLiteralExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXInheritedCtorInitExpr(
+bool Compiler<Emitter>::VisitCXXInheritedCtorInitExpr(
     const CXXInheritedCtorInitExpr *E) {
   const CXXConstructorDecl *Ctor = E->getConstructor();
   assert(!Ctor->isTrivial() &&
@@ -2697,19 +2739,29 @@ bool ByteCodeExprGen<Emitter>::VisitCXXInheritedCtorInitExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitExpressionTraitExpr(
-    const ExpressionTraitExpr *E) {
+bool Compiler<Emitter>::VisitExpressionTraitExpr(const ExpressionTraitExpr *E) {
   assert(Ctx.getLangOpts().CPlusPlus);
   return this->emitConstBool(E->getValue(), E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
+bool Compiler<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
   if (DiscardResult)
     return true;
   assert(!Initializing);
 
-  std::optional<unsigned> GlobalIndex = P.getOrCreateGlobal(E->getGuidDecl());
+  const MSGuidDecl *GuidDecl = E->getGuidDecl();
+  const RecordDecl *RD = GuidDecl->getType()->getAsRecordDecl();
+  assert(RD);
+  // If the definiton of the result type is incomplete, just return a dummy.
+  // If (and when) that is read from, we will fail, but not now.
+  if (!RD->isCompleteDefinition()) {
+    if (std::optional<unsigned> I = P.getOrCreateDummy(GuidDecl))
+      return this->emitGetPtrGlobal(*I, E);
+    return false;
+  }
+
+  std::optional<unsigned> GlobalIndex = P.getOrCreateGlobal(GuidDecl);
   if (!GlobalIndex)
     return false;
   if (!this->emitGetPtrGlobal(*GlobalIndex, E))
@@ -2717,7 +2769,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
 
   assert(this->getRecord(E->getType()));
 
-  const APValue &V = E->getGuidDecl()->getAsAPValue();
+  const APValue &V = GuidDecl->getAsAPValue();
   if (V.getKind() == APValue::None)
     return true;
 
@@ -2730,7 +2782,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitRequiresExpr(const RequiresExpr *E) {
+bool Compiler<Emitter>::VisitRequiresExpr(const RequiresExpr *E) {
   assert(classifyPrim(E->getType()) == PT_Bool);
   if (DiscardResult)
     return true;
@@ -2738,7 +2790,7 @@ bool ByteCodeExprGen<Emitter>::VisitRequiresExpr(const RequiresExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitConceptSpecializationExpr(
+bool Compiler<Emitter>::VisitConceptSpecializationExpr(
     const ConceptSpecializationExpr *E) {
   assert(classifyPrim(E->getType()) == PT_Bool);
   if (DiscardResult)
@@ -2747,14 +2799,13 @@ bool ByteCodeExprGen<Emitter>::VisitConceptSpecializationExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXRewrittenBinaryOperator(
+bool Compiler<Emitter>::VisitCXXRewrittenBinaryOperator(
     const CXXRewrittenBinaryOperator *E) {
   return this->delegate(E->getSemanticForm());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitPseudoObjectExpr(
-    const PseudoObjectExpr *E) {
+bool Compiler<Emitter>::VisitPseudoObjectExpr(const PseudoObjectExpr *E) {
 
   for (const Expr *SemE : E->semantics()) {
     if (auto *OVE = dyn_cast<OpaqueValueExpr>(SemE)) {
@@ -2778,18 +2829,17 @@ bool ByteCodeExprGen<Emitter>::VisitPseudoObjectExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitPackIndexingExpr(
-    const PackIndexingExpr *E) {
+bool Compiler<Emitter>::VisitPackIndexingExpr(const PackIndexingExpr *E) {
   return this->delegate(E->getSelectedExpr());
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitRecoveryExpr(const RecoveryExpr *E) {
+bool Compiler<Emitter>::VisitRecoveryExpr(const RecoveryExpr *E) {
   return this->emitError(E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitAddrLabelExpr(const AddrLabelExpr *E) {
+bool Compiler<Emitter>::VisitAddrLabelExpr(const AddrLabelExpr *E) {
   assert(E->getType()->isVoidPointerType());
 
   unsigned Offset = allocateLocalPrimitive(
@@ -2799,8 +2849,7 @@ bool ByteCodeExprGen<Emitter>::VisitAddrLabelExpr(const AddrLabelExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitConvertVectorExpr(
-    const ConvertVectorExpr *E) {
+bool Compiler<Emitter>::VisitConvertVectorExpr(const ConvertVectorExpr *E) {
   assert(Initializing);
   const auto *VT = E->getType()->castAs<VectorType>();
   QualType ElemType = VT->getElementType();
@@ -2832,8 +2881,7 @@ bool ByteCodeExprGen<Emitter>::VisitConvertVectorExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitShuffleVectorExpr(
-    const ShuffleVectorExpr *E) {
+bool Compiler<Emitter>::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
   assert(Initializing);
   assert(E->getNumSubExprs() > 2);
 
@@ -2875,7 +2923,7 @@ bool ByteCodeExprGen<Emitter>::VisitShuffleVectorExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitExtVectorElementExpr(
+bool Compiler<Emitter>::VisitExtVectorElementExpr(
     const ExtVectorElementExpr *E) {
   const Expr *Base = E->getBase();
   assert(
@@ -2937,28 +2985,89 @@ bool ByteCodeExprGen<Emitter>::VisitExtVectorElementExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitObjCBoxedExpr(const ObjCBoxedExpr *E) {
+bool Compiler<Emitter>::VisitObjCBoxedExpr(const ObjCBoxedExpr *E) {
   if (!E->isExpressibleAsConstantInitializer())
     return this->emitInvalid(E);
 
   return this->delegate(E->getSubExpr());
 }
 
-template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
+template <class Emitter>
+bool Compiler<Emitter>::VisitCXXStdInitializerListExpr(
+    const CXXStdInitializerListExpr *E) {
+  const Expr *SubExpr = E->getSubExpr();
+  const ConstantArrayType *ArrayType =
+      Ctx.getASTContext().getAsConstantArrayType(SubExpr->getType());
+  const Record *R = getRecord(E->getType());
+  assert(Initializing);
+  assert(SubExpr->isGLValue());
+
+  if (!this->visit(SubExpr))
+    return false;
+  if (!this->emitInitFieldPtr(R->getField(0u)->Offset, E))
+    return false;
+
+  PrimType SecondFieldT = classifyPrim(R->getField(1u)->Decl->getType());
+  if (isIntegralType(SecondFieldT)) {
+    if (!this->emitConst(static_cast<APSInt>(ArrayType->getSize()),
+                         SecondFieldT, E))
+      return false;
+    return this->emitInitField(SecondFieldT, R->getField(1u)->Offset, E);
+  }
+  assert(SecondFieldT == PT_Ptr);
+
+  if (!this->emitGetFieldPtr(R->getField(0u)->Offset, E))
+    return false;
+  if (!this->emitConst(static_cast<APSInt>(ArrayType->getSize()), PT_Uint64, E))
+    return false;
+  if (!this->emitArrayElemPtrPop(PT_Uint64, E))
+    return false;
+  return this->emitInitFieldPtr(R->getField(1u)->Offset, E);
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::VisitStmtExpr(const StmtExpr *E) {
+  BlockScope<Emitter> BS(this);
+
+  const CompoundStmt *CS = E->getSubStmt();
+  const Stmt *Result = CS->getStmtExprResult();
+  for (const Stmt *S : CS->body()) {
+    if (S != Result) {
+      if (!this->visitStmt(S))
+        return false;
+      continue;
+    }
+
+    assert(S == Result);
+    // This better produces a value (i.e. is an expression).
+    if (const Expr *ResultExpr = dyn_cast<Expr>(S)) {
+      if (DiscardResult)
+        return this->discard(ResultExpr);
+      return this->delegate(ResultExpr);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+template <class Emitter> bool Compiler<Emitter>::discard(const Expr *E) {
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/true,
                              /*NewInitializing=*/false);
   return this->Visit(E);
 }
 
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::delegate(const Expr *E) {
+template <class Emitter> bool Compiler<Emitter>::delegate(const Expr *E) {
+  if (E->containsErrors())
+    return this->emitError(E);
+
   // We're basically doing:
   // OptionScope<Emitter> Scope(this, DicardResult, Initializing);
   // but that's unnecessary of course.
   return this->Visit(E);
 }
 
-template <class Emitter> bool ByteCodeExprGen<Emitter>::visit(const Expr *E) {
+template <class Emitter> bool Compiler<Emitter>::visit(const Expr *E) {
   if (E->getType().isNull())
     return false;
 
@@ -2985,7 +3094,7 @@ template <class Emitter> bool ByteCodeExprGen<Emitter>::visit(const Expr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitInitializer(const Expr *E) {
+bool Compiler<Emitter>::visitInitializer(const Expr *E) {
   assert(!classify(E->getType()));
 
   if (E->containsErrors())
@@ -2996,8 +3105,7 @@ bool ByteCodeExprGen<Emitter>::visitInitializer(const Expr *E) {
   return this->Visit(E);
 }
 
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitBool(const Expr *E) {
+template <class Emitter> bool Compiler<Emitter>::visitBool(const Expr *E) {
   std::optional<PrimType> T = classify(E->getType());
   if (!T) {
     // Convert complex values to bool.
@@ -3031,8 +3139,8 @@ bool ByteCodeExprGen<Emitter>::visitBool(const Expr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
-                                                    const Expr *E) {
+bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
+                                             const Expr *E) {
   switch (T) {
   case PT_Bool:
     return this->emitZeroBool(E);
@@ -3070,8 +3178,8 @@ bool ByteCodeExprGen<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitZeroRecordInitializer(const Record *R,
-                                                          const Expr *E) {
+bool Compiler<Emitter>::visitZeroRecordInitializer(const Record *R,
+                                                   const Expr *E) {
   assert(E);
   assert(R);
   // Fields
@@ -3139,7 +3247,7 @@ bool ByteCodeExprGen<Emitter>::visitZeroRecordInitializer(const Record *R,
 
 template <class Emitter>
 template <typename T>
-bool ByteCodeExprGen<Emitter>::emitConst(T Value, PrimType Ty, const Expr *E) {
+bool Compiler<Emitter>::emitConst(T Value, PrimType Ty, const Expr *E) {
   switch (Ty) {
   case PT_Sint8:
     return this->emitConstSint8(Value, E);
@@ -3173,13 +3281,13 @@ bool ByteCodeExprGen<Emitter>::emitConst(T Value, PrimType Ty, const Expr *E) {
 
 template <class Emitter>
 template <typename T>
-bool ByteCodeExprGen<Emitter>::emitConst(T Value, const Expr *E) {
+bool Compiler<Emitter>::emitConst(T Value, const Expr *E) {
   return this->emitConst(Value, classifyPrim(E->getType()), E);
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitConst(const APSInt &Value, PrimType Ty,
-                                         const Expr *E) {
+bool Compiler<Emitter>::emitConst(const APSInt &Value, PrimType Ty,
+                                  const Expr *E) {
   if (Ty == PT_IntAPS)
     return this->emitConstIntAPS(Value, E);
   if (Ty == PT_IntAP)
@@ -3191,15 +3299,14 @@ bool ByteCodeExprGen<Emitter>::emitConst(const APSInt &Value, PrimType Ty,
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitConst(const APSInt &Value, const Expr *E) {
+bool Compiler<Emitter>::emitConst(const APSInt &Value, const Expr *E) {
   return this->emitConst(Value, classifyPrim(E->getType()), E);
 }
 
 template <class Emitter>
-unsigned ByteCodeExprGen<Emitter>::allocateLocalPrimitive(DeclTy &&Src,
-                                                          PrimType Ty,
-                                                          bool IsConst,
-                                                          bool IsExtended) {
+unsigned Compiler<Emitter>::allocateLocalPrimitive(DeclTy &&Src, PrimType Ty,
+                                                   bool IsConst,
+                                                   bool IsExtended) {
   // Make sure we don't accidentally register the same decl twice.
   if (const auto *VD =
           dyn_cast_if_present<ValueDecl>(Src.dyn_cast<const Decl *>())) {
@@ -3222,10 +3329,9 @@ unsigned ByteCodeExprGen<Emitter>::allocateLocalPrimitive(DeclTy &&Src,
 
 template <class Emitter>
 std::optional<unsigned>
-ByteCodeExprGen<Emitter>::allocateLocal(DeclTy &&Src,
-                                        const ValueDecl *ExtendingDecl) {
+Compiler<Emitter>::allocateLocal(DeclTy &&Src, const ValueDecl *ExtendingDecl) {
   // Make sure we don't accidentally register the same decl twice.
-  if ([[maybe_unused]]  const auto *VD =
+  if ([[maybe_unused]] const auto *VD =
           dyn_cast_if_present<ValueDecl>(Src.dyn_cast<const Decl *>())) {
     assert(!P.getGlobal(VD));
     assert(!Locals.contains(VD));
@@ -3264,31 +3370,29 @@ ByteCodeExprGen<Emitter>::allocateLocal(DeclTy &&Src,
 }
 
 template <class Emitter>
-const RecordType *ByteCodeExprGen<Emitter>::getRecordTy(QualType Ty) {
+const RecordType *Compiler<Emitter>::getRecordTy(QualType Ty) {
   if (const PointerType *PT = dyn_cast<PointerType>(Ty))
     return PT->getPointeeType()->getAs<RecordType>();
   return Ty->getAs<RecordType>();
 }
 
-template <class Emitter>
-Record *ByteCodeExprGen<Emitter>::getRecord(QualType Ty) {
+template <class Emitter> Record *Compiler<Emitter>::getRecord(QualType Ty) {
   if (const auto *RecordTy = getRecordTy(Ty))
     return getRecord(RecordTy->getDecl());
   return nullptr;
 }
 
 template <class Emitter>
-Record *ByteCodeExprGen<Emitter>::getRecord(const RecordDecl *RD) {
+Record *Compiler<Emitter>::getRecord(const RecordDecl *RD) {
   return P.getOrCreateRecord(RD);
 }
 
 template <class Emitter>
-const Function *ByteCodeExprGen<Emitter>::getFunction(const FunctionDecl *FD) {
+const Function *Compiler<Emitter>::getFunction(const FunctionDecl *FD) {
   return Ctx.getOrCreateFunction(FD);
 }
 
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
+template <class Emitter> bool Compiler<Emitter>::visitExpr(const Expr *E) {
   ExprScope<Emitter> RootScope(this);
   // Void expressions.
   if (E->getType()->isVoidType()) {
@@ -3330,8 +3434,7 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
 /// We get here from evaluateAsInitializer().
 /// We need to evaluate the initializer and return its value.
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitDecl(const VarDecl *VD,
-                                         bool ConstantContext) {
+bool Compiler<Emitter>::visitDecl(const VarDecl *VD, bool ConstantContext) {
   assert(!VD->isInvalidDecl() && "Trying to constant evaluate an invalid decl");
 
   std::optional<PrimType> VarT = classify(VD->getType());
@@ -3404,11 +3507,16 @@ bool ByteCodeExprGen<Emitter>::visitDecl(const VarDecl *VD,
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
+VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD) {
   // We don't know what to do with these, so just return false.
   if (VD->getType().isNull())
     return false;
 
+  // This case is EvalEmitter-only. If we won't create any instructions for the
+  // initializer anyway, don't bother creating the variable in the first place.
+  if (!this->isActive())
+    return VarCreationState::NotCreated();
+
   const Expr *Init = VD->getInit();
   std::optional<PrimType> VarT = classify(VD->getType());
 
@@ -3467,8 +3575,8 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitAPValue(const APValue &Val,
-                                            PrimType ValType, const Expr *E) {
+bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
+                                     const Expr *E) {
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
@@ -3494,8 +3602,8 @@ bool ByteCodeExprGen<Emitter>::visitAPValue(const APValue &Val,
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitAPValueInitializer(const APValue &Val,
-                                                       const Expr *E) {
+bool Compiler<Emitter>::visitAPValueInitializer(const APValue &Val,
+                                                const Expr *E) {
 
   if (Val.isStruct()) {
     const Record *R = this->getRecord(E->getType());
@@ -3557,7 +3665,7 @@ bool ByteCodeExprGen<Emitter>::visitAPValueInitializer(const APValue &Val,
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
+bool Compiler<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
   const Function *Func = getFunction(E->getDirectCallee());
   if (!Func)
     return false;
@@ -3567,9 +3675,16 @@ bool ByteCodeExprGen<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
   unsigned Builtin = E->getBuiltinCallee();
   if (Builtin == Builtin::BI__builtin___CFStringMakeConstantString ||
       Builtin == Builtin::BI__builtin___NSStringMakeConstantString ||
+      Builtin == Builtin::BI__builtin_ptrauth_sign_constant ||
       Builtin == Builtin::BI__builtin_function_start) {
-    if (std::optional<unsigned> GlobalOffset = P.createGlobal(E))
-      return this->emitGetPtrGlobal(*GlobalOffset, E);
+    if (std::optional<unsigned> GlobalOffset = P.createGlobal(E)) {
+      if (!this->emitGetPtrGlobal(*GlobalOffset, E))
+        return false;
+
+      if (PrimType PT = classifyPrim(E); PT != PT_Ptr && isPtrType(PT))
+        return this->emitDecayPtr(PT_Ptr, PT, E);
+      return true;
+    }
     return false;
   }
 
@@ -3605,7 +3720,7 @@ bool ByteCodeExprGen<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) {
+bool Compiler<Emitter>::VisitCallExpr(const CallExpr *E) {
   if (E->getBuiltinCallee())
     return VisitBuiltinCallExpr(E);
 
@@ -3765,8 +3880,7 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXDefaultInitExpr(
-    const CXXDefaultInitExpr *E) {
+bool Compiler<Emitter>::VisitCXXDefaultInitExpr(const CXXDefaultInitExpr *E) {
   SourceLocScope<Emitter> SLS(this, E);
 
   bool Old = InitStackActive;
@@ -3777,8 +3891,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXDefaultInitExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXDefaultArgExpr(
-    const CXXDefaultArgExpr *E) {
+bool Compiler<Emitter>::VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E) {
   SourceLocScope<Emitter> SLS(this, E);
 
   const Expr *SubExpr = E->getExpr();
@@ -3790,8 +3903,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXDefaultArgExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXBoolLiteralExpr(
-    const CXXBoolLiteralExpr *E) {
+bool Compiler<Emitter>::VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *E) {
   if (DiscardResult)
     return true;
 
@@ -3799,7 +3911,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXBoolLiteralExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXNullPtrLiteralExpr(
+bool Compiler<Emitter>::VisitCXXNullPtrLiteralExpr(
     const CXXNullPtrLiteralExpr *E) {
   if (DiscardResult)
     return true;
@@ -3808,7 +3920,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXNullPtrLiteralExpr(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitGNUNullExpr(const GNUNullExpr *E) {
+bool Compiler<Emitter>::VisitGNUNullExpr(const GNUNullExpr *E) {
   if (DiscardResult)
     return true;
 
@@ -3819,7 +3931,7 @@ bool ByteCodeExprGen<Emitter>::VisitGNUNullExpr(const GNUNullExpr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
+bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   if (DiscardResult)
     return true;
 
@@ -3843,8 +3955,640 @@ bool ByteCodeExprGen<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   return this->emitThis(E);
 }
 
+template <class Emitter> bool Compiler<Emitter>::visitStmt(const Stmt *S) {
+  switch (S->getStmtClass()) {
+  case Stmt::CompoundStmtClass:
+    return visitCompoundStmt(cast<CompoundStmt>(S));
+  case Stmt::DeclStmtClass:
+    return visitDeclStmt(cast<DeclStmt>(S));
+  case Stmt::ReturnStmtClass:
+    return visitReturnStmt(cast<ReturnStmt>(S));
+  case Stmt::IfStmtClass:
+    return visitIfStmt(cast<IfStmt>(S));
+  case Stmt::WhileStmtClass:
+    return visitWhileStmt(cast<WhileStmt>(S));
+  case Stmt::DoStmtClass:
+    return visitDoStmt(cast<DoStmt>(S));
+  case Stmt::ForStmtClass:
+    return visitForStmt(cast<ForStmt>(S));
+  case Stmt::CXXForRangeStmtClass:
+    return visitCXXForRangeStmt(cast<CXXForRangeStmt>(S));
+  case Stmt::BreakStmtClass:
+    return visitBreakStmt(cast<BreakStmt>(S));
+  case Stmt::ContinueStmtClass:
+    return visitContinueStmt(cast<ContinueStmt>(S));
+  case Stmt::SwitchStmtClass:
+    return visitSwitchStmt(cast<SwitchStmt>(S));
+  case Stmt::CaseStmtClass:
+    return visitCaseStmt(cast<CaseStmt>(S));
+  case Stmt::DefaultStmtClass:
+    return visitDefaultStmt(cast<DefaultStmt>(S));
+  case Stmt::AttributedStmtClass:
+    return visitAttributedStmt(cast<AttributedStmt>(S));
+  case Stmt::CXXTryStmtClass:
+    return visitCXXTryStmt(cast<CXXTryStmt>(S));
+  case Stmt::NullStmtClass:
+    return true;
+  // Always invalid statements.
+  case Stmt::GCCAsmStmtClass:
+  case Stmt::MSAsmStmtClass:
+  case Stmt::GotoStmtClass:
+    return this->emitInvalid(S);
+  case Stmt::LabelStmtClass:
+    return this->visitStmt(cast<LabelStmt>(S)->getSubStmt());
+  default: {
+    if (const auto *E = dyn_cast<Expr>(S))
+      return this->discard(E);
+    return false;
+  }
+  }
+}
+
+/// Visits the given statment without creating a variable
+/// scope for it in case it is a compound statement.
+template <class Emitter> bool Compiler<Emitter>::visitLoopBody(const Stmt *S) {
+  if (isa<NullStmt>(S))
+    return true;
+
+  if (const auto *CS = dyn_cast<CompoundStmt>(S)) {
+    for (auto *InnerStmt : CS->body())
+      if (!visitStmt(InnerStmt))
+        return false;
+    return true;
+  }
+
+  return this->visitStmt(S);
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitCompoundStmt(const CompoundStmt *S) {
+  BlockScope<Emitter> Scope(this);
+  for (auto *InnerStmt : S->body())
+    if (!visitStmt(InnerStmt))
+      return false;
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitDeclStmt(const DeclStmt *DS) {
+  for (auto *D : DS->decls()) {
+    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl,
+            FunctionDecl>(D))
+      continue;
+
+    const auto *VD = dyn_cast<VarDecl>(D);
+    if (!VD)
+      return false;
+    if (!this->visitVarDecl(VD))
+      return false;
+  }
+
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitReturnStmt(const ReturnStmt *RS) {
+  if (const Expr *RE = RS->getRetValue()) {
+    ExprScope<Emitter> RetScope(this);
+    if (ReturnType) {
+      // Primitive types are simply returned.
+      if (!this->visit(RE))
+        return false;
+      this->emitCleanup();
+      return this->emitRet(*ReturnType, RS);
+    } else if (RE->getType()->isVoidType()) {
+      if (!this->visit(RE))
+        return false;
+    } else {
+      // RVO - construct the value in the return location.
+      if (!this->emitRVOPtr(RE))
+        return false;
+      if (!this->visitInitializer(RE))
+        return false;
+      if (!this->emitPopPtr(RE))
+        return false;
+
+      this->emitCleanup();
+      return this->emitRetVoid(RS);
+    }
+  }
+
+  // Void return.
+  this->emitCleanup();
+  return this->emitRetVoid(RS);
+}
+
+template <class Emitter> bool Compiler<Emitter>::visitIfStmt(const IfStmt *IS) {
+  BlockScope<Emitter> IfScope(this);
+
+  if (IS->isNonNegatedConsteval())
+    return visitStmt(IS->getThen());
+  if (IS->isNegatedConsteval())
+    return IS->getElse() ? visitStmt(IS->getElse()) : true;
+
+  if (auto *CondInit = IS->getInit())
+    if (!visitStmt(CondInit))
+      return false;
+
+  if (const DeclStmt *CondDecl = IS->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
+  if (!this->visitBool(IS->getCond()))
+    return false;
+
+  if (const Stmt *Else = IS->getElse()) {
+    LabelTy LabelElse = this->getLabel();
+    LabelTy LabelEnd = this->getLabel();
+    if (!this->jumpFalse(LabelElse))
+      return false;
+    if (!visitStmt(IS->getThen()))
+      return false;
+    if (!this->jump(LabelEnd))
+      return false;
+    this->emitLabel(LabelElse);
+    if (!visitStmt(Else))
+      return false;
+    this->emitLabel(LabelEnd);
+  } else {
+    LabelTy LabelEnd = this->getLabel();
+    if (!this->jumpFalse(LabelEnd))
+      return false;
+    if (!visitStmt(IS->getThen()))
+      return false;
+    this->emitLabel(LabelEnd);
+  }
+
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitWhileStmt(const WhileStmt *S) {
+  const Expr *Cond = S->getCond();
+  const Stmt *Body = S->getBody();
+
+  LabelTy CondLabel = this->getLabel(); // Label before the condition.
+  LabelTy EndLabel = this->getLabel();  // Label after the loop.
+  LoopScope<Emitter> LS(this, EndLabel, CondLabel);
+
+  this->emitLabel(CondLabel);
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
+  if (!this->visitBool(Cond))
+    return false;
+  if (!this->jumpFalse(EndLabel))
+    return false;
+
+  LocalScope<Emitter> Scope(this);
+  {
+    DestructorScope<Emitter> DS(Scope);
+    if (!this->visitLoopBody(Body))
+      return false;
+  }
+
+  if (!this->jump(CondLabel))
+    return false;
+  this->emitLabel(EndLabel);
+
+  return true;
+}
+
+template <class Emitter> bool Compiler<Emitter>::visitDoStmt(const DoStmt *S) {
+  const Expr *Cond = S->getCond();
+  const Stmt *Body = S->getBody();
+
+  LabelTy StartLabel = this->getLabel();
+  LabelTy EndLabel = this->getLabel();
+  LabelTy CondLabel = this->getLabel();
+  LoopScope<Emitter> LS(this, EndLabel, CondLabel);
+  LocalScope<Emitter> Scope(this);
+
+  this->emitLabel(StartLabel);
+  {
+    DestructorScope<Emitter> DS(Scope);
+
+    if (!this->visitLoopBody(Body))
+      return false;
+    this->emitLabel(CondLabel);
+    if (!this->visitBool(Cond))
+      return false;
+  }
+  if (!this->jumpTrue(StartLabel))
+    return false;
+
+  this->emitLabel(EndLabel);
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitForStmt(const ForStmt *S) {
+  // for (Init; Cond; Inc) { Body }
+  const Stmt *Init = S->getInit();
+  const Expr *Cond = S->getCond();
+  const Expr *Inc = S->getInc();
+  const Stmt *Body = S->getBody();
+
+  LabelTy EndLabel = this->getLabel();
+  LabelTy CondLabel = this->getLabel();
+  LabelTy IncLabel = this->getLabel();
+  LoopScope<Emitter> LS(this, EndLabel, IncLabel);
+  LocalScope<Emitter> Scope(this);
+
+  if (Init && !this->visitStmt(Init))
+    return false;
+  this->emitLabel(CondLabel);
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
+  if (Cond) {
+    if (!this->visitBool(Cond))
+      return false;
+    if (!this->jumpFalse(EndLabel))
+      return false;
+  }
+
+  {
+    DestructorScope<Emitter> DS(Scope);
+
+    if (Body && !this->visitLoopBody(Body))
+      return false;
+    this->emitLabel(IncLabel);
+    if (Inc && !this->discard(Inc))
+      return false;
+  }
+
+  if (!this->jump(CondLabel))
+    return false;
+  this->emitLabel(EndLabel);
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitCXXForRangeStmt(const CXXForRangeStmt *S) {
+  const Stmt *Init = S->getInit();
+  const Expr *Cond = S->getCond();
+  const Expr *Inc = S->getInc();
+  const Stmt *Body = S->getBody();
+  const Stmt *BeginStmt = S->getBeginStmt();
+  const Stmt *RangeStmt = S->getRangeStmt();
+  const Stmt *EndStmt = S->getEndStmt();
+  const VarDecl *LoopVar = S->getLoopVariable();
+
+  LabelTy EndLabel = this->getLabel();
+  LabelTy CondLabel = this->getLabel();
+  LabelTy IncLabel = this->getLabel();
+  LoopScope<Emitter> LS(this, EndLabel, IncLabel);
+
+  // Emit declarations needed in the loop.
+  if (Init && !this->visitStmt(Init))
+    return false;
+  if (!this->visitStmt(RangeStmt))
+    return false;
+  if (!this->visitStmt(BeginStmt))
+    return false;
+  if (!this->visitStmt(EndStmt))
+    return false;
+
+  // Now the condition as well as the loop variable assignment.
+  this->emitLabel(CondLabel);
+  if (!this->visitBool(Cond))
+    return false;
+  if (!this->jumpFalse(EndLabel))
+    return false;
+
+  if (!this->visitVarDecl(LoopVar))
+    return false;
+
+  // Body.
+  LocalScope<Emitter> Scope(this);
+  {
+    DestructorScope<Emitter> DS(Scope);
+
+    if (!this->visitLoopBody(Body))
+      return false;
+    this->emitLabel(IncLabel);
+    if (!this->discard(Inc))
+      return false;
+  }
+  if (!this->jump(CondLabel))
+    return false;
+
+  this->emitLabel(EndLabel);
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitBreakStmt(const BreakStmt *S) {
+  if (!BreakLabel)
+    return false;
+
+  this->VarScope->emitDestructors();
+  return this->jump(*BreakLabel);
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitContinueStmt(const ContinueStmt *S) {
+  if (!ContinueLabel)
+    return false;
+
+  this->VarScope->emitDestructors();
+  return this->jump(*ContinueLabel);
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitSwitchStmt(const SwitchStmt *S) {
+  const Expr *Cond = S->getCond();
+  PrimType CondT = this->classifyPrim(Cond->getType());
+
+  LabelTy EndLabel = this->getLabel();
+  OptLabelTy DefaultLabel = std::nullopt;
+  unsigned CondVar = this->allocateLocalPrimitive(Cond, CondT, true, false);
+
+  if (const auto *CondInit = S->getInit())
+    if (!visitStmt(CondInit))
+      return false;
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
+  // Initialize condition variable.
+  if (!this->visit(Cond))
+    return false;
+  if (!this->emitSetLocal(CondT, CondVar, S))
+    return false;
+
+  CaseMap CaseLabels;
+  // Create labels and comparison ops for all case statements.
+  for (const SwitchCase *SC = S->getSwitchCaseList(); SC;
+       SC = SC->getNextSwitchCase()) {
+    if (const auto *CS = dyn_cast<CaseStmt>(SC)) {
+      // FIXME: Implement ranges.
+      if (CS->caseStmtIsGNURange())
+        return false;
+      CaseLabels[SC] = this->getLabel();
+
+      const Expr *Value = CS->getLHS();
+      PrimType ValueT = this->classifyPrim(Value->getType());
+
+      // Compare the case statement's value to the switch condition.
+      if (!this->emitGetLocal(CondT, CondVar, CS))
+        return false;
+      if (!this->visit(Value))
+        return false;
+
+      // Compare and jump to the case label.
+      if (!this->emitEQ(ValueT, S))
+        return false;
+      if (!this->jumpTrue(CaseLabels[CS]))
+        return false;
+    } else {
+      assert(!DefaultLabel);
+      DefaultLabel = this->getLabel();
+    }
+  }
+
+  // If none of the conditions above were true, fall through to the default
+  // statement or jump after the switch statement.
+  if (DefaultLabel) {
+    if (!this->jump(*DefaultLabel))
+      return false;
+  } else {
+    if (!this->jump(EndLabel))
+      return false;
+  }
+
+  SwitchScope<Emitter> SS(this, std::move(CaseLabels), EndLabel, DefaultLabel);
+  if (!this->visitStmt(S->getBody()))
+    return false;
+  this->emitLabel(EndLabel);
+  return true;
+}
+
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
+bool Compiler<Emitter>::visitCaseStmt(const CaseStmt *S) {
+  this->emitLabel(CaseLabels[S]);
+  return this->visitStmt(S->getSubStmt());
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitDefaultStmt(const DefaultStmt *S) {
+  this->emitLabel(*DefaultLabel);
+  return this->visitStmt(S->getSubStmt());
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitAttributedStmt(const AttributedStmt *S) {
+  if (this->Ctx.getLangOpts().CXXAssumptions &&
+      !this->Ctx.getLangOpts().MSVCCompat) {
+    for (const Attr *A : S->getAttrs()) {
+      auto *AA = dyn_cast<CXXAssumeAttr>(A);
+      if (!AA)
+        continue;
+
+      assert(isa<NullStmt>(S->getSubStmt()));
+
+      const Expr *Assumption = AA->getAssumption();
+      if (Assumption->isValueDependent())
+        return false;
+
+      if (Assumption->HasSideEffects(this->Ctx.getASTContext()))
+        continue;
+
+      // Evaluate assumption.
+      if (!this->visitBool(Assumption))
+        return false;
+
+      if (!this->emitAssume(Assumption))
+        return false;
+    }
+  }
+
+  // Ignore other attributes.
+  return this->visitStmt(S->getSubStmt());
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitCXXTryStmt(const CXXTryStmt *S) {
+  // Ignore all handlers.
+  return this->visitStmt(S->getTryBlock());
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::emitLambdaStaticInvokerBody(const CXXMethodDecl *MD) {
+  assert(MD->isLambdaStaticInvoker());
+  assert(MD->hasBody());
+  assert(cast<CompoundStmt>(MD->getBody())->body_empty());
+
+  const CXXRecordDecl *ClosureClass = MD->getParent();
+  const CXXMethodDecl *LambdaCallOp = ClosureClass->getLambdaCallOperator();
+  assert(ClosureClass->captures_begin() == ClosureClass->captures_end());
+  const Function *Func = this->getFunction(LambdaCallOp);
+  if (!Func)
+    return false;
+  assert(Func->hasThisPointer());
+  assert(Func->getNumParams() == (MD->getNumParams() + 1 + Func->hasRVO()));
+
+  if (Func->hasRVO()) {
+    if (!this->emitRVOPtr(MD))
+      return false;
+  }
+
+  // The lambda call operator needs an instance pointer, but we don't have
+  // one here, and we don't need one either because the lambda cannot have
+  // any captures, as verified above. Emit a null pointer. This is then
+  // special-cased when interpreting to not emit any misleading diagnostics.
+  if (!this->emitNullPtr(nullptr, MD))
+    return false;
+
+  // Forward all arguments from the static invoker to the lambda call operator.
+  for (const ParmVarDecl *PVD : MD->parameters()) {
+    auto It = this->Params.find(PVD);
+    assert(It != this->Params.end());
+
+    // We do the lvalue-to-rvalue conversion manually here, so no need
+    // to care about references.
+    PrimType ParamType = this->classify(PVD->getType()).value_or(PT_Ptr);
+    if (!this->emitGetParam(ParamType, It->second.Offset, MD))
+      return false;
+  }
+
+  if (!this->emitCall(Func, 0, LambdaCallOp))
+    return false;
+
+  this->emitCleanup();
+  if (ReturnType)
+    return this->emitRet(*ReturnType, MD);
+
+  // Nothing to do, since we emitted the RVO pointer above.
+  return this->emitRetVoid(MD);
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
+  // Classify the return type.
+  ReturnType = this->classify(F->getReturnType());
+
+  auto emitFieldInitializer = [&](const Record::Field *F, unsigned FieldOffset,
+                                  const Expr *InitExpr) -> bool {
+    // We don't know what to do with these, so just return false.
+    if (InitExpr->getType().isNull())
+      return false;
+
+    if (std::optional<PrimType> T = this->classify(InitExpr)) {
+      if (!this->visit(InitExpr))
+        return false;
+
+      if (F->isBitField())
+        return this->emitInitThisBitField(*T, F, FieldOffset, InitExpr);
+      return this->emitInitThisField(*T, FieldOffset, InitExpr);
+    }
+    // Non-primitive case. Get a pointer to the field-to-initialize
+    // on the stack and call visitInitialzer() for it.
+    InitLinkScope<Emitter> FieldScope(this, InitLink::Field(F->Offset));
+    if (!this->emitGetPtrThisField(FieldOffset, InitExpr))
+      return false;
+
+    if (!this->visitInitializer(InitExpr))
+      return false;
+
+    return this->emitPopPtr(InitExpr);
+  };
+
+  // Emit custom code if this is a lambda static invoker.
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(F);
+      MD && MD->isLambdaStaticInvoker())
+    return this->emitLambdaStaticInvokerBody(MD);
+
+  // Constructor. Set up field initializers.
+  if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(F)) {
+    const RecordDecl *RD = Ctor->getParent();
+    const Record *R = this->getRecord(RD);
+    if (!R)
+      return false;
+
+    InitLinkScope<Emitter> InitScope(this, InitLink::This());
+    for (const auto *Init : Ctor->inits()) {
+      // Scope needed for the initializers.
+      BlockScope<Emitter> Scope(this);
+
+      const Expr *InitExpr = Init->getInit();
+      if (const FieldDecl *Member = Init->getMember()) {
+        const Record::Field *F = R->getField(Member);
+
+        if (!emitFieldInitializer(F, F->Offset, InitExpr))
+          return false;
+      } else if (const Type *Base = Init->getBaseClass()) {
+        const auto *BaseDecl = Base->getAsCXXRecordDecl();
+        assert(BaseDecl);
+
+        if (Init->isBaseVirtual()) {
+          assert(R->getVirtualBase(BaseDecl));
+          if (!this->emitGetPtrThisVirtBase(BaseDecl, InitExpr))
+            return false;
+
+        } else {
+          // Base class initializer.
+          // Get This Base and call initializer on it.
+          const Record::Base *B = R->getBase(BaseDecl);
+          assert(B);
+          if (!this->emitGetPtrThisBase(B->Offset, InitExpr))
+            return false;
+        }
+
+        if (!this->visitInitializer(InitExpr))
+          return false;
+        if (!this->emitFinishInitPop(InitExpr))
+          return false;
+      } else if (const IndirectFieldDecl *IFD = Init->getIndirectMember()) {
+        assert(IFD->getChainingSize() >= 2);
+
+        unsigned NestedFieldOffset = 0;
+        const Record::Field *NestedField = nullptr;
+        for (const NamedDecl *ND : IFD->chain()) {
+          const auto *FD = cast<FieldDecl>(ND);
+          const Record *FieldRecord =
+              this->P.getOrCreateRecord(FD->getParent());
+          assert(FieldRecord);
+
+          NestedField = FieldRecord->getField(FD);
+          assert(NestedField);
+
+          NestedFieldOffset += NestedField->Offset;
+        }
+        assert(NestedField);
+
+        if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr))
+          return false;
+      } else {
+        assert(Init->isDelegatingInitializer());
+        if (!this->emitThis(InitExpr))
+          return false;
+        if (!this->visitInitializer(Init->getInit()))
+          return false;
+        if (!this->emitPopPtr(InitExpr))
+          return false;
+      }
+    }
+  }
+
+  if (const auto *Body = F->getBody())
+    if (!visitStmt(Body))
+      return false;
+
+  // Emit a guard return to protect against a code path missing one.
+  if (F->getReturnType()->isVoidType())
+    return this->emitRetVoid(SourceInfo{});
+  return this->emitNoRet(SourceInfo{});
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
   const Expr *SubExpr = E->getSubExpr();
   if (SubExpr->getType()->isAnyComplexType())
     return this->VisitComplexUnaryOperator(E);
@@ -4004,7 +4748,7 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     if (!this->visit(SubExpr))
       return false;
     return DiscardResult ? this->emitPop(*T, E) : this->emitNeg(*T, E);
-  case UO_Plus:  // +x
+  case UO_Plus:                // +x
     if (!this->visit(SubExpr)) // noop
       return false;
     return DiscardResult ? this->emitPop(*T, E) : true;
@@ -4016,11 +4760,11 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     }
     // We should already have a pointer when we get here.
     return this->delegate(SubExpr);
-  case UO_Deref:  // *x
+  case UO_Deref: // *x
     if (DiscardResult)
       return this->discard(SubExpr);
     return this->visit(SubExpr);
-  case UO_Not:    // ~x
+  case UO_Not: // ~x
     if (!this->visit(SubExpr))
       return false;
     return DiscardResult ? this->emitPop(*T, E) : this->emitComp(*T, E);
@@ -4043,8 +4787,7 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
-    const UnaryOperator *E) {
+bool Compiler<Emitter>::VisitComplexUnaryOperator(const UnaryOperator *E) {
   const Expr *SubExpr = E->getSubExpr();
   assert(SubExpr->getType()->isAnyComplexType());
 
@@ -4151,7 +4894,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
+bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
   if (DiscardResult)
     return true;
 
@@ -4189,8 +4932,11 @@ bool ByteCodeExprGen<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
       return this->emitGetLocal(PT_Ptr, Offset, E);
     return this->emitGetPtrLocal(Offset, E);
   } else if (auto GlobalIndex = P.getGlobal(D)) {
-    if (IsReference)
-      return this->emitGetGlobal(classifyPrim(E), *GlobalIndex, E);
+    if (IsReference) {
+      if (!Ctx.getLangOpts().CPlusPlus11)
+        return this->emitGetGlobal(classifyPrim(E), *GlobalIndex, E);
+      return this->emitGetGlobalUnchecked(classifyPrim(E), *GlobalIndex, E);
+    }
 
     return this->emitGetPtrGlobal(*GlobalIndex, E);
   } else if (const auto *PVD = dyn_cast<ParmVarDecl>(D)) {
@@ -4237,7 +4983,10 @@ bool ByteCodeExprGen<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
         if ((VD->hasGlobalStorage() || VD->isLocalVarDecl() ||
              VD->isStaticDataMember()) &&
             typeShouldBeVisited(VD->getType())) {
-          if (!this->visitVarDecl(VD))
+          auto VarState = this->visitVarDecl(VD);
+          if (VarState.notCreated())
+            return true;
+          if (!VarState)
             return false;
           // Retry.
           return this->visitDeclRef(VD, E);
@@ -4247,7 +4996,10 @@ bool ByteCodeExprGen<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
       if (const auto *VD = dyn_cast<VarDecl>(D);
           VD && VD->getAnyInitializer() &&
           VD->getType().isConstant(Ctx.getASTContext()) && !VD->isWeak()) {
-        if (!this->visitVarDecl(VD))
+        auto VarState = this->visitVarDecl(VD);
+        if (VarState.notCreated())
+          return true;
+        if (!VarState)
           return false;
         // Retry.
         return this->visitDeclRef(VD, E);
@@ -4274,21 +5026,19 @@ bool ByteCodeExprGen<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
+bool Compiler<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   const auto *D = E->getDecl();
   return this->visitDeclRef(D, E);
 }
 
-template <class Emitter>
-void ByteCodeExprGen<Emitter>::emitCleanup() {
+template <class Emitter> void Compiler<Emitter>::emitCleanup() {
   for (VariableScope<Emitter> *C = VarScope; C; C = C->getParent())
     C->emitDestruction();
 }
 
 template <class Emitter>
-unsigned
-ByteCodeExprGen<Emitter>::collectBaseOffset(const QualType BaseType,
-                                            const QualType DerivedType) {
+unsigned Compiler<Emitter>::collectBaseOffset(const QualType BaseType,
+                                              const QualType DerivedType) {
   const auto extractRecordDecl = [](QualType Ty) -> const CXXRecordDecl * {
     if (const auto *PT = dyn_cast<PointerType>(Ty))
       return PT->getPointeeType()->getAsCXXRecordDecl();
@@ -4302,8 +5052,8 @@ ByteCodeExprGen<Emitter>::collectBaseOffset(const QualType BaseType,
 
 /// Emit casts from a PrimType to another PrimType.
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitPrimCast(PrimType FromT, PrimType ToT,
-                                            QualType ToQT, const Expr *E) {
+bool Compiler<Emitter>::emitPrimCast(PrimType FromT, PrimType ToT,
+                                     QualType ToQT, const Expr *E) {
 
   if (FromT == PT_Float) {
     // Floating to floating.
@@ -4345,7 +5095,7 @@ bool ByteCodeExprGen<Emitter>::emitPrimCast(PrimType FromT, PrimType ToT,
 
 /// Emits __real(SubExpr)
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitComplexReal(const Expr *SubExpr) {
+bool Compiler<Emitter>::emitComplexReal(const Expr *SubExpr) {
   assert(SubExpr->getType()->isAnyComplexType());
 
   if (DiscardResult)
@@ -4365,7 +5115,7 @@ bool ByteCodeExprGen<Emitter>::emitComplexReal(const Expr *SubExpr) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitComplexBoolCast(const Expr *E) {
+bool Compiler<Emitter>::emitComplexBoolCast(const Expr *E) {
   assert(!DiscardResult);
   PrimType ElemT = classifyComplexElementType(E->getType());
   // We emit the expression (__real(E) != 0 || __imag(E) != 0)
@@ -4411,9 +5161,8 @@ bool ByteCodeExprGen<Emitter>::emitComplexBoolCast(const Expr *E) {
 }
 
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitComplexComparison(const Expr *LHS,
-                                                     const Expr *RHS,
-                                                     const BinaryOperator *E) {
+bool Compiler<Emitter>::emitComplexComparison(const Expr *LHS, const Expr *RHS,
+                                              const BinaryOperator *E) {
   assert(E->isComparisonOp());
   assert(!Initializing);
   assert(!DiscardResult);
@@ -4510,7 +5259,7 @@ bool ByteCodeExprGen<Emitter>::emitComplexComparison(const Expr *LHS,
 /// on the stack.
 /// Emit destruction of record types (or arrays of record types).
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitRecordDestruction(const Record *R) {
+bool Compiler<Emitter>::emitRecordDestruction(const Record *R) {
   assert(R);
   // First, destroy all fields.
   for (const Record::Field &Field : llvm::reverse(R->fields())) {
@@ -4558,7 +5307,7 @@ bool ByteCodeExprGen<Emitter>::emitRecordDestruction(const Record *R) {
 /// on the stack.
 /// Emit destruction of record types (or arrays of record types).
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::emitDestruction(const Descriptor *Desc) {
+bool Compiler<Emitter>::emitDestruction(const Descriptor *Desc) {
   assert(Desc);
   assert(!Desc->isPrimitive());
   assert(!Desc->isPrimitiveArray());
@@ -4601,8 +5350,8 @@ bool ByteCodeExprGen<Emitter>::emitDestruction(const Descriptor *Desc) {
 namespace clang {
 namespace interp {
 
-template class ByteCodeExprGen<ByteCodeEmitter>;
-template class ByteCodeExprGen<EvalEmitter>;
+template class Compiler<ByteCodeEmitter>;
+template class Compiler<EvalEmitter>;
 
 } // namespace interp
 } // namespace clang
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/Compiler.h
similarity index 84%
rename from clang/lib/AST/Interp/ByteCodeExprGen.h
rename to clang/lib/AST/Interp/Compiler.h
index eef8cae6e38cd8..d188ce2200664f 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/Compiler.h
@@ -1,4 +1,4 @@
-//===--- ByteCodeExprGen.h - Code generator for expressions -----*- C++ -*-===//
+//===--- Compiler.h - Code generator for expressions -----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -36,8 +36,11 @@ template <class Emitter> class InitLinkScope;
 template <class Emitter> class OptionScope;
 template <class Emitter> class ArrayIndexScope;
 template <class Emitter> class SourceLocScope;
+template <class Emitter> class LoopScope;
+template <class Emitter> class LabelScope;
+template <class Emitter> class SwitchScope;
 
-template <class Emitter> class ByteCodeExprGen;
+template <class Emitter> class Compiler;
 struct InitLink {
 public:
   enum {
@@ -60,7 +63,7 @@ struct InitLink {
 
   InitLink(uint8_t Kind) : Kind(Kind) {}
   template <class Emitter>
-  bool emit(ByteCodeExprGen<Emitter> *Ctx, const Expr *E) const;
+  bool emit(Compiler<Emitter> *Ctx, const Expr *E) const;
 
 private:
   uint32_t Kind;
@@ -70,14 +73,28 @@ struct InitLink {
   };
 };
 
+/// State encapsulating if a the variable creation has been successful,
+/// unsuccessful, or no variable has been created at all.
+struct VarCreationState {
+  std::optional<bool> S = std::nullopt;
+  VarCreationState() = default;
+  VarCreationState(bool b) : S(b) {}
+  static VarCreationState NotCreated() { return VarCreationState(); }
+
+  operator bool() const { return S && *S; }
+  bool notCreated() const { return !S; }
+};
+
 /// Compilation context for expressions.
 template <class Emitter>
-class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
-                        public Emitter {
+class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
+                 public Emitter {
 protected:
   // Aliases for types defined in the emitter.
   using LabelTy = typename Emitter::LabelTy;
   using AddrTy = typename Emitter::AddrTy;
+  using OptLabelTy = std::optional<LabelTy>;
+  using CaseMap = llvm::DenseMap<const SwitchCase *, LabelTy>;
 
   /// Current compilation context.
   Context &Ctx;
@@ -87,10 +104,10 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
 public:
   /// Initializes the compiler and the backend emitter.
   template <typename... Tys>
-  ByteCodeExprGen(Context &Ctx, Program &P, Tys &&... Args)
+  Compiler(Context &Ctx, Program &P, Tys &&...Args)
       : Emitter(Ctx, P, Args...), Ctx(Ctx), P(P) {}
 
-  // Expression visitors - result returned on interp stack.
+  // Expressions.
   bool VisitCastExpr(const CastExpr *E);
   bool VisitIntegerLiteral(const IntegerLiteral *E);
   bool VisitFloatingLiteral(const FloatingLiteral *E);
@@ -165,10 +182,32 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitShuffleVectorExpr(const ShuffleVectorExpr *E);
   bool VisitExtVectorElementExpr(const ExtVectorElementExpr *E);
   bool VisitObjCBoxedExpr(const ObjCBoxedExpr *E);
+  bool VisitCXXStdInitializerListExpr(const CXXStdInitializerListExpr *E);
+  bool VisitStmtExpr(const StmtExpr *E);
+
+  // Statements.
+  bool visitCompoundStmt(const CompoundStmt *S);
+  bool visitLoopBody(const Stmt *S);
+  bool visitDeclStmt(const DeclStmt *DS);
+  bool visitReturnStmt(const ReturnStmt *RS);
+  bool visitIfStmt(const IfStmt *IS);
+  bool visitWhileStmt(const WhileStmt *S);
+  bool visitDoStmt(const DoStmt *S);
+  bool visitForStmt(const ForStmt *S);
+  bool visitCXXForRangeStmt(const CXXForRangeStmt *S);
+  bool visitBreakStmt(const BreakStmt *S);
+  bool visitContinueStmt(const ContinueStmt *S);
+  bool visitSwitchStmt(const SwitchStmt *S);
+  bool visitCaseStmt(const CaseStmt *S);
+  bool visitDefaultStmt(const DefaultStmt *S);
+  bool visitAttributedStmt(const AttributedStmt *S);
+  bool visitCXXTryStmt(const CXXTryStmt *S);
 
 protected:
+  bool visitStmt(const Stmt *S);
   bool visitExpr(const Expr *E) override;
   bool visitDecl(const VarDecl *VD, bool ConstantContext) override;
+  bool visitFunc(const FunctionDecl *F) override;
 
 protected:
   /// Emits scope cleanup instructions.
@@ -181,8 +220,8 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   Record *getRecord(QualType Ty);
   Record *getRecord(const RecordDecl *RD);
 
-  // Returns a function for the given FunctionDecl.
-  // If the function does not exist yet, it is compiled.
+  /// Returns a function for the given FunctionDecl.
+  /// If the function does not exist yet, it is compiled.
   const Function *getFunction(const FunctionDecl *FD);
 
   std::optional<PrimType> classify(const Expr *E) const {
@@ -220,9 +259,8 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   /// Just pass evaluation on to \p E. This leaves all the parsing flags
   /// intact.
   bool delegate(const Expr *E);
-
   /// Creates and initializes a variable from the given decl.
-  bool visitVarDecl(const VarDecl *VD);
+  VarCreationState visitVarDecl(const VarDecl *VD);
   /// Visit an APValue.
   bool visitAPValue(const APValue &Val, PrimType ValType, const Expr *E);
   bool visitAPValueInitializer(const APValue &Val, const Expr *E);
@@ -293,6 +331,9 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   friend class ArrayIndexScope<Emitter>;
   friend class SourceLocScope<Emitter>;
   friend struct InitLink;
+  friend class LoopScope<Emitter>;
+  friend class LabelScope<Emitter>;
+  friend class SwitchScope<Emitter>;
 
   /// Emits a zero initializer.
   bool visitZeroInitializer(PrimType T, QualType QT, const Expr *E);
@@ -336,6 +377,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool emitDestruction(const Descriptor *Desc);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
+  bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
 
 protected:
   /// Variable to storage mapping.
@@ -366,15 +408,28 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
 
   /// Flag indicating if we're initializing a global variable.
   bool GlobalDecl = false;
+
+  /// Type of the expression returned by the function.
+  std::optional<PrimType> ReturnType;
+
+  /// Switch case mapping.
+  CaseMap CaseLabels;
+
+  /// Point to break to.
+  OptLabelTy BreakLabel;
+  /// Point to continue to.
+  OptLabelTy ContinueLabel;
+  /// Default case label.
+  OptLabelTy DefaultLabel;
 };
 
-extern template class ByteCodeExprGen<ByteCodeEmitter>;
-extern template class ByteCodeExprGen<EvalEmitter>;
+extern template class Compiler<ByteCodeEmitter>;
+extern template class Compiler<EvalEmitter>;
 
 /// Scope chain managing the variable lifetimes.
 template <class Emitter> class VariableScope {
 public:
-  VariableScope(ByteCodeExprGen<Emitter> *Ctx, const ValueDecl *VD)
+  VariableScope(Compiler<Emitter> *Ctx, const ValueDecl *VD)
       : Ctx(Ctx), Parent(Ctx->VarScope), ValDecl(VD) {
     Ctx->VarScope = this;
   }
@@ -421,8 +476,8 @@ template <class Emitter> class VariableScope {
   VariableScope *getParent() const { return Parent; }
 
 protected:
-  /// ByteCodeExprGen instance.
-  ByteCodeExprGen<Emitter> *Ctx;
+  /// Compiler instance.
+  Compiler<Emitter> *Ctx;
   /// Link to the parent scope.
   VariableScope *Parent;
   const ValueDecl *ValDecl = nullptr;
@@ -431,8 +486,7 @@ template <class Emitter> class VariableScope {
 /// Generic scope for local variables.
 template <class Emitter> class LocalScope : public VariableScope<Emitter> {
 public:
-  LocalScope(ByteCodeExprGen<Emitter> *Ctx)
-      : VariableScope<Emitter>(Ctx, nullptr) {}
+  LocalScope(Compiler<Emitter> *Ctx) : VariableScope<Emitter>(Ctx, nullptr) {}
 
   /// Emit a Destroy op for this scope.
   ~LocalScope() override {
@@ -526,8 +580,7 @@ template <class Emitter> class DestructorScope final {
 /// variables are automatically emitted when the AutoScope is destroyed.
 template <class Emitter> class AutoScope : public LocalScope<Emitter> {
 public:
-  AutoScope(ByteCodeExprGen<Emitter> *Ctx)
-      : LocalScope<Emitter>(Ctx), DS(*this) {}
+  AutoScope(Compiler<Emitter> *Ctx) : LocalScope<Emitter>(Ctx), DS(*this) {}
 
 private:
   DestructorScope<Emitter> DS;
@@ -536,7 +589,7 @@ template <class Emitter> class AutoScope : public LocalScope<Emitter> {
 /// Scope for storage declared in a compound statement.
 template <class Emitter> class BlockScope final : public AutoScope<Emitter> {
 public:
-  BlockScope(ByteCodeExprGen<Emitter> *Ctx) : AutoScope<Emitter>(Ctx) {}
+  BlockScope(Compiler<Emitter> *Ctx) : AutoScope<Emitter>(Ctx) {}
 
   void addExtended(const Scope::Local &Local) override {
     // If we to this point, just add the variable as a normal local
@@ -548,12 +601,12 @@ template <class Emitter> class BlockScope final : public AutoScope<Emitter> {
 
 template <class Emitter> class ExprScope final : public AutoScope<Emitter> {
 public:
-  ExprScope(ByteCodeExprGen<Emitter> *Ctx) : AutoScope<Emitter>(Ctx) {}
+  ExprScope(Compiler<Emitter> *Ctx) : AutoScope<Emitter>(Ctx) {}
 };
 
 template <class Emitter> class ArrayIndexScope final {
 public:
-  ArrayIndexScope(ByteCodeExprGen<Emitter> *Ctx, uint64_t Index) : Ctx(Ctx) {
+  ArrayIndexScope(Compiler<Emitter> *Ctx, uint64_t Index) : Ctx(Ctx) {
     OldArrayIndex = Ctx->ArrayIndex;
     Ctx->ArrayIndex = Index;
   }
@@ -561,14 +614,13 @@ template <class Emitter> class ArrayIndexScope final {
   ~ArrayIndexScope() { Ctx->ArrayIndex = OldArrayIndex; }
 
 private:
-  ByteCodeExprGen<Emitter> *Ctx;
+  Compiler<Emitter> *Ctx;
   std::optional<uint64_t> OldArrayIndex;
 };
 
 template <class Emitter> class SourceLocScope final {
 public:
-  SourceLocScope(ByteCodeExprGen<Emitter> *Ctx, const Expr *DefaultExpr)
-      : Ctx(Ctx) {
+  SourceLocScope(Compiler<Emitter> *Ctx, const Expr *DefaultExpr) : Ctx(Ctx) {
     assert(DefaultExpr);
     // We only switch if the current SourceLocDefaultExpr is null.
     if (!Ctx->SourceLocDefaultExpr) {
@@ -583,20 +635,20 @@ template <class Emitter> class SourceLocScope final {
   }
 
 private:
-  ByteCodeExprGen<Emitter> *Ctx;
+  Compiler<Emitter> *Ctx;
   bool Enabled = false;
 };
 
 template <class Emitter> class InitLinkScope final {
 public:
-  InitLinkScope(ByteCodeExprGen<Emitter> *Ctx, InitLink &&Link) : Ctx(Ctx) {
+  InitLinkScope(Compiler<Emitter> *Ctx, InitLink &&Link) : Ctx(Ctx) {
     Ctx->InitStack.push_back(std::move(Link));
   }
 
   ~InitLinkScope() { this->Ctx->InitStack.pop_back(); }
 
 private:
-  ByteCodeExprGen<Emitter> *Ctx;
+  Compiler<Emitter> *Ctx;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index 98d1837204ebc4..913e8d514282ad 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -8,8 +8,7 @@
 
 #include "Context.h"
 #include "ByteCodeEmitter.h"
-#include "ByteCodeExprGen.h"
-#include "ByteCodeStmtGen.h"
+#include "Compiler.h"
 #include "EvalEmitter.h"
 #include "Interp.h"
 #include "InterpFrame.h"
@@ -30,7 +29,7 @@ bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) {
   assert(Stk.empty());
   Function *Func = P->getFunction(FD);
   if (!Func || !Func->hasBody())
-    Func = ByteCodeStmtGen<ByteCodeEmitter>(*this, *P).compileFunc(FD);
+    Func = Compiler<ByteCodeEmitter>(*this, *P).compileFunc(FD);
 
   APValue DummyResult;
   if (!Run(Parent, Func, DummyResult))
@@ -40,8 +39,9 @@ bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) {
 }
 
 bool Context::evaluateAsRValue(State &Parent, const Expr *E, APValue &Result) {
+  ++EvalID;
   bool Recursing = !Stk.empty();
-  ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk);
+  Compiler<EvalEmitter> C(*this, *P, Parent, Stk);
 
   auto Res = C.interpretExpr(E, /*ConvertResultToRValue=*/E->isGLValue());
 
@@ -66,8 +66,9 @@ bool Context::evaluateAsRValue(State &Parent, const Expr *E, APValue &Result) {
 }
 
 bool Context::evaluate(State &Parent, const Expr *E, APValue &Result) {
+  ++EvalID;
   bool Recursing = !Stk.empty();
-  ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk);
+  Compiler<EvalEmitter> C(*this, *P, Parent, Stk);
 
   auto Res = C.interpretExpr(E);
   if (Res.isInvalid()) {
@@ -91,8 +92,9 @@ bool Context::evaluate(State &Parent, const Expr *E, APValue &Result) {
 
 bool Context::evaluateAsInitializer(State &Parent, const VarDecl *VD,
                                     APValue &Result) {
+  ++EvalID;
   bool Recursing = !Stk.empty();
-  ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk);
+  Compiler<EvalEmitter> C(*this, *P, Parent, Stk);
 
   bool CheckGlobalInitialized =
       shouldBeGloballyIndexed(VD) &&
@@ -261,7 +263,7 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FD) {
     return Func;
 
   if (!Func || WasNotDefined) {
-    if (auto F = ByteCodeStmtGen<ByteCodeEmitter>(*this, *P).compileFunc(FD))
+    if (auto F = Compiler<ByteCodeEmitter>(*this, *P).compileFunc(FD))
       Func = F;
   }
 
diff --git a/clang/lib/AST/Interp/Context.h b/clang/lib/AST/Interp/Context.h
index c78dc9a2a471eb..b8ea4ad6b3b447 100644
--- a/clang/lib/AST/Interp/Context.h
+++ b/clang/lib/AST/Interp/Context.h
@@ -109,6 +109,8 @@ class Context final {
 
   const Record *getRecord(const RecordDecl *D) const;
 
+  unsigned getEvalID() const { return EvalID; }
+
 private:
   /// Runs a function.
   bool Run(State &Parent, const Function *Func, APValue &Result);
@@ -119,6 +121,8 @@ class Context final {
   InterpStack Stk;
   /// Constexpr program.
   std::unique_ptr<Program> P;
+  /// ID identifying an evaluation.
+  unsigned EvalID = 0;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index fcb778f7aeab0e..afafae088aca0c 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -64,7 +64,14 @@ static void dtorArrayTy(Block *, std::byte *Ptr, const Descriptor *D) {
 template <typename T>
 static void moveArrayTy(Block *, const std::byte *Src, std::byte *Dst,
                         const Descriptor *D) {
-  // FIXME: Need to copy the InitMap?
+  // FIXME: Get rid of the const_cast.
+  InitMapPtr &SrcIMP =
+      *reinterpret_cast<InitMapPtr *>(const_cast<std::byte *>(Src));
+  if (SrcIMP) {
+    // We only ever invoke the moveFunc when moving block contents to a
+    // DeadBlock. DeadBlocks don't need InitMaps, so we destroy them here.
+    SrcIMP = std::nullopt;
+  }
   Src += sizeof(InitMapPtr);
   Dst += sizeof(InitMapPtr);
   for (unsigned I = 0, NE = D->getNumElems(); I < NE; ++I) {
@@ -348,11 +355,11 @@ Descriptor::Descriptor(const DeclTy &D)
 }
 
 QualType Descriptor::getType() const {
-  if (auto *E = asExpr())
+  if (const auto *E = asExpr())
     return E->getType();
-  if (auto *D = asValueDecl())
+  if (const auto *D = asValueDecl())
     return D->getType();
-  if (auto *T = dyn_cast<TypeDecl>(asDecl()))
+  if (const auto *T = dyn_cast<TypeDecl>(asDecl()))
     return QualType(T->getTypeForDecl(), 0);
   llvm_unreachable("Invalid descriptor type");
 }
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 0ab84d159c58b7..d946a10c22dc3f 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -25,6 +25,7 @@
 #include "Program.h"
 #include "clang/AST/ASTDumperUtils.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/ExprCXX.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
 
@@ -154,6 +155,32 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
       OS << (GP.isInitialized() ? "initialized " : "uninitialized ");
     }
     Desc->dump(OS);
+
+    if (GP.isInitialized() && Desc->IsTemporary) {
+      if (const auto *MTE =
+              dyn_cast_if_present<MaterializeTemporaryExpr>(Desc->asExpr());
+          MTE && MTE->getLifetimeExtendedTemporaryDecl()) {
+        if (const APValue *V =
+                MTE->getLifetimeExtendedTemporaryDecl()->getValue()) {
+          OS << " (global temporary value: ";
+          {
+            ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_MAGENTA, true});
+            std::string VStr;
+            llvm::raw_string_ostream SS(VStr);
+            V->dump(SS, Ctx.getASTContext());
+
+            for (unsigned I = 0; I != VStr.size(); ++I) {
+              if (VStr[I] == '\n')
+                VStr[I] = ' ';
+            }
+            VStr.pop_back(); // Remove the newline (or now space) at the end.
+            OS << VStr;
+          }
+          OS << ')';
+        }
+      }
+    }
+
     OS << "\n";
     if (GP.isInitialized() && Desc->isPrimitive() && !Desc->isDummy()) {
       OS << "   ";
@@ -191,7 +218,7 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
     if (const auto *ND = dyn_cast_if_present<NamedDecl>(asDecl()))
       ND->printQualifiedName(OS);
     else if (asExpr())
-      OS << "expr (TODO)";
+      OS << "Expr " << (const void *)asExpr();
   }
 
   // Print a few interesting bits about the descriptor.
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index 77ff901634a462..d17151416b44bd 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -56,6 +56,7 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
 EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD,
                                             bool CheckFullyInitialized) {
   this->CheckFullyInitialized = CheckFullyInitialized;
+  S.EvaluatingDecl = VD;
 
   if (const Expr *Init = VD->getAnyInitializer()) {
     QualType T = VD->getType();
@@ -69,6 +70,8 @@ EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD,
   if (!this->visitDecl(VD, S.inConstantContext()) && EvalResult.empty())
     EvalResult.setInvalid();
 
+  S.EvaluatingDecl = nullptr;
+  updateGlobalTemporaries();
   return std::move(this->EvalResult);
 }
 
@@ -81,7 +84,7 @@ EvalEmitter::LabelTy EvalEmitter::getLabel() { return NextLabel++; }
 Scope::Local EvalEmitter::createLocal(Descriptor *D) {
   // Allocate memory for a local.
   auto Memory = std::make_unique<char[]>(sizeof(Block) + D->getAllocSize());
-  auto *B = new (Memory.get()) Block(D, /*isStatic=*/false);
+  auto *B = new (Memory.get()) Block(Ctx.getEvalID(), D, /*isStatic=*/false);
   B->invokeCtor();
 
   // Initialize local variable inline descriptor.
@@ -148,6 +151,11 @@ template <> bool EvalEmitter::emitRet<PT_Ptr>(const SourceInfo &Info) {
 
   // Implicitly convert lvalue to rvalue, if requested.
   if (ConvertResultToRValue) {
+    // Never allow reading from a non-const pointer, unless the memory
+    // has been created in this evaluation.
+    if (!Ptr.isConst() && Ptr.block()->getEvalID() != Ctx.getEvalID())
+      return false;
+
     if (std::optional<APValue> V = Ptr.toRValue(Ctx)) {
       EvalResult.setValue(*V);
     } else {
@@ -235,6 +243,28 @@ bool EvalEmitter::emitDestroy(uint32_t I, const SourceInfo &Info) {
   return true;
 }
 
+/// Global temporaries (LifetimeExtendedTemporary) carry their value
+/// around as an APValue, which codegen accesses.
+/// We set their value once when creating them, but we don't update it
+/// afterwards when code changes it later.
+/// This is what we do here.
+void EvalEmitter::updateGlobalTemporaries() {
+  for (const auto &[E, Temp] : S.SeenGlobalTemporaries) {
+    if (std::optional<unsigned> GlobalIndex = P.getGlobal(E)) {
+      const Pointer &Ptr = P.getPtrGlobal(*GlobalIndex);
+      APValue *Cached = Temp->getOrCreateValue(true);
+
+      if (std::optional<PrimType> T = Ctx.classify(E->getType())) {
+        TYPE_SWITCH(*T, { *Cached = Ptr.deref<T>().toAPValue(); });
+      } else {
+        if (std::optional<APValue> APV = Ptr.toRValue(Ctx))
+          *Cached = *APV;
+      }
+    }
+  }
+  S.SeenGlobalTemporaries.clear();
+}
+
 //===----------------------------------------------------------------------===//
 // Opcode evaluators
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/EvalEmitter.h b/clang/lib/AST/Interp/EvalEmitter.h
index 68accbc5214c27..d1e125cae95944 100644
--- a/clang/lib/AST/Interp/EvalEmitter.h
+++ b/clang/lib/AST/Interp/EvalEmitter.h
@@ -56,6 +56,7 @@ class EvalEmitter : public SourceMapper {
   /// Methods implemented by the compiler.
   virtual bool visitExpr(const Expr *E) = 0;
   virtual bool visitDecl(const VarDecl *VD, bool ConstantContext) = 0;
+  virtual bool visitFunc(const FunctionDecl *F) = 0;
 
   /// Emits jumps.
   bool jumpTrue(const LabelTy &Label);
@@ -63,6 +64,10 @@ class EvalEmitter : public SourceMapper {
   bool jump(const LabelTy &Label);
   bool fallthrough(const LabelTy &Label);
 
+  /// Since expressions can only jump forward, predicated execution is
+  /// used to deal with if-else statements.
+  bool isActive() const { return CurrentLabel == ActiveLabel; }
+
   /// Callback for registering a local.
   Local createLocal(Descriptor *D);
 
@@ -104,6 +109,8 @@ class EvalEmitter : public SourceMapper {
     return reinterpret_cast<Block *>(It->second.get());
   }
 
+  void updateGlobalTemporaries();
+
   // The emitter always tracks the current instruction and sets OpPC to a token
   // value which is mapped to the location of the opcode being evaluated.
   CodePtr OpPC;
@@ -117,10 +124,6 @@ class EvalEmitter : public SourceMapper {
   /// Active block which should be executed.
   LabelTy ActiveLabel = 0;
 
-  /// Since expressions can only jump forward, predicated execution is
-  /// used to deal with if-else statements.
-  bool isActive() const { return CurrentLabel == ActiveLabel; }
-
 protected:
 #define GET_EVAL_PROTO
 #include "Opcodes.inc"
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 49015b1dd63d33..0411fcad88ad0a 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -322,7 +322,7 @@ bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
   };
 
   if (const auto *D = Desc->asVarDecl();
-      D && D->hasGlobalStorage() && !IsConstType(D)) {
+      D && D->hasGlobalStorage() && D != S.EvaluatingDecl && !IsConstType(D)) {
     diagnoseNonConstVariable(S, OpPC, D);
     return S.inConstantContext();
   }
@@ -341,7 +341,9 @@ bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   if (!Ptr.isZero())
     return true;
   const SourceInfo &Loc = S.Current->getSource(OpPC);
-  S.FFDiag(Loc, diag::note_constexpr_null_subobject) << CSK;
+  S.FFDiag(Loc, diag::note_constexpr_null_subobject)
+      << CSK << S.Current->getRange(OpPC);
+
   return false;
 }
 
@@ -350,7 +352,8 @@ bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   if (!Ptr.isOnePastEnd())
     return true;
   const SourceInfo &Loc = S.Current->getSource(OpPC);
-  S.FFDiag(Loc, diag::note_constexpr_access_past_end) << AK;
+  S.FFDiag(Loc, diag::note_constexpr_access_past_end)
+      << AK << S.Current->getRange(OpPC);
   return false;
 }
 
@@ -359,7 +362,8 @@ bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   if (!Ptr.isElementPastEnd())
     return true;
   const SourceInfo &Loc = S.Current->getSource(OpPC);
-  S.FFDiag(Loc, diag::note_constexpr_past_end_subobject) << CSK;
+  S.FFDiag(Loc, diag::note_constexpr_past_end_subobject)
+      << CSK << S.Current->getRange(OpPC);
   return false;
 }
 
@@ -369,7 +373,8 @@ bool CheckSubobject(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
     return true;
 
   const SourceInfo &Loc = S.Current->getSource(OpPC);
-  S.FFDiag(Loc, diag::note_constexpr_past_end_subobject) << CSK;
+  S.FFDiag(Loc, diag::note_constexpr_past_end_subobject)
+      << CSK << S.Current->getRange(OpPC);
   return false;
 }
 
@@ -395,7 +400,7 @@ bool CheckDowncast(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
 bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   assert(Ptr.isLive() && "Pointer is not live");
-  if (!Ptr.isConst())
+  if (!Ptr.isConst() || Ptr.isMutable())
     return true;
 
   // The This pointer is writable in constructors and destructors,
@@ -417,9 +422,14 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 
 bool CheckMutable(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   assert(Ptr.isLive() && "Pointer is not live");
-  if (!Ptr.isMutable()) {
+  if (!Ptr.isMutable())
+    return true;
+
+  // In C++14 onwards, it is permitted to read a mutable member whose
+  // lifetime began within the evaluation.
+  if (S.getLangOpts().CPlusPlus14 &&
+      Ptr.block()->getEvalID() == S.Ctx.getEvalID())
     return true;
-  }
 
   const SourceInfo &Loc = S.Current->getSource(OpPC);
   const FieldDecl *Field = Ptr.getField();
@@ -471,23 +481,24 @@ bool CheckGlobalInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   return false;
 }
 
-bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (!CheckLive(S, OpPC, Ptr, AK_Read))
+bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+               AccessKinds AK) {
+  if (!CheckLive(S, OpPC, Ptr, AK))
     return false;
   if (!CheckConstant(S, OpPC, Ptr))
     return false;
 
-  if (!CheckDummy(S, OpPC, Ptr, AK_Read))
+  if (!CheckDummy(S, OpPC, Ptr, AK))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
-  if (!CheckRange(S, OpPC, Ptr, AK_Read))
+  if (!CheckRange(S, OpPC, Ptr, AK))
     return false;
-  if (!CheckActive(S, OpPC, Ptr, AK_Read))
+  if (!CheckActive(S, OpPC, Ptr, AK))
     return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
+  if (!CheckInitialized(S, OpPC, Ptr, AK))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Read))
+  if (!CheckTemporary(S, OpPC, Ptr, AK))
     return false;
   if (!CheckMutable(S, OpPC, Ptr))
     return false;
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 916d268aa4f095..866593b9af094a 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -92,7 +92,8 @@ bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc);
 bool CheckMutable(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
 /// Checks if a value can be loaded from a block.
-bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+               AccessKinds AK = AK_Read);
 
 bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                       AccessKinds AK);
@@ -724,9 +725,7 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Inc(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Increment))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
     return false;
 
   return IncDecHelper<T, IncDecOp::Inc, PushVal::Yes>(S, OpPC, Ptr);
@@ -738,9 +737,7 @@ bool Inc(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool IncPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Increment))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
     return false;
 
   return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr);
@@ -753,9 +750,7 @@ bool IncPop(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Dec(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Decrement))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
 
   return IncDecHelper<T, IncDecOp::Dec, PushVal::Yes>(S, OpPC, Ptr);
@@ -767,9 +762,7 @@ bool Dec(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool DecPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Decrement))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
 
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr);
@@ -797,9 +790,7 @@ bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
 inline bool Incf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (Ptr.isDummy())
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
     return false;
 
   return IncDecFloatHelper<IncDecOp::Inc, PushVal::Yes>(S, OpPC, Ptr, RM);
@@ -807,9 +798,7 @@ inline bool Incf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
 
 inline bool IncfPop(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (Ptr.isDummy())
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
     return false;
 
   return IncDecFloatHelper<IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, RM);
@@ -817,11 +806,7 @@ inline bool IncfPop(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
 
 inline bool Decf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
-  if (Ptr.isDummy())
-    return false;
-
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
 
   return IncDecFloatHelper<IncDecOp::Dec, PushVal::Yes>(S, OpPC, Ptr, RM);
@@ -829,10 +814,7 @@ inline bool Decf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
 
 inline bool DecfPop(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
-  if (Ptr.isDummy())
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
 
   return IncDecFloatHelper<IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, RM);
@@ -1255,7 +1237,10 @@ bool GetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 /// Same as GetGlobal, but without the checks.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool GetGlobalUnchecked(InterpState &S, CodePtr OpPC, uint32_t I) {
-  auto *B = S.P.getGlobal(I);
+  const Pointer &Ptr = S.P.getPtrGlobal(I);
+  if (!Ptr.isInitialized())
+    return false;
+  const Block *B = S.P.getGlobal(I);
   S.Stk.push<T>(B->deref<T>());
   return true;
 }
@@ -1280,16 +1265,20 @@ bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobalTemp(InterpState &S, CodePtr OpPC, uint32_t I,
                     const LifetimeExtendedTemporaryDecl *Temp) {
-  assert(Temp);
+  const Pointer &Ptr = S.P.getGlobal(I);
+
   const T Value = S.Stk.peek<T>();
   APValue APV = Value.toAPValue();
   APValue *Cached = Temp->getOrCreateValue(true);
   *Cached = APV;
 
-  const Pointer &P = S.P.getGlobal(I);
-  P.deref<T>() = S.Stk.pop<T>();
-  P.initialize();
+  assert(Ptr.getDeclDesc()->asExpr());
+
+  S.SeenGlobalTemporaries.push_back(
+      std::make_pair(Ptr.getDeclDesc()->asExpr(), Temp));
 
+  Ptr.deref<T>() = S.Stk.pop<T>();
+  Ptr.initialize();
   return true;
 }
 
@@ -1302,6 +1291,9 @@ inline bool InitGlobalTempComp(InterpState &S, CodePtr OpPC,
   const Pointer &P = S.Stk.peek<Pointer>();
   APValue *Cached = Temp->getOrCreateValue(true);
 
+  S.SeenGlobalTemporaries.push_back(
+      std::make_pair(P.getDeclDesc()->asExpr(), Temp));
+
   if (std::optional<APValue> APV = P.toRValue(S.getCtx())) {
     *Cached = *APV;
     return true;
diff --git a/clang/lib/AST/Interp/InterpBlock.cpp b/clang/lib/AST/Interp/InterpBlock.cpp
index 9b33d1b778fb2c..c34ea7634b4a90 100644
--- a/clang/lib/AST/Interp/InterpBlock.cpp
+++ b/clang/lib/AST/Interp/InterpBlock.cpp
@@ -92,7 +92,8 @@ bool Block::hasPointer(const Pointer *P) const {
 #endif
 
 DeadBlock::DeadBlock(DeadBlock *&Root, Block *Blk)
-    : Root(Root), B(Blk->Desc, Blk->IsStatic, Blk->IsExtern, /*isDead=*/true) {
+    : Root(Root),
+      B(~0u, Blk->Desc, Blk->IsStatic, Blk->IsExtern, /*isDead=*/true) {
   // Add the block to the chain of dead blocks.
   if (Root)
     Root->Prev = this;
diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h
index 2bb195648a9a91..1f25de35896303 100644
--- a/clang/lib/AST/Interp/InterpBlock.h
+++ b/clang/lib/AST/Interp/InterpBlock.h
@@ -49,17 +49,19 @@ enum PrimType : unsigned;
 class Block final {
 public:
   /// Creates a new block.
-  Block(const std::optional<unsigned> &DeclID, const Descriptor *Desc,
-        bool IsStatic = false, bool IsExtern = false)
-      : DeclID(DeclID), IsStatic(IsStatic), IsExtern(IsExtern), Desc(Desc) {
-        assert(Desc);
-      }
-
-  Block(const Descriptor *Desc, bool IsStatic = false, bool IsExtern = false)
-      : DeclID((unsigned)-1), IsStatic(IsStatic), IsExtern(IsExtern),
+  Block(unsigned EvalID, const std::optional<unsigned> &DeclID,
+        const Descriptor *Desc, bool IsStatic = false, bool IsExtern = false)
+      : EvalID(EvalID), DeclID(DeclID), IsStatic(IsStatic), IsExtern(IsExtern),
         Desc(Desc) {
-          assert(Desc);
-        }
+    assert(Desc);
+  }
+
+  Block(unsigned EvalID, const Descriptor *Desc, bool IsStatic = false,
+        bool IsExtern = false)
+      : EvalID(EvalID), DeclID((unsigned)-1), IsStatic(IsStatic),
+        IsExtern(IsExtern), Desc(Desc) {
+    assert(Desc);
+  }
 
   /// Returns the block's descriptor.
   const Descriptor *getDescriptor() const { return Desc; }
@@ -75,7 +77,11 @@ class Block final {
   unsigned getSize() const { return Desc->getAllocSize(); }
   /// Returns the declaration ID.
   std::optional<unsigned> getDeclID() const { return DeclID; }
+  /// Returns whether the data of this block has been initialized via
+  /// invoking the Ctor func.
   bool isInitialized() const { return IsInitialized; }
+  /// The Evaluation ID this block was created in.
+  unsigned getEvalID() const { return EvalID; }
 
   /// Returns a pointer to the stored data.
   /// You are allowed to read Desc->getSize() bytes from this address.
@@ -130,8 +136,10 @@ class Block final {
   friend class DeadBlock;
   friend class InterpState;
 
-  Block(const Descriptor *Desc, bool IsExtern, bool IsStatic, bool IsDead)
-      : IsStatic(IsStatic), IsExtern(IsExtern), IsDead(true), Desc(Desc) {
+  Block(unsigned EvalID, const Descriptor *Desc, bool IsExtern, bool IsStatic,
+        bool IsDead)
+      : EvalID(EvalID), IsStatic(IsStatic), IsExtern(IsExtern), IsDead(true),
+        Desc(Desc) {
     assert(Desc);
   }
 
@@ -146,6 +154,7 @@ class Block final {
   bool hasPointer(const Pointer *P) const;
 #endif
 
+  const unsigned EvalID = ~0u;
   /// Start of the chain of pointers.
   Pointer *Pointers = nullptr;
   /// Unique identifier of the declaration.
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index af8841c8b4ef8a..98928b3c22d7c6 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetInfo.h"
+#include "llvm/Support/SipHash.h"
 
 namespace clang {
 namespace interp {
@@ -608,8 +609,8 @@ static bool interp__builtin_addressof(InterpState &S, CodePtr OpPC,
                                       const InterpFrame *Frame,
                                       const Function *Func,
                                       const CallExpr *Call) {
-  PrimType PtrT =
-      S.getContext().classify(Call->getArg(0)->getType()).value_or(PT_Ptr);
+  assert(Call->getArg(0)->isLValue());
+  PrimType PtrT = S.getContext().classify(Call->getArg(0)).value_or(PT_Ptr);
 
   if (PtrT == PT_FnPtr) {
     const FunctionPointer &Arg = S.Stk.peek<FunctionPointer>();
@@ -1100,6 +1101,18 @@ static bool interp__builtin_os_log_format_buffer_size(InterpState &S,
   return true;
 }
 
+static bool interp__builtin_ptrauth_string_discriminator(
+    InterpState &S, CodePtr OpPC, const InterpFrame *Frame,
+    const Function *Func, const CallExpr *Call) {
+  const auto &Ptr = S.Stk.peek<Pointer>();
+  assert(Ptr.getFieldDesc()->isPrimitiveArray());
+
+  StringRef R(&Ptr.deref<char>(), Ptr.getFieldDesc()->getNumElems() - 1);
+  uint64_t Result = getPointerAuthStableSipHash(R);
+  pushInteger(S, Result, Call->getType());
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
   const InterpFrame *Frame = S.Current;
@@ -1424,6 +1437,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case Builtin::BI__builtin_ptrauth_string_discriminator:
+    if (!interp__builtin_ptrauth_string_discriminator(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
              diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp
index 54ccf9034c7a7d..b33f74dfe99f1c 100644
--- a/clang/lib/AST/Interp/InterpFrame.cpp
+++ b/clang/lib/AST/Interp/InterpFrame.cpp
@@ -37,7 +37,8 @@ InterpFrame::InterpFrame(InterpState &S, const Function *Func,
   Locals = std::make_unique<char[]>(FrameSize);
   for (auto &Scope : Func->scopes()) {
     for (auto &Local : Scope.locals()) {
-      Block *B = new (localBlock(Local.Offset)) Block(Local.Desc);
+      Block *B =
+          new (localBlock(Local.Offset)) Block(S.Ctx.getEvalID(), Local.Desc);
       B->invokeCtor();
       new (localInlineDesc(Local.Offset)) InlineDescriptor(Local.Desc);
     }
@@ -220,7 +221,7 @@ Pointer InterpFrame::getParamPointer(unsigned Off) {
   const auto &Desc = Func->getParamDescriptor(Off);
   size_t BlockSize = sizeof(Block) + Desc.second->getAllocSize();
   auto Memory = std::make_unique<char[]>(BlockSize);
-  auto *B = new (Memory.get()) Block(Desc.second);
+  auto *B = new (Memory.get()) Block(S.Ctx.getEvalID(), Desc.second);
 
   // Copy the initial value.
   TYPE_SWITCH(Desc.first, new (B->data()) T(stackRef<T>(Off)));
diff --git a/clang/lib/AST/Interp/InterpState.h b/clang/lib/AST/Interp/InterpState.h
index 0938a723a76d09..138e1d7ac95d5d 100644
--- a/clang/lib/AST/Interp/InterpState.h
+++ b/clang/lib/AST/Interp/InterpState.h
@@ -121,6 +121,12 @@ class InterpState final : public State, public SourceMapper {
   InterpFrame *Current = nullptr;
   /// Source location of the evaluating expression
   SourceLocation EvalLocation;
+  /// Declaration we're initializing/evaluting, if any.
+  const VarDecl *EvaluatingDecl = nullptr;
+
+  llvm::SmallVector<
+      std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
+      SeenGlobalTemporaries;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index d77cd8943c4966..4070d0c54225da 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -347,7 +347,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx) const {
       Ty = AT->getValueType();
 
     // Invalid pointers.
-    if (Ptr.isDummy() || !Ptr.isLive() ||
+    if (Ptr.isDummy() || !Ptr.isLive() || !Ptr.isBlockPointer() ||
         (!Ptr.isUnknownSizeArray() && Ptr.isOnePastEnd()))
       return false;
 
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 609367a462b006..5faec75cc3ec51 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -513,6 +513,8 @@ class Pointer {
   unsigned getByteOffset() const {
     if (isIntegralPointer())
       return asIntPointer().Value + Offset;
+    if (isOnePastEnd())
+      return PastEndMark;
     return Offset;
   }
 
@@ -551,6 +553,9 @@ class Pointer {
     if (!asBlockPointer().Pointee)
       return false;
 
+    if (isUnknownSizeArray())
+      return false;
+
     return isElementPastEnd() ||
            (getSize() == getOffset() && !isZeroSizeArray());
   }
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 7b1f719779e904..d3864d23925c0e 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Program.h"
-#include "ByteCodeStmtGen.h"
 #include "Context.h"
 #include "Function.h"
 #include "Integral.h"
@@ -64,7 +63,7 @@ unsigned Program::createGlobalString(const StringLiteral *S) {
   // The byte length does not include the null terminator.
   unsigned I = Globals.size();
   unsigned Sz = Desc->getAllocSize();
-  auto *G = new (Allocator, Sz) Global(Desc, /*isStatic=*/true,
+  auto *G = new (Allocator, Sz) Global(Ctx.getEvalID(), Desc, /*isStatic=*/true,
                                        /*isExtern=*/false);
   G->block()->invokeCtor();
 
@@ -127,6 +126,12 @@ std::optional<unsigned> Program::getGlobal(const ValueDecl *VD) {
   return std::nullopt;
 }
 
+std::optional<unsigned> Program::getGlobal(const Expr *E) {
+  if (auto It = GlobalIndices.find(E); It != GlobalIndices.end())
+    return It->second;
+  return std::nullopt;
+}
+
 std::optional<unsigned> Program::getOrCreateGlobal(const ValueDecl *VD,
                                                    const Expr *Init) {
   if (auto Idx = getGlobal(VD))
@@ -165,7 +170,8 @@ std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *VD) {
   unsigned I = Globals.size();
 
   auto *G = new (Allocator, Desc->getAllocSize())
-      Global(getCurrentDecl(), Desc, /*IsStatic=*/true, /*IsExtern=*/false);
+      Global(Ctx.getEvalID(), getCurrentDecl(), Desc, /*IsStatic=*/true,
+             /*IsExtern=*/false);
   G->block()->invokeCtor();
 
   Globals.push_back(G);
@@ -196,7 +202,14 @@ std::optional<unsigned> Program::createGlobal(const ValueDecl *VD,
 }
 
 std::optional<unsigned> Program::createGlobal(const Expr *E) {
-  return createGlobal(E, E->getType(), /*isStatic=*/true, /*isExtern=*/false);
+  if (auto Idx = getGlobal(E))
+    return Idx;
+  if (auto Idx = createGlobal(E, E->getType(), /*isStatic=*/true,
+                              /*isExtern=*/false)) {
+    GlobalIndices[E] = *Idx;
+    return *Idx;
+  }
+  return std::nullopt;
 }
 
 std::optional<unsigned> Program::createGlobal(const DeclTy &D, QualType Ty,
@@ -219,7 +232,7 @@ std::optional<unsigned> Program::createGlobal(const DeclTy &D, QualType Ty,
   unsigned I = Globals.size();
 
   auto *G = new (Allocator, Desc->getAllocSize())
-      Global(getCurrentDecl(), Desc, IsStatic, IsExtern);
+      Global(Ctx.getEvalID(), getCurrentDecl(), Desc, IsStatic, IsExtern);
   G->block()->invokeCtor();
 
   // Initialize InlineDescriptor fields.
diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index ec7c0744b8856a..1cabc5212180f2 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -77,6 +77,7 @@ class Program final {
 
   /// Finds a global's index.
   std::optional<unsigned> getGlobal(const ValueDecl *VD);
+  std::optional<unsigned> getGlobal(const Expr *E);
 
   /// Returns or creates a global an creates an index to it.
   std::optional<unsigned> getOrCreateGlobal(const ValueDecl *VD,
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 2cac03ba163f92..5444dcf027fe2f 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -99,11 +99,10 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
   }
 
   void mangleCXXName(GlobalDecl GD, raw_ostream &) override;
-  void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
+  void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk, bool,
                    raw_ostream &) override;
   void mangleCXXDtorThunk(const CXXDestructorDecl *DD, CXXDtorType Type,
-                          const ThisAdjustment &ThisAdjustment,
-                          raw_ostream &) override;
+                          const ThunkInfo &Thunk, bool, raw_ostream &) override;
   void mangleReferenceTemporary(const VarDecl *D, unsigned ManglingNumber,
                                 raw_ostream &) override;
   void mangleCXXVTable(const CXXRecordDecl *RD, raw_ostream &) override;
@@ -468,6 +467,7 @@ class CXXNameMangler {
   void mangleNameOrStandardSubstitution(const NamedDecl *ND);
   void mangleLambdaSig(const CXXRecordDecl *Lambda);
   void mangleModuleNamePrefix(StringRef Name, bool IsPartition = false);
+  void mangleVendorQualifier(StringRef Name);
 
 private:
 
@@ -559,7 +559,6 @@ class CXXNameMangler {
                                       StringRef Prefix = "");
   void mangleOperatorName(DeclarationName Name, unsigned Arity);
   void mangleOperatorName(OverloadedOperatorKind OO, unsigned Arity);
-  void mangleVendorQualifier(StringRef qualifier);
   void mangleQualifiers(Qualifiers Quals, const DependentAddressSpaceType *DAST = nullptr);
   void mangleRefQualifier(RefQualifierKind RefQualifier);
 
@@ -7044,8 +7043,78 @@ void ItaniumMangleContextImpl::mangleCXXDtorComdat(const CXXDestructorDecl *D,
   Mangler.mangle(GlobalDecl(D, Dtor_Comdat));
 }
 
+/// Mangles the pointer authentication override attribute for classes
+/// that have explicit overrides for the vtable authentication schema.
+///
+/// The override is mangled as a parameterized vendor extension as follows
+///
+///   <type> ::= U "__vtptrauth" I
+///                 <key>
+///                 <addressDiscriminated>
+///                 <extraDiscriminator>
+///              E
+///
+/// The extra discriminator encodes the explicit value derived from the
+/// override schema, e.g. if the override has specified type based
+/// discrimination the encoded value will be the discriminator derived from the
+/// type name.
+static void mangleOverrideDiscrimination(CXXNameMangler &Mangler,
+                                         ASTContext &Context,
+                                         const ThunkInfo &Thunk) {
+  auto &LangOpts = Context.getLangOpts();
+  const CXXRecordDecl *ThisRD = Thunk.ThisType->getPointeeCXXRecordDecl();
+  const CXXRecordDecl *PtrauthClassRD =
+      Context.baseForVTableAuthentication(ThisRD);
+  unsigned TypedDiscriminator =
+      Context.getPointerAuthVTablePointerDiscriminator(ThisRD);
+  Mangler.mangleVendorQualifier("__vtptrauth");
+  auto &ManglerStream = Mangler.getStream();
+  ManglerStream << "I";
+  if (const auto *ExplicitAuth =
+          PtrauthClassRD->getAttr<VTablePointerAuthenticationAttr>()) {
+    ManglerStream << "Lj" << ExplicitAuth->getKey();
+
+    if (ExplicitAuth->getAddressDiscrimination() ==
+        VTablePointerAuthenticationAttr::DefaultAddressDiscrimination)
+      ManglerStream << "Lb" << LangOpts.PointerAuthVTPtrAddressDiscrimination;
+    else
+      ManglerStream << "Lb"
+                    << (ExplicitAuth->getAddressDiscrimination() ==
+                        VTablePointerAuthenticationAttr::AddressDiscrimination);
+
+    switch (ExplicitAuth->getExtraDiscrimination()) {
+    case VTablePointerAuthenticationAttr::DefaultExtraDiscrimination: {
+      if (LangOpts.PointerAuthVTPtrTypeDiscrimination)
+        ManglerStream << "Lj" << TypedDiscriminator;
+      else
+        ManglerStream << "Lj" << 0;
+      break;
+    }
+    case VTablePointerAuthenticationAttr::TypeDiscrimination:
+      ManglerStream << "Lj" << TypedDiscriminator;
+      break;
+    case VTablePointerAuthenticationAttr::CustomDiscrimination:
+      ManglerStream << "Lj" << ExplicitAuth->getCustomDiscriminationValue();
+      break;
+    case VTablePointerAuthenticationAttr::NoExtraDiscrimination:
+      ManglerStream << "Lj" << 0;
+      break;
+    }
+  } else {
+    ManglerStream << "Lj"
+                  << (unsigned)VTablePointerAuthenticationAttr::DefaultKey;
+    ManglerStream << "Lb" << LangOpts.PointerAuthVTPtrAddressDiscrimination;
+    if (LangOpts.PointerAuthVTPtrTypeDiscrimination)
+      ManglerStream << "Lj" << TypedDiscriminator;
+    else
+      ManglerStream << "Lj" << 0;
+  }
+  ManglerStream << "E";
+}
+
 void ItaniumMangleContextImpl::mangleThunk(const CXXMethodDecl *MD,
                                            const ThunkInfo &Thunk,
+                                           bool ElideOverrideInfo,
                                            raw_ostream &Out) {
   //  <special-name> ::= T <call-offset> <base encoding>
   //                      # base is the nominal target function of thunk
@@ -7071,21 +7140,28 @@ void ItaniumMangleContextImpl::mangleThunk(const CXXMethodDecl *MD,
                              Thunk.Return.Virtual.Itanium.VBaseOffsetOffset);
 
   Mangler.mangleFunctionEncoding(MD);
+  if (!ElideOverrideInfo)
+    mangleOverrideDiscrimination(Mangler, getASTContext(), Thunk);
 }
 
-void ItaniumMangleContextImpl::mangleCXXDtorThunk(
-    const CXXDestructorDecl *DD, CXXDtorType Type,
-    const ThisAdjustment &ThisAdjustment, raw_ostream &Out) {
+void ItaniumMangleContextImpl::mangleCXXDtorThunk(const CXXDestructorDecl *DD,
+                                                  CXXDtorType Type,
+                                                  const ThunkInfo &Thunk,
+                                                  bool ElideOverrideInfo,
+                                                  raw_ostream &Out) {
   //  <special-name> ::= T <call-offset> <base encoding>
   //                      # base is the nominal target function of thunk
   CXXNameMangler Mangler(*this, Out, DD, Type);
   Mangler.getStream() << "_ZT";
 
+  auto &ThisAdjustment = Thunk.This;
   // Mangle the 'this' pointer adjustment.
   Mangler.mangleCallOffset(ThisAdjustment.NonVirtual,
                            ThisAdjustment.Virtual.Itanium.VCallOffsetOffset);
 
   Mangler.mangleFunctionEncoding(GlobalDecl(DD, Type));
+  if (!ElideOverrideInfo)
+    mangleOverrideDiscrimination(Mangler, getASTContext(), Thunk);
 }
 
 /// Returns the mangled name for a guard variable for the passed in VarDecl.
diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp
index 4fbf0e3b42dbc8..75f6e2161a6338 100644
--- a/clang/lib/AST/Mangle.cpp
+++ b/clang/lib/AST/Mangle.cpp
@@ -513,10 +513,20 @@ class ASTNameGenerator::Implementation {
       }
     } else if (const auto *MD = dyn_cast_or_null<CXXMethodDecl>(ND)) {
       Manglings.emplace_back(getName(ND));
-      if (MD->isVirtual())
-        if (const auto *TIV = Ctx.getVTableContext()->getThunkInfo(MD))
-          for (const auto &T : *TIV)
-            Manglings.emplace_back(getMangledThunk(MD, T));
+      if (MD->isVirtual()) {
+        if (const auto *TIV = Ctx.getVTableContext()->getThunkInfo(MD)) {
+          for (const auto &T : *TIV) {
+            std::string ThunkName;
+            std::string ContextualizedName =
+                getMangledThunk(MD, T, /* ElideOverrideInfo */ false);
+            if (Ctx.useAbbreviatedThunkName(MD, ContextualizedName))
+              ThunkName = getMangledThunk(MD, T, /* ElideOverrideInfo */ true);
+            else
+              ThunkName = ContextualizedName;
+            Manglings.emplace_back(ThunkName);
+          }
+        }
+      }
     }
 
     return Manglings;
@@ -569,11 +579,12 @@ class ASTNameGenerator::Implementation {
     return BOS.str();
   }
 
-  std::string getMangledThunk(const CXXMethodDecl *MD, const ThunkInfo &T) {
+  std::string getMangledThunk(const CXXMethodDecl *MD, const ThunkInfo &T,
+                              bool ElideOverrideInfo) {
     std::string FrontendBuf;
     llvm::raw_string_ostream FOS(FrontendBuf);
 
-    MC->mangleThunk(MD, T, FOS);
+    MC->mangleThunk(MD, T, ElideOverrideInfo, FOS);
 
     std::string BackendBuf;
     llvm::raw_string_ostream BOS(BackendBuf);
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 3923d34274703d..7f1e9ab02ec261 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -159,9 +159,9 @@ class MicrosoftMangleContextImpl : public MicrosoftMangleContext {
                                 const MethodVFTableLocation &ML,
                                 raw_ostream &Out) override;
   void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
-                   raw_ostream &) override;
+                   bool ElideOverrideInfo, raw_ostream &) override;
   void mangleCXXDtorThunk(const CXXDestructorDecl *DD, CXXDtorType Type,
-                          const ThisAdjustment &ThisAdjustment,
+                          const ThunkInfo &Thunk, bool ElideOverrideInfo,
                           raw_ostream &) override;
   void mangleCXXVFTable(const CXXRecordDecl *Derived,
                         ArrayRef<const CXXRecordDecl *> BasePath,
@@ -169,6 +169,8 @@ class MicrosoftMangleContextImpl : public MicrosoftMangleContext {
   void mangleCXXVBTable(const CXXRecordDecl *Derived,
                         ArrayRef<const CXXRecordDecl *> BasePath,
                         raw_ostream &Out) override;
+
+  void mangleCXXVTable(const CXXRecordDecl *, raw_ostream &) override;
   void mangleCXXVirtualDisplacementMap(const CXXRecordDecl *SrcRD,
                                        const CXXRecordDecl *DstRD,
                                        raw_ostream &Out) override;
@@ -447,8 +449,8 @@ class MicrosoftCXXNameMangler {
   void mangleDecayedArrayType(const ArrayType *T);
   void mangleArrayType(const ArrayType *T);
   void mangleFunctionClass(const FunctionDecl *FD);
-  void mangleCallingConvention(CallingConv CC);
-  void mangleCallingConvention(const FunctionType *T);
+  void mangleCallingConvention(CallingConv CC, SourceRange Range);
+  void mangleCallingConvention(const FunctionType *T, SourceRange Range);
   void mangleIntegerLiteral(const llvm::APSInt &Number,
                             const NonTypeTemplateParmDecl *PD = nullptr,
                             QualType TemplateArgType = QualType());
@@ -888,7 +890,8 @@ void MicrosoftCXXNameMangler::mangleVirtualMemPtrThunk(
   Out << "$B";
   mangleNumber(OffsetInVFTable);
   Out << 'A';
-  mangleCallingConvention(MD->getType()->castAs<FunctionProtoType>());
+  mangleCallingConvention(MD->getType()->castAs<FunctionProtoType>(),
+                          MD->getSourceRange());
 }
 
 void MicrosoftCXXNameMangler::mangleName(GlobalDecl GD) {
@@ -2768,7 +2771,7 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
     mangleQualifiers(Quals, /*IsMember=*/false);
   }
 
-  mangleCallingConvention(CC);
+  mangleCallingConvention(CC, Range);
 
   // <return-type> ::= <type>
   //               ::= @ # structors (they have no declared return type)
@@ -2949,7 +2952,8 @@ void MicrosoftCXXNameMangler::mangleFunctionClass(const FunctionDecl *FD) {
     Out << 'Y';
   }
 }
-void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
+void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC,
+                                                      SourceRange Range) {
   // <calling-convention> ::= A # __cdecl
   //                      ::= B # __export __cdecl
   //                      ::= C # __pascal
@@ -2962,7 +2966,10 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
   //                      ::= J # __export __fastcall
   //                      ::= Q # __vectorcall
   //                      ::= S # __attribute__((__swiftcall__)) // Clang-only
-  //                      ::= T # __attribute__((__swiftasynccall__))
+  //                      ::= W # __attribute__((__swiftasynccall__))
+  //                      ::= U # __attribute__((__preserve_most__))
+  //                      ::= V # __attribute__((__preserve_none__)) //
+  //                      Clang-only
   //                            // Clang-only
   //                      ::= w # __regcall
   //                      ::= x # __regcall4
@@ -2974,28 +2981,55 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
 
   switch (CC) {
     default:
-      llvm_unreachable("Unsupported CC for mangling");
+      break;
     case CC_Win64:
     case CC_X86_64SysV:
-    case CC_C: Out << 'A'; break;
-    case CC_X86Pascal: Out << 'C'; break;
-    case CC_X86ThisCall: Out << 'E'; break;
-    case CC_X86StdCall: Out << 'G'; break;
-    case CC_X86FastCall: Out << 'I'; break;
-    case CC_X86VectorCall: Out << 'Q'; break;
-    case CC_Swift: Out << 'S'; break;
-    case CC_SwiftAsync: Out << 'W'; break;
-    case CC_PreserveMost: Out << 'U'; break;
+    case CC_C:
+      Out << 'A';
+      return;
+    case CC_X86Pascal:
+      Out << 'C';
+      return;
+    case CC_X86ThisCall:
+      Out << 'E';
+      return;
+    case CC_X86StdCall:
+      Out << 'G';
+      return;
+    case CC_X86FastCall:
+      Out << 'I';
+      return;
+    case CC_X86VectorCall:
+      Out << 'Q';
+      return;
+    case CC_Swift:
+      Out << 'S';
+      return;
+    case CC_SwiftAsync:
+      Out << 'W';
+      return;
+    case CC_PreserveMost:
+      Out << 'U';
+      return;
+    case CC_PreserveNone:
+      Out << 'V';
+      return;
     case CC_X86RegCall:
       if (getASTContext().getLangOpts().RegCall4)
         Out << "x";
       else
         Out << "w";
-      break;
+      return;
   }
+
+  DiagnosticsEngine &Diags = Context.getDiags();
+  unsigned DiagID = Diags.getCustomDiagID(
+      DiagnosticsEngine::Error, "cannot mangle this calling convention yet");
+  Diags.Report(Range.getBegin(), DiagID) << Range;
 }
-void MicrosoftCXXNameMangler::mangleCallingConvention(const FunctionType *T) {
-  mangleCallingConvention(T->getCallConv());
+void MicrosoftCXXNameMangler::mangleCallingConvention(const FunctionType *T,
+                                                      SourceRange Range) {
+  mangleCallingConvention(T->getCallConv(), Range);
 }
 
 void MicrosoftCXXNameMangler::mangleThrowSpecification(
@@ -3715,6 +3749,7 @@ void MicrosoftMangleContextImpl::mangleVirtualMemPtrThunk(
 
 void MicrosoftMangleContextImpl::mangleThunk(const CXXMethodDecl *MD,
                                              const ThunkInfo &Thunk,
+                                             bool /*ElideOverrideInfo*/,
                                              raw_ostream &Out) {
   msvc_hashing_ostream MHO(Out);
   MicrosoftCXXNameMangler Mangler(*this, MHO);
@@ -3736,9 +3771,11 @@ void MicrosoftMangleContextImpl::mangleThunk(const CXXMethodDecl *MD,
       DeclForFPT->getType()->castAs<FunctionProtoType>(), MD);
 }
 
-void MicrosoftMangleContextImpl::mangleCXXDtorThunk(
-    const CXXDestructorDecl *DD, CXXDtorType Type,
-    const ThisAdjustment &Adjustment, raw_ostream &Out) {
+void MicrosoftMangleContextImpl::mangleCXXDtorThunk(const CXXDestructorDecl *DD,
+                                                    CXXDtorType Type,
+                                                    const ThunkInfo &Thunk,
+                                                    bool /*ElideOverrideInfo*/,
+                                                    raw_ostream &Out) {
   // FIXME: Actually, the dtor thunk should be emitted for vector deleting
   // dtors rather than scalar deleting dtors. Just use the vector deleting dtor
   // mangling manually until we support both deleting dtor types.
@@ -3747,6 +3784,7 @@ void MicrosoftMangleContextImpl::mangleCXXDtorThunk(
   MicrosoftCXXNameMangler Mangler(*this, MHO, DD, Type);
   Mangler.getStream() << "??_E";
   Mangler.mangleName(DD->getParent());
+  auto &Adjustment = Thunk.This;
   mangleThunkThisAdjustment(DD->getAccess(), Adjustment, Mangler, MHO);
   Mangler.mangleFunctionType(DD->getType()->castAs<FunctionProtoType>(), DD);
 }
@@ -3771,6 +3809,12 @@ void MicrosoftMangleContextImpl::mangleCXXVFTable(
   Mangler.getStream() << '@';
 }
 
+void MicrosoftMangleContextImpl::mangleCXXVTable(const CXXRecordDecl *Derived,
+                                                 raw_ostream &Out) {
+  // TODO: Determine appropriate mangling for MSABI
+  mangleCXXVFTable(Derived, {}, Out);
+}
+
 void MicrosoftMangleContextImpl::mangleCXXVBTable(
     const CXXRecordDecl *Derived, ArrayRef<const CXXRecordDecl *> BasePath,
     raw_ostream &Out) {
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index d8519b2071e6da..c8792941a6bb64 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the subclesses of Stmt class declared in StmtOpenMP.h
+// This file implements the subclasses of Stmt class declared in StmtOpenMP.h
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 656b733a13b0e9..d8b885870de3ac 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3711,6 +3711,34 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef<QualType> params,
     auto &EllipsisLoc = *getTrailingObjects<SourceLocation>();
     EllipsisLoc = epi.EllipsisLoc;
   }
+
+  if (!epi.FunctionEffects.empty()) {
+    auto &ExtraBits = *getTrailingObjects<FunctionTypeExtraBitfields>();
+    size_t EffectsCount = epi.FunctionEffects.size();
+    ExtraBits.NumFunctionEffects = EffectsCount;
+    assert(ExtraBits.NumFunctionEffects == EffectsCount &&
+           "effect bitfield overflow");
+
+    ArrayRef<FunctionEffect> SrcFX = epi.FunctionEffects.effects();
+    auto *DestFX = getTrailingObjects<FunctionEffect>();
+    std::uninitialized_copy(SrcFX.begin(), SrcFX.end(), DestFX);
+
+    ArrayRef<EffectConditionExpr> SrcConds = epi.FunctionEffects.conditions();
+    if (!SrcConds.empty()) {
+      ExtraBits.EffectsHaveConditions = true;
+      auto *DestConds = getTrailingObjects<EffectConditionExpr>();
+      std::uninitialized_copy(SrcConds.begin(), SrcConds.end(), DestConds);
+      assert(std::any_of(SrcConds.begin(), SrcConds.end(),
+                         [](const EffectConditionExpr &EC) {
+                           if (const Expr *E = EC.getCondition())
+                             return E->isTypeDependent() ||
+                                    E->isValueDependent();
+                           return false;
+                         }) &&
+             "expected a dependent expression among the conditions");
+      addDependence(TypeDependence::DependentInstantiation);
+    }
+  }
 }
 
 bool FunctionProtoType::hasDependentExceptionSpec() const {
@@ -3794,6 +3822,7 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result,
   // Finally we have a trailing return type flag (bool)
   // combined with AArch64 SME Attributes, to save space:
   //      int
+  // combined with any FunctionEffects
   //
   // There is no ambiguity between the consumed arguments and an empty EH
   // spec because of the leading 'bool' which unambiguously indicates
@@ -3829,6 +3858,8 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result,
 
   epi.ExtInfo.Profile(ID);
   ID.AddInteger((epi.AArch64SMEAttributes << 1) | epi.HasTrailingReturn);
+
+  epi.FunctionEffects.Profile(ID);
 }
 
 void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID,
@@ -5093,3 +5124,257 @@ void AutoType::Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
   Profile(ID, Context, getDeducedType(), getKeyword(), isDependentType(),
           getTypeConstraintConcept(), getTypeConstraintArguments());
 }
+
+FunctionEffect::Kind FunctionEffect::oppositeKind() const {
+  switch (kind()) {
+  case Kind::NonBlocking:
+    return Kind::Blocking;
+  case Kind::Blocking:
+    return Kind::NonBlocking;
+  case Kind::NonAllocating:
+    return Kind::Allocating;
+  case Kind::Allocating:
+    return Kind::NonAllocating;
+  case Kind::None:
+    return Kind::None;
+  }
+  llvm_unreachable("unknown effect kind");
+}
+
+StringRef FunctionEffect::name() const {
+  switch (kind()) {
+  case Kind::NonBlocking:
+    return "nonblocking";
+  case Kind::NonAllocating:
+    return "nonallocating";
+  case Kind::Blocking:
+    return "blocking";
+  case Kind::Allocating:
+    return "allocating";
+  case Kind::None:
+    return "(none)";
+  }
+  llvm_unreachable("unknown effect kind");
+}
+
+bool FunctionEffect::canInferOnFunction(const Decl &Callee) const {
+  switch (kind()) {
+  case Kind::NonAllocating:
+  case Kind::NonBlocking: {
+    FunctionEffectsRef CalleeFX;
+    if (auto *FD = Callee.getAsFunction())
+      CalleeFX = FD->getFunctionEffects();
+    else if (auto *BD = dyn_cast<BlockDecl>(&Callee))
+      CalleeFX = BD->getFunctionEffects();
+    else
+      return false;
+    for (const FunctionEffectWithCondition &CalleeEC : CalleeFX) {
+      // nonblocking/nonallocating cannot call allocating.
+      if (CalleeEC.Effect.kind() == Kind::Allocating)
+        return false;
+      // nonblocking cannot call blocking.
+      if (kind() == Kind::NonBlocking &&
+          CalleeEC.Effect.kind() == Kind::Blocking)
+        return false;
+    }
+    return true;
+  }
+
+  case Kind::Allocating:
+  case Kind::Blocking:
+    return false;
+
+  case Kind::None:
+    assert(0 && "canInferOnFunction with None");
+    break;
+  }
+  llvm_unreachable("unknown effect kind");
+}
+
+bool FunctionEffect::shouldDiagnoseFunctionCall(
+    bool Direct, ArrayRef<FunctionEffect> CalleeFX) const {
+  switch (kind()) {
+  case Kind::NonAllocating:
+  case Kind::NonBlocking: {
+    const Kind CallerKind = kind();
+    for (const auto &Effect : CalleeFX) {
+      const Kind EK = Effect.kind();
+      // Does callee have same or stronger constraint?
+      if (EK == CallerKind ||
+          (CallerKind == Kind::NonAllocating && EK == Kind::NonBlocking)) {
+        return false; // no diagnostic
+      }
+    }
+    return true; // warning
+  }
+  case Kind::Allocating:
+  case Kind::Blocking:
+    return false;
+  case Kind::None:
+    assert(0 && "shouldDiagnoseFunctionCall with None");
+    break;
+  }
+  llvm_unreachable("unknown effect kind");
+}
+
+// =====
+
+void FunctionEffectsRef::Profile(llvm::FoldingSetNodeID &ID) const {
+  bool HasConds = !Conditions.empty();
+
+  ID.AddInteger(size() | (HasConds << 31u));
+  for (unsigned Idx = 0, Count = Effects.size(); Idx != Count; ++Idx) {
+    ID.AddInteger(Effects[Idx].toOpaqueInt32());
+    if (HasConds)
+      ID.AddPointer(Conditions[Idx].getCondition());
+  }
+}
+
+bool FunctionEffectSet::insert(const FunctionEffectWithCondition &NewEC,
+                               Conflicts &Errs) {
+  FunctionEffect::Kind NewOppositeKind = NewEC.Effect.oppositeKind();
+  Expr *NewCondition = NewEC.Cond.getCondition();
+
+  // The index at which insertion will take place; default is at end
+  // but we might find an earlier insertion point.
+  unsigned InsertIdx = Effects.size();
+  unsigned Idx = 0;
+  for (const FunctionEffectWithCondition &EC : *this) {
+    // Note about effects with conditions: They are considered distinct from
+    // those without conditions; they are potentially unique, redundant, or
+    // in conflict, but we can't tell which until the condition is evaluated.
+    if (EC.Cond.getCondition() == nullptr && NewCondition == nullptr) {
+      if (EC.Effect.kind() == NewEC.Effect.kind()) {
+        // There is no condition, and the effect kind is already present,
+        // so just fail to insert the new one (creating a duplicate),
+        // and return success.
+        return true;
+      }
+
+      if (EC.Effect.kind() == NewOppositeKind) {
+        Errs.push_back({EC, NewEC});
+        return false;
+      }
+    }
+
+    if (NewEC.Effect.kind() < EC.Effect.kind() && InsertIdx > Idx)
+      InsertIdx = Idx;
+
+    ++Idx;
+  }
+
+  if (NewCondition || !Conditions.empty()) {
+    if (Conditions.empty() && !Effects.empty())
+      Conditions.resize(Effects.size());
+    Conditions.insert(Conditions.begin() + InsertIdx,
+                      NewEC.Cond.getCondition());
+  }
+  Effects.insert(Effects.begin() + InsertIdx, NewEC.Effect);
+  return true;
+}
+
+bool FunctionEffectSet::insert(const FunctionEffectsRef &Set, Conflicts &Errs) {
+  for (const auto &Item : Set)
+    insert(Item, Errs);
+  return Errs.empty();
+}
+
+FunctionEffectSet FunctionEffectSet::getIntersection(FunctionEffectsRef LHS,
+                                                     FunctionEffectsRef RHS) {
+  FunctionEffectSet Result;
+  FunctionEffectSet::Conflicts Errs;
+
+  // We could use std::set_intersection but that would require expanding the
+  // container interface to include push_back, making it available to clients
+  // who might fail to maintain invariants.
+  auto IterA = LHS.begin(), EndA = LHS.end();
+  auto IterB = RHS.begin(), EndB = RHS.end();
+
+  auto FEWCLess = [](const FunctionEffectWithCondition &LHS,
+                     const FunctionEffectWithCondition &RHS) {
+    return std::tuple(LHS.Effect, uintptr_t(LHS.Cond.getCondition())) <
+           std::tuple(RHS.Effect, uintptr_t(RHS.Cond.getCondition()));
+  };
+
+  while (IterA != EndA && IterB != EndB) {
+    FunctionEffectWithCondition A = *IterA;
+    FunctionEffectWithCondition B = *IterB;
+    if (FEWCLess(A, B))
+      ++IterA;
+    else if (FEWCLess(B, A))
+      ++IterB;
+    else {
+      Result.insert(A, Errs);
+      ++IterA;
+      ++IterB;
+    }
+  }
+
+  // Insertion shouldn't be able to fail; that would mean both input
+  // sets contained conflicts.
+  assert(Errs.empty() && "conflict shouldn't be possible in getIntersection");
+
+  return Result;
+}
+
+FunctionEffectSet FunctionEffectSet::getUnion(FunctionEffectsRef LHS,
+                                              FunctionEffectsRef RHS,
+                                              Conflicts &Errs) {
+  // Optimize for either of the two sets being empty (very common).
+  if (LHS.empty())
+    return FunctionEffectSet(RHS);
+
+  FunctionEffectSet Combined(LHS);
+  Combined.insert(RHS, Errs);
+  return Combined;
+}
+
+LLVM_DUMP_METHOD void FunctionEffectsRef::dump(llvm::raw_ostream &OS) const {
+  OS << "Effects{";
+  bool First = true;
+  for (const auto &CFE : *this) {
+    if (!First)
+      OS << ", ";
+    else
+      First = false;
+    OS << CFE.Effect.name();
+    if (Expr *E = CFE.Cond.getCondition()) {
+      OS << '(';
+      E->dump();
+      OS << ')';
+    }
+  }
+  OS << "}";
+}
+
+LLVM_DUMP_METHOD void FunctionEffectSet::dump(llvm::raw_ostream &OS) const {
+  FunctionEffectsRef(*this).dump(OS);
+}
+
+FunctionEffectsRef FunctionEffectsRef::get(QualType QT) {
+  while (true) {
+    QualType Pointee = QT->getPointeeType();
+    if (Pointee.isNull())
+      break;
+    QT = Pointee;
+  }
+  if (const auto *FPT = QT->getAs<FunctionProtoType>())
+    return FPT->getFunctionEffects();
+  return {};
+}
+
+FunctionEffectsRef
+FunctionEffectsRef::create(ArrayRef<FunctionEffect> FX,
+                           ArrayRef<EffectConditionExpr> Conds) {
+  assert(std::is_sorted(FX.begin(), FX.end()) && "effects should be sorted");
+  assert((Conds.empty() || Conds.size() == FX.size()) &&
+         "effects size should match conditions size");
+  return FunctionEffectsRef(FX, Conds);
+}
+
+std::string FunctionEffectWithCondition::description() const {
+  std::string Result(Effect.name().str());
+  if (Cond.getCondition() != nullptr)
+    Result += "(expr)";
+  return Result;
+}
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 4add4d3af69a30..7c87fd587880ed 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1016,6 +1016,17 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
   }
   T->printExceptionSpecification(OS, Policy);
 
+  const FunctionEffectsRef FX = T->getFunctionEffects();
+  for (const auto &CFE : FX) {
+    OS << " __attribute__((" << CFE.Effect.name();
+    if (const Expr *E = CFE.Cond.getCondition()) {
+      OS << '(';
+      E->printPretty(OS, nullptr, Policy);
+      OS << ')';
+    }
+    OS << "))";
+  }
+
   if (T->hasTrailingReturn()) {
     OS << " -> ";
     print(T->getReturnType(), OS, StringRef());
@@ -1946,6 +1957,10 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::ArmOut:
   case attr::ArmInOut:
   case attr::ArmPreserves:
+  case attr::NonBlocking:
+  case attr::NonAllocating:
+  case attr::Blocking:
+  case attr::Allocating:
     llvm_unreachable("This attribute should have been handled already");
 
   case attr::NSReturnsRetained:
diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp
index a956ca5b37acfe..e941c3bedb0a7e 100644
--- a/clang/lib/AST/VTableBuilder.cpp
+++ b/clang/lib/AST/VTableBuilder.cpp
@@ -1147,11 +1147,41 @@ void ItaniumVTableBuilder::ComputeThisAdjustments() {
       continue;
 
     // Add it.
-    VTableThunks[VTableIndex].This = ThisAdjustment;
+    auto SetThisAdjustmentThunk = [&](uint64_t Idx) {
+      // If a this pointer adjustment is required, record the method that
+      // created the vtable entry. MD is not necessarily the method that
+      // created the entry since derived classes overwrite base class
+      // information in MethodInfoMap, hence findOriginalMethodInMap is called
+      // here.
+      //
+      // For example, in the following class hierarchy, if MD = D1::m and
+      // Overrider = D2:m, the original method that created the entry is B0:m,
+      // which is what findOriginalMethodInMap(MD) returns:
+      //
+      // struct B0 { int a; virtual void m(); };
+      // struct D0 : B0 { int a; void m() override; };
+      // struct D1 : B0 { int a; void m() override; };
+      // struct D2 : D0, D1 { int a; void m() override; };
+      //
+      // We need to record the method because we cannot
+      // call findOriginalMethod to find the method that created the entry if
+      // the method in the entry requires adjustment.
+      //
+      // Do not set ThunkInfo::Method if Idx is already in VTableThunks. This
+      // can happen when covariant return adjustment is required too.
+      if (!VTableThunks.count(Idx)) {
+        const CXXMethodDecl *Method = VTables.findOriginalMethodInMap(MD);
+        VTableThunks[Idx].Method = Method;
+        VTableThunks[Idx].ThisType = Method->getThisType().getTypePtr();
+      }
+      VTableThunks[Idx].This = ThisAdjustment;
+    };
+
+    SetThisAdjustmentThunk(VTableIndex);
 
     if (isa<CXXDestructorDecl>(MD)) {
       // Add an adjustment for the deleting destructor as well.
-      VTableThunks[VTableIndex + 1].This = ThisAdjustment;
+      SetThisAdjustmentThunk(VTableIndex + 1);
     }
   }
 
@@ -1509,6 +1539,8 @@ void ItaniumVTableBuilder::AddMethods(
           FindNearestOverriddenMethod(MD, PrimaryBases)) {
       if (ComputeReturnAdjustmentBaseOffset(Context, MD,
                                             OverriddenMD).isEmpty()) {
+        VTables.setOriginalMethod(MD, OverriddenMD);
+
         // Replace the method info of the overridden method with our own
         // method.
         assert(MethodInfoMap.count(OverriddenMD) &&
@@ -1547,7 +1579,8 @@ void ItaniumVTableBuilder::AddMethods(
 
             // This is a virtual thunk for the most derived class, add it.
             AddThunk(Overrider.Method,
-                     ThunkInfo(ThisAdjustment, ReturnAdjustment));
+                     ThunkInfo(ThisAdjustment, ReturnAdjustment,
+                               OverriddenMD->getThisType().getTypePtr()));
           }
         }
 
@@ -1615,6 +1648,15 @@ void ItaniumVTableBuilder::AddMethods(
     ReturnAdjustment ReturnAdjustment =
       ComputeReturnAdjustment(ReturnAdjustmentOffset);
 
+    // If a return adjustment is required, record the method that created the
+    // vtable entry. We need to record the method because we cannot call
+    // findOriginalMethod to find the method that created the entry if the
+    // method in the entry requires adjustment.
+    if (!ReturnAdjustment.isEmpty()) {
+      VTableThunks[Components.size()].Method = MD;
+      VTableThunks[Components.size()].ThisType = MD->getThisType().getTypePtr();
+    }
+
     AddMethod(Overrider.Method, ReturnAdjustment);
   }
 }
@@ -1890,11 +1932,31 @@ void ItaniumVTableBuilder::LayoutVTablesForVirtualBases(
   }
 }
 
+static void printThunkMethod(const ThunkInfo &Info, raw_ostream &Out) {
+  if (!Info.Method)
+    return;
+  std::string Str = PredefinedExpr::ComputeName(
+      PredefinedIdentKind::PrettyFunctionNoVirtual, Info.Method);
+  Out << " method: " << Str;
+}
+
 /// dumpLayout - Dump the vtable layout.
 void ItaniumVTableBuilder::dumpLayout(raw_ostream &Out) {
   // FIXME: write more tests that actually use the dumpLayout output to prevent
   // ItaniumVTableBuilder regressions.
 
+  Out << "Original map\n";
+
+  for (const auto &P : VTables.getOriginalMethodMap()) {
+    std::string Str0 =
+        PredefinedExpr::ComputeName(PredefinedIdentKind::PrettyFunctionNoVirtual,
+                                    P.first);
+    std::string Str1 =
+        PredefinedExpr::ComputeName(PredefinedIdentKind::PrettyFunctionNoVirtual,
+                                    P.second);
+    Out << " " << Str0 << " -> " << Str1 << "\n";
+  }
+
   if (isBuildingConstructorVTable()) {
     Out << "Construction vtable for ('";
     MostDerivedClass->printQualifiedName(Out);
@@ -1978,6 +2040,7 @@ void ItaniumVTableBuilder::dumpLayout(raw_ostream &Out) {
           }
 
           Out << ']';
+          printThunkMethod(Thunk, Out);
         }
 
         // If this function pointer has a 'this' pointer adjustment, dump it.
@@ -1991,6 +2054,7 @@ void ItaniumVTableBuilder::dumpLayout(raw_ostream &Out) {
           }
 
           Out << ']';
+          printThunkMethod(Thunk, Out);
         }
       }
 
@@ -2027,6 +2091,7 @@ void ItaniumVTableBuilder::dumpLayout(raw_ostream &Out) {
 
           Out << ']';
         }
+        printThunkMethod(Thunk, Out);
       }
 
       break;
@@ -2125,7 +2190,6 @@ void ItaniumVTableBuilder::dumpLayout(raw_ostream &Out) {
 
       ThunkInfoVectorTy ThunksVector = Thunks[MD];
       llvm::sort(ThunksVector, [](const ThunkInfo &LHS, const ThunkInfo &RHS) {
-        assert(LHS.Method == nullptr && RHS.Method == nullptr);
         return std::tie(LHS.This, LHS.Return) < std::tie(RHS.This, RHS.Return);
       });
 
@@ -2314,6 +2378,35 @@ ItaniumVTableContext::getVirtualBaseOffsetOffset(const CXXRecordDecl *RD,
   return I->second;
 }
 
+GlobalDecl ItaniumVTableContext::findOriginalMethod(GlobalDecl GD) {
+  const auto *MD = cast<CXXMethodDecl>(GD.getDecl());
+  computeVTableRelatedInformation(MD->getParent());
+  const CXXMethodDecl *OriginalMD = findOriginalMethodInMap(MD);
+
+  if (const auto *DD = dyn_cast<CXXDestructorDecl>(OriginalMD))
+    return GlobalDecl(DD, GD.getDtorType());
+  return OriginalMD;
+}
+
+const CXXMethodDecl *
+ItaniumVTableContext::findOriginalMethodInMap(const CXXMethodDecl *MD) const {
+  // Traverse the chain of virtual methods until we find the method that added
+  // the v-table slot.
+  while (true) {
+    auto I = OriginalMethodMap.find(MD);
+
+    // MD doesn't exist in OriginalMethodMap, so it must be the method we are
+    // looking for.
+    if (I == OriginalMethodMap.end())
+      break;
+
+    // Set MD to the overridden method.
+    MD = I->second;
+  }
+
+  return MD;
+}
+
 static std::unique_ptr<VTableLayout>
 CreateVTableLayout(const ItaniumVTableBuilder &Builder) {
   SmallVector<VTableLayout::VTableThunkTy, 1>
@@ -3094,9 +3187,9 @@ void VFTableBuilder::AddMethods(BaseSubobject Base, unsigned BaseDepth,
                                     ReturnAdjustmentOffset.VirtualBase);
       }
     }
-
+    auto ThisType = (OverriddenMD ? OverriddenMD : MD)->getThisType().getTypePtr();
     AddMethod(FinalOverriderMD,
-              ThunkInfo(ThisAdjustmentOffset, ReturnAdjustment,
+              ThunkInfo(ThisAdjustmentOffset, ReturnAdjustment, ThisType,
                         ForceReturnAdjustmentMangling ? MD : nullptr));
   }
 }
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 8109ac6a781e73..3c896d373a211d 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -391,17 +391,22 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     }
     case UO_PreInc:
     case UO_PreDec:
-      // Propagate the storage location, but don't create a new value; to
-      // avoid generating unnecessary values, we leave it to the specific
-      // analysis to do this if desired.
+      // Propagate the storage location and clear out any value associated with
+      // it (to represent the fact that the value has definitely changed).
+      // To avoid generating unnecessary values, we leave it to the specific
+      // analysis to create a new value if desired.
       propagateStorageLocation(*S->getSubExpr(), *S, Env);
+      if (StorageLocation *Loc = Env.getStorageLocation(*S->getSubExpr()))
+        Env.clearValue(*Loc);
       break;
     case UO_PostInc:
     case UO_PostDec:
-      // Propagate the old value, but don't create a new value; to avoid
-      // generating unnecessary values, we leave it to the specific analysis
-      // to do this if desired.
+      // Propagate the old value, then clear out any value associated with the
+      // storage location (to represent the fact that the value has definitely
+      // changed). See above for rationale.
       propagateValue(*S->getSubExpr(), *S, Env);
+      if (StorageLocation *Loc = Env.getStorageLocation(*S->getSubExpr()))
+        Env.clearValue(*Loc);
       break;
     default:
       break;
diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index e25b843c9bf83e..550c2a3ffe522f 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -37,6 +37,7 @@
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ImmutableMap.h"
@@ -1034,9 +1035,10 @@ class ThreadSafetyAnalyzer {
 
   template <class AttrType>
   void getMutexIDs(CapExprSet &Mtxs, AttrType *Attr, const Expr *Exp,
-                   const NamedDecl *D,
-                   const CFGBlock *PredBlock, const CFGBlock *CurrBlock,
-                   Expr *BrE, bool Neg);
+                   const NamedDecl *D, const CFGBlock *PredBlock,
+                   const CFGBlock *CurrBlock,
+                   const ASTContext &TrylockAttrContext,
+                   Expr *TrylockAttrSuccessValue, bool Neg);
 
   const CallExpr* getTrylockCallExpr(const Stmt *Cond, LocalVarContext C,
                                      bool &Negate);
@@ -1359,17 +1361,18 @@ void ThreadSafetyAnalyzer::getMutexIDs(CapExprSet &Mtxs, AttrType *Attr,
                                        const Expr *Exp, const NamedDecl *D,
                                        const CFGBlock *PredBlock,
                                        const CFGBlock *CurrBlock,
-                                       Expr *BrE, bool Neg) {
-  // Find out which branch has the lock
-  bool branch = false;
-  if (const auto *BLE = dyn_cast_or_null<CXXBoolLiteralExpr>(BrE))
-    branch = BLE->getValue();
-  else if (const auto *ILE = dyn_cast_or_null<IntegerLiteral>(BrE))
-    branch = ILE->getValue().getBoolValue();
-
-  int branchnum = branch ? 0 : 1;
-  if (Neg)
-    branchnum = !branchnum;
+                                       const ASTContext &TrylockAttrContext,
+                                       Expr *TrylockAttrSuccess,
+                                       bool Neg) {
+  // Evaluate the trylock's success value as a boolean.
+  bool trylockSuccessValue = false;
+  if (!TrylockAttrSuccess->EvaluateAsBooleanCondition(
+          trylockSuccessValue, TrylockAttrContext,
+          /*InConstantContext=*/true)) {
+    llvm_unreachable("Trylock success value could not be evaluated.");
+  }
+
+  const int branchnum = Neg ? !!trylockSuccessValue : !trylockSuccessValue;
 
   // If we've taken the trylock branch, then add the lock
   int i = 0;
@@ -1390,8 +1393,15 @@ static bool getStaticBooleanValue(Expr *E, bool &TCond) {
   } else if (const auto *ILE = dyn_cast<IntegerLiteral>(E)) {
     TCond = ILE->getValue().getBoolValue();
     return true;
-  } else if (auto *CE = dyn_cast<ImplicitCastExpr>(E))
+  } else if (auto *CE = dyn_cast<ImplicitCastExpr>(E)) {
     return getStaticBooleanValue(CE->getSubExpr(), TCond);
+  } else if (auto *DRE = dyn_cast<DeclRefExpr>(E)) {
+    if (auto *ECD = dyn_cast<EnumConstantDecl>(DRE->getDecl())) {
+      llvm::APSInt IV = ECD->getInitVal();
+      TCond = IV.getBoolValue();
+      return true;
+    }
+  }
   return false;
 }
 
@@ -1497,27 +1507,27 @@ void ThreadSafetyAnalyzer::getEdgeLockset(FactSet& Result,
   // If the condition is a call to a Trylock function, then grab the attributes
   for (const auto *Attr : FunDecl->attrs()) {
     switch (Attr->getKind()) {
-      case attr::TryAcquireCapability: {
-        auto *A = cast<TryAcquireCapabilityAttr>(Attr);
-        getMutexIDs(A->isShared() ? SharedLocksToAdd : ExclusiveLocksToAdd, A,
-                    Exp, FunDecl, PredBlock, CurrBlock, A->getSuccessValue(),
-                    Negate);
-        break;
-      };
-      case attr::ExclusiveTrylockFunction: {
-        const auto *A = cast<ExclusiveTrylockFunctionAttr>(Attr);
-        getMutexIDs(ExclusiveLocksToAdd, A, Exp, FunDecl, PredBlock, CurrBlock,
-                    A->getSuccessValue(), Negate);
-        break;
-      }
-      case attr::SharedTrylockFunction: {
-        const auto *A = cast<SharedTrylockFunctionAttr>(Attr);
-        getMutexIDs(SharedLocksToAdd, A, Exp, FunDecl, PredBlock, CurrBlock,
-                    A->getSuccessValue(), Negate);
-        break;
-      }
-      default:
-        break;
+    case attr::TryAcquireCapability: {
+      auto *A = cast<TryAcquireCapabilityAttr>(Attr);
+      getMutexIDs(A->isShared() ? SharedLocksToAdd : ExclusiveLocksToAdd, A,
+                  Exp, FunDecl, PredBlock, CurrBlock, FunDecl->getASTContext(),
+                  A->getSuccessValue(), Negate);
+      break;
+    };
+    case attr::ExclusiveTrylockFunction: {
+      const auto *A = cast<ExclusiveTrylockFunctionAttr>(Attr);
+      getMutexIDs(ExclusiveLocksToAdd, A, Exp, FunDecl, PredBlock, CurrBlock,
+                  FunDecl->getASTContext(), A->getSuccessValue(), Negate);
+      break;
+    }
+    case attr::SharedTrylockFunction: {
+      const auto *A = cast<SharedTrylockFunctionAttr>(Attr);
+      getMutexIDs(SharedLocksToAdd, A, Exp, FunDecl, PredBlock, CurrBlock,
+                  FunDecl->getASTContext(), A->getSuccessValue(), Negate);
+      break;
+    }
+    default:
+      break;
     }
   }
 
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 1d96a929f95d85..faf3878f064d20 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -72,23 +72,21 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) {
 }
 
 namespace {
-struct CudaArchToStringMap {
-  CudaArch arch;
+struct OffloadArchToStringMap {
+  OffloadArch arch;
   const char *arch_name;
   const char *virtual_arch_name;
 };
 } // namespace
 
-#define SM2(sm, ca)                                                            \
-  { CudaArch::SM_##sm, "sm_" #sm, ca }
+#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
 #define SM(sm) SM2(sm, "compute_" #sm)
-#define GFX(gpu)                                                               \
-  { CudaArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
-static const CudaArchToStringMap arch_names[] = {
+#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
+static const OffloadArchToStringMap arch_names[] = {
     // clang-format off
-    {CudaArch::UNUSED, "", ""},
+    {OffloadArch::UNUSED, "", ""},
     SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi
-    SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37),  // Kepler
+    SM(30), {OffloadArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37),  // Kepler
     SM(50), SM(52), SM(53),          // Maxwell
     SM(60), SM(61), SM(62),          // Pascal
     SM(70), SM(72),                  // Volta
@@ -112,7 +110,7 @@ static const CudaArchToStringMap arch_names[] = {
     GFX(803),  // gfx803
     GFX(805),  // gfx805
     GFX(810),  // gfx810
-    {CudaArch::GFX9_GENERIC, "gfx9-generic", "compute_amdgcn"},
+    {OffloadArch::GFX9_GENERIC, "gfx9-generic", "compute_amdgcn"},
     GFX(900),  // gfx900
     GFX(902),  // gfx902
     GFX(904),  // gfx903
@@ -124,12 +122,12 @@ static const CudaArchToStringMap arch_names[] = {
     GFX(940),  // gfx940
     GFX(941),  // gfx941
     GFX(942),  // gfx942
-    {CudaArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"},
+    {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"},
     GFX(1010), // gfx1010
     GFX(1011), // gfx1011
     GFX(1012), // gfx1012
     GFX(1013), // gfx1013
-    {CudaArch::GFX10_3_GENERIC, "gfx10-3-generic", "compute_amdgcn"},
+    {OffloadArch::GFX10_3_GENERIC, "gfx10-3-generic", "compute_amdgcn"},
     GFX(1030), // gfx1030
     GFX(1031), // gfx1031
     GFX(1032), // gfx1032
@@ -137,7 +135,7 @@ static const CudaArchToStringMap arch_names[] = {
     GFX(1034), // gfx1034
     GFX(1035), // gfx1035
     GFX(1036), // gfx1036
-    {CudaArch::GFX11_GENERIC, "gfx11-generic", "compute_amdgcn"},
+    {OffloadArch::GFX11_GENERIC, "gfx11-generic", "compute_amdgcn"},
     GFX(1100), // gfx1100
     GFX(1101), // gfx1101
     GFX(1102), // gfx1102
@@ -145,104 +143,105 @@ static const CudaArchToStringMap arch_names[] = {
     GFX(1150), // gfx1150
     GFX(1151), // gfx1151
     GFX(1152), // gfx1152
-    {CudaArch::GFX12_GENERIC, "gfx12-generic", "compute_amdgcn"},
+    {OffloadArch::GFX12_GENERIC, "gfx12-generic", "compute_amdgcn"},
     GFX(1200), // gfx1200
     GFX(1201), // gfx1201
-    {CudaArch::Generic, "generic", ""},
+    {OffloadArch::AMDGCNSPIRV, "amdgcnspirv", "compute_amdgcn"},
+    {OffloadArch::Generic, "generic", ""},
     // clang-format on
 };
 #undef SM
 #undef SM2
 #undef GFX
 
-const char *CudaArchToString(CudaArch A) {
+const char *OffloadArchToString(OffloadArch A) {
   auto result = std::find_if(
       std::begin(arch_names), std::end(arch_names),
-      [A](const CudaArchToStringMap &map) { return A == map.arch; });
+      [A](const OffloadArchToStringMap &map) { return A == map.arch; });
   if (result == std::end(arch_names))
     return "unknown";
   return result->arch_name;
 }
 
-const char *CudaArchToVirtualArchString(CudaArch A) {
+const char *OffloadArchToVirtualArchString(OffloadArch A) {
   auto result = std::find_if(
       std::begin(arch_names), std::end(arch_names),
-      [A](const CudaArchToStringMap &map) { return A == map.arch; });
+      [A](const OffloadArchToStringMap &map) { return A == map.arch; });
   if (result == std::end(arch_names))
     return "unknown";
   return result->virtual_arch_name;
 }
 
-CudaArch StringToCudaArch(llvm::StringRef S) {
+OffloadArch StringToOffloadArch(llvm::StringRef S) {
   auto result = std::find_if(
       std::begin(arch_names), std::end(arch_names),
-      [S](const CudaArchToStringMap &map) { return S == map.arch_name; });
+      [S](const OffloadArchToStringMap &map) { return S == map.arch_name; });
   if (result == std::end(arch_names))
-    return CudaArch::UNKNOWN;
+    return OffloadArch::UNKNOWN;
   return result->arch;
 }
 
-CudaVersion MinVersionForCudaArch(CudaArch A) {
-  if (A == CudaArch::UNKNOWN)
+CudaVersion MinVersionForOffloadArch(OffloadArch A) {
+  if (A == OffloadArch::UNKNOWN)
     return CudaVersion::UNKNOWN;
 
   // AMD GPUs do not depend on CUDA versions.
-  if (IsAMDGpuArch(A))
+  if (IsAMDOffloadArch(A))
     return CudaVersion::CUDA_70;
 
   switch (A) {
-  case CudaArch::SM_20:
-  case CudaArch::SM_21:
-  case CudaArch::SM_30:
-  case CudaArch::SM_32_:
-  case CudaArch::SM_35:
-  case CudaArch::SM_37:
-  case CudaArch::SM_50:
-  case CudaArch::SM_52:
-  case CudaArch::SM_53:
+  case OffloadArch::SM_20:
+  case OffloadArch::SM_21:
+  case OffloadArch::SM_30:
+  case OffloadArch::SM_32_:
+  case OffloadArch::SM_35:
+  case OffloadArch::SM_37:
+  case OffloadArch::SM_50:
+  case OffloadArch::SM_52:
+  case OffloadArch::SM_53:
     return CudaVersion::CUDA_70;
-  case CudaArch::SM_60:
-  case CudaArch::SM_61:
-  case CudaArch::SM_62:
+  case OffloadArch::SM_60:
+  case OffloadArch::SM_61:
+  case OffloadArch::SM_62:
     return CudaVersion::CUDA_80;
-  case CudaArch::SM_70:
+  case OffloadArch::SM_70:
     return CudaVersion::CUDA_90;
-  case CudaArch::SM_72:
+  case OffloadArch::SM_72:
     return CudaVersion::CUDA_91;
-  case CudaArch::SM_75:
+  case OffloadArch::SM_75:
     return CudaVersion::CUDA_100;
-  case CudaArch::SM_80:
+  case OffloadArch::SM_80:
     return CudaVersion::CUDA_110;
-  case CudaArch::SM_86:
+  case OffloadArch::SM_86:
     return CudaVersion::CUDA_111;
-  case CudaArch::SM_87:
+  case OffloadArch::SM_87:
     return CudaVersion::CUDA_114;
-  case CudaArch::SM_89:
-  case CudaArch::SM_90:
+  case OffloadArch::SM_89:
+  case OffloadArch::SM_90:
     return CudaVersion::CUDA_118;
-  case CudaArch::SM_90a:
+  case OffloadArch::SM_90a:
     return CudaVersion::CUDA_120;
   default:
     llvm_unreachable("invalid enum");
   }
 }
 
-CudaVersion MaxVersionForCudaArch(CudaArch A) {
+CudaVersion MaxVersionForOffloadArch(OffloadArch A) {
   // AMD GPUs do not depend on CUDA versions.
-  if (IsAMDGpuArch(A))
+  if (IsAMDOffloadArch(A))
     return CudaVersion::NEW;
 
   switch (A) {
-  case CudaArch::UNKNOWN:
+  case OffloadArch::UNKNOWN:
     return CudaVersion::UNKNOWN;
-  case CudaArch::SM_20:
-  case CudaArch::SM_21:
+  case OffloadArch::SM_20:
+  case OffloadArch::SM_21:
     return CudaVersion::CUDA_80;
-  case CudaArch::SM_30:
-  case CudaArch::SM_32_:
+  case OffloadArch::SM_30:
+  case OffloadArch::SM_32_:
     return CudaVersion::CUDA_102;
-  case CudaArch::SM_35:
-  case CudaArch::SM_37:
+  case OffloadArch::SM_35:
+  case OffloadArch::SM_37:
     return CudaVersion::CUDA_118;
   default:
     return CudaVersion::NEW;
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index b3e9affbb3e58a..7c8990880fae3b 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -702,6 +702,13 @@ bool clang::needsTaskBasedThreadLimit(OpenMPDirectiveKind DKind) {
          DKind == OMPD_target_parallel_loop;
 }
 
+bool clang::isOpenMPExecutableDirective(OpenMPDirectiveKind DKind) {
+  if (DKind == OMPD_error)
+    return true;
+  Category Cat = getDirectiveCategory(DKind);
+  return Cat == Category::Executable || Cat == Category::Subsidiary;
+}
+
 void clang::getOpenMPCaptureRegions(
     SmallVectorImpl<OpenMPDirectiveKind> &CaptureRegions,
     OpenMPDirectiveKind DKind) {
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index f0af1a3e3a38b5..533a9fe88a2150 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Allocator.h"
@@ -46,6 +47,13 @@ using namespace clang;
 using namespace SrcMgr;
 using llvm::MemoryBuffer;
 
+#define DEBUG_TYPE "source-manager"
+
+// Reaching a limit of 2^31 results in a hard error. This metric allows to track
+// if particular invocation of the compiler is close to it.
+STATISTIC(MaxUsedSLocBytes, "Maximum number of bytes used by source locations "
+                            "(both loaded and local).");
+
 //===----------------------------------------------------------------------===//
 // SourceManager Helper Classes
 //===----------------------------------------------------------------------===//
@@ -466,6 +474,7 @@ SourceManager::AllocateLoadedSLocEntries(unsigned NumSLocEntries,
   SLocEntryLoaded.resize(LoadedSLocEntryTable.size());
   SLocEntryOffsetLoaded.resize(LoadedSLocEntryTable.size());
   CurrentLoadedOffset -= TotalSize;
+  updateSlocUsageStats();
   int BaseID = -int(LoadedSLocEntryTable.size()) - 1;
   LoadedSLocEntryAllocBegin.push_back(FileID::get(BaseID));
   return std::make_pair(BaseID, CurrentLoadedOffset);
@@ -619,6 +628,7 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
   // We do a +1 here because we want a SourceLocation that means "the end of the
   // file", e.g. for the "no newline at the end of the file" diagnostic.
   NextLocalOffset += FileSize + 1;
+  updateSlocUsageStats();
 
   // Set LastFileIDLookup to the newly created file.  The next getFileID call is
   // almost guaranteed to be from that file.
@@ -679,6 +689,7 @@ SourceManager::createExpansionLocImpl(const ExpansionInfo &Info,
   }
   // See createFileID for that +1.
   NextLocalOffset += Length + 1;
+  updateSlocUsageStats();
   return SourceLocation::getMacroLoc(NextLocalOffset - (Length + 1));
 }
 
@@ -1843,6 +1854,12 @@ void SourceManager::associateFileChunkWithMacroArgExp(
   MacroArgsCache[EndOffs] = EndOffsMappedLoc;
 }
 
+void SourceManager::updateSlocUsageStats() const {
+  SourceLocation::UIntTy UsedBytes =
+      NextLocalOffset + (MaxLoadedOffset - CurrentLoadedOffset);
+  MaxUsedSLocBytes.updateMax(UsedBytes);
+}
+
 /// If \arg Loc points inside a function macro argument, the returned
 /// location will be the macro location in which the argument was expanded.
 /// If a macro argument is used multiple times, the expanded location will
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 31d8121b91d107..2692ddec26ff46 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -1536,6 +1536,7 @@ WindowsARM64TargetInfo::checkCallingConvention(CallingConv CC) const {
   case CC_OpenCLKernel:
   case CC_PreserveMost:
   case CC_PreserveAll:
+  case CC_PreserveNone:
   case CC_Swift:
   case CC_SwiftAsync:
   case CC_Win64:
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index ff7d2f1f92aa41..43b653dc52ce0d 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -59,7 +59,7 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   // Define available target features
   // These must be defined in sorted order!
   NoAsmVariants = true;
-  GPU = CudaArch::UNUSED;
+  GPU = OffloadArch::UNUSED;
 
   // PTX supports f16 as a fundamental type.
   HasLegalHalfType = true;
@@ -175,116 +175,117 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
   Builder.defineMacro("__NVPTX__");
 
   // Skip setting architecture dependent macros if undefined.
-  if (GPU == CudaArch::UNUSED && !HostTarget)
+  if (GPU == OffloadArch::UNUSED && !HostTarget)
     return;
 
   if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
     // Set __CUDA_ARCH__ for the GPU specified.
     std::string CUDAArchCode = [this] {
       switch (GPU) {
-      case CudaArch::GFX600:
-      case CudaArch::GFX601:
-      case CudaArch::GFX602:
-      case CudaArch::GFX700:
-      case CudaArch::GFX701:
-      case CudaArch::GFX702:
-      case CudaArch::GFX703:
-      case CudaArch::GFX704:
-      case CudaArch::GFX705:
-      case CudaArch::GFX801:
-      case CudaArch::GFX802:
-      case CudaArch::GFX803:
-      case CudaArch::GFX805:
-      case CudaArch::GFX810:
-      case CudaArch::GFX9_GENERIC:
-      case CudaArch::GFX900:
-      case CudaArch::GFX902:
-      case CudaArch::GFX904:
-      case CudaArch::GFX906:
-      case CudaArch::GFX908:
-      case CudaArch::GFX909:
-      case CudaArch::GFX90a:
-      case CudaArch::GFX90c:
-      case CudaArch::GFX940:
-      case CudaArch::GFX941:
-      case CudaArch::GFX942:
-      case CudaArch::GFX10_1_GENERIC:
-      case CudaArch::GFX1010:
-      case CudaArch::GFX1011:
-      case CudaArch::GFX1012:
-      case CudaArch::GFX1013:
-      case CudaArch::GFX10_3_GENERIC:
-      case CudaArch::GFX1030:
-      case CudaArch::GFX1031:
-      case CudaArch::GFX1032:
-      case CudaArch::GFX1033:
-      case CudaArch::GFX1034:
-      case CudaArch::GFX1035:
-      case CudaArch::GFX1036:
-      case CudaArch::GFX11_GENERIC:
-      case CudaArch::GFX1100:
-      case CudaArch::GFX1101:
-      case CudaArch::GFX1102:
-      case CudaArch::GFX1103:
-      case CudaArch::GFX1150:
-      case CudaArch::GFX1151:
-      case CudaArch::GFX1152:
-      case CudaArch::GFX12_GENERIC:
-      case CudaArch::GFX1200:
-      case CudaArch::GFX1201:
-      case CudaArch::Generic:
-      case CudaArch::LAST:
+      case OffloadArch::GFX600:
+      case OffloadArch::GFX601:
+      case OffloadArch::GFX602:
+      case OffloadArch::GFX700:
+      case OffloadArch::GFX701:
+      case OffloadArch::GFX702:
+      case OffloadArch::GFX703:
+      case OffloadArch::GFX704:
+      case OffloadArch::GFX705:
+      case OffloadArch::GFX801:
+      case OffloadArch::GFX802:
+      case OffloadArch::GFX803:
+      case OffloadArch::GFX805:
+      case OffloadArch::GFX810:
+      case OffloadArch::GFX9_GENERIC:
+      case OffloadArch::GFX900:
+      case OffloadArch::GFX902:
+      case OffloadArch::GFX904:
+      case OffloadArch::GFX906:
+      case OffloadArch::GFX908:
+      case OffloadArch::GFX909:
+      case OffloadArch::GFX90a:
+      case OffloadArch::GFX90c:
+      case OffloadArch::GFX940:
+      case OffloadArch::GFX941:
+      case OffloadArch::GFX942:
+      case OffloadArch::GFX10_1_GENERIC:
+      case OffloadArch::GFX1010:
+      case OffloadArch::GFX1011:
+      case OffloadArch::GFX1012:
+      case OffloadArch::GFX1013:
+      case OffloadArch::GFX10_3_GENERIC:
+      case OffloadArch::GFX1030:
+      case OffloadArch::GFX1031:
+      case OffloadArch::GFX1032:
+      case OffloadArch::GFX1033:
+      case OffloadArch::GFX1034:
+      case OffloadArch::GFX1035:
+      case OffloadArch::GFX1036:
+      case OffloadArch::GFX11_GENERIC:
+      case OffloadArch::GFX1100:
+      case OffloadArch::GFX1101:
+      case OffloadArch::GFX1102:
+      case OffloadArch::GFX1103:
+      case OffloadArch::GFX1150:
+      case OffloadArch::GFX1151:
+      case OffloadArch::GFX1152:
+      case OffloadArch::GFX12_GENERIC:
+      case OffloadArch::GFX1200:
+      case OffloadArch::GFX1201:
+      case OffloadArch::AMDGCNSPIRV:
+      case OffloadArch::Generic:
+      case OffloadArch::LAST:
         break;
-      case CudaArch::UNKNOWN:
+      case OffloadArch::UNKNOWN:
         assert(false && "No GPU arch when compiling CUDA device code.");
         return "";
-      case CudaArch::UNUSED:
-      case CudaArch::SM_20:
+      case OffloadArch::UNUSED:
+      case OffloadArch::SM_20:
         return "200";
-      case CudaArch::SM_21:
+      case OffloadArch::SM_21:
         return "210";
-      case CudaArch::SM_30:
+      case OffloadArch::SM_30:
         return "300";
-      case CudaArch::SM_32_:
+      case OffloadArch::SM_32_:
         return "320";
-      case CudaArch::SM_35:
+      case OffloadArch::SM_35:
         return "350";
-      case CudaArch::SM_37:
+      case OffloadArch::SM_37:
         return "370";
-      case CudaArch::SM_50:
+      case OffloadArch::SM_50:
         return "500";
-      case CudaArch::SM_52:
+      case OffloadArch::SM_52:
         return "520";
-      case CudaArch::SM_53:
+      case OffloadArch::SM_53:
         return "530";
-      case CudaArch::SM_60:
+      case OffloadArch::SM_60:
         return "600";
-      case CudaArch::SM_61:
+      case OffloadArch::SM_61:
         return "610";
-      case CudaArch::SM_62:
+      case OffloadArch::SM_62:
         return "620";
-      case CudaArch::SM_70:
+      case OffloadArch::SM_70:
         return "700";
-      case CudaArch::SM_72:
+      case OffloadArch::SM_72:
         return "720";
-      case CudaArch::SM_75:
+      case OffloadArch::SM_75:
         return "750";
-      case CudaArch::SM_80:
+      case OffloadArch::SM_80:
         return "800";
-      case CudaArch::SM_86:
+      case OffloadArch::SM_86:
         return "860";
-      case CudaArch::SM_87:
+      case OffloadArch::SM_87:
         return "870";
-      case CudaArch::SM_89:
+      case OffloadArch::SM_89:
         return "890";
-      case CudaArch::SM_90:
-      case CudaArch::SM_90a:
+      case OffloadArch::SM_90:
+      case OffloadArch::SM_90a:
         return "900";
       }
-      llvm_unreachable("unhandled CudaArch");
+      llvm_unreachable("unhandled OffloadArch");
     }();
     Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
-    if (GPU == CudaArch::SM_90a)
+    if (GPU == OffloadArch::SM_90a)
       Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
   }
 }
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index f476d49047c013..a5daf36cfac720 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -62,7 +62,7 @@ static const int NVPTXDWARFAddrSpaceMap[] = {
 
 class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
   static const char *const GCCRegNames[];
-  CudaArch GPU;
+  OffloadArch GPU;
   uint32_t PTXVersion;
   std::unique_ptr<TargetInfo> HostTarget;
 
@@ -79,8 +79,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
   initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
                  StringRef CPU,
                  const std::vector<std::string> &FeaturesVec) const override {
-    if (GPU != CudaArch::UNUSED)
-      Features[CudaArchToString(GPU)] = true;
+    if (GPU != OffloadArch::UNUSED)
+      Features[OffloadArchToString(GPU)] = true;
     Features["ptx" + std::to_string(PTXVersion)] = true;
     return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
   }
@@ -121,18 +121,18 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
   }
 
   bool isValidCPUName(StringRef Name) const override {
-    return StringToCudaArch(Name) != CudaArch::UNKNOWN;
+    return StringToOffloadArch(Name) != OffloadArch::UNKNOWN;
   }
 
   void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override {
-    for (int i = static_cast<int>(CudaArch::SM_20);
-         i < static_cast<int>(CudaArch::Generic); ++i)
-      Values.emplace_back(CudaArchToString(static_cast<CudaArch>(i)));
+    for (int i = static_cast<int>(OffloadArch::SM_20);
+         i < static_cast<int>(OffloadArch::Generic); ++i)
+      Values.emplace_back(OffloadArchToString(static_cast<OffloadArch>(i)));
   }
 
   bool setCPU(const std::string &Name) override {
-    GPU = StringToCudaArch(Name);
-    return GPU != CudaArch::UNKNOWN;
+    GPU = StringToOffloadArch(Name);
+    return GPU != OffloadArch::UNKNOWN;
   }
 
   void setSupportedOpenCLOpts() override {
@@ -183,7 +183,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
   bool hasBitIntType() const override { return true; }
   bool hasBFloat16Type() const override { return true; }
 
-  CudaArch getGPU() const { return GPU; }
+  OffloadArch getGPU() const { return GPU; }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index fc23c30c68523a..e4d6a02386da58 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -305,9 +305,11 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
               // asm statements)
       Info.setAllowsMemory();
       break;
-    case 'R': // AIX TOC entry
     case 'a': // Address operand that is an indexed or indirect from a
               // register (`p' is preferable for asm statements)
+              // TODO: Add full support for this constraint
+      return false;
+    case 'R': // AIX TOC entry
     case 'S': // Constant suitable as a 64-bit mask operand
     case 'T': // Constant suitable as a 32-bit mask operand
     case 'U': // System V Release 4 small data area reference
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 036a655a4d073f..276d492955207a 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -455,6 +455,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasNF = true;
     } else if (Feature == "+cf") {
       HasCF = true;
+    } else if (Feature == "+zu") {
+      HasZU = true;
     }
 
     X86SSEEnum Level = llvm::StringSwitch<X86SSEEnum>(Feature)
@@ -962,8 +964,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__NF__");
   if (HasCF)
     Builder.defineMacro("__CF__");
+  if (HasZU)
+    Builder.defineMacro("__ZU__");
   // Condition here is aligned with the feature set of mapxf in Options.td
-  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD && HasCCMP && HasNF)
+  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD && HasCCMP && HasNF && HasCF)
     Builder.defineMacro("__APX_F__");
   if (HasEGPR && HasInlineAsmUseGPR32)
     Builder.defineMacro("__APX_INLINE_ASM_USE_GPR32__");
@@ -1166,6 +1170,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("ccmp", true)
       .Case("nf", true)
       .Case("cf", true)
+      .Case("zu", true)
       .Default(false);
 }
 
@@ -1286,6 +1291,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("ccmp", HasCCMP)
       .Case("nf", HasNF)
       .Case("cf", HasCF)
+      .Case("zu", HasZU)
       .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 9b2ae87adb2e74..5ce4953251bc34 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -172,6 +172,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasCCMP = false;
   bool HasNF = false;
   bool HasCF = false;
+  bool HasZU = false;
   bool HasInlineAsmUseGPR32 = false;
 
 protected:
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index 0fe9929dca2b33..a648bd314e5012 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -112,7 +112,7 @@ class BackendConsumer : public ASTConsumer {
   void HandleVTable(CXXRecordDecl *RD) override;
 
   // Links each entry in LinkModules into our module.  Returns true on error.
-  bool LinkInModules(llvm::Module *M, bool ShouldLinkFiles = true);
+  bool LinkInModules(llvm::Module *M);
 
   /// Get the best possible source location to represent a diagnostic that
   /// may have associated debug info.
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index b09680086248d0..22b593e8f2b7a2 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1035,7 +1035,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
 
   // Link against bitcodes supplied via the -mlink-builtin-bitcode option
   if (CodeGenOpts.LinkBitcodePostopt)
-    MPM.addPass(LinkInModulesPass(BC, false));
+    MPM.addPass(LinkInModulesPass(BC));
 
   // Add a verifier pass if requested. We don't have to do this if the action
   // requires code generation because there will already be a verifier pass in
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index ed07476f4047f6..0bc4fda62979cd 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -62,7 +62,7 @@ class CGBuilderTy : public CGBuilderBaseTy {
   template <bool IsInBounds>
   Address createConstGEP2_32(Address Addr, unsigned Idx0, unsigned Idx1,
                              const llvm::Twine &Name) {
-    const llvm::DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    const llvm::DataLayout &DL = BB->getDataLayout();
     llvm::GetElementPtrInst *GEP;
     if (IsInBounds)
       GEP = cast<llvm::GetElementPtrInst>(CreateConstInBoundsGEP2_32(
@@ -218,7 +218,7 @@ class CGBuilderTy : public CGBuilderBaseTy {
   Address CreateStructGEP(Address Addr, unsigned Index,
                           const llvm::Twine &Name = "") {
     llvm::StructType *ElTy = cast<llvm::StructType>(Addr.getElementType());
-    const llvm::DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    const llvm::DataLayout &DL = BB->getDataLayout();
     const llvm::StructLayout *Layout = DL.getStructLayout(ElTy);
     auto Offset = CharUnits::fromQuantity(Layout->getElementOffset(Index));
 
@@ -240,7 +240,7 @@ class CGBuilderTy : public CGBuilderBaseTy {
   Address CreateConstArrayGEP(Address Addr, uint64_t Index,
                               const llvm::Twine &Name = "") {
     llvm::ArrayType *ElTy = cast<llvm::ArrayType>(Addr.getElementType());
-    const llvm::DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    const llvm::DataLayout &DL = BB->getDataLayout();
     CharUnits EltSize =
         CharUnits::fromQuantity(DL.getTypeAllocSize(ElTy->getElementType()));
 
@@ -260,7 +260,7 @@ class CGBuilderTy : public CGBuilderBaseTy {
   Address CreateConstInBoundsGEP(Address Addr, uint64_t Index,
                                  const llvm::Twine &Name = "") {
     llvm::Type *ElTy = Addr.getElementType();
-    const llvm::DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    const llvm::DataLayout &DL = BB->getDataLayout();
     CharUnits EltSize = CharUnits::fromQuantity(DL.getTypeAllocSize(ElTy));
 
     return Address(
@@ -277,7 +277,7 @@ class CGBuilderTy : public CGBuilderBaseTy {
   Address CreateConstGEP(Address Addr, uint64_t Index,
                          const llvm::Twine &Name = "") {
     llvm::Type *ElTy = Addr.getElementType();
-    const llvm::DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    const llvm::DataLayout &DL = BB->getDataLayout();
     CharUnits EltSize = CharUnits::fromQuantity(DL.getTypeAllocSize(ElTy));
 
     return Address(CreateGEP(ElTy, Addr.getBasePointer(), getSize(Index), Name),
@@ -290,7 +290,7 @@ class CGBuilderTy : public CGBuilderBaseTy {
   using CGBuilderBaseTy::CreateGEP;
   Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index,
                     const llvm::Twine &Name = "") {
-    const llvm::DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    const llvm::DataLayout &DL = BB->getDataLayout();
     CharUnits EltSize =
         CharUnits::fromQuantity(DL.getTypeAllocSize(Addr.getElementType()));
 
@@ -412,7 +412,7 @@ class CGBuilderTy : public CGBuilderBaseTy {
                                           unsigned FieldIndex,
                                           llvm::MDNode *DbgInfo) {
     llvm::StructType *ElTy = cast<llvm::StructType>(Addr.getElementType());
-    const llvm::DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    const llvm::DataLayout &DL = BB->getDataLayout();
     const llvm::StructLayout *Layout = DL.getStructLayout(ElTy);
     auto Offset = CharUnits::fromQuantity(Layout->getElementOffset(Index));
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6316e2b062692d..98c2f70664ec7a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3505,6 +3505,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_trap:
     EmitTrapCall(Intrinsic::trap);
     return RValue::get(nullptr);
+  case Builtin::BI__builtin_verbose_trap: {
+    llvm::DILocation *TrapLocation = Builder.getCurrentDebugLocation();
+    if (getDebugInfo()) {
+      TrapLocation = getDebugInfo()->CreateTrapFailureMessageFor(
+          TrapLocation, *E->getArg(0)->tryEvaluateString(getContext()),
+          *E->getArg(1)->tryEvaluateString(getContext()));
+    }
+    ApplyDebugLocation ApplyTrapDI(*this, TrapLocation);
+    // Currently no attempt is made to prevent traps from being merged.
+    EmitTrapCall(Intrinsic::trap);
+    return RValue::get(nullptr);
+  }
   case Builtin::BI__debugbreak:
     EmitTrapCall(Intrinsic::debugtrap);
     return RValue::get(nullptr);
@@ -18467,6 +18479,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
     return Builder.CreateCall(F, Args);
   }
+  case AMDGPU::BI__builtin_amdgcn_permlane16:
+  case AMDGPU::BI__builtin_amdgcn_permlanex16:
+    return emitBuiltinWithOneOverloadedType<6>(
+        *this, E,
+        BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
+            ? Intrinsic::amdgcn_permlane16
+            : Intrinsic::amdgcn_permlanex16);
+  case AMDGPU::BI__builtin_amdgcn_permlane64:
+    return emitBuiltinWithOneOverloadedType<1>(*this, E,
+                                               Intrinsic::amdgcn_permlane64);
+  case AMDGPU::BI__builtin_amdgcn_readlane:
+    return emitBuiltinWithOneOverloadedType<2>(*this, E,
+                                               Intrinsic::amdgcn_readlane);
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+    return emitBuiltinWithOneOverloadedType<1>(*this, E,
+                                               Intrinsic::amdgcn_readfirstlane);
   case AMDGPU::BI__builtin_amdgcn_div_fixup:
   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
@@ -18604,28 +18632,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
     return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_ds_fminf:
-  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
-    Intrinsic::ID Intrin;
-    switch (BuiltinID) {
-    case AMDGPU::BI__builtin_amdgcn_ds_fminf:
-      Intrin = Intrinsic::amdgcn_ds_fmin;
-      break;
-    case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
-      Intrin = Intrinsic::amdgcn_ds_fmax;
-      break;
-    }
-    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
-    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
-    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
-    llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
-    llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
-    llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
-    llvm::FunctionType *FTy = F->getFunctionType();
-    llvm::Type *PTy = FTy->getParamType(0);
-    Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
-    return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
-  }
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
@@ -19059,11 +19065,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
   case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
   case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
-  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
     llvm::AtomicRMWInst::BinOp BinOp;
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19081,6 +19089,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
       BinOp = llvm::AtomicRMWInst::FAdd;
       break;
+    case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+      BinOp = llvm::AtomicRMWInst::FMin;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+      BinOp = llvm::AtomicRMWInst::FMax;
+      break;
     }
 
     Address Ptr = CheckAtomicAlignment(*this, E);
@@ -19090,8 +19104,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 
     bool Volatile;
 
-    if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
-      // __builtin_amdgcn_ds_faddf has an explicit volatile argument
+    if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
+        BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
+        BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
+      // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
       Volatile =
           cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
     } else {
@@ -19105,7 +19121,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
                               EmitScalarExpr(E->getArg(3)), AO, SSID);
     } else {
-      // The ds_fadd_* builtins do not have syncscope/order arguments.
+      // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
       SSID = llvm::SyncScope::System;
       AO = AtomicOrdering::SequentiallyConsistent;
 
@@ -19135,6 +19151,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
     return emitBuiltinWithOneOverloadedType<4>(
         *this, E, Intrinsic::amdgcn_make_buffer_rsrc);
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
+  case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
+    return emitBuiltinWithOneOverloadedType<5>(
+        *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
   default:
     return nullptr;
   }
diff --git a/clang/lib/CodeGen/CGCXX.cpp b/clang/lib/CodeGen/CGCXX.cpp
index e95a735f92f74b..23ebbee19bf791 100644
--- a/clang/lib/CodeGen/CGCXX.cpp
+++ b/clang/lib/CodeGen/CGCXX.cpp
@@ -263,7 +263,16 @@ static CGCallee BuildAppleKextVirtualCall(CodeGenFunction &CGF,
     CGF.Builder.CreateConstInBoundsGEP1_64(Ty, VTable, VTableIndex, "vfnkxt");
   llvm::Value *VFunc = CGF.Builder.CreateAlignedLoad(
       Ty, VFuncPtr, llvm::Align(CGF.PointerAlignInBytes));
-  CGCallee Callee(GD, VFunc);
+
+  CGPointerAuthInfo PointerAuth;
+  if (auto &Schema =
+          CGM.getCodeGenOpts().PointerAuth.CXXVirtualFunctionPointers) {
+    GlobalDecl OrigMD =
+        CGM.getItaniumVTableContext().findOriginalMethod(GD.getCanonicalDecl());
+    PointerAuth = CGF.EmitPointerAuthInfo(Schema, VFuncPtr, OrigMD, QualType());
+  }
+
+  CGCallee Callee(GD, VFunc, PointerAuth);
   return Callee;
 }
 
diff --git a/clang/lib/CodeGen/CGCXXABI.h b/clang/lib/CodeGen/CGCXXABI.h
index 104a20db8efaf2..7dcc539111996b 100644
--- a/clang/lib/CodeGen/CGCXXABI.h
+++ b/clang/lib/CodeGen/CGCXXABI.h
@@ -504,13 +504,15 @@ class CGCXXABI {
   virtual void setThunkLinkage(llvm::Function *Thunk, bool ForVTable,
                                GlobalDecl GD, bool ReturnAdjustment) = 0;
 
-  virtual llvm::Value *performThisAdjustment(CodeGenFunction &CGF,
-                                             Address This,
-                                             const ThisAdjustment &TA) = 0;
+  virtual llvm::Value *
+  performThisAdjustment(CodeGenFunction &CGF, Address This,
+                        const CXXRecordDecl *UnadjustedClass,
+                        const ThunkInfo &TI) = 0;
 
-  virtual llvm::Value *performReturnAdjustment(CodeGenFunction &CGF,
-                                               Address Ret,
-                                               const ReturnAdjustment &RA) = 0;
+  virtual llvm::Value *
+  performReturnAdjustment(CodeGenFunction &CGF, Address Ret,
+                          const CXXRecordDecl *UnadjustedClass,
+                          const ReturnAdjustment &RA) = 0;
 
   virtual void EmitReturnFromThunk(CodeGenFunction &CGF,
                                    RValue RV, QualType ResultType);
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 5a032bdbf93791..0a595bb998d261 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2588,6 +2588,11 @@ void CodeGenFunction::InitializeVTablePointer(const VPtr &Vptr) {
   // the same addr space. Note that this might not be LLVM address space 0.
   VTableField = VTableField.withElementType(PtrTy);
 
+  if (auto AuthenticationInfo = CGM.getVTablePointerAuthInfo(
+          this, Vptr.Base.getBase(), VTableField.emitRawPointer(*this)))
+    VTableAddressPoint =
+        EmitPointerAuthSign(*AuthenticationInfo, VTableAddressPoint);
+
   llvm::StoreInst *Store = Builder.CreateStore(VTableAddressPoint, VTableField);
   TBAAAccessInfo TBAAInfo = CGM.getTBAAVTablePtrAccessInfo(PtrTy);
   CGM.DecorateInstructionWithTBAA(Store, TBAAInfo);
@@ -2681,12 +2686,35 @@ void CodeGenFunction::InitializeVTablePointers(const CXXRecordDecl *RD) {
 
 llvm::Value *CodeGenFunction::GetVTablePtr(Address This,
                                            llvm::Type *VTableTy,
-                                           const CXXRecordDecl *RD) {
+                                           const CXXRecordDecl *RD,
+                                           VTableAuthMode AuthMode) {
   Address VTablePtrSrc = This.withElementType(VTableTy);
   llvm::Instruction *VTable = Builder.CreateLoad(VTablePtrSrc, "vtable");
   TBAAAccessInfo TBAAInfo = CGM.getTBAAVTablePtrAccessInfo(VTableTy);
   CGM.DecorateInstructionWithTBAA(VTable, TBAAInfo);
 
+  if (auto AuthenticationInfo =
+          CGM.getVTablePointerAuthInfo(this, RD, This.emitRawPointer(*this))) {
+    if (AuthMode != VTableAuthMode::UnsafeUbsanStrip) {
+      VTable = cast<llvm::Instruction>(
+          EmitPointerAuthAuth(*AuthenticationInfo, VTable));
+      if (AuthMode == VTableAuthMode::MustTrap) {
+        // This is clearly suboptimal but until we have an ability
+        // to rely on the authentication intrinsic trapping and force
+        // an authentication to occur we don't really have a choice.
+        VTable =
+            cast<llvm::Instruction>(Builder.CreateBitCast(VTable, Int8PtrTy));
+        Builder.CreateLoad(RawAddress(VTable, Int8Ty, CGM.getPointerAlign()),
+                           /* IsVolatile */ true);
+      }
+    } else {
+      VTable = cast<llvm::Instruction>(EmitPointerAuthAuth(
+          CGPointerAuthInfo(0, PointerAuthenticationMode::Strip, false, false,
+                            nullptr),
+          VTable));
+    }
+  }
+
   if (CGM.getCodeGenOpts().OptimizationLevel > 0 &&
       CGM.getCodeGenOpts().StrictVTablePointers)
     CGM.DecorateInstructionWithInvariantGroup(VTable, RD);
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index b4c724422c14aa..a8a70186c2c5a1 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -103,7 +103,7 @@ static void createCoroData(CodeGenFunction &CGF,
     return;
   }
 
-  CurCoro.Data = std::unique_ptr<CGCoroData>(new CGCoroData);
+  CurCoro.Data = std::make_unique<CGCoroData>();
   CurCoro.Data->CoroId = CoroId;
   CurCoro.Data->CoroIdExpr = CoroIdExpr;
 }
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index a072475ba77057..3d8a715b692de8 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -32,6 +32,7 @@
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Version.h"
+#include "clang/CodeGen/ModuleBuilder.h"
 #include "clang/Frontend/FrontendOptions.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/ModuleMap.h"
@@ -1731,6 +1732,28 @@ llvm::DIType *CGDebugInfo::createFieldType(
                                    offsetInBits, flags, debugType, Annotations);
 }
 
+llvm::DISubprogram *
+CGDebugInfo::createInlinedTrapSubprogram(StringRef FuncName,
+                                         llvm::DIFile *FileScope) {
+  // We are caching the subprogram because we don't want to duplicate
+  // subprograms with the same message. Note that `SPFlagDefinition` prevents
+  // subprograms from being uniqued.
+  llvm::DISubprogram *&SP = InlinedTrapFuncMap[FuncName];
+
+  if (!SP) {
+    llvm::DISubroutineType *DIFnTy = DBuilder.createSubroutineType(nullptr);
+    SP = DBuilder.createFunction(
+        /*Scope=*/FileScope, /*Name=*/FuncName, /*LinkageName=*/StringRef(),
+        /*File=*/FileScope, /*LineNo=*/0, /*Ty=*/DIFnTy,
+        /*ScopeLine=*/0,
+        /*Flags=*/llvm::DINode::FlagArtificial,
+        /*SPFlags=*/llvm::DISubprogram::SPFlagDefinition,
+        /*TParams=*/nullptr, /*ThrownTypes=*/nullptr, /*Annotations=*/nullptr);
+  }
+
+  return SP;
+}
+
 void CGDebugInfo::CollectRecordLambdaFields(
     const CXXRecordDecl *CXXDecl, SmallVectorImpl<llvm::Metadata *> &elements,
     llvm::DIType *RecordTy) {
@@ -3527,6 +3550,23 @@ llvm::DIMacroFile *CGDebugInfo::CreateTempMacroFile(llvm::DIMacroFile *Parent,
   return DBuilder.createTempMacroFile(Parent, Line, FName);
 }
 
+llvm::DILocation *CGDebugInfo::CreateTrapFailureMessageFor(
+    llvm::DebugLoc TrapLocation, StringRef Category, StringRef FailureMsg) {
+  // Create a debug location from `TrapLocation` that adds an artificial inline
+  // frame.
+  SmallString<64> FuncName(ClangTrapPrefix);
+
+  FuncName += "$";
+  FuncName += Category;
+  FuncName += "$";
+  FuncName += FailureMsg;
+
+  llvm::DISubprogram *TrapSP =
+      createInlinedTrapSubprogram(FuncName, TrapLocation->getFile());
+  return llvm::DILocation::get(CGM.getLLVMContext(), /*Line=*/0, /*Column=*/0,
+                               /*Scope=*/TrapSP, /*InlinedAt=*/TrapLocation);
+}
+
 static QualType UnwrapTypeForDebugInfo(QualType T, const ASTContext &C) {
   Qualifiers Quals;
   do {
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 2731c627d9dc37..a0c419cf1e208a 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -29,7 +29,9 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Allocator.h"
+#include <map>
 #include <optional>
+#include <string>
 
 namespace llvm {
 class MDNode;
@@ -346,6 +348,14 @@ class CGDebugInfo {
       const FieldDecl *BitFieldDecl, const llvm::DIDerivedType *BitFieldDI,
       llvm::ArrayRef<llvm::Metadata *> PreviousFieldsDI, const RecordDecl *RD);
 
+  /// A cache that maps names of artificial inlined functions to subprograms.
+  llvm::StringMap<llvm::DISubprogram *> InlinedTrapFuncMap;
+
+  /// A function that returns the subprogram corresponding to the artificial
+  /// inlined function for traps.
+  llvm::DISubprogram *createInlinedTrapSubprogram(StringRef FuncName,
+                                                  llvm::DIFile *FileScope);
+
   /// Helpers for collecting fields of a record.
   /// @{
   void CollectRecordLambdaFields(const CXXRecordDecl *CXXDecl,
@@ -608,6 +618,18 @@ class CGDebugInfo {
     return CoroutineParameterMappings;
   }
 
+  /// Create a debug location from `TrapLocation` that adds an artificial inline
+  /// frame where the frame name is
+  ///
+  /// * `<Prefix>:<Category>:<FailureMsg>`
+  ///
+  /// `<Prefix>` is "__clang_trap_msg".
+  ///
+  /// This is used to store failure reasons for traps.
+  llvm::DILocation *CreateTrapFailureMessageFor(llvm::DebugLoc TrapLocation,
+                                                StringRef Category,
+                                                StringRef FailureMsg);
+
 private:
   /// Emit call to llvm.dbg.declare for a variable declaration.
   /// Returns a pointer to the DILocalVariable associated with the
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 4a213990d1e36e..90aa4c0745a8ab 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1972,7 +1972,21 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) {
   }
 
   if (!constant) {
-    initializeWhatIsTechnicallyUninitialized(Loc);
+    if (trivialAutoVarInit !=
+        LangOptions::TrivialAutoVarInitKind::Uninitialized) {
+      // At this point, we know D has an Init expression, but isn't a constant.
+      // - If D is not a scalar, auto-var-init conservatively (members may be
+      // left uninitialized by constructor Init expressions for example).
+      // - If D is a scalar, we only need to auto-var-init if there is a
+      // self-reference. Otherwise, the Init expression should be sufficient.
+      // It may be that the Init expression uses other uninitialized memory,
+      // but auto-var-init here would not help, as auto-init would get
+      // overwritten by Init.
+      if (!D.getType()->isScalarType() || capturedByInit ||
+          isAccessedBy(D, Init)) {
+        initializeWhatIsTechnicallyUninitialized(Loc);
+      }
+    }
     LValue lv = MakeAddrLValue(Loc, type);
     lv.setNonGC(true);
     return EmitExprAsInit(Init, &D, lv, capturedByInit);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 534f46da748627..23e5deee325813 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -830,8 +830,14 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
       // Load the vptr, and mix it with TypeHash.
       llvm::Value *TypeHash =
           llvm::ConstantInt::get(Int64Ty, xxh3_64bits(Out.str()));
+
+      llvm::Type *VPtrTy = llvm::PointerType::get(IntPtrTy, 0);
       Address VPtrAddr(Ptr, IntPtrTy, getPointerAlign());
-      llvm::Value *VPtrVal = Builder.CreateLoad(VPtrAddr);
+      llvm::Value *VPtrVal = GetVTablePtr(VPtrAddr, VPtrTy,
+                                          Ty->getAsCXXRecordDecl(),
+                                          VTableAuthMode::UnsafeUbsanStrip);
+      VPtrVal = Builder.CreateBitOrPointerCast(VPtrVal, IntPtrTy);
+
       llvm::Value *Hash =
           emitHashMix(Builder, TypeHash, Builder.CreateZExt(VPtrVal, Int64Ty));
       Hash = Builder.CreateTrunc(Hash, IntPtrTy);
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index dffb8ce83b6437..1fec587b5c4c7c 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -803,6 +803,13 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
       llvm::Constant *VTableAddressPoint =
           CGM.getCXXABI().getVTableAddressPoint(BaseSubobject(CD, Offset),
                                                 VTableClass);
+      if (auto Authentication =
+              CGM.getVTablePointerAuthentication(VTableClass)) {
+        VTableAddressPoint = Emitter.tryEmitConstantSignedPointer(
+            VTableAddressPoint, *Authentication);
+        if (!VTableAddressPoint)
+          return false;
+      }
       if (!AppendBytes(Offset, VTableAddressPoint))
         return false;
     }
@@ -1647,7 +1654,7 @@ namespace {
       // messing around with llvm::Constant structures, which never itself
       // does anything that should be visible in compiler output.
       for (auto &entry : Locations) {
-        assert(entry.first->getParent() == nullptr && "not a placeholder!");
+        assert(entry.first->getName() == "" && "not a placeholder!");
         entry.first->replaceAllUsesWith(entry.second);
         entry.first->eraseFromParent();
       }
@@ -1811,6 +1818,43 @@ llvm::Constant *ConstantEmitter::tryEmitPrivateForMemory(const APValue &value,
   return (C ? emitForMemory(C, destType) : nullptr);
 }
 
+/// Try to emit a constant signed pointer, given a raw pointer and the
+/// destination ptrauth qualifier.
+///
+/// This can fail if the qualifier needs address discrimination and the
+/// emitter is in an abstract mode.
+llvm::Constant *
+ConstantEmitter::tryEmitConstantSignedPointer(llvm::Constant *UnsignedPointer,
+                                              PointerAuthQualifier Schema) {
+  assert(Schema && "applying trivial ptrauth schema");
+
+  if (Schema.hasKeyNone())
+    return UnsignedPointer;
+
+  unsigned Key = Schema.getKey();
+
+  // Create an address placeholder if we're using address discrimination.
+  llvm::GlobalValue *StorageAddress = nullptr;
+  if (Schema.isAddressDiscriminated()) {
+    // We can't do this if the emitter is in an abstract state.
+    if (isAbstract())
+      return nullptr;
+
+    StorageAddress = getCurrentAddrPrivate();
+  }
+
+  llvm::ConstantInt *Discriminator =
+      llvm::ConstantInt::get(CGM.IntPtrTy, Schema.getExtraDiscriminator());
+
+  llvm::Constant *SignedPointer = CGM.getConstantSignedPointer(
+      UnsignedPointer, Key, StorageAddress, Discriminator);
+
+  if (Schema.isAddressDiscriminated())
+    registerCurrentAddrPrivate(SignedPointer, StorageAddress);
+
+  return SignedPointer;
+}
+
 llvm::Constant *ConstantEmitter::emitForMemory(CodeGenModule &CGM,
                                                llvm::Constant *C,
                                                QualType destType) {
@@ -2041,10 +2085,10 @@ ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) {
       return ConstantLValue(C);
     };
 
-    if (auto FD = dyn_cast<FunctionDecl>(D))
+    if (const auto *FD = dyn_cast<FunctionDecl>(D))
       return PtrAuthSign(CGM.getRawFunctionPointer(FD));
 
-    if (auto VD = dyn_cast<VarDecl>(D)) {
+    if (const auto *VD = dyn_cast<VarDecl>(D)) {
       // We can never refer to a variable with local storage.
       if (!VD->hasLocalStorage()) {
         if (VD->isFileVarDecl() || VD->hasExternalStorage())
@@ -2057,13 +2101,13 @@ ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) {
       }
     }
 
-    if (auto *GD = dyn_cast<MSGuidDecl>(D))
+    if (const auto *GD = dyn_cast<MSGuidDecl>(D))
       return CGM.GetAddrOfMSGuidDecl(GD);
 
-    if (auto *GCD = dyn_cast<UnnamedGlobalConstantDecl>(D))
+    if (const auto *GCD = dyn_cast<UnnamedGlobalConstantDecl>(D))
       return CGM.GetAddrOfUnnamedGlobalConstantDecl(GCD);
 
-    if (auto *TPO = dyn_cast<TemplateParamObjectDecl>(D))
+    if (const auto *TPO = dyn_cast<TemplateParamObjectDecl>(D))
       return CGM.GetAddrOfTemplateParamObject(TPO);
 
     return nullptr;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 6e9a1bacd9bf5f..f5bd4a141cc2d7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -501,31 +501,6 @@ class CheckVarsEscapingDeclContext final
 };
 } // anonymous namespace
 
-/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
-}
-
-/// Get the id of the current lane in the Warp.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
-  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
-                       "nvptx_lane_id");
-}
-
 CGOpenMPRuntimeGPU::ExecutionMode
 CGOpenMPRuntimeGPU::getExecutionMode() const {
   return CurrentExecutionMode;
@@ -1435,1133 +1410,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
                               TBAAAccessInfo());
 }
 
-/// This function creates calls to one of two shuffle functions to copy
-/// variables between lanes in a warp.
-static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
-                                                 llvm::Value *Elem,
-                                                 QualType ElemType,
-                                                 llvm::Value *Offset,
-                                                 SourceLocation Loc) {
-  CodeGenModule &CGM = CGF.CGM;
-  CGBuilderTy &Bld = CGF.Builder;
-  CGOpenMPRuntimeGPU &RT =
-      *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
-  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
-
-  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
-  assert(Size.getQuantity() <= 8 &&
-         "Unsupported bitwidth in shuffle instruction.");
-
-  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
-                                  ? OMPRTL___kmpc_shuffle_int32
-                                  : OMPRTL___kmpc_shuffle_int64;
-
-  // Cast all types to 32- or 64-bit values before calling shuffle routines.
-  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
-      Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
-  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
-  llvm::Value *WarpSize =
-      Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
-
-  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
-      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
-      {ElemCast, Offset, WarpSize});
-
-  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
-}
-
-static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
-                            Address DestAddr, QualType ElemType,
-                            llvm::Value *Offset, SourceLocation Loc) {
-  CGBuilderTy &Bld = CGF.Builder;
-
-  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
-  // Create the loop over the big sized data.
-  // ptr = (void*)Elem;
-  // ptrEnd = (void*) Elem + 1;
-  // Step = 8;
-  // while (ptr + Step < ptrEnd)
-  //   shuffle((int64_t)*ptr);
-  // Step = 4;
-  // while (ptr + Step < ptrEnd)
-  //   shuffle((int32_t)*ptr);
-  // ...
-  Address ElemPtr = DestAddr;
-  Address Ptr = SrcAddr;
-  Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
-  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
-    if (Size < CharUnits::fromQuantity(IntSize))
-      continue;
-    QualType IntType = CGF.getContext().getIntTypeForBitwidth(
-        CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
-        /*Signed=*/1);
-    llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
-    Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
-                                                  IntTy);
-    ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtr, IntTy->getPointerTo(), IntTy);
-    if (Size.getQuantity() / IntSize > 1) {
-      llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
-      llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
-      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
-      llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
-      CGF.EmitBlock(PreCondBB);
-      llvm::PHINode *PhiSrc =
-          Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
-      PhiSrc->addIncoming(Ptr.emitRawPointer(CGF), CurrentBB);
-      llvm::PHINode *PhiDest =
-          Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
-      PhiDest->addIncoming(ElemPtr.emitRawPointer(CGF), CurrentBB);
-      Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
-      ElemPtr =
-          Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
-      llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);
-      llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);
-      llvm::Value *PtrDiff = Bld.CreatePtrDiff(
-          CGF.Int8Ty, PtrEndRaw,
-          Bld.CreatePointerBitCastOrAddrSpaceCast(PtrRaw, CGF.VoidPtrTy));
-      Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
-                       ThenBB, ExitBB);
-      CGF.EmitBlock(ThenBB);
-      llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF,
-          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
-                               LValueBaseInfo(AlignmentSource::Type),
-                               TBAAAccessInfo()),
-          IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
-      Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
-      PhiSrc->addIncoming(LocalPtr.emitRawPointer(CGF), ThenBB);
-      PhiDest->addIncoming(LocalElemPtr.emitRawPointer(CGF), ThenBB);
-      CGF.EmitBranch(PreCondBB);
-      CGF.EmitBlock(ExitBB);
-    } else {
-      llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF,
-          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
-                               LValueBaseInfo(AlignmentSource::Type),
-                               TBAAAccessInfo()),
-          IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      Ptr = Bld.CreateConstGEP(Ptr, 1);
-      ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
-    }
-    Size = Size % IntSize;
-  }
-}
-
-namespace {
-enum CopyAction : unsigned {
-  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
-  // the warp using shuffle instructions.
-  RemoteLaneToThread,
-  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
-  ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
-  llvm::Value *RemoteLaneOffset;
-  llvm::Value *ScratchpadIndex;
-  llvm::Value *ScratchpadWidth;
-};
-
-/// Emit instructions to copy a Reduce list, which contains partially
-/// aggregated values, in the specified direction.
-static void emitReductionListCopy(
-    CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
-    ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
-    CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-
-  CodeGenModule &CGM = CGF.CGM;
-  ASTContext &C = CGM.getContext();
-  CGBuilderTy &Bld = CGF.Builder;
-
-  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
-
-  // Iterates, element-by-element, through the source Reduce list and
-  // make a copy.
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    Address SrcElementAddr = Address::invalid();
-    Address DestElementAddr = Address::invalid();
-    Address DestElementPtrAddr = Address::invalid();
-    // Should we shuffle in an element from a remote lane?
-    bool ShuffleInElement = false;
-    // Set to true to update the pointer in the dest Reduce list to a
-    // newly created element.
-    bool UpdateDestListPtr = false;
-    QualType PrivatePtrType = C.getPointerType(Private->getType());
-    llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
-
-    switch (Action) {
-    case RemoteLaneToThread: {
-      // Step 1.1: Get the address for the src element in the Reduce list.
-      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-
-      // Step 1.2: Create a temporary to store the element in the destination
-      // Reduce list.
-      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
-      DestElementAddr =
-          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
-      ShuffleInElement = true;
-      UpdateDestListPtr = true;
-      break;
-    }
-    case ThreadCopy: {
-      // Step 1.1: Get the address for the src element in the Reduce list.
-      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-
-      // Step 1.2: Get the address for dest element.  The destination
-      // element has already been created on the thread's stack.
-      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
-      DestElementAddr = CGF.EmitLoadOfPointer(
-          DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-      break;
-    }
-    }
-
-    // Regardless of src and dest of copy, we emit the load of src
-    // element as this is required in all directions
-    SrcElementAddr = SrcElementAddr.withElementType(
-        CGF.ConvertTypeForMem(Private->getType()));
-    DestElementAddr =
-        DestElementAddr.withElementType(SrcElementAddr.getElementType());
-
-    // Now that all active lanes have read the element in the
-    // Reduce list, shuffle over the value from the remote lane.
-    if (ShuffleInElement) {
-      shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
-                      RemoteLaneOffset, Private->getExprLoc());
-    } else {
-      switch (CGF.getEvaluationKind(Private->getType())) {
-      case TEK_Scalar: {
-        llvm::Value *Elem = CGF.EmitLoadOfScalar(
-            SrcElementAddr, /*Volatile=*/false, Private->getType(),
-            Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
-            TBAAAccessInfo());
-        // Store the source element value to the dest element address.
-        CGF.EmitStoreOfScalar(
-            Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
-            LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
-        break;
-      }
-      case TEK_Complex: {
-        CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
-            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
-            Private->getExprLoc());
-        CGF.EmitStoreOfComplex(
-            Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
-            /*isInit=*/false);
-        break;
-      }
-      case TEK_Aggregate:
-        CGF.EmitAggregateCopy(
-            CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
-            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
-            Private->getType(), AggValueSlot::DoesNotOverlap);
-        break;
-      }
-    }
-
-    // Step 3.1: Modify reference in dest Reduce list as needed.
-    // Modifying the reference in Reduce list to point to the newly
-    // created element.  The element is live in the current function
-    // scope and that of functions it invokes (i.e., reduce_function).
-    // RemoteReduceData[i] = (void*)&RemoteElem
-    if (UpdateDestListPtr) {
-      CGF.EmitStoreOfScalar(
-          Bld.CreatePointerBitCastOrAddrSpaceCast(
-              DestElementAddr.emitRawPointer(CGF), CGF.VoidPtrTy),
-          DestElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy);
-    }
-
-    ++Idx;
-  }
-}
-
-/// This function emits a helper that gathers Reduce lists from the first
-/// lane of every active warp to lanes in the first warp.
-///
-/// void inter_warp_copy_func(void* reduce_data, num_warps)
-///   shared smem[warp_size];
-///   For all data entries D in reduce_data:
-///     sync
-///     If (I am the first lane in each warp)
-///       Copy my local D to smem[warp_id]
-///     sync
-///     if (I am the first warp)
-///       Copy smem[thread_id] to my local D
-static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
-                                              ArrayRef<const Expr *> Privates,
-                                              QualType ReductionArrayTy,
-                                              SourceLocation Loc) {
-  ASTContext &C = CGM.getContext();
-  llvm::Module &M = CGM.getModule();
-
-  // ReduceList: thread local Reduce list.
-  // At the stage of the computation when this function is called, partially
-  // aggregated values reside in the first lane of every active warp.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  // NumWarps: number of warps active in the parallel region.  This could
-  // be smaller than 32 (max warps in a CTA) for partial block reduction.
-  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                C.getIntTypeForBitwidth(32, /* Signed */ true),
-                                ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&ReduceListArg);
-  Args.push_back(&NumWarpsArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
-                                    llvm::GlobalValue::InternalLinkage,
-                                    "_omp_reduction_inter_warp_copy_func", &M);
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  // This array is used as a medium to transfer, one reduce element at a time,
-  // the data from the first lane of every warp to lanes in the first warp
-  // in order to perform the final step of a reduction in a parallel region
-  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
-  // for reduced latency, as well as to have a distinct copy for concurrently
-  // executing target regions.  The array is declared with common linkage so
-  // as to be shared across compilation units.
-  StringRef TransferMediumName =
-      "__openmp_nvptx_data_transfer_temporary_storage";
-  llvm::GlobalVariable *TransferMedium =
-      M.getGlobalVariable(TransferMediumName);
-  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
-  if (!TransferMedium) {
-    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
-        llvm::UndefValue::get(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
-    CGM.addCompilerUsedGlobal(TransferMedium);
-  }
-
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  // Get the CUDA thread id of the current OpenMP thread on the GPU.
-  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
-  // nvptx_lane_id = nvptx_id % warpsize
-  llvm::Value *LaneID = getNVPTXLaneID(CGF);
-  // nvptx_warp_id = nvptx_id / warpsize
-  llvm::Value *WarpID = getNVPTXWarpID(CGF);
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(
-              AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
-              LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    //
-    // Warp master copies reduce element to transfer medium in __shared__
-    // memory.
-    //
-    unsigned RealTySize =
-        C.getTypeSizeInChars(Private->getType())
-            .alignTo(C.getTypeAlignInChars(Private->getType()))
-            .getQuantity();
-    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
-      unsigned NumIters = RealTySize / TySize;
-      if (NumIters == 0)
-        continue;
-      QualType CType = C.getIntTypeForBitwidth(
-          C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
-      llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
-      CharUnits Align = CharUnits::fromQuantity(TySize);
-      llvm::Value *Cnt = nullptr;
-      Address CntAddr = Address::invalid();
-      llvm::BasicBlock *PrecondBB = nullptr;
-      llvm::BasicBlock *ExitBB = nullptr;
-      if (NumIters > 1) {
-        CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
-        CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
-                              /*Volatile=*/false, C.IntTy);
-        PrecondBB = CGF.createBasicBlock("precond");
-        ExitBB = CGF.createBasicBlock("exit");
-        llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
-        // There is no need to emit line number for unconditional branch.
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(PrecondBB);
-        Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
-        llvm::Value *Cmp =
-            Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
-        Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
-        CGF.EmitBlock(BodyBB);
-      }
-      // kmpc_barrier.
-      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
-                                             /*EmitChecks=*/false,
-                                             /*ForceSimpleCall=*/true);
-      llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
-      llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
-      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
-      // if (lane_id == 0)
-      llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
-      Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
-      CGF.EmitBlock(ThenBB);
-
-      // Reduce element = LocalReduceList[i]
-      Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-      llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-          ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-      // elemptr = ((CopyType*)(elemptrptr)) + I
-      Address ElemPtr(ElemPtrPtr, CopyType, Align);
-      if (NumIters > 1)
-        ElemPtr = Bld.CreateGEP(CGF, ElemPtr, Cnt);
-
-      // Get pointer to location in transfer medium.
-      // MediumPtr = &medium[warp_id]
-      llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
-          TransferMedium->getValueType(), TransferMedium,
-          {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
-      // Casting to actual data type.
-      // MediumPtr = (CopyType*)MediumPtrAddr;
-      Address MediumPtr(MediumPtrVal, CopyType, Align);
-
-      // elem = *elemptr
-      //*MediumPtr = elem
-      llvm::Value *Elem = CGF.EmitLoadOfScalar(
-          ElemPtr, /*Volatile=*/false, CType, Loc,
-          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
-      // Store the source element value to the dest element address.
-      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-
-      Bld.CreateBr(MergeBB);
-
-      CGF.EmitBlock(ElseBB);
-      Bld.CreateBr(MergeBB);
-
-      CGF.EmitBlock(MergeBB);
-
-      // kmpc_barrier.
-      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
-                                             /*EmitChecks=*/false,
-                                             /*ForceSimpleCall=*/true);
-
-      //
-      // Warp 0 copies reduce element from transfer medium.
-      //
-      llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
-      llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
-      llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
-
-      Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
-      llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
-          AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
-
-      // Up to 32 threads in warp 0 are active.
-      llvm::Value *IsActiveThread =
-          Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
-      Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
-
-      CGF.EmitBlock(W0ThenBB);
-
-      // SrcMediumPtr = &medium[tid]
-      llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
-          TransferMedium->getValueType(), TransferMedium,
-          {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
-      // SrcMediumVal = *SrcMediumPtr;
-      Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
-
-      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
-      Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-      llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
-          TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
-      Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
-      if (NumIters > 1)
-        TargetElemPtr = Bld.CreateGEP(CGF, TargetElemPtr, Cnt);
-
-      // *TargetElemPtr = SrcMediumVal;
-      llvm::Value *SrcMediumValue =
-          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
-      CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
-                            CType);
-      Bld.CreateBr(W0MergeBB);
-
-      CGF.EmitBlock(W0ElseBB);
-      Bld.CreateBr(W0MergeBB);
-
-      CGF.EmitBlock(W0MergeBB);
-
-      if (NumIters > 1) {
-        Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
-        CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
-        CGF.EmitBranch(PrecondBB);
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(ExitBB);
-      }
-      RealTySize %= TySize;
-    }
-    ++Idx;
-  }
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// Emit a helper that reduces data across two OpenMP threads (lanes)
-/// in the same warp.  It uses shuffle instructions to copy over data from
-/// a remote lane's stack.  The reduction algorithm performed is specified
-/// by the fourth parameter.
-///
-/// Algorithm Versions.
-/// Full Warp Reduce (argument value 0):
-///   This algorithm assumes that all 32 lanes are active and gathers
-///   data from these 32 lanes, producing a single resultant value.
-/// Contiguous Partial Warp Reduce (argument value 1):
-///   This algorithm assumes that only a *contiguous* subset of lanes
-///   are active.  This happens for the last warp in a parallel region
-///   when the user specified num_threads is not an integer multiple of
-///   32.  This contiguous subset always starts with the zeroth lane.
-/// Partial Warp Reduce (argument value 2):
-///   This algorithm gathers data from any number of lanes at any position.
-/// All reduced values are stored in the lowest possible lane.  The set
-/// of problems every algorithm addresses is a super set of those
-/// addressable by algorithms with a lower version number.  Overhead
-/// increases as algorithm version increases.
-///
-/// Terminology
-/// Reduce element:
-///   Reduce element refers to the individual data field with primitive
-///   data types to be combined and reduced across threads.
-/// Reduce list:
-///   Reduce list refers to a collection of local, thread-private
-///   reduce elements.
-/// Remote Reduce list:
-///   Remote Reduce list refers to a collection of remote (relative to
-///   the current thread) reduce elements.
-///
-/// We distinguish between three states of threads that are important to
-/// the implementation of this function.
-/// Alive threads:
-///   Threads in a warp executing the SIMT instruction, as distinguished from
-///   threads that are inactive due to divergent control flow.
-/// Active threads:
-///   The minimal set of threads that has to be alive upon entry to this
-///   function.  The computation is correct iff active threads are alive.
-///   Some threads are alive but they are not active because they do not
-///   contribute to the computation in any useful manner.  Turning them off
-///   may introduce control flow overheads without any tangible benefits.
-/// Effective threads:
-///   In order to comply with the argument requirements of the shuffle
-///   function, we must keep all lanes holding data alive.  But at most
-///   half of them perform value aggregation; we refer to this half of
-///   threads as effective. The other half is simply handing off their
-///   data.
-///
-/// Procedure
-/// Value shuffle:
-///   In this step active threads transfer data from higher lane positions
-///   in the warp to lower lane positions, creating Remote Reduce list.
-/// Value aggregation:
-///   In this step, effective threads combine their thread local Reduce list
-///   with Remote Reduce list and store the result in the thread local
-///   Reduce list.
-/// Value copy:
-///   In this step, we deal with the assumption made by algorithm 2
-///   (i.e. contiguity assumption).  When we have an odd number of lanes
-///   active, say 2k+1, only k threads will be effective and therefore k
-///   new values will be produced.  However, the Reduce list owned by the
-///   (2k+1)th thread is ignored in the value aggregation.  Therefore
-///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
-///   that the contiguity assumption still holds.
-static llvm::Function *emitShuffleAndReduceFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
-  ASTContext &C = CGM.getContext();
-
-  // Thread local Reduce list used to host the values of data to be reduced.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  // Current lane id; could be logical.
-  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
-                              ImplicitParamKind::Other);
-  // Offset of the remote source lane relative to the current lane.
-  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                        C.ShortTy, ImplicitParamKind::Other);
-  // Algorithm version.  This is expected to be known at compile time.
-  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                               C.ShortTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&ReduceListArg);
-  Args.push_back(&LaneIDArg);
-  Args.push_back(&RemoteLaneOffsetArg);
-  Args.push_back(&AlgoVerArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, SourceLocation()),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-
-  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
-  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
-      AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
-  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
-  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
-      AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
-  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
-  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
-      AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
-  // Create a local thread-private variable to host the Reduce list
-  // from a remote lane.
-  Address RemoteReduceList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
-
-  // This loop iterates through the list of reduce elements and copies,
-  // element by element, from a remote lane in the warp to RemoteReduceList,
-  // hosted on the thread's stack.
-  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
-                        LocalReduceList, RemoteReduceList,
-                        {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
-                         /*ScratchpadIndex=*/nullptr,
-                         /*ScratchpadWidth=*/nullptr});
-
-  // The actions to be performed on the Remote Reduce list is dependent
-  // on the algorithm version.
-  //
-  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
-  //  LaneId % 2 == 0 && Offset > 0):
-  //    do the reduction value aggregation
-  //
-  //  The thread local variable Reduce list is mutated in place to host the
-  //  reduced data, which is the aggregated value produced from local and
-  //  remote lanes.
-  //
-  //  Note that AlgoVer is expected to be a constant integer known at compile
-  //  time.
-  //  When AlgoVer==0, the first conjunction evaluates to true, making
-  //    the entire predicate true during compile time.
-  //  When AlgoVer==1, the second conjunction has only the second part to be
-  //    evaluated during runtime.  Other conjunctions evaluates to false
-  //    during compile time.
-  //  When AlgoVer==2, the third conjunction has only the second part to be
-  //    evaluated during runtime.  Other conjunctions evaluates to false
-  //    during compile time.
-  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
-
-  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
-  llvm::Value *CondAlgo1 = Bld.CreateAnd(
-      Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
-  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
-  llvm::Value *CondAlgo2 = Bld.CreateAnd(
-      Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
-  CondAlgo2 = Bld.CreateAnd(
-      CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
-
-  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
-  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
-
-  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
-  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
-  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
-
-  CGF.EmitBlock(ThenBB);
-  // reduce_function(LocalReduceList, RemoteReduceList)
-  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      LocalReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
-  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      RemoteReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
-  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
-      CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
-  Bld.CreateBr(MergeBB);
-
-  CGF.EmitBlock(ElseBB);
-  Bld.CreateBr(MergeBB);
-
-  CGF.EmitBlock(MergeBB);
-
-  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
-  // Reduce list.
-  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
-  llvm::Value *CondCopy = Bld.CreateAnd(
-      Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
-  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
-  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
-  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
-  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
-
-  CGF.EmitBlock(CpyThenBB);
-  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
-                        RemoteReduceList, LocalReduceList);
-  Bld.CreateBr(CpyMergeBB);
-
-  CGF.EmitBlock(CpyElseBB);
-  Bld.CreateBr(CpyMergeBB);
-
-  CGF.EmitBlock(CpyMergeBB);
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-///   For all data entries D in reduce_data:
-///     Copy local D to buffer.D[Idx]
-static llvm::Value *emitListToGlobalCopyFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, Loc),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    // Reduce element = LocalReduceList[i]
-    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-    // elemptr = ((CopyType*)(elemptrptr)) + I
-    ElemTy = CGF.ConvertTypeForMem(Private->getType());
-    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtrPtr, ElemTy->getPointerTo());
-    Address ElemPtr =
-        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
-    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
-    // Global = Buffer.VD[Idx];
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress();
-    GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
-                                CGF.ConvertTypeForMem(Private->getType()),
-                                GlobAddr.getAlignment()));
-    switch (CGF.getEvaluationKind(Private->getType())) {
-    case TEK_Scalar: {
-      llvm::Value *V = CGF.EmitLoadOfScalar(
-          ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
-          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
-      CGF.EmitStoreOfScalar(V, GlobLVal);
-      break;
-    }
-    case TEK_Complex: {
-      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
-          CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
-      CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
-      break;
-    }
-    case TEK_Aggregate:
-      CGF.EmitAggregateCopy(GlobLVal,
-                            CGF.MakeAddrLValue(ElemPtr, Private->getType()),
-                            Private->getType(), AggValueSlot::DoesNotOverlap);
-      break;
-    }
-    ++Idx;
-  }
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-///  void *GlobPtrs[];
-///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
-///  ...
-///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
-///  reduce_function(GlobPtrs, reduce_data);
-static llvm::Value *emitListToGlobalReduceFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap,
-    llvm::Function *ReduceFn) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  RawAddress ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    // Global = Buffer.VD[Idx];
-    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress();
-    CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
-                          /*Volatile=*/false, C.VoidPtrTy);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  // Call reduce_function(GlobalReduceList, ReduceList)
-  llvm::Value *GlobalReduceList = ReductionList.getPointer();
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
-      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
-  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
-      CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-///   For all data entries D in reduce_data:
-///     Copy buffer.D[Idx] to local D;
-static llvm::Value *emitGlobalToListCopyFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, Loc),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    // Reduce element = LocalReduceList[i]
-    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-    // elemptr = ((CopyType*)(elemptrptr)) + I
-    ElemTy = CGF.ConvertTypeForMem(Private->getType());
-    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtrPtr, ElemTy->getPointerTo());
-    Address ElemPtr =
-        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
-    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
-    // Global = Buffer.VD[Idx];
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress();
-    GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
-                                CGF.ConvertTypeForMem(Private->getType()),
-                                GlobAddr.getAlignment()));
-    switch (CGF.getEvaluationKind(Private->getType())) {
-    case TEK_Scalar: {
-      llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
-      CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      break;
-    }
-    case TEK_Complex: {
-      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
-      CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
-                             /*isInit=*/false);
-      break;
-    }
-    case TEK_Aggregate:
-      CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
-                            GlobLVal, Private->getType(),
-                            AggValueSlot::DoesNotOverlap);
-      break;
-    }
-    ++Idx;
-  }
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
-///  void *GlobPtrs[];
-///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
-///  ...
-///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
-///  reduce_function(reduce_data, GlobPtrs);
-static llvm::Value *emitGlobalToListReduceFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap,
-    llvm::Function *ReduceFn) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  Address ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    // Global = Buffer.VD[Idx];
-    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress();
-    CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
-                          /*Volatile=*/false, C.VoidPtrTy);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  // Call reduce_function(ReduceList, GlobalReduceList)
-  llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
-      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
-  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
-      CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
-  CGF.FinishFunction();
-  return Fn;
-}
-
 ///
 /// Design of OpenMP reductions on the GPU
 ///
@@ -2812,21 +1660,20 @@ void CGOpenMPRuntimeGPU::emitReduction(
     return;
 
   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
-#ifndef NDEBUG
+  bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-#endif
+
+  ASTContext &C = CGM.getContext();
 
   if (Options.SimpleReduction) {
     assert(!TeamsReduction && !ParallelReduction &&
            "Invalid reduction selection in emitReduction.");
+    (void)ParallelReduction;
     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
                                    ReductionOps, Options);
     return;
   }
 
-  assert((TeamsReduction || ParallelReduction) &&
-         "Invalid reduction selection in emitReduction.");
-
   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
   llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
   int Cnt = 0;
@@ -2834,145 +1681,84 @@ void CGOpenMPRuntimeGPU::emitReduction(
     PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
     ++Cnt;
   }
-
-  ASTContext &C = CGM.getContext();
   const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
       CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
 
-  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
-  // RedList, shuffle_reduce_func, interwarp_copy_func);
-  // or
-  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
-  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
-
-  llvm::Value *Res;
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  auto Size = RHSExprs.size();
-  for (const Expr *E : Privates) {
-    if (E->getType()->isVariablyModifiedType())
-      // Reserve place for array size.
-      ++Size;
-  }
-  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
-  QualType ReductionArrayTy = C.getConstantArrayType(
-      C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
-      /*IndexTypeQuals=*/0);
-  Address ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    CGF.Builder.CreateStore(
-        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
-        Elem);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);
-  llvm::Function *ReductionFn = emitReductionFunction(
-      CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
-      Privates, LHSExprs, RHSExprs, ReductionOps);
-  llvm::Value *ReductionDataSize =
-      CGF.getTypeSize(C.getRecordType(ReductionRec));
-  ReductionDataSize =
-      CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
-  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
-      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
-  llvm::Value *InterWarpCopyFn =
-      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
-
-  if (ParallelReduction) {
-    llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
-                           InterWarpCopyFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
-        Args);
-  } else {
-    assert(TeamsReduction && "expected teams reduction.");
+  if (TeamsReduction)
     TeamsReductions.push_back(ReductionRec);
-    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
-        {}, "_openmp_teams_reductions_buffer_$_$ptr");
-    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
-    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
 
-    llvm::Value *Args[] = {
-        RTLoc,
-        KernelTeamsReductionPtr,
-        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
-        ReductionDataSize,
-        RL,
-        ShuffleAndReduceFn,
-        InterWarpCopyFn,
-        GlobalToBufferCpyFn,
-        GlobalToBufferRedFn,
-        BufferToGlobalCpyFn,
-        BufferToGlobalRedFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
-        Args);
-  }
+  // Source location for the ident struct
+  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
 
-  // 5. Build if (res == 1)
-  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
-  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
-  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
-      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
-  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
-
-  // 6. Build then branch: where we have reduced values in the master
-  //    thread in each team.
-  //    __kmpc_end_reduce{_nowait}(<gtid>);
-  //    break;
-  CGF.EmitBlock(ThenBB);
-
-  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
-  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
-                    this](CodeGenFunction &CGF, PrePostActionTy &Action) {
-    auto IPriv = Privates.begin();
-    auto ILHS = LHSExprs.begin();
-    auto IRHS = RHSExprs.begin();
-    for (const Expr *E : ReductionOps) {
-      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
-                                  cast<DeclRefExpr>(*IRHS));
-      ++IPriv;
-      ++ILHS;
-      ++IRHS;
+  using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+                         CGF.AllocaInsertPt->getIterator());
+  InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+                          CGF.Builder.GetInsertPoint());
+  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
+
+  CodeGenFunction::OMPPrivateScope Scope(CGF);
+  unsigned Idx = 0;
+  for (const Expr *Private : Privates) {
+    llvm::Type *ElementType;
+    llvm::Value *Variable;
+    llvm::Value *PrivateVariable;
+    llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
+    ElementType = CGF.ConvertTypeForMem(Private->getType());
+    const auto *RHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
+    PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
+    const auto *LHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
+    Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
+    llvm::OpenMPIRBuilder::EvalKind EvalKind;
+    switch (CGF.getEvaluationKind(Private->getType())) {
+    case TEK_Scalar:
+      EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
+      break;
+    case TEK_Complex:
+      EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
+      break;
+    case TEK_Aggregate:
+      EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
+      break;
     }
-  };
-  RegionCodeGenTy RCG(CodeGen);
-  RCG(CGF);
-  // There is no need to emit line number for unconditional branch.
-  (void)ApplyDebugLocation::CreateEmpty(CGF);
-  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+    auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
+                            llvm::Value **LHSPtr, llvm::Value **RHSPtr,
+                            llvm::Function *NewFunc) {
+      CGF.Builder.restoreIP(CodeGenIP);
+      auto *CurFn = CGF.CurFn;
+      CGF.CurFn = NewFunc;
+
+      *LHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
+                    .emitRawPointer(CGF);
+      *RHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
+                    .emitRawPointer(CGF);
+
+      emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
+                                  cast<DeclRefExpr>(LHSExprs[I]),
+                                  cast<DeclRefExpr>(RHSExprs[I]));
+
+      CGF.CurFn = CurFn;
+
+      return InsertPointTy(CGF.Builder.GetInsertBlock(),
+                           CGF.Builder.GetInsertPoint());
+    };
+    ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
+        ElementType, Variable, PrivateVariable, EvalKind,
+        /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
+    Idx++;
+  }
+
+  CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
+      OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
+      DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
+      CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
+      RTLoc));
+  return;
 }
 
 const VarDecl *
@@ -3441,112 +2227,112 @@ bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
   return false;
 }
 
-// Get current CudaArch and ignore any unknown values
-static CudaArch getCudaArch(CodeGenModule &CGM) {
+// Get current OffloadArch and ignore any unknown values
+static OffloadArch getOffloadArch(CodeGenModule &CGM) {
   if (!CGM.getTarget().hasFeature("ptx"))
-    return CudaArch::UNKNOWN;
+    return OffloadArch::UNKNOWN;
   for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
     if (Feature.getValue()) {
-      CudaArch Arch = StringToCudaArch(Feature.getKey());
-      if (Arch != CudaArch::UNKNOWN)
+      OffloadArch Arch = StringToOffloadArch(Feature.getKey());
+      if (Arch != OffloadArch::UNKNOWN)
         return Arch;
     }
   }
-  return CudaArch::UNKNOWN;
+  return OffloadArch::UNKNOWN;
 }
 
 /// Check to see if target architecture supports unified addressing which is
 /// a restriction for OpenMP requires clause "unified_shared_memory".
-void CGOpenMPRuntimeGPU::processRequiresDirective(
-    const OMPRequiresDecl *D) {
+void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
   for (const OMPClause *Clause : D->clauselists()) {
     if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
-      CudaArch Arch = getCudaArch(CGM);
+      OffloadArch Arch = getOffloadArch(CGM);
       switch (Arch) {
-      case CudaArch::SM_20:
-      case CudaArch::SM_21:
-      case CudaArch::SM_30:
-      case CudaArch::SM_32_:
-      case CudaArch::SM_35:
-      case CudaArch::SM_37:
-      case CudaArch::SM_50:
-      case CudaArch::SM_52:
-      case CudaArch::SM_53: {
+      case OffloadArch::SM_20:
+      case OffloadArch::SM_21:
+      case OffloadArch::SM_30:
+      case OffloadArch::SM_32_:
+      case OffloadArch::SM_35:
+      case OffloadArch::SM_37:
+      case OffloadArch::SM_50:
+      case OffloadArch::SM_52:
+      case OffloadArch::SM_53: {
         SmallString<256> Buffer;
         llvm::raw_svector_ostream Out(Buffer);
-        Out << "Target architecture " << CudaArchToString(Arch)
+        Out << "Target architecture " << OffloadArchToString(Arch)
             << " does not support unified addressing";
         CGM.Error(Clause->getBeginLoc(), Out.str());
         return;
       }
-      case CudaArch::SM_60:
-      case CudaArch::SM_61:
-      case CudaArch::SM_62:
-      case CudaArch::SM_70:
-      case CudaArch::SM_72:
-      case CudaArch::SM_75:
-      case CudaArch::SM_80:
-      case CudaArch::SM_86:
-      case CudaArch::SM_87:
-      case CudaArch::SM_89:
-      case CudaArch::SM_90:
-      case CudaArch::SM_90a:
-      case CudaArch::GFX600:
-      case CudaArch::GFX601:
-      case CudaArch::GFX602:
-      case CudaArch::GFX700:
-      case CudaArch::GFX701:
-      case CudaArch::GFX702:
-      case CudaArch::GFX703:
-      case CudaArch::GFX704:
-      case CudaArch::GFX705:
-      case CudaArch::GFX801:
-      case CudaArch::GFX802:
-      case CudaArch::GFX803:
-      case CudaArch::GFX805:
-      case CudaArch::GFX810:
-      case CudaArch::GFX9_GENERIC:
-      case CudaArch::GFX900:
-      case CudaArch::GFX902:
-      case CudaArch::GFX904:
-      case CudaArch::GFX906:
-      case CudaArch::GFX908:
-      case CudaArch::GFX909:
-      case CudaArch::GFX90a:
-      case CudaArch::GFX90c:
-      case CudaArch::GFX940:
-      case CudaArch::GFX941:
-      case CudaArch::GFX942:
-      case CudaArch::GFX10_1_GENERIC:
-      case CudaArch::GFX1010:
-      case CudaArch::GFX1011:
-      case CudaArch::GFX1012:
-      case CudaArch::GFX1013:
-      case CudaArch::GFX10_3_GENERIC:
-      case CudaArch::GFX1030:
-      case CudaArch::GFX1031:
-      case CudaArch::GFX1032:
-      case CudaArch::GFX1033:
-      case CudaArch::GFX1034:
-      case CudaArch::GFX1035:
-      case CudaArch::GFX1036:
-      case CudaArch::GFX11_GENERIC:
-      case CudaArch::GFX1100:
-      case CudaArch::GFX1101:
-      case CudaArch::GFX1102:
-      case CudaArch::GFX1103:
-      case CudaArch::GFX1150:
-      case CudaArch::GFX1151:
-      case CudaArch::GFX1152:
-      case CudaArch::GFX12_GENERIC:
-      case CudaArch::GFX1200:
-      case CudaArch::GFX1201:
-      case CudaArch::Generic:
-      case CudaArch::UNUSED:
-      case CudaArch::UNKNOWN:
+      case OffloadArch::SM_60:
+      case OffloadArch::SM_61:
+      case OffloadArch::SM_62:
+      case OffloadArch::SM_70:
+      case OffloadArch::SM_72:
+      case OffloadArch::SM_75:
+      case OffloadArch::SM_80:
+      case OffloadArch::SM_86:
+      case OffloadArch::SM_87:
+      case OffloadArch::SM_89:
+      case OffloadArch::SM_90:
+      case OffloadArch::SM_90a:
+      case OffloadArch::GFX600:
+      case OffloadArch::GFX601:
+      case OffloadArch::GFX602:
+      case OffloadArch::GFX700:
+      case OffloadArch::GFX701:
+      case OffloadArch::GFX702:
+      case OffloadArch::GFX703:
+      case OffloadArch::GFX704:
+      case OffloadArch::GFX705:
+      case OffloadArch::GFX801:
+      case OffloadArch::GFX802:
+      case OffloadArch::GFX803:
+      case OffloadArch::GFX805:
+      case OffloadArch::GFX810:
+      case OffloadArch::GFX9_GENERIC:
+      case OffloadArch::GFX900:
+      case OffloadArch::GFX902:
+      case OffloadArch::GFX904:
+      case OffloadArch::GFX906:
+      case OffloadArch::GFX908:
+      case OffloadArch::GFX909:
+      case OffloadArch::GFX90a:
+      case OffloadArch::GFX90c:
+      case OffloadArch::GFX940:
+      case OffloadArch::GFX941:
+      case OffloadArch::GFX942:
+      case OffloadArch::GFX10_1_GENERIC:
+      case OffloadArch::GFX1010:
+      case OffloadArch::GFX1011:
+      case OffloadArch::GFX1012:
+      case OffloadArch::GFX1013:
+      case OffloadArch::GFX10_3_GENERIC:
+      case OffloadArch::GFX1030:
+      case OffloadArch::GFX1031:
+      case OffloadArch::GFX1032:
+      case OffloadArch::GFX1033:
+      case OffloadArch::GFX1034:
+      case OffloadArch::GFX1035:
+      case OffloadArch::GFX1036:
+      case OffloadArch::GFX11_GENERIC:
+      case OffloadArch::GFX1100:
+      case OffloadArch::GFX1101:
+      case OffloadArch::GFX1102:
+      case OffloadArch::GFX1103:
+      case OffloadArch::GFX1150:
+      case OffloadArch::GFX1151:
+      case OffloadArch::GFX1152:
+      case OffloadArch::GFX12_GENERIC:
+      case OffloadArch::GFX1200:
+      case OffloadArch::GFX1201:
+      case OffloadArch::AMDGCNSPIRV:
+      case OffloadArch::Generic:
+      case OffloadArch::UNUSED:
+      case OffloadArch::UNKNOWN:
         break;
-      case CudaArch::LAST:
-        llvm_unreachable("Unexpected Cuda arch.");
+      case OffloadArch::LAST:
+        llvm_unreachable("Unexpected GPU arch.");
       }
     }
   }
@@ -3573,10 +2359,3 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
           CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
       Args);
 }
-
-llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
-  ArrayRef<llvm::Value *> Args{};
-  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
-                             Args);
-}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 141436f26230dd..4d586ec972f8d6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -150,9 +150,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
       CodeGenFunction &CGF,
       const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) override;
 
-  /// Get the GPU warp size.
-  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF);
-
   /// Get the id of the current thread on the GPU.
   llvm::Value *getGPUThreadID(CodeGenFunction &CGF);
 
diff --git a/clang/lib/CodeGen/CGPointerAuth.cpp b/clang/lib/CodeGen/CGPointerAuth.cpp
index f0819b04674898..673f6e60bfc317 100644
--- a/clang/lib/CodeGen/CGPointerAuth.cpp
+++ b/clang/lib/CodeGen/CGPointerAuth.cpp
@@ -11,12 +11,56 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/CodeGen/CodeGenABITypes.h"
+#include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/Support/SipHash.h"
 
 using namespace clang;
 using namespace CodeGen;
 
+/// Given a pointer-authentication schema, return a concrete "other"
+/// discriminator for it.
+llvm::ConstantInt *CodeGenModule::getPointerAuthOtherDiscriminator(
+    const PointerAuthSchema &Schema, GlobalDecl Decl, QualType Type) {
+  switch (Schema.getOtherDiscrimination()) {
+  case PointerAuthSchema::Discrimination::None:
+    return nullptr;
+
+  case PointerAuthSchema::Discrimination::Type:
+    llvm_unreachable("type discrimination not implemented yet");
+
+  case PointerAuthSchema::Discrimination::Decl:
+    assert(Decl.getDecl() &&
+           "declaration not provided for decl-discriminated schema");
+    return llvm::ConstantInt::get(IntPtrTy,
+                                  getPointerAuthDeclDiscriminator(Decl));
+
+  case PointerAuthSchema::Discrimination::Constant:
+    return llvm::ConstantInt::get(IntPtrTy, Schema.getConstantDiscrimination());
+  }
+  llvm_unreachable("bad discrimination kind");
+}
+
+uint16_t CodeGen::getPointerAuthDeclDiscriminator(CodeGenModule &CGM,
+                                                  GlobalDecl Declaration) {
+  return CGM.getPointerAuthDeclDiscriminator(Declaration);
+}
+
+/// Return the "other" decl-specific discriminator for the given decl.
+uint16_t
+CodeGenModule::getPointerAuthDeclDiscriminator(GlobalDecl Declaration) {
+  uint16_t &EntityHash = PtrAuthDiscriminatorHashes[Declaration];
+
+  if (EntityHash == 0) {
+    StringRef Name = getMangledName(Declaration);
+    EntityHash = llvm::getPointerAuthStableSipHash(Name);
+  }
+
+  return EntityHash;
+}
+
 /// Return the abstract pointer authentication schema for a pointer to the given
 /// function type.
 CGPointerAuthInfo CodeGenModule::getFunctionPointerAuthInfo(QualType T) {
@@ -35,6 +79,41 @@ CGPointerAuthInfo CodeGenModule::getFunctionPointerAuthInfo(QualType T) {
                            /*Discriminator=*/nullptr);
 }
 
+llvm::Value *
+CodeGenFunction::EmitPointerAuthBlendDiscriminator(llvm::Value *StorageAddress,
+                                                   llvm::Value *Discriminator) {
+  StorageAddress = Builder.CreatePtrToInt(StorageAddress, IntPtrTy);
+  auto Intrinsic = CGM.getIntrinsic(llvm::Intrinsic::ptrauth_blend);
+  return Builder.CreateCall(Intrinsic, {StorageAddress, Discriminator});
+}
+
+/// Emit the concrete pointer authentication informaton for the
+/// given authentication schema.
+CGPointerAuthInfo CodeGenFunction::EmitPointerAuthInfo(
+    const PointerAuthSchema &Schema, llvm::Value *StorageAddress,
+    GlobalDecl SchemaDecl, QualType SchemaType) {
+  if (!Schema)
+    return CGPointerAuthInfo();
+
+  llvm::Value *Discriminator =
+      CGM.getPointerAuthOtherDiscriminator(Schema, SchemaDecl, SchemaType);
+
+  if (Schema.isAddressDiscriminated()) {
+    assert(StorageAddress &&
+           "address not provided for address-discriminated schema");
+
+    if (Discriminator)
+      Discriminator =
+          EmitPointerAuthBlendDiscriminator(StorageAddress, Discriminator);
+    else
+      Discriminator = Builder.CreatePtrToInt(StorageAddress, IntPtrTy);
+  }
+
+  return CGPointerAuthInfo(Schema.getKey(), Schema.getAuthenticationMode(),
+                           Schema.isIsaPointer(),
+                           Schema.authenticatesNullValues(), Discriminator);
+}
+
 llvm::Constant *
 CodeGenModule::getConstantSignedPointer(llvm::Constant *Pointer, unsigned Key,
                                         llvm::Constant *StorageAddress,
@@ -60,6 +139,29 @@ CodeGenModule::getConstantSignedPointer(llvm::Constant *Pointer, unsigned Key,
                                     IntegerDiscriminator, AddressDiscriminator);
 }
 
+/// Does a given PointerAuthScheme require us to sign a value
+bool CodeGenModule::shouldSignPointer(const PointerAuthSchema &Schema) {
+  auto AuthenticationMode = Schema.getAuthenticationMode();
+  return AuthenticationMode == PointerAuthenticationMode::SignAndStrip ||
+         AuthenticationMode == PointerAuthenticationMode::SignAndAuth;
+}
+
+/// Sign a constant pointer using the given scheme, producing a constant
+/// with the same IR type.
+llvm::Constant *CodeGenModule::getConstantSignedPointer(
+    llvm::Constant *Pointer, const PointerAuthSchema &Schema,
+    llvm::Constant *StorageAddress, GlobalDecl SchemaDecl,
+    QualType SchemaType) {
+  assert(shouldSignPointer(Schema));
+  llvm::ConstantInt *OtherDiscriminator =
+      getPointerAuthOtherDiscriminator(Schema, SchemaDecl, SchemaType);
+
+  return getConstantSignedPointer(Pointer, Schema.getKey(), StorageAddress,
+                                  OtherDiscriminator);
+}
+
+/// If applicable, sign a given constant function pointer with the ABI rules for
+/// functionType.
 llvm::Constant *CodeGenModule::getFunctionPointer(llvm::Constant *Pointer,
                                                   QualType FunctionType) {
   assert(FunctionType->isFunctionType() ||
@@ -80,3 +182,113 @@ llvm::Constant *CodeGenModule::getFunctionPointer(GlobalDecl GD,
   QualType FuncType = FD->getType();
   return getFunctionPointer(getRawFunctionPointer(GD, Ty), FuncType);
 }
+
+std::optional<PointerAuthQualifier>
+CodeGenModule::computeVTPointerAuthentication(const CXXRecordDecl *ThisClass) {
+  auto DefaultAuthentication = getCodeGenOpts().PointerAuth.CXXVTablePointers;
+  if (!DefaultAuthentication)
+    return std::nullopt;
+  const CXXRecordDecl *PrimaryBase =
+      Context.baseForVTableAuthentication(ThisClass);
+
+  unsigned Key = DefaultAuthentication.getKey();
+  bool AddressDiscriminated = DefaultAuthentication.isAddressDiscriminated();
+  auto DefaultDiscrimination = DefaultAuthentication.getOtherDiscrimination();
+  unsigned TypeBasedDiscriminator =
+      Context.getPointerAuthVTablePointerDiscriminator(PrimaryBase);
+  unsigned Discriminator;
+  if (DefaultDiscrimination == PointerAuthSchema::Discrimination::Type) {
+    Discriminator = TypeBasedDiscriminator;
+  } else if (DefaultDiscrimination ==
+             PointerAuthSchema::Discrimination::Constant) {
+    Discriminator = DefaultAuthentication.getConstantDiscrimination();
+  } else {
+    assert(DefaultDiscrimination == PointerAuthSchema::Discrimination::None);
+    Discriminator = 0;
+  }
+  if (auto ExplicitAuthentication =
+          PrimaryBase->getAttr<VTablePointerAuthenticationAttr>()) {
+    auto ExplicitAddressDiscrimination =
+        ExplicitAuthentication->getAddressDiscrimination();
+    auto ExplicitDiscriminator =
+        ExplicitAuthentication->getExtraDiscrimination();
+
+    unsigned ExplicitKey = ExplicitAuthentication->getKey();
+    if (ExplicitKey == VTablePointerAuthenticationAttr::NoKey)
+      return std::nullopt;
+
+    if (ExplicitKey != VTablePointerAuthenticationAttr::DefaultKey) {
+      if (ExplicitKey == VTablePointerAuthenticationAttr::ProcessIndependent)
+        Key = (unsigned)PointerAuthSchema::ARM8_3Key::ASDA;
+      else {
+        assert(ExplicitKey ==
+               VTablePointerAuthenticationAttr::ProcessDependent);
+        Key = (unsigned)PointerAuthSchema::ARM8_3Key::ASDB;
+      }
+    }
+
+    if (ExplicitAddressDiscrimination !=
+        VTablePointerAuthenticationAttr::DefaultAddressDiscrimination)
+      AddressDiscriminated =
+          ExplicitAddressDiscrimination ==
+          VTablePointerAuthenticationAttr::AddressDiscrimination;
+
+    if (ExplicitDiscriminator ==
+        VTablePointerAuthenticationAttr::TypeDiscrimination)
+      Discriminator = TypeBasedDiscriminator;
+    else if (ExplicitDiscriminator ==
+             VTablePointerAuthenticationAttr::CustomDiscrimination)
+      Discriminator = ExplicitAuthentication->getCustomDiscriminationValue();
+    else if (ExplicitDiscriminator ==
+             VTablePointerAuthenticationAttr::NoExtraDiscrimination)
+      Discriminator = 0;
+  }
+  return PointerAuthQualifier::Create(Key, AddressDiscriminated, Discriminator,
+                                      PointerAuthenticationMode::SignAndAuth,
+                                      /* IsIsaPointer */ false,
+                                      /* AuthenticatesNullValues */ false);
+}
+
+std::optional<PointerAuthQualifier>
+CodeGenModule::getVTablePointerAuthentication(const CXXRecordDecl *Record) {
+  if (!Record->getDefinition() || !Record->isPolymorphic())
+    return std::nullopt;
+
+  auto Existing = VTablePtrAuthInfos.find(Record);
+  std::optional<PointerAuthQualifier> Authentication;
+  if (Existing != VTablePtrAuthInfos.end()) {
+    Authentication = Existing->getSecond();
+  } else {
+    Authentication = computeVTPointerAuthentication(Record);
+    VTablePtrAuthInfos.insert(std::make_pair(Record, Authentication));
+  }
+  return Authentication;
+}
+
+std::optional<CGPointerAuthInfo>
+CodeGenModule::getVTablePointerAuthInfo(CodeGenFunction *CGF,
+                                        const CXXRecordDecl *Record,
+                                        llvm::Value *StorageAddress) {
+  auto Authentication = getVTablePointerAuthentication(Record);
+  if (!Authentication)
+    return std::nullopt;
+
+  llvm::Value *Discriminator = nullptr;
+  if (auto ExtraDiscriminator = Authentication->getExtraDiscriminator())
+    Discriminator = llvm::ConstantInt::get(IntPtrTy, ExtraDiscriminator);
+
+  if (Authentication->isAddressDiscriminated()) {
+    assert(StorageAddress &&
+           "address not provided for address-discriminated schema");
+    if (Discriminator)
+      Discriminator =
+          CGF->EmitPointerAuthBlendDiscriminator(StorageAddress, Discriminator);
+    else
+      Discriminator = CGF->Builder.CreatePtrToInt(StorageAddress, IntPtrTy);
+  }
+
+  return CGPointerAuthInfo(Authentication->getKey(),
+                           PointerAuthenticationMode::SignAndAuth,
+                           /* IsIsaPointer */ false,
+                           /* AuthenticatesNullValues */ false, Discriminator);
+}
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index f73d32de7c4848..43cf4dc630cd77 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -193,7 +193,7 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
     if (PreInits) {
       // CompoundStmts and DeclStmts are used as lists of PreInit statements and
       // declarations. Since declarations must be visible in the the following
-      // that they initialize, unpack the ComboundStmt they are nested in.
+      // that they initialize, unpack the CompoundStmt they are nested in.
       SmallVector<const Stmt *> PreInitStmts;
       if (auto *PreInitCompound = dyn_cast<CompoundStmt>(PreInits))
         llvm::append_range(PreInitStmts, PreInitCompound->body());
@@ -1411,7 +1411,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     case OMPD_end_declare_variant:
     case OMPD_unknown:
     default:
-      llvm_unreachable("Enexpected directive with task reductions.");
+      llvm_unreachable("Unexpected directive with task reductions.");
     }
 
     const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(TaskRedRef)->getDecl());
@@ -1766,7 +1766,7 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
 
     using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
 
-    // The cleanup callback that finalizes all variabels at the given location,
+    // The cleanup callback that finalizes all variables at the given location,
     // thus calls destructors etc.
     auto FiniCB = [this](InsertPointTy IP) {
       OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP);
@@ -6506,7 +6506,7 @@ static void emitOMPAtomicCompareExpr(
   }
 
   if (FailAO == llvm::AtomicOrdering::NotAtomic) {
-    // fail clause was not mentionend on the
+    // fail clause was not mentioned on the
     // "#pragma omp atomic compare" construct.
     CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
         CGF.Builder, XOpVal, VOpVal, ROpVal, EVal, DVal, AO, Op, IsXBinopExpr,
@@ -7920,7 +7920,7 @@ void CodeGenFunction::EmitOMPGenericLoopDirective(
 
 void CodeGenFunction::EmitOMPParallelGenericLoopDirective(
     const OMPLoopDirective &S) {
-  // Emit combined directive as if its consituent constructs are 'parallel'
+  // Emit combined directive as if its constituent constructs are 'parallel'
   // and 'for'.
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     Action.Enter(CGF);
diff --git a/clang/lib/CodeGen/CGVTT.cpp b/clang/lib/CodeGen/CGVTT.cpp
index 4cebb750c89e8f..20bd2c2fc2c642 100644
--- a/clang/lib/CodeGen/CGVTT.cpp
+++ b/clang/lib/CodeGen/CGVTT.cpp
@@ -90,6 +90,11 @@ CodeGenVTables::EmitVTTDefinition(llvm::GlobalVariable *VTT,
      llvm::Constant *Init = llvm::ConstantExpr::getGetElementPtr(
          VTable->getValueType(), VTable, Idxs, /*InBounds=*/true, InRange);
 
+     if (const auto &Schema =
+             CGM.getCodeGenOpts().PointerAuth.CXXVTTVTablePointers)
+       Init = CGM.getConstantSignedPointer(Init, Schema, nullptr, GlobalDecl(),
+                                           QualType());
+
      VTTComponents.push_back(Init);
   }
 
diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 55c3032dc9332c..3e88cd73e8c1f4 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -95,7 +95,7 @@ static RValue PerformReturnAdjustment(CodeGenFunction &CGF,
       CGF,
       Address(ReturnValue, CGF.ConvertTypeForMem(ResultType->getPointeeType()),
               ClassAlign),
-      Thunk.Return);
+      ClassDecl, Thunk.Return);
 
   if (NullCheckValue) {
     CGF.Builder.CreateBr(AdjustEnd);
@@ -219,8 +219,10 @@ CodeGenFunction::GenerateVarArgsThunk(llvm::Function *Fn,
          "Store of this should be in entry block?");
   // Adjust "this", if necessary.
   Builder.SetInsertPoint(&*ThisStore);
-  llvm::Value *AdjustedThisPtr =
-      CGM.getCXXABI().performThisAdjustment(*this, ThisPtr, Thunk.This);
+
+  const CXXRecordDecl *ThisValueClass = Thunk.ThisType->getPointeeCXXRecordDecl();
+  llvm::Value *AdjustedThisPtr = CGM.getCXXABI().performThisAdjustment(
+      *this, ThisPtr, ThisValueClass, Thunk);
   AdjustedThisPtr = Builder.CreateBitCast(AdjustedThisPtr,
                                           ThisStore->getOperand(0)->getType());
   ThisStore->setOperand(0, AdjustedThisPtr);
@@ -307,10 +309,15 @@ void CodeGenFunction::EmitCallAndReturnForThunk(llvm::FunctionCallee Callee,
   const CXXMethodDecl *MD = cast<CXXMethodDecl>(CurGD.getDecl());
 
   // Adjust the 'this' pointer if necessary
+  const CXXRecordDecl *ThisValueClass =
+      MD->getThisType()->getPointeeCXXRecordDecl();
+  if (Thunk)
+    ThisValueClass = Thunk->ThisType->getPointeeCXXRecordDecl();
+
   llvm::Value *AdjustedThisPtr =
-    Thunk ? CGM.getCXXABI().performThisAdjustment(
-                          *this, LoadCXXThisAddress(), Thunk->This)
-          : LoadCXXThis();
+      Thunk ? CGM.getCXXABI().performThisAdjustment(*this, LoadCXXThisAddress(),
+                                                    ThisValueClass, *Thunk)
+            : LoadCXXThis();
 
   // If perfect forwarding is required a variadic method, a method using
   // inalloca, or an unprototyped thunk, use musttail. Emit an error if this
@@ -504,10 +511,22 @@ llvm::Constant *CodeGenVTables::maybeEmitThunk(GlobalDecl GD,
   SmallString<256> Name;
   MangleContext &MCtx = CGM.getCXXABI().getMangleContext();
   llvm::raw_svector_ostream Out(Name);
-  if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(MD))
-    MCtx.mangleCXXDtorThunk(DD, GD.getDtorType(), TI.This, Out);
-  else
-    MCtx.mangleThunk(MD, TI, Out);
+
+  if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(MD)) {
+    MCtx.mangleCXXDtorThunk(DD, GD.getDtorType(), TI,
+                            /* elideOverrideInfo */ false, Out);
+  } else
+    MCtx.mangleThunk(MD, TI, /* elideOverrideInfo */ false, Out);
+
+  if (CGM.getContext().useAbbreviatedThunkName(GD, Name.str())) {
+    Name = "";
+    if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(MD))
+      MCtx.mangleCXXDtorThunk(DD, GD.getDtorType(), TI,
+                              /* elideOverrideInfo */ true, Out);
+    else
+      MCtx.mangleThunk(MD, TI, /* elideOverrideInfo */ true, Out);
+  }
+
   llvm::Type *ThunkVTableTy = CGM.getTypes().GetFunctionTypeForVTable(GD);
   llvm::Constant *Thunk = CGM.GetAddrOfThunk(Name, ThunkVTableTy, GD);
 
@@ -819,11 +838,17 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder,
 
       nextVTableThunkIndex++;
       fnPtr = maybeEmitThunk(GD, thunkInfo, /*ForVTable=*/true);
+      if (CGM.getCodeGenOpts().PointerAuth.CXXVirtualFunctionPointers) {
+        assert(thunkInfo.Method &&  "Method not set");
+        GD = GD.getWithDecl(thunkInfo.Method);
+      }
 
     // Otherwise we can use the method definition directly.
     } else {
       llvm::Type *fnTy = CGM.getTypes().GetFunctionTypeForVTable(GD);
       fnPtr = CGM.GetAddrOfFunction(GD, fnTy, /*ForVTable=*/true);
+      if (CGM.getCodeGenOpts().PointerAuth.CXXVirtualFunctionPointers)
+        GD = getItaniumVTableContext().findOriginalMethod(GD);
     }
 
     if (useRelativeLayout()) {
@@ -841,6 +866,9 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder,
       if (FnAS != GVAS)
         fnPtr =
             llvm::ConstantExpr::getAddrSpaceCast(fnPtr, CGM.GlobalsInt8PtrTy);
+      if (const auto &Schema =
+          CGM.getCodeGenOpts().PointerAuth.CXXVirtualFunctionPointers)
+        return builder.addSignedPointer(fnPtr, Schema, GD, QualType());
       return builder.add(fnPtr);
     }
   }
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index 8dd9d8b54c25fa..2a179deddcc314 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -39,6 +39,7 @@ set(LLVM_LINK_COMPONENTS
 # our goal is to disable the /Og flag while retaining the other optimizations from the /O1|/O2 set
 if(MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES Clang
     AND MSVC_VERSION VERSION_GREATER_EQUAL 1932
+    AND MSVC_VERSION VERSION_LESS 1939
     AND CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
 
   string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 6d3efdb5ffe34e..0b92c5318a5c23 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -226,16 +226,11 @@ void BackendConsumer::HandleInterestingDecl(DeclGroupRef D) {
     HandleTopLevelDecl(D);
 }
 
-// Links each entry in LinkModules into our module.  Returns true on error.
-bool BackendConsumer::LinkInModules(llvm::Module *M, bool ShouldLinkFiles) {
+// Links each entry in LinkModules into our module. Returns true on error.
+bool BackendConsumer::LinkInModules(llvm::Module *M) {
   for (auto &LM : LinkModules) {
     assert(LM.Module && "LinkModule does not actually have a module");
 
-    // If ShouldLinkFiles is not set, skip files added via the
-    // -mlink-bitcode-files, only linking -mlink-builtin-bitcode
-    if (!LM.Internalize && !ShouldLinkFiles)
-      continue;
-
     if (LM.PropagateAttrs)
       for (Function &F : *LM.Module) {
         // Skip intrinsics. Keep consistent with how intrinsics are created
@@ -299,6 +294,9 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
   Ctx.setDiagnosticHandler(std::make_unique<ClangDiagnosticHandler>(
       CodeGenOpts, this));
 
+  Ctx.setDefaultTargetCPU(TargetOpts.CPU);
+  Ctx.setDefaultTargetFeatures(llvm::join(TargetOpts.Features, ","));
+
   Expected<std::unique_ptr<llvm::ToolOutputFile>> OptRecordFileOrErr =
     setupLLVMOptimizationRemarks(
       Ctx, CodeGenOpts.OptRecordFile, CodeGenOpts.OptRecordPasses,
@@ -1205,6 +1203,9 @@ void CodeGenAction::ExecuteAction() {
   Ctx.setDiagnosticHandler(
       std::make_unique<ClangDiagnosticHandler>(CodeGenOpts, &Result));
 
+  Ctx.setDefaultTargetCPU(TargetOpts.CPU);
+  Ctx.setDefaultTargetFeatures(llvm::join(TargetOpts.Features, ","));
+
   Expected<std::unique_ptr<llvm::ToolOutputFile>> OptRecordFileOrErr =
       setupLLVMOptimizationRemarks(
           Ctx, CodeGenOpts.OptRecordFile, CodeGenOpts.OptRecordPasses,
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 650c5662539f9f..26deeca95d326c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -3063,3 +3063,66 @@ void CodeGenFunction::EmitPointerAuthOperandBundle(
   llvm::Value *Args[] = {Key, Discriminator};
   Bundles.emplace_back("ptrauth", Args);
 }
+
+static llvm::Value *EmitPointerAuthCommon(CodeGenFunction &CGF,
+                                          const CGPointerAuthInfo &PointerAuth,
+                                          llvm::Value *Pointer,
+                                          unsigned IntrinsicID) {
+  if (!PointerAuth)
+    return Pointer;
+
+  auto Key = CGF.Builder.getInt32(PointerAuth.getKey());
+
+  llvm::Value *Discriminator = PointerAuth.getDiscriminator();
+  if (!Discriminator) {
+    Discriminator = CGF.Builder.getSize(0);
+  }
+
+  // Convert the pointer to intptr_t before signing it.
+  auto OrigType = Pointer->getType();
+  Pointer = CGF.Builder.CreatePtrToInt(Pointer, CGF.IntPtrTy);
+
+  // call i64 @llvm.ptrauth.sign.i64(i64 %pointer, i32 %key, i64 %discriminator)
+  auto Intrinsic = CGF.CGM.getIntrinsic(IntrinsicID);
+  Pointer = CGF.EmitRuntimeCall(Intrinsic, {Pointer, Key, Discriminator});
+
+  // Convert back to the original type.
+  Pointer = CGF.Builder.CreateIntToPtr(Pointer, OrigType);
+  return Pointer;
+}
+
+llvm::Value *
+CodeGenFunction::EmitPointerAuthSign(const CGPointerAuthInfo &PointerAuth,
+                                     llvm::Value *Pointer) {
+  if (!PointerAuth.shouldSign())
+    return Pointer;
+  return EmitPointerAuthCommon(*this, PointerAuth, Pointer,
+                               llvm::Intrinsic::ptrauth_sign);
+}
+
+static llvm::Value *EmitStrip(CodeGenFunction &CGF,
+                              const CGPointerAuthInfo &PointerAuth,
+                              llvm::Value *Pointer) {
+  auto StripIntrinsic = CGF.CGM.getIntrinsic(llvm::Intrinsic::ptrauth_strip);
+
+  auto Key = CGF.Builder.getInt32(PointerAuth.getKey());
+  // Convert the pointer to intptr_t before signing it.
+  auto OrigType = Pointer->getType();
+  Pointer = CGF.EmitRuntimeCall(
+      StripIntrinsic, {CGF.Builder.CreatePtrToInt(Pointer, CGF.IntPtrTy), Key});
+  return CGF.Builder.CreateIntToPtr(Pointer, OrigType);
+}
+
+llvm::Value *
+CodeGenFunction::EmitPointerAuthAuth(const CGPointerAuthInfo &PointerAuth,
+                                     llvm::Value *Pointer) {
+  if (PointerAuth.shouldStrip()) {
+    return EmitStrip(*this, PointerAuth, Pointer);
+  }
+  if (!PointerAuth.shouldAuth()) {
+    return Pointer;
+  }
+
+  return EmitPointerAuthCommon(*this, PointerAuth, Pointer,
+                               llvm::Intrinsic::ptrauth_auth);
+}
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index a9c497bde68717..13f12b5d878a6a 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2453,10 +2453,20 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   void InitializeVTablePointers(const CXXRecordDecl *ClassDecl);
 
+  // VTableTrapMode - whether we guarantee that loading the
+  // vtable is guaranteed to trap on authentication failure,
+  // even if the resulting vtable pointer is unused.
+  enum class VTableAuthMode {
+    Authenticate,
+    MustTrap,
+    UnsafeUbsanStrip // Should only be used for Vptr UBSan check
+  };
   /// GetVTablePtr - Return the Value of the vtable pointer member pointed
   /// to by This.
-  llvm::Value *GetVTablePtr(Address This, llvm::Type *VTableTy,
-                            const CXXRecordDecl *VTableClass);
+  llvm::Value *
+  GetVTablePtr(Address This, llvm::Type *VTableTy,
+               const CXXRecordDecl *VTableClass,
+               VTableAuthMode AuthMode = VTableAuthMode::Authenticate);
 
   enum CFITypeCheckKind {
     CFITCK_VCall,
@@ -4417,6 +4427,19 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   bool isPointerKnownNonNull(const Expr *E);
 
+  /// Create the discriminator from the storage address and the entity hash.
+  llvm::Value *EmitPointerAuthBlendDiscriminator(llvm::Value *StorageAddress,
+                                                 llvm::Value *Discriminator);
+  CGPointerAuthInfo EmitPointerAuthInfo(const PointerAuthSchema &Schema,
+                                        llvm::Value *StorageAddress,
+                                        GlobalDecl SchemaDecl,
+                                        QualType SchemaType);
+  llvm::Value *EmitPointerAuthSign(QualType PointeeType, llvm::Value *Pointer);
+  llvm::Value *EmitPointerAuthSign(const CGPointerAuthInfo &Info,
+                                   llvm::Value *Pointer);
+  llvm::Value *EmitPointerAuthAuth(const CGPointerAuthInfo &Info,
+                                   llvm::Value *Pointer);
+
   void EmitPointerAuthOperandBundle(
       const CGPointerAuthInfo &Info,
       SmallVectorImpl<llvm::OperandBundleDef> &Bundles);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index dd4a665ebc78b7..652f519d824880 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -907,7 +907,8 @@ void CodeGenModule::Release() {
   if (Context.getTargetInfo().getTriple().isWasm())
     EmitMainVoidAlias();
 
-  if (getTriple().isAMDGPU()) {
+  if (getTriple().isAMDGPU() ||
+      (getTriple().isSPIRV() && getTriple().getVendor() == llvm::Triple::AMD)) {
     // Emit amdhsa_code_object_version module flag, which is code object version
     // times 100.
     if (getTarget().getTargetOpts().CodeObjectVersion !=
@@ -4259,8 +4260,8 @@ void CodeGenModule::emitMultiVersionFunctions() {
     llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD);
     if (auto *IFunc = dyn_cast<llvm::GlobalIFunc>(ResolverConstant)) {
       ResolverConstant = IFunc->getResolver();
-      if (FD->isTargetClonesMultiVersion() ||
-          FD->isTargetVersionMultiVersion()) {
+      if (FD->isTargetClonesMultiVersion() &&
+          !getTarget().getTriple().isAArch64()) {
         std::string MangledName = getMangledNameImpl(
             *this, GD, FD, /*OmitMultiVersionMangling=*/true);
         if (!GetGlobalValue(MangledName + ".ifunc")) {
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 9913304757caa7..4796d421aaa699 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -435,7 +435,7 @@ class CodeGenModule : public CodeGenTypeCache {
 
   // Store deferred function annotations so they can be emitted at the end with
   // most up to date ValueDecl that will have all the inherited annotations.
-  llvm::DenseMap<StringRef, const ValueDecl *> DeferredAnnotations;
+  llvm::MapVector<StringRef, const ValueDecl *> DeferredAnnotations;
 
   /// Map used to get unique annotation strings.
   llvm::StringMap<llvm::Constant*> AnnotationStrings;
@@ -609,6 +609,13 @@ class CodeGenModule : public CodeGenTypeCache {
   std::pair<std::unique_ptr<CodeGenFunction>, const TopLevelStmtDecl *>
       GlobalTopLevelStmtBlockInFlight;
 
+  llvm::DenseMap<GlobalDecl, uint16_t> PtrAuthDiscriminatorHashes;
+
+  llvm::DenseMap<const CXXRecordDecl *, std::optional<PointerAuthQualifier>>
+      VTablePtrAuthInfos;
+  std::optional<PointerAuthQualifier>
+  computeVTPointerAuthentication(const CXXRecordDecl *ThisClass);
+
 public:
   CodeGenModule(ASTContext &C, IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
                 const HeaderSearchOptions &headersearchopts,
@@ -957,11 +964,33 @@ class CodeGenModule : public CodeGenTypeCache {
 
   CGPointerAuthInfo getFunctionPointerAuthInfo(QualType T);
 
+  bool shouldSignPointer(const PointerAuthSchema &Schema);
+  llvm::Constant *getConstantSignedPointer(llvm::Constant *Pointer,
+                                           const PointerAuthSchema &Schema,
+                                           llvm::Constant *StorageAddress,
+                                           GlobalDecl SchemaDecl,
+                                           QualType SchemaType);
+
   llvm::Constant *
   getConstantSignedPointer(llvm::Constant *Pointer, unsigned Key,
                            llvm::Constant *StorageAddress,
                            llvm::ConstantInt *OtherDiscriminator);
 
+  llvm::ConstantInt *
+  getPointerAuthOtherDiscriminator(const PointerAuthSchema &Schema,
+                                   GlobalDecl SchemaDecl, QualType SchemaType);
+
+  uint16_t getPointerAuthDeclDiscriminator(GlobalDecl GD);
+  std::optional<CGPointerAuthInfo>
+  getVTablePointerAuthInfo(CodeGenFunction *Context,
+                           const CXXRecordDecl *Record,
+                           llvm::Value *StorageAddress);
+
+  std::optional<PointerAuthQualifier>
+  getVTablePointerAuthentication(const CXXRecordDecl *thisClass);
+
+  CGPointerAuthInfo EmitPointerAuthInfo(const RecordDecl *RD);
+
   // Return whether RTTI information should be emitted for this target.
   bool shouldEmitRTTI(bool ForEH = false) {
     return (ForEH || getLangOpts().RTTI) && !getLangOpts().CUDAIsDevice &&
diff --git a/clang/lib/CodeGen/ConstantEmitter.h b/clang/lib/CodeGen/ConstantEmitter.h
index a55da0dcad7921..eff0a8d52891f0 100644
--- a/clang/lib/CodeGen/ConstantEmitter.h
+++ b/clang/lib/CodeGen/ConstantEmitter.h
@@ -113,6 +113,9 @@ class ConstantEmitter {
   llvm::Constant *tryEmitAbstract(const APValue &value, QualType T);
   llvm::Constant *tryEmitAbstractForMemory(const APValue &value, QualType T);
 
+  llvm::Constant *tryEmitConstantSignedPointer(llvm::Constant *Ptr,
+                                               PointerAuthQualifier Auth);
+
   llvm::Constant *tryEmitConstantExpr(const ConstantExpr *CE);
 
   llvm::Constant *emitNullForMemory(QualType T) {
diff --git a/clang/lib/CodeGen/ConstantInitBuilder.cpp b/clang/lib/CodeGen/ConstantInitBuilder.cpp
index 3cf69f3b6415e4..549d5dd66b1230 100644
--- a/clang/lib/CodeGen/ConstantInitBuilder.cpp
+++ b/clang/lib/CodeGen/ConstantInitBuilder.cpp
@@ -296,3 +296,21 @@ ConstantAggregateBuilderBase::finishStruct(llvm::StructType *ty) {
   buffer.erase(buffer.begin() + Begin, buffer.end());
   return constant;
 }
+
+/// Sign the given pointer and add it to the constant initializer
+/// currently being built.
+void ConstantAggregateBuilderBase::addSignedPointer(
+    llvm::Constant *Pointer, const PointerAuthSchema &Schema,
+    GlobalDecl CalleeDecl, QualType CalleeType) {
+  if (!Schema || !Builder.CGM.shouldSignPointer(Schema))
+    return add(Pointer);
+
+  llvm::Constant *StorageAddress = nullptr;
+  if (Schema.isAddressDiscriminated()) {
+    StorageAddress = getAddrOfCurrentPosition(Pointer->getType());
+  }
+
+  llvm::Constant *SignedPointer = Builder.CGM.getConstantSignedPointer(
+      Pointer, Schema, StorageAddress, CalleeDecl, CalleeType);
+  add(SignedPointer);
+}
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 01a735c1437e1d..63e36e1b838936 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -23,6 +23,7 @@
 #include "CGVTables.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
+#include "ConstantEmitter.h"
 #include "TargetInfo.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Mangle.h"
@@ -336,9 +337,11 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
   bool exportThunk() override { return true; }
 
   llvm::Value *performThisAdjustment(CodeGenFunction &CGF, Address This,
-                                     const ThisAdjustment &TA) override;
+                                     const CXXRecordDecl *UnadjustedThisClass,
+                                     const ThunkInfo &TI) override;
 
   llvm::Value *performReturnAdjustment(CodeGenFunction &CGF, Address Ret,
+                                       const CXXRecordDecl *UnadjustedRetClass,
                                        const ReturnAdjustment &RA) override;
 
   size_t getSrcArgforCopyCtor(const CXXConstructorDecl *,
@@ -1477,10 +1480,22 @@ llvm::Value *ItaniumCXXABI::emitDynamicCastCall(
       computeOffsetHint(CGF.getContext(), SrcDecl, DestDecl).getQuantity());
 
   // Emit the call to __dynamic_cast.
-  llvm::Value *Args[] = {ThisAddr.emitRawPointer(CGF), SrcRTTI, DestRTTI,
-                         OffsetHint};
-  llvm::Value *Value =
-      CGF.EmitNounwindRuntimeCall(getItaniumDynamicCastFn(CGF), Args);
+  llvm::Value *Value = ThisAddr.emitRawPointer(CGF);
+  if (CGM.getCodeGenOpts().PointerAuth.CXXVTablePointers) {
+    // We perform a no-op load of the vtable pointer here to force an
+    // authentication. In environments that do not support pointer
+    // authentication this is a an actual no-op that will be elided. When
+    // pointer authentication is supported and enforced on vtable pointers this
+    // load can trap.
+    llvm::Value *Vtable =
+        CGF.GetVTablePtr(ThisAddr, CGM.Int8PtrTy, SrcDecl,
+                         CodeGenFunction::VTableAuthMode::MustTrap);
+    assert(Vtable);
+    (void)Vtable;
+  }
+
+  llvm::Value *args[] = {Value, SrcRTTI, DestRTTI, OffsetHint};
+  Value = CGF.EmitNounwindRuntimeCall(getItaniumDynamicCastFn(CGF), args);
 
   /// C++ [expr.dynamic.cast]p9:
   ///   A failed cast to reference type throws std::bad_cast
@@ -1955,8 +1970,18 @@ llvm::Value *ItaniumCXXABI::getVTableAddressPointInStructorWithVTT(
                                                  VirtualPointerIndex);
 
   // And load the address point from the VTT.
-  return CGF.Builder.CreateAlignedLoad(CGF.GlobalsVoidPtrTy, VTT,
-                                       CGF.getPointerAlign());
+  llvm::Value *AP =
+      CGF.Builder.CreateAlignedLoad(CGF.GlobalsVoidPtrTy, VTT,
+                                    CGF.getPointerAlign());
+
+  if (auto &Schema = CGF.CGM.getCodeGenOpts().PointerAuth.CXXVTTVTablePointers) {
+    CGPointerAuthInfo PointerAuth = CGF.EmitPointerAuthInfo(Schema, VTT,
+                                                            GlobalDecl(),
+                                                            QualType());
+    AP = CGF.EmitPointerAuthAuth(PointerAuth, AP);
+  }
+
+  return AP;
 }
 
 llvm::GlobalVariable *ItaniumCXXABI::getAddrOfVTable(const CXXRecordDecl *RD,
@@ -2008,8 +2033,9 @@ CGCallee ItaniumCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
   llvm::Value *VTable = CGF.GetVTablePtr(This, PtrTy, MethodDecl->getParent());
 
   uint64_t VTableIndex = CGM.getItaniumVTableContext().getMethodVTableIndex(GD);
-  llvm::Value *VFunc;
-  if (CGF.ShouldEmitVTableTypeCheckedLoad(MethodDecl->getParent())) {
+  llvm::Value *VFunc, *VTableSlotPtr = nullptr;
+  auto &Schema = CGM.getCodeGenOpts().PointerAuth.CXXVirtualFunctionPointers;
+  if (!Schema && CGF.ShouldEmitVTableTypeCheckedLoad(MethodDecl->getParent())) {
     VFunc = CGF.EmitVTableTypeCheckedLoad(
         MethodDecl->getParent(), VTable, PtrTy,
         VTableIndex *
@@ -2024,7 +2050,7 @@ CGCallee ItaniumCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
           CGM.getIntrinsic(llvm::Intrinsic::load_relative, {CGM.Int32Ty}),
           {VTable, llvm::ConstantInt::get(CGM.Int32Ty, 4 * VTableIndex)});
     } else {
-      llvm::Value *VTableSlotPtr = CGF.Builder.CreateConstInBoundsGEP1_64(
+      VTableSlotPtr = CGF.Builder.CreateConstInBoundsGEP1_64(
           PtrTy, VTable, VTableIndex, "vfn");
       VFuncLoad = CGF.Builder.CreateAlignedLoad(PtrTy, VTableSlotPtr,
                                                 CGF.getPointerAlign());
@@ -2048,7 +2074,13 @@ CGCallee ItaniumCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
     VFunc = VFuncLoad;
   }
 
-  CGCallee Callee(GD, VFunc);
+  CGPointerAuthInfo PointerAuth;
+  if (Schema) {
+    assert(VTableSlotPtr && "virtual function pointer not set");
+    GD = CGM.getItaniumVTableContext().findOriginalMethod(GD.getCanonicalDecl());
+    PointerAuth = CGF.EmitPointerAuthInfo(Schema, VTableSlotPtr, GD, QualType());
+  }
+  CGCallee Callee(GD, VFunc, PointerAuth);
   return Callee;
 }
 
@@ -2147,6 +2179,7 @@ bool ItaniumCXXABI::canSpeculativelyEmitVTable(const CXXRecordDecl *RD) const {
 }
 static llvm::Value *performTypeAdjustment(CodeGenFunction &CGF,
                                           Address InitialPtr,
+                                          const CXXRecordDecl *UnadjustedClass,
                                           int64_t NonVirtualAdjustment,
                                           int64_t VirtualAdjustment,
                                           bool IsReturnAdjustment) {
@@ -2164,8 +2197,8 @@ static llvm::Value *performTypeAdjustment(CodeGenFunction &CGF,
   // Perform the virtual adjustment if we have one.
   llvm::Value *ResultPtr;
   if (VirtualAdjustment) {
-    Address VTablePtrPtr = V.withElementType(CGF.Int8PtrTy);
-    llvm::Value *VTablePtr = CGF.Builder.CreateLoad(VTablePtrPtr);
+    llvm::Value *VTablePtr =
+        CGF.GetVTablePtr(V, CGF.Int8PtrTy, UnadjustedClass);
 
     llvm::Value *Offset;
     llvm::Value *OffsetPtr = CGF.Builder.CreateConstInBoundsGEP1_64(
@@ -2200,18 +2233,20 @@ static llvm::Value *performTypeAdjustment(CodeGenFunction &CGF,
   return ResultPtr;
 }
 
-llvm::Value *ItaniumCXXABI::performThisAdjustment(CodeGenFunction &CGF,
-                                                  Address This,
-                                                  const ThisAdjustment &TA) {
-  return performTypeAdjustment(CGF, This, TA.NonVirtual,
-                               TA.Virtual.Itanium.VCallOffsetOffset,
+llvm::Value *
+ItaniumCXXABI::performThisAdjustment(CodeGenFunction &CGF, Address This,
+                                     const CXXRecordDecl *UnadjustedClass,
+                                     const ThunkInfo &TI) {
+  return performTypeAdjustment(CGF, This, UnadjustedClass, TI.This.NonVirtual,
+                               TI.This.Virtual.Itanium.VCallOffsetOffset,
                                /*IsReturnAdjustment=*/false);
 }
 
 llvm::Value *
 ItaniumCXXABI::performReturnAdjustment(CodeGenFunction &CGF, Address Ret,
+                                       const CXXRecordDecl *UnadjustedClass,
                                        const ReturnAdjustment &RA) {
-  return performTypeAdjustment(CGF, Ret, RA.NonVirtual,
+  return performTypeAdjustment(CGF, Ret, UnadjustedClass, RA.NonVirtual,
                                RA.Virtual.Itanium.VBaseOffsetOffset,
                                /*IsReturnAdjustment=*/true);
 }
@@ -3694,6 +3729,10 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
                                                           VTable, Two);
   }
 
+  if (auto &Schema = CGM.getCodeGenOpts().PointerAuth.CXXTypeInfoVTablePointer)
+    VTable = CGM.getConstantSignedPointer(VTable, Schema, nullptr, GlobalDecl(),
+                                          QualType(Ty, 0));
+
   Fields.push_back(VTable);
 }
 
diff --git a/clang/lib/CodeGen/LinkInModulesPass.cpp b/clang/lib/CodeGen/LinkInModulesPass.cpp
index c3831aae13b647..44b2df52f001a7 100644
--- a/clang/lib/CodeGen/LinkInModulesPass.cpp
+++ b/clang/lib/CodeGen/LinkInModulesPass.cpp
@@ -20,15 +20,13 @@
 
 using namespace llvm;
 
-LinkInModulesPass::LinkInModulesPass(clang::BackendConsumer *BC,
-                                     bool ShouldLinkFiles)
-    : BC(BC), ShouldLinkFiles(ShouldLinkFiles) {}
+LinkInModulesPass::LinkInModulesPass(clang::BackendConsumer *BC) : BC(BC) {}
 
 PreservedAnalyses LinkInModulesPass::run(Module &M, ModuleAnalysisManager &AM) {
   if (!BC)
     return PreservedAnalyses::all();
 
-  if (BC->LinkInModules(&M, ShouldLinkFiles))
+  if (BC->LinkInModules(&M))
     report_fatal_error("Bitcode module postopt linking failed, aborted!");
 
   return PreservedAnalyses::none();
diff --git a/clang/lib/CodeGen/LinkInModulesPass.h b/clang/lib/CodeGen/LinkInModulesPass.h
index 7fe94d6250583d..3edbfd076e1508 100644
--- a/clang/lib/CodeGen/LinkInModulesPass.h
+++ b/clang/lib/CodeGen/LinkInModulesPass.h
@@ -28,10 +28,9 @@ class Pass;
 /// for use with the legacy pass manager.
 class LinkInModulesPass : public PassInfoMixin<LinkInModulesPass> {
   clang::BackendConsumer *BC;
-  bool ShouldLinkFiles;
 
 public:
-  LinkInModulesPass(clang::BackendConsumer *BC, bool ShouldLinkFiles = true);
+  LinkInModulesPass(clang::BackendConsumer *BC);
 
   PreservedAnalyses run(Module &M, AnalysisManager<Module> &);
   static bool isRequired() { return true; }
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 9ab634fa6ce2e0..cc6740edabcd3c 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -415,9 +415,11 @@ class MicrosoftCXXABI : public CGCXXABI {
   bool exportThunk() override { return false; }
 
   llvm::Value *performThisAdjustment(CodeGenFunction &CGF, Address This,
-                                     const ThisAdjustment &TA) override;
+                                     const CXXRecordDecl * /*UnadjustedClass*/,
+                                     const ThunkInfo &TI) override;
 
   llvm::Value *performReturnAdjustment(CodeGenFunction &CGF, Address Ret,
+                                       const CXXRecordDecl * /*UnadjustedClass*/,
                                        const ReturnAdjustment &RA) override;
 
   void EmitThreadLocalInitFuncs(
@@ -2223,9 +2225,10 @@ void MicrosoftCXXABI::emitVBTableDefinition(const VPtrInfo &VBT,
     GV->setLinkage(llvm::GlobalVariable::AvailableExternallyLinkage);
 }
 
-llvm::Value *MicrosoftCXXABI::performThisAdjustment(CodeGenFunction &CGF,
-                                                    Address This,
-                                                    const ThisAdjustment &TA) {
+llvm::Value *MicrosoftCXXABI::performThisAdjustment(
+    CodeGenFunction &CGF, Address This,
+    const CXXRecordDecl * /*UnadjustedClass*/, const ThunkInfo &TI) {
+  const ThisAdjustment &TA = TI.This;
   if (TA.isEmpty())
     return This.emitRawPointer(CGF);
 
@@ -2275,9 +2278,10 @@ llvm::Value *MicrosoftCXXABI::performThisAdjustment(CodeGenFunction &CGF,
   return V;
 }
 
-llvm::Value *
-MicrosoftCXXABI::performReturnAdjustment(CodeGenFunction &CGF, Address Ret,
-                                         const ReturnAdjustment &RA) {
+llvm::Value *MicrosoftCXXABI::performReturnAdjustment(
+    CodeGenFunction &CGF, Address Ret,
+    const CXXRecordDecl * /*UnadjustedClass*/, const ReturnAdjustment &RA) {
+
   if (RA.isEmpty())
     return Ret.emitRawPointer(CGF);
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 0298d22203d9d4..221e222bdd47d9 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -147,6 +147,14 @@ getNVIDIAOffloadTargetTriple(const Driver &D, const ArgList &Args,
 static std::optional<llvm::Triple>
 getHIPOffloadTargetTriple(const Driver &D, const ArgList &Args) {
   if (!Args.hasArg(options::OPT_offload_EQ)) {
+    auto OffloadArchs = Args.getAllArgValues(options::OPT_offload_arch_EQ);
+    if (llvm::find(OffloadArchs, "amdgcnspirv") != OffloadArchs.cend()) {
+      if (OffloadArchs.size() == 1)
+        return llvm::Triple("spirv64-amd-amdhsa");
+      // Mixing specific & SPIR-V compilation is not supported for now.
+      D.Diag(diag::err_drv_only_one_offload_target_supported);
+      return std::nullopt;
+    }
     return llvm::Triple("amdgcn-amd-amdhsa"); // Default HIP triple.
   }
   auto TT = getOffloadTargetTriple(D, Args);
@@ -229,6 +237,9 @@ Driver::Driver(StringRef ClangExecutable, StringRef TargetTriple,
     UserConfigDir = static_cast<std::string>(P);
   }
 #endif
+
+  // Compute the path to the resource directory.
+  ResourceDir = GetResourcesPath(ClangExecutable, CLANG_RESOURCE_DIR);
 }
 
 void Driver::setDriverMode(StringRef Value) {
@@ -247,24 +258,6 @@ void Driver::setDriverMode(StringRef Value) {
     Diag(diag::err_drv_unsupported_option_argument) << OptName << Value;
 }
 
-void Driver::setResourceDirectory() {
-  // Compute the path to the resource directory, depending on the driver mode.
-  switch (Mode) {
-  case GCCMode:
-  case GXXMode:
-  case CPPMode:
-  case CLMode:
-  case DXCMode:
-    ResourceDir = GetResourcesPath(ClangExecutable, CLANG_RESOURCE_DIR);
-    break;
-  case FlangMode:
-    SmallString<64> customResourcePathRelativeToDriver{".."};
-    ResourceDir =
-        GetResourcesPath(ClangExecutable, customResourcePathRelativeToDriver);
-    break;
-  }
-}
-
 InputArgList Driver::ParseArgStrings(ArrayRef<const char *> ArgStrings,
                                      bool UseDriverMode, bool &ContainsError) {
   llvm::PrettyStackTraceString CrashInfo("Command line argument parsing");
@@ -370,6 +363,7 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL,
     // -{fsyntax-only,-analyze,emit-ast} only run up to the compiler.
   } else if ((PhaseArg = DAL.getLastArg(options::OPT_fsyntax_only)) ||
              (PhaseArg = DAL.getLastArg(options::OPT_print_supported_cpus)) ||
+             (PhaseArg = DAL.getLastArg(options::OPT_print_enabled_extensions)) ||
              (PhaseArg = DAL.getLastArg(options::OPT_module_file_info)) ||
              (PhaseArg = DAL.getLastArg(options::OPT_verify_pch)) ||
              (PhaseArg = DAL.getLastArg(options::OPT_rewrite_objc)) ||
@@ -905,11 +899,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       }
 
       for (StringRef Arch : Archs) {
-        if (NVPTXTriple && IsNVIDIAGpuArch(StringToCudaArch(
+        if (NVPTXTriple && IsNVIDIAOffloadArch(StringToOffloadArch(
                                getProcessorFromTargetID(*NVPTXTriple, Arch)))) {
           DerivedArchs[NVPTXTriple->getTriple()].insert(Arch);
         } else if (AMDTriple &&
-                   IsAMDGpuArch(StringToCudaArch(
+                   IsAMDOffloadArch(StringToOffloadArch(
                        getProcessorFromTargetID(*AMDTriple, Arch)))) {
           DerivedArchs[AMDTriple->getTriple()].insert(Arch);
         } else {
@@ -1217,7 +1211,6 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   if (!DriverMode.empty())
     setDriverMode(DriverMode);
 
-  setResourceDirectory();
   // FIXME: What are we going to do with -V and -b?
 
   // Arguments specified in command line.
@@ -2171,7 +2164,8 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
   if (C.getArgs().hasArg(options::OPT_v) ||
       C.getArgs().hasArg(options::OPT__HASH_HASH_HASH) ||
       C.getArgs().hasArg(options::OPT_print_supported_cpus) ||
-      C.getArgs().hasArg(options::OPT_print_supported_extensions)) {
+      C.getArgs().hasArg(options::OPT_print_supported_extensions) ||
+      C.getArgs().hasArg(options::OPT_print_enabled_extensions)) {
     PrintVersion(C, llvm::errs());
     SuppressMissingInputWarning = true;
   }
@@ -2954,7 +2948,7 @@ class OffloadingActionBuilder final {
     struct TargetID {
       /// Target ID string which is persistent throughout the compilation.
       const char *ID;
-      TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); }
+      TargetID(OffloadArch Arch) { ID = OffloadArchToString(Arch); }
       TargetID(const char *ID) : ID(ID) {}
       operator const char *() { return ID; }
       operator StringRef() { return StringRef(ID); }
@@ -2975,7 +2969,7 @@ class OffloadingActionBuilder final {
     bool Relocatable = false;
 
     /// Default GPU architecture if there's no one specified.
-    CudaArch DefaultCudaArch = CudaArch::UNKNOWN;
+    OffloadArch DefaultOffloadArch = OffloadArch::UNKNOWN;
 
     /// Method to generate compilation unit ID specified by option
     /// '-fuse-cuid='.
@@ -3104,7 +3098,7 @@ class OffloadingActionBuilder final {
 
       // If we have a fat binary, add it to the list.
       if (CudaFatBinary) {
-        AddTopLevel(CudaFatBinary, CudaArch::UNUSED);
+        AddTopLevel(CudaFatBinary, OffloadArch::UNUSED);
         CudaDeviceActions.clear();
         CudaFatBinary = nullptr;
         return;
@@ -3247,10 +3241,14 @@ class OffloadingActionBuilder final {
       // supported GPUs.  sm_20 code should work correctly, if
       // suboptimally, on all newer GPUs.
       if (GpuArchList.empty()) {
-        if (ToolChains.front()->getTriple().isSPIRV())
-          GpuArchList.push_back(CudaArch::Generic);
-        else
-          GpuArchList.push_back(DefaultCudaArch);
+        if (ToolChains.front()->getTriple().isSPIRV()) {
+          if (ToolChains.front()->getTriple().getVendor() == llvm::Triple::AMD)
+            GpuArchList.push_back(OffloadArch::AMDGCNSPIRV);
+          else
+            GpuArchList.push_back(OffloadArch::Generic);
+        } else {
+          GpuArchList.push_back(DefaultOffloadArch);
+        }
       }
 
       return Error;
@@ -3264,16 +3262,16 @@ class OffloadingActionBuilder final {
     CudaActionBuilder(Compilation &C, DerivedArgList &Args,
                       const Driver::InputList &Inputs)
         : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {
-      DefaultCudaArch = CudaArch::CudaDefault;
+      DefaultOffloadArch = OffloadArch::CudaDefault;
     }
 
     StringRef getCanonicalOffloadArch(StringRef ArchStr) override {
-      CudaArch Arch = StringToCudaArch(ArchStr);
-      if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
+      OffloadArch Arch = StringToOffloadArch(ArchStr);
+      if (Arch == OffloadArch::UNKNOWN || !IsNVIDIAOffloadArch(Arch)) {
         C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
         return StringRef();
       }
-      return CudaArchToString(Arch);
+      return OffloadArchToString(Arch);
     }
 
     std::optional<std::pair<llvm::StringRef, llvm::StringRef>>
@@ -3403,7 +3401,7 @@ class OffloadingActionBuilder final {
                      const Driver::InputList &Inputs)
         : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {
 
-      DefaultCudaArch = CudaArch::HIPDefault;
+      DefaultOffloadArch = OffloadArch::HIPDefault;
 
       if (Args.hasArg(options::OPT_fhip_emit_relocatable,
                       options::OPT_fno_hip_emit_relocatable)) {
@@ -4351,13 +4349,14 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
   }
 
   for (auto Opt : {options::OPT_print_supported_cpus,
-                   options::OPT_print_supported_extensions}) {
+                   options::OPT_print_supported_extensions,
+                   options::OPT_print_enabled_extensions}) {
     // If --print-supported-cpus, -mcpu=? or -mtune=? is specified, build a
     // custom Compile phase that prints out supported cpu models and quits.
     //
-    // If --print-supported-extensions is specified, call the helper function
-    // RISCVMarchHelp in RISCVISAInfo.cpp that prints out supported extensions
-    // and quits.
+    // If either --print-supported-extensions or --print-enabled-extensions is
+    // specified, call the corresponding helper function that prints out the
+    // supported/enabled extensions and quits.
     if (Arg *A = Args.getLastArg(Opt)) {
       if (Opt == options::OPT_print_supported_extensions &&
           !C.getDefaultToolChain().getTriple().isRISCV() &&
@@ -4367,6 +4366,12 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
             << "--print-supported-extensions";
         return;
       }
+      if (Opt == options::OPT_print_enabled_extensions &&
+          !C.getDefaultToolChain().getTriple().isAArch64()) {
+        C.getDriver().Diag(diag::err_opt_not_valid_on_target)
+            << "--print-enabled-extensions";
+        return;
+      }
 
       // Use the -mcpu=? flag as the dummy input to cc1.
       Actions.clear();
@@ -4403,23 +4408,24 @@ static StringRef getCanonicalArchString(Compilation &C,
                                         bool SuppressError = false) {
   // Lookup the CUDA / HIP architecture string. Only report an error if we were
   // expecting the triple to be only NVPTX / AMDGPU.
-  CudaArch Arch = StringToCudaArch(getProcessorFromTargetID(Triple, ArchStr));
+  OffloadArch Arch =
+      StringToOffloadArch(getProcessorFromTargetID(Triple, ArchStr));
   if (!SuppressError && Triple.isNVPTX() &&
-      (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch))) {
+      (Arch == OffloadArch::UNKNOWN || !IsNVIDIAOffloadArch(Arch))) {
     C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
         << "CUDA" << ArchStr;
     return StringRef();
   } else if (!SuppressError && Triple.isAMDGPU() &&
-             (Arch == CudaArch::UNKNOWN || !IsAMDGpuArch(Arch))) {
+             (Arch == OffloadArch::UNKNOWN || !IsAMDOffloadArch(Arch))) {
     C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
         << "HIP" << ArchStr;
     return StringRef();
   }
 
-  if (IsNVIDIAGpuArch(Arch))
-    return Args.MakeArgStringRef(CudaArchToString(Arch));
+  if (IsNVIDIAOffloadArch(Arch))
+    return Args.MakeArgStringRef(OffloadArchToString(Arch));
 
-  if (IsAMDGpuArch(Arch)) {
+  if (IsAMDOffloadArch(Arch)) {
     llvm::StringMap<bool> Features;
     auto HIPTriple = getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs());
     if (!HIPTriple)
@@ -4540,9 +4546,9 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
 
   if (Archs.empty()) {
     if (Kind == Action::OFK_Cuda)
-      Archs.insert(CudaArchToString(CudaArch::CudaDefault));
+      Archs.insert(OffloadArchToString(OffloadArch::CudaDefault));
     else if (Kind == Action::OFK_HIP)
-      Archs.insert(CudaArchToString(CudaArch::HIPDefault));
+      Archs.insert(OffloadArchToString(OffloadArch::HIPDefault));
     else if (Kind == Action::OFK_OpenMP)
       Archs.insert(StringRef());
   } else {
@@ -4592,10 +4598,13 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
 
     // Get the product of all bound architectures and toolchains.
     SmallVector<std::pair<const ToolChain *, StringRef>> TCAndArchs;
-    for (const ToolChain *TC : ToolChains)
-      for (StringRef Arch : getOffloadArchs(C, Args, Kind, TC))
+    for (const ToolChain *TC : ToolChains) {
+      llvm::DenseSet<StringRef> Arches = getOffloadArchs(C, Args, Kind, TC);
+      SmallVector<StringRef, 0> Sorted(Arches.begin(), Arches.end());
+      llvm::sort(Sorted);
+      for (StringRef Arch : Sorted)
         TCAndArchs.push_back(std::make_pair(TC, Arch));
-    llvm::sort(TCAndArchs, llvm::less_second());
+    }
 
     for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I)
       DeviceActions.push_back(C.MakeAction<InputAction>(*InputArg, InputType));
@@ -6517,9 +6526,11 @@ const ToolChain &Driver::getOffloadingDeviceToolChain(
     // things.
     switch (TargetDeviceOffloadKind) {
     case Action::OFK_HIP: {
-      if (Target.getArch() == llvm::Triple::amdgcn &&
-          Target.getVendor() == llvm::Triple::AMD &&
-          Target.getOS() == llvm::Triple::AMDHSA)
+      if (((Target.getArch() == llvm::Triple::amdgcn ||
+            Target.getArch() == llvm::Triple::spirv64) &&
+           Target.getVendor() == llvm::Triple::AMD &&
+           Target.getOS() == llvm::Triple::AMDHSA) ||
+          !Args.hasArgNoClaim(options::OPT_offload_EQ))
         TC = std::make_unique<toolchains::HIPAMDToolChain>(*this, Target,
                                                            HostTC, Args);
       else if (Target.getArch() == llvm::Triple::spirv64 &&
diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp
index 191d108e9b7398..a4ab846ed2c57a 100644
--- a/clang/lib/Driver/OffloadBundler.cpp
+++ b/clang/lib/Driver/OffloadBundler.cpp
@@ -79,7 +79,8 @@ OffloadTargetInfo::OffloadTargetInfo(const StringRef Target,
   auto TargetFeatures = Target.split(':');
   auto TripleOrGPU = TargetFeatures.first.rsplit('-');
 
-  if (clang::StringToCudaArch(TripleOrGPU.second) != clang::CudaArch::UNKNOWN) {
+  if (clang::StringToOffloadArch(TripleOrGPU.second) !=
+      clang::OffloadArch::UNKNOWN) {
     auto KindTriple = TripleOrGPU.first.split('-');
     this->OffloadKind = KindTriple.first;
 
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1e..8f4cc47e418b5d 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -195,11 +195,11 @@ static void getAArch64MultilibFlags(const Driver &D,
                                        UnifiedFeatures.end());
   std::vector<std::string> MArch;
   for (const auto &Ext : AArch64::Extensions)
-    if (FeatureSet.contains(Ext.Feature))
-      MArch.push_back(Ext.Name.str());
+    if (FeatureSet.contains(Ext.PosTargetFeature))
+      MArch.push_back(Ext.UserVisibleName.str());
   for (const auto &Ext : AArch64::Extensions)
-    if (FeatureSet.contains(Ext.NegFeature))
-      MArch.push_back(("no" + Ext.Name).str());
+    if (FeatureSet.contains(Ext.NegTargetFeature))
+      MArch.push_back(("no" + Ext.UserVisibleName).str());
   StringRef ArchName;
   for (const auto &ArchInfo : AArch64::ArchInfos)
     if (FeatureSet.contains(ArchInfo->ArchFeature))
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index d17ecb15c82088..1c0fb4babe3a55 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -87,7 +87,7 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs(
               llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));
           getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
               << llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";
-          Arch = CudaArchToString(CudaArch::HIPDefault);
+          Arch = OffloadArchToString(OffloadArch::HIPDefault);
         } else {
           Arch = Args.MakeArgString(ArchsOrErr->front());
         }
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 75f9c99d5d0bf4..92821b2a82daec 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -274,7 +274,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
       for (StringRef Value : A->getValues()) {
         if (Value == "egpr" || Value == "push2pop2" || Value == "ppx" ||
             Value == "ndd" || Value == "ccmp" || Value == "nf" ||
-            Value == "cf") {
+            Value == "cf" || Value == "zu") {
           Features.push_back(
               Args.MakeArgString((IsNegative ? "-" : "+") + Value));
           continue;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2ce9e2f4bcfcd4..1b7cc82ea816ed 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1524,7 +1524,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
       auto isPAuthLR = [](const char *member) {
         llvm::AArch64::ExtensionInfo pauthlr_extension =
             llvm::AArch64::getExtensionByID(llvm::AArch64::AEK_PAUTHLR);
-        return pauthlr_extension.Feature == member;
+        return pauthlr_extension.PosTargetFeature == member;
       };
 
       if (std::any_of(CmdArgs.begin(), CmdArgs.end(), isPAuthLR))
@@ -4939,7 +4939,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
 
     if (JA.isDeviceOffloading(Action::OFK_HIP) &&
-        getToolChain().getTriple().isAMDGPU()) {
+        (getToolChain().getTriple().isAMDGPU() ||
+         (getToolChain().getTriple().isSPIRV() &&
+          getToolChain().getTriple().getVendor() == llvm::Triple::AMD))) {
       // Device side compilation printf
       if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) {
         CmdArgs.push_back(Args.MakeArgString(
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 2dfc7457b0ac7c..08a4633902654d 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -223,13 +223,13 @@ CudaInstallationDetector::CudaInstallationDetector(
       // CUDA-9+ uses single libdevice file for all GPU variants.
       std::string FilePath = LibDevicePath + "/libdevice.10.bc";
       if (FS.exists(FilePath)) {
-        for (int Arch = (int)CudaArch::SM_30, E = (int)CudaArch::LAST; Arch < E;
-             ++Arch) {
-          CudaArch GpuArch = static_cast<CudaArch>(Arch);
-          if (!IsNVIDIAGpuArch(GpuArch))
+        for (int Arch = (int)OffloadArch::SM_30, E = (int)OffloadArch::LAST;
+             Arch < E; ++Arch) {
+          OffloadArch OA = static_cast<OffloadArch>(Arch);
+          if (!IsNVIDIAOffloadArch(OA))
             continue;
-          std::string GpuArchName(CudaArchToString(GpuArch));
-          LibDeviceMap[GpuArchName] = FilePath;
+          std::string OffloadArchName(OffloadArchToString(OA));
+          LibDeviceMap[OffloadArchName] = FilePath;
         }
       }
     } else {
@@ -312,17 +312,17 @@ void CudaInstallationDetector::AddCudaIncludeArgs(
 }
 
 void CudaInstallationDetector::CheckCudaVersionSupportsArch(
-    CudaArch Arch) const {
-  if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
+    OffloadArch Arch) const {
+  if (Arch == OffloadArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
       ArchsWithBadVersion[(int)Arch])
     return;
 
-  auto MinVersion = MinVersionForCudaArch(Arch);
-  auto MaxVersion = MaxVersionForCudaArch(Arch);
+  auto MinVersion = MinVersionForOffloadArch(Arch);
+  auto MaxVersion = MaxVersionForOffloadArch(Arch);
   if (Version < MinVersion || Version > MaxVersion) {
     ArchsWithBadVersion[(int)Arch] = true;
     D.Diag(diag::err_drv_cuda_version_unsupported)
-        << CudaArchToString(Arch) << CudaVersionToString(MinVersion)
+        << OffloadArchToString(Arch) << CudaVersionToString(MinVersion)
         << CudaVersionToString(MaxVersion) << InstallPath
         << CudaVersionToString(Version);
   }
@@ -401,8 +401,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   // Obtain architecture from the action.
-  CudaArch gpu_arch = StringToCudaArch(GPUArchName);
-  assert(gpu_arch != CudaArch::UNKNOWN &&
+  OffloadArch gpu_arch = StringToOffloadArch(GPUArchName);
+  assert(gpu_arch != OffloadArch::UNKNOWN &&
          "Device action expected to have an architecture.");
 
   // Check that our installation's ptxas supports gpu_arch.
@@ -457,7 +457,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-v");
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
+  CmdArgs.push_back(Args.MakeArgString(OffloadArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   std::string OutputFileName = TC.getInputFilename(Output);
 
@@ -553,7 +553,7 @@ void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
     const char *gpu_arch_str = A->getOffloadingArch();
     assert(gpu_arch_str &&
            "Device action expected to have associated a GPU architecture!");
-    CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
+    OffloadArch gpu_arch = StringToOffloadArch(gpu_arch_str);
 
     if (II.getType() == types::TY_PP_Asm &&
         !shouldIncludePTX(Args, gpu_arch_str))
@@ -561,7 +561,7 @@ void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
     const char *Arch = (II.getType() == types::TY_PP_Asm)
-                           ? CudaArchToVirtualArchString(gpu_arch)
+                           ? OffloadArchToVirtualArchString(gpu_arch)
                            : gpu_arch_str;
     CmdArgs.push_back(
         Args.MakeArgString(llvm::Twine("--image=profile=") + Arch +
@@ -758,7 +758,7 @@ NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
 
   if (!DAL->hasArg(options::OPT_march_EQ) && OffloadKind != Action::OFK_None) {
     DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
-                      CudaArchToString(CudaArch::CudaDefault));
+                      OffloadArchToString(OffloadArch::CudaDefault));
   } else if (DAL->getLastArgValue(options::OPT_march_EQ) == "generic" &&
              OffloadKind == Action::OFK_None) {
     DAL->eraseArg(options::OPT_march_EQ);
@@ -938,7 +938,7 @@ void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
       !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
     StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
     assert(!Arch.empty() && "Must have an explicit GPU arch.");
-    CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
+    CudaInstallation.CheckCudaVersionSupportsArch(StringToOffloadArch(Arch));
   }
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
@@ -984,7 +984,7 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
               llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));
           getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
               << llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";
-          Arch = CudaArchToString(CudaArch::CudaDefault);
+          Arch = OffloadArchToString(OffloadArch::CudaDefault);
         } else {
           Arch = Args.MakeArgString(ArchsOrErr->front());
         }
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index 43c17ba7c0ba03..7464d88cb350ba 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -37,7 +37,7 @@ class CudaInstallationDetector {
 
   // CUDA architectures for which we have raised an error in
   // CheckCudaVersionSupportsArch.
-  mutable std::bitset<(int)CudaArch::LAST> ArchsWithBadVersion;
+  mutable std::bitset<(int)OffloadArch::LAST> ArchsWithBadVersion;
 
 public:
   CudaInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
@@ -50,7 +50,7 @@ class CudaInstallationDetector {
   ///
   /// If either Version or Arch is unknown, does not emit an error.  Emits at
   /// most one error per Arch.
-  void CheckCudaVersionSupportsArch(CudaArch Arch) const;
+  void CheckCudaVersionSupportsArch(OffloadArch Arch) const;
 
   /// Check whether we detected a valid Cuda install.
   bool isValid() const { return IsValid; }
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 34236e8bcf9490..c35b0febb262da 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -10,6 +10,7 @@
 #include "AMDGPU.h"
 #include "CommonArgs.h"
 #include "HIPUtility.h"
+#include "SPIRV.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/Basic/TargetID.h"
 #include "clang/Driver/Compilation.h"
@@ -193,6 +194,33 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
                                          Lld, LldArgs, Inputs, Output));
 }
 
+// For SPIR-V the inputs for the job are device AMDGCN SPIR-V flavoured bitcode
+// and the output is either a compiled SPIR-V binary or bitcode (-emit-llvm). It
+// calls llvm-link and then the llvm-spirv translator. Once the SPIR-V BE will
+// be promoted from experimental, we will switch to using that. TODO: consider
+// if we want to run any targeted optimisations over IR here, over generic
+// SPIR-V.
+void AMDGCN::Linker::constructLinkAndEmitSpirvCommand(
+    Compilation &C, const JobAction &JA, const InputInfoList &Inputs,
+    const InputInfo &Output, const llvm::opt::ArgList &Args) const {
+  assert(!Inputs.empty() && "Must have at least one input.");
+
+  constructLlvmLinkCommand(C, JA, Inputs, Output, Args);
+
+  // Linked BC is now in Output
+
+  // Emit SPIR-V binary.
+  llvm::opt::ArgStringList TrArgs{
+      "--spirv-max-version=1.6",
+      "--spirv-ext=+all",
+      "--spirv-allow-extra-diexpressions",
+      "--spirv-allow-unknown-intrinsics",
+      "--spirv-lower-const-expr",
+      "--spirv-preserve-auxdata",
+      "--spirv-debug-info-version=nonsemantic-shader-200"};
+  SPIRV::constructTranslateCommand(C, *this, JA, Output, Output, TrArgs);
+}
+
 // For amdgcn the inputs of the linker job are device bitcode and output is
 // either an object file or bitcode (-emit-llvm). It calls llvm-link, opt,
 // llc, then lld steps.
@@ -214,6 +242,9 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (JA.getType() == types::TY_LLVM_BC)
     return constructLlvmLinkCommand(C, JA, Inputs, Output, Args);
 
+  if (getToolChain().getTriple().isSPIRV())
+    return constructLinkAndEmitSpirvCommand(C, JA, Inputs, Output, Args);
+
   return constructLldCommand(C, JA, Inputs, Output, Args);
 }
 
@@ -270,6 +301,13 @@ void HIPAMDToolChain::addClangTargetOptions(
     CC1Args.push_back("-fapply-global-visibility-to-externs");
   }
 
+  // For SPIR-V we embed the command-line into the generated binary, in order to
+  // retrieve it at JIT time and be able to do target specific compilation with
+  // options that match the user-supplied ones.
+  if (getTriple().isSPIRV() &&
+      !DriverArgs.hasArg(options::OPT_fembed_bitcode_marker))
+    CC1Args.push_back("-fembed-bitcode=marker");
+
   for (auto BCFile : getDeviceLibs(DriverArgs)) {
     CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode"
                                                : "-mlink-bitcode-file");
@@ -303,7 +341,8 @@ HIPAMDToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
 }
 
 Tool *HIPAMDToolChain::buildLinker() const {
-  assert(getTriple().getArch() == llvm::Triple::amdgcn);
+  assert(getTriple().getArch() == llvm::Triple::amdgcn ||
+         getTriple().getArch() == llvm::Triple::spirv64);
   return new tools::AMDGCN::Linker(*this);
 }
 
@@ -358,7 +397,9 @@ VersionTuple HIPAMDToolChain::computeMSVCVersion(const Driver *D,
 llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
 HIPAMDToolChain::getDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
   llvm::SmallVector<BitCodeLibraryInfo, 12> BCLibs;
-  if (DriverArgs.hasArg(options::OPT_nogpulib))
+  if (DriverArgs.hasArg(options::OPT_nogpulib) ||
+      (getTriple().getArch() == llvm::Triple::spirv64 &&
+       getTriple().getVendor() == llvm::Triple::AMD))
     return {};
   ArgStringList LibraryPaths;
 
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.h b/clang/lib/Driver/ToolChains/HIPAMD.h
index d81a9733014ccb..c31894e22c5c83 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.h
+++ b/clang/lib/Driver/ToolChains/HIPAMD.h
@@ -40,6 +40,10 @@ class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
                                 const InputInfoList &Inputs,
                                 const InputInfo &Output,
                                 const llvm::opt::ArgList &Args) const;
+  void constructLinkAndEmitSpirvCommand(Compilation &C, const JobAction &JA,
+                                        const InputInfoList &Inputs,
+                                        const InputInfo &Output,
+                                        const llvm::opt::ArgList &Args) const;
 };
 
 } // end namespace AMDGCN
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 7126e018ca5b6f..e82ed2ca79ffd6 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -341,6 +341,7 @@ SanitizerMask Solaris::getSupportedSanitizers() const {
     Res |= SanitizerKind::PointerCompare;
     Res |= SanitizerKind::PointerSubtract;
   }
+  Res |= SanitizerKind::SafeStack;
   Res |= SanitizerKind::Vptr;
   return Res;
 }
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 5b763df9b33293..60bd97e0ee987c 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -158,44 +158,46 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
-  // When optimizing, if wasm-opt is available, run it.
-  std::string WasmOptPath;
-  if (Args.getLastArg(options::OPT_O_Group)) {
-    WasmOptPath = ToolChain.GetProgramPath("wasm-opt");
-    if (WasmOptPath == "wasm-opt") {
-      WasmOptPath = {};
+  if (Args.hasFlag(options::OPT_wasm_opt, options::OPT_no_wasm_opt, true)) {
+    // When optimizing, if wasm-opt is available, run it.
+    std::string WasmOptPath;
+    if (Args.getLastArg(options::OPT_O_Group)) {
+      WasmOptPath = ToolChain.GetProgramPath("wasm-opt");
+      if (WasmOptPath == "wasm-opt") {
+        WasmOptPath = {};
+      }
     }
-  }
-
-  if (!WasmOptPath.empty()) {
-    CmdArgs.push_back("--keep-section=target_features");
-  }
 
-  C.addCommand(std::make_unique<Command>(JA, *this,
-                                         ResponseFileSupport::AtFileCurCP(),
-                                         Linker, CmdArgs, Inputs, Output));
-
-  if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
     if (!WasmOptPath.empty()) {
-      StringRef OOpt = "s";
-      if (A->getOption().matches(options::OPT_O4) ||
-          A->getOption().matches(options::OPT_Ofast))
-        OOpt = "4";
-      else if (A->getOption().matches(options::OPT_O0))
-        OOpt = "0";
-      else if (A->getOption().matches(options::OPT_O))
-        OOpt = A->getValue();
-
-      if (OOpt != "0") {
-        const char *WasmOpt = Args.MakeArgString(WasmOptPath);
-        ArgStringList OptArgs;
-        OptArgs.push_back(Output.getFilename());
-        OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
-        OptArgs.push_back("-o");
-        OptArgs.push_back(Output.getFilename());
-        C.addCommand(std::make_unique<Command>(
-            JA, *this, ResponseFileSupport::AtFileCurCP(), WasmOpt, OptArgs,
-            Inputs, Output));
+      CmdArgs.push_back("--keep-section=target_features");
+    }
+
+    C.addCommand(std::make_unique<Command>(JA, *this,
+                                           ResponseFileSupport::AtFileCurCP(),
+                                           Linker, CmdArgs, Inputs, Output));
+
+    if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
+      if (!WasmOptPath.empty()) {
+        StringRef OOpt = "s";
+        if (A->getOption().matches(options::OPT_O4) ||
+            A->getOption().matches(options::OPT_Ofast))
+          OOpt = "4";
+        else if (A->getOption().matches(options::OPT_O0))
+          OOpt = "0";
+        else if (A->getOption().matches(options::OPT_O))
+          OOpt = A->getValue();
+
+        if (OOpt != "0") {
+          const char *WasmOpt = Args.MakeArgString(WasmOptPath);
+          ArgStringList OptArgs;
+          OptArgs.push_back(Output.getFilename());
+          OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
+          OptArgs.push_back("-o");
+          OptArgs.push_back(Output.getFilename());
+          C.addCommand(std::make_unique<Command>(
+              JA, *this, ResponseFileSupport::AtFileCurCP(), WasmOpt, OptArgs,
+              Inputs, Output));
+        }
       }
     }
   }
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index cd21fbb2221ac6..259ef1dd00e3f9 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -369,6 +369,14 @@ template <> struct ScalarEnumerationTraits<FormatStyle::JavaScriptQuoteStyle> {
   }
 };
 
+template <> struct MappingTraits<FormatStyle::KeepEmptyLinesStyle> {
+  static void mapping(IO &IO, FormatStyle::KeepEmptyLinesStyle &Value) {
+    IO.mapOptional("AtEndOfFile", Value.AtEndOfFile);
+    IO.mapOptional("AtStartOfBlock", Value.AtStartOfBlock);
+    IO.mapOptional("AtStartOfFile", Value.AtStartOfFile);
+  }
+};
+
 template <> struct ScalarEnumerationTraits<FormatStyle::LanguageKind> {
   static void enumeration(IO &IO, FormatStyle::LanguageKind &Value) {
     IO.enumCase(Value, "Cpp", FormatStyle::LK_Cpp);
@@ -869,6 +877,9 @@ template <> struct MappingTraits<FormatStyle> {
                      OnCurrentLine);
       IO.mapOptional("DeriveLineEnding", DeriveLineEnding);
       IO.mapOptional("DerivePointerBinding", Style.DerivePointerAlignment);
+      IO.mapOptional("KeepEmptyLinesAtEOF", Style.KeepEmptyLines.AtEndOfFile);
+      IO.mapOptional("KeepEmptyLinesAtTheStartOfBlocks",
+                     Style.KeepEmptyLines.AtStartOfBlock);
       IO.mapOptional("IndentFunctionDeclarationAfterType",
                      Style.IndentWrappedFunctionNames);
       IO.mapOptional("IndentRequires", Style.IndentRequiresClause);
@@ -1004,9 +1015,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("JavaImportGroups", Style.JavaImportGroups);
     IO.mapOptional("JavaScriptQuotes", Style.JavaScriptQuotes);
     IO.mapOptional("JavaScriptWrapImports", Style.JavaScriptWrapImports);
-    IO.mapOptional("KeepEmptyLinesAtTheStartOfBlocks",
-                   Style.KeepEmptyLinesAtTheStartOfBlocks);
-    IO.mapOptional("KeepEmptyLinesAtEOF", Style.KeepEmptyLinesAtEOF);
+    IO.mapOptional("KeepEmptyLines", Style.KeepEmptyLines);
     IO.mapOptional("LambdaBodyIndentation", Style.LambdaBodyIndentation);
     IO.mapOptional("LineEnding", Style.LineEnding);
     IO.mapOptional("MacroBlockBegin", Style.MacroBlockBegin);
@@ -1517,8 +1526,11 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
       /*Hex=*/0,     /*HexMinDigits=*/0};
   LLVMStyle.JavaScriptQuotes = FormatStyle::JSQS_Leave;
   LLVMStyle.JavaScriptWrapImports = true;
-  LLVMStyle.KeepEmptyLinesAtEOF = false;
-  LLVMStyle.KeepEmptyLinesAtTheStartOfBlocks = true;
+  LLVMStyle.KeepEmptyLines = {
+      /*AtEndOfFile=*/false,
+      /*AtStartOfBlock=*/true,
+      /*AtStartOfFile=*/true,
+  };
   LLVMStyle.LambdaBodyIndentation = FormatStyle::LBI_Signature;
   LLVMStyle.Language = Language;
   LLVMStyle.LineEnding = FormatStyle::LE_DeriveLF;
@@ -1641,7 +1653,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
                                                 {".*", 3, 0, false}};
   GoogleStyle.IncludeStyle.IncludeIsMainRegex = "([-_](test|unittest))?$";
   GoogleStyle.IndentCaseLabels = true;
-  GoogleStyle.KeepEmptyLinesAtTheStartOfBlocks = false;
+  GoogleStyle.KeepEmptyLines.AtStartOfBlock = false;
   GoogleStyle.ObjCBinPackProtocolList = FormatStyle::BPS_Never;
   GoogleStyle.ObjCSpaceAfterProperty = false;
   GoogleStyle.ObjCSpaceBeforeProtocolList = true;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 89e134144d433c..03082cd2742c8c 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -176,10 +176,6 @@ class AnnotatingParser {
     Left->ParentBracket = Contexts.back().ContextKind;
     ScopedContextCreator ContextCreator(*this, tok::less, 12);
 
-    // If this angle is in the context of an expression, we need to be more
-    // hesitant to detect it as opening template parameters.
-    bool InExprContext = Contexts.back().IsExpression;
-
     Contexts.back().IsExpression = false;
     // If there's a template keyword before the opening angle bracket, this is a
     // template parameter, not an argument.
@@ -231,11 +227,8 @@ class AnnotatingParser {
         next();
         continue;
       }
-      if (CurrentToken->isOneOf(tok::r_paren, tok::r_square, tok::r_brace) ||
-          (CurrentToken->isOneOf(tok::colon, tok::question) && InExprContext &&
-           !Style.isCSharp() && !Style.isProto())) {
+      if (CurrentToken->isOneOf(tok::r_paren, tok::r_square, tok::r_brace))
         return false;
-      }
       // If a && or || is found and interpreted as a binary operator, this set
       // of angles is likely part of something like "a < b && c > d". If the
       // angles are inside an expression, the ||/&& might also be a binary
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 729f3d78f4a354..1804c1437fd41d 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -1478,11 +1478,13 @@ static auto computeNewlines(const AnnotatedLine &Line,
     Newlines = std::min(Newlines, 1u);
   if (Newlines == 0 && !RootToken.IsFirst)
     Newlines = 1;
-  if (RootToken.IsFirst && !RootToken.HasUnescapedNewline)
+  if (RootToken.IsFirst &&
+      (!Style.KeepEmptyLines.AtStartOfFile || !RootToken.HasUnescapedNewline)) {
     Newlines = 0;
+  }
 
   // Remove empty lines after "{".
-  if (!Style.KeepEmptyLinesAtTheStartOfBlocks && PreviousLine &&
+  if (!Style.KeepEmptyLines.AtStartOfBlock && PreviousLine &&
       PreviousLine->Last->is(tok::l_brace) &&
       !PreviousLine->startsWithNamespace() &&
       !(PrevPrevLine && PrevPrevLine->startsWithNamespace() &&
@@ -1554,9 +1556,9 @@ void UnwrappedLineFormatter::formatFirstToken(
     unsigned NewlineIndent) {
   FormatToken &RootToken = *Line.First;
   if (RootToken.is(tok::eof)) {
-    unsigned Newlines =
-        std::min(RootToken.NewlinesBefore,
-                 Style.KeepEmptyLinesAtEOF ? Style.MaxEmptyLinesToKeep + 1 : 1);
+    unsigned Newlines = std::min(
+        RootToken.NewlinesBefore,
+        Style.KeepEmptyLines.AtEndOfFile ? Style.MaxEmptyLinesToKeep + 1 : 1);
     unsigned TokenIndent = Newlines ? NewlineIndent : 0;
     Whitespaces->replaceWhitespace(RootToken, Newlines, TokenIndent,
                                    TokenIndent);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index a6d9f42ace9cc9..f42e28ba7e6294 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1468,6 +1468,17 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
     // If you change anything here, be sure to update <ptrauth.h>.
     Opts.FunctionPointers =
         PointerAuthSchema(Key::ASIA, false, Discrimination::None);
+
+    Opts.CXXVTablePointers = PointerAuthSchema(
+        Key::ASDA, LangOpts.PointerAuthVTPtrAddressDiscrimination,
+        LangOpts.PointerAuthVTPtrTypeDiscrimination ? Discrimination::Type
+                                                    : Discrimination::None);
+    Opts.CXXTypeInfoVTablePointer =
+        PointerAuthSchema(Key::ASDA, false, Discrimination::None);
+    Opts.CXXVTTVTablePointers =
+        PointerAuthSchema(Key::ASDA, false, Discrimination::None);
+    Opts.CXXVirtualFunctionPointers = Opts.CXXVirtualVariadicFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::Decl);
   }
 }
 
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 2d5c94c7602522..55ec4600648308 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -934,6 +934,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (LangOpts.GNUCVersion && LangOpts.CPlusPlus11)
     Builder.defineMacro("__GXX_EXPERIMENTAL_CXX0X__");
 
+  if (TI.getTriple().isWindowsGNUEnvironment()) {
+    // Set ABI defining macros for libstdc++ for MinGW, where the
+    // default in libstdc++ differs from the defaults for this target.
+    Builder.defineMacro("__GXX_TYPEINFO_EQUALITY_INLINE", "0");
+  }
+
   if (LangOpts.ObjC) {
     if (LangOpts.ObjCRuntime.isNonFragile()) {
       Builder.defineMacro("__OBJC2__");
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
index 1a4bd02933ea23..40ac6dcac2ab8a 100644
--- a/clang/lib/Headers/ptrauth.h
+++ b/clang/lib/Headers/ptrauth.h
@@ -32,6 +32,10 @@ typedef enum {
      The extra data is always 0. */
   ptrauth_key_function_pointer = ptrauth_key_process_independent_code,
 
+  /* The key used to sign C++ v-table pointers.
+     The extra data is always 0. */
+  ptrauth_key_cxx_vtable_pointer = ptrauth_key_process_independent_data,
+
   /* Other pointers signed under the ABI use private ABI rules. */
 
 } ptrauth_key;
@@ -205,6 +209,12 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 #define ptrauth_sign_generic_data(__value, __data)                             \
   __builtin_ptrauth_sign_generic_data(__value, __data)
 
+/* C++ vtable pointer signing class attribute */
+#define ptrauth_cxx_vtable_pointer(key, address_discrimination,                \
+                                   extra_discrimination...)                    \
+  [[clang::ptrauth_vtable_pointer(key, address_discrimination,                 \
+                                  extra_discrimination)]]
+
 #else
 
 #define ptrauth_strip(__value, __key)                                          \
@@ -271,6 +281,10 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
     ((ptrauth_generic_signature_t)0);                                          \
   })
 
+
+#define ptrauth_cxx_vtable_pointer(key, address_discrimination,                \
+                                   extra_discrimination...)
+
 #endif /* __has_feature(ptrauth_intrinsics) */
 
 #endif /* __PTRAUTH_H */
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index 367ae53b208b63..a73ea0b0d124c2 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -447,16 +447,16 @@ InstallAPIVisitor::getMangledCXXVTableName(const CXXRecordDecl *D) const {
   return getBackendMangledName(Name);
 }
 
-std::string
-InstallAPIVisitor::getMangledCXXThunk(const GlobalDecl &D,
-                                      const ThunkInfo &Thunk) const {
+std::string InstallAPIVisitor::getMangledCXXThunk(
+    const GlobalDecl &D, const ThunkInfo &Thunk, bool ElideOverrideInfo) const {
   SmallString<256> Name;
   raw_svector_ostream NameStream(Name);
   const auto *Method = cast<CXXMethodDecl>(D.getDecl());
   if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method))
-    MC->mangleCXXDtorThunk(Dtor, D.getDtorType(), Thunk.This, NameStream);
+    MC->mangleCXXDtorThunk(Dtor, D.getDtorType(), Thunk, ElideOverrideInfo,
+                           NameStream);
   else
-    MC->mangleThunk(Method, Thunk, NameStream);
+    MC->mangleThunk(Method, Thunk, ElideOverrideInfo, NameStream);
 
   return getBackendMangledName(Name);
 }
@@ -500,7 +500,8 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
             return;
 
           for (const auto &Thunk : *Thunks) {
-            const std::string Name = getMangledCXXThunk(GD, Thunk);
+            const std::string Name =
+                getMangledCXXThunk(GD, Thunk, /*ElideOverrideInfo=*/true);
             auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
                                                  GlobalRecord::Kind::Function,
                                                  Avail, GD.getDecl(), Access);
diff --git a/clang/lib/Interpreter/CodeCompletion.cpp b/clang/lib/Interpreter/CodeCompletion.cpp
index 25183ae9eeb99e..791426807cb91d 100644
--- a/clang/lib/Interpreter/CodeCompletion.cpp
+++ b/clang/lib/Interpreter/CodeCompletion.cpp
@@ -368,8 +368,7 @@ void ReplCodeCompleter::codeComplete(CompilerInstance *InterpCI,
   llvm::SmallVector<const llvm::MemoryBuffer *, 1> tb = {};
   InterpCI->getFrontendOpts().Inputs[0] = FrontendInputFile(
       CodeCompletionFileName, Language::CXX, InputKind::Source);
-  auto Act = std::unique_ptr<IncrementalSyntaxOnlyAction>(
-      new IncrementalSyntaxOnlyAction(ParentCI));
+  auto Act = std::make_unique<IncrementalSyntaxOnlyAction>(ParentCI);
   std::unique_ptr<llvm::MemoryBuffer> MB =
       llvm::MemoryBuffer::getMemBufferCopy(Content, CodeCompletionFileName);
   llvm::SmallVector<ASTUnit::RemappedFile, 4> RemappedFiles;
diff --git a/clang/lib/Interpreter/DeviceOffload.cpp b/clang/lib/Interpreter/DeviceOffload.cpp
index fb42964e4936f2..07c9e3005e5fd3 100644
--- a/clang/lib/Interpreter/DeviceOffload.cpp
+++ b/clang/lib/Interpreter/DeviceOffload.cpp
@@ -17,6 +17,7 @@
 #include "clang/Frontend/CompilerInstance.h"
 
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 3df0391bdda772..9d2720af5dbd92 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1123,8 +1123,9 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
         break; // Invalid for floats
       if (HasSize)
         break;
-      if (DoubleUnderscore)
-        break; // Cannot be repeated.
+      // There is currently no way to reach this with DoubleUnderscore set.
+      // If new double underscope literals are added handle it here as above.
+      assert(!DoubleUnderscore && "unhandled double underscore case");
       if (LangOpts.CPlusPlus && s + 2 < ThisTokEnd &&
           s[1] == '_') { // s + 2 < ThisTokEnd to ensure some character exists
                          // after __
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index c5289174373327..a07f7ad2233fe6 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -368,6 +368,27 @@ static bool attributeIsTypeArgAttr(const IdentifierInfo &II) {
 #undef CLANG_ATTR_TYPE_ARG_LIST
 }
 
+/// Determine whether the given attribute takes identifier arguments.
+static bool attributeHasStrictIdentifierArgs(const IdentifierInfo &II) {
+#define CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
+  return (llvm::StringSwitch<uint64_t>(normalizeAttrName(II.getName()))
+#include "clang/Parse/AttrParserStringSwitches.inc"
+              .Default(0)) != 0;
+#undef CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
+}
+
+/// Determine whether the given attribute takes an identifier argument at a
+/// specific index
+static bool attributeHasStrictIdentifierArgAtIndex(const IdentifierInfo &II,
+                                                   size_t argIndex) {
+#define CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
+  return (llvm::StringSwitch<uint64_t>(normalizeAttrName(II.getName()))
+#include "clang/Parse/AttrParserStringSwitches.inc"
+              .Default(0)) &
+         (1ull << argIndex);
+#undef CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
+}
+
 /// Determine whether the given attribute requires parsing its arguments
 /// in an unevaluated context or not.
 static bool attributeParsedArgsUnevaluated(const IdentifierInfo &II) {
@@ -546,7 +567,8 @@ unsigned Parser::ParseAttributeArgsCommon(
       }
       if (T.isUsable())
         TheParsedType = T.get();
-    } else if (AttributeHasVariadicIdentifierArg) {
+    } else if (AttributeHasVariadicIdentifierArg ||
+               attributeHasStrictIdentifierArgs(*AttrName)) {
       // Parse variadic identifier arg. This can either consume identifiers or
       // expressions. Variadic identifier args do not support parameter packs
       // because those are typically used for attributes with enumeration
@@ -557,6 +579,12 @@ unsigned Parser::ParseAttributeArgsCommon(
         if (ChangeKWThisToIdent && Tok.is(tok::kw_this))
           Tok.setKind(tok::identifier);
 
+        if (Tok.is(tok::identifier) && attributeHasStrictIdentifierArgAtIndex(
+                                           *AttrName, ArgExprs.size())) {
+          ArgExprs.push_back(ParseIdentifierLoc());
+          continue;
+        }
+
         ExprResult ArgExpr;
         if (Tok.is(tok::identifier)) {
           ArgExprs.push_back(ParseIdentifierLoc());
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 5e3ee5f0579aab..2fc43a7e7926b7 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -2681,6 +2681,10 @@ bool Parser::ParseCXXMemberDeclaratorBeforeInitializer(
   else
     DeclaratorInfo.SetIdentifier(nullptr, Tok.getLocation());
 
+  if (getLangOpts().HLSL)
+    MaybeParseHLSLAnnotations(DeclaratorInfo, nullptr,
+                              /*CouldBeBitField*/ true);
+
   if (!DeclaratorInfo.isFunctionDeclarator() && TryConsumeToken(tok::colon)) {
     assert(DeclaratorInfo.isPastIdentifier() &&
            "don't know where identifier would go yet?");
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index 4b72afe9986e52..b36ea4012c26e1 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -119,9 +119,11 @@ static void fixSeparateAttrArgAndNumber(StringRef ArgStr, SourceLocation ArgLoc,
 }
 
 void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
-                                  SourceLocation *EndLoc) {
+                                  SourceLocation *EndLoc,
+                                  bool CouldBeBitField) {
 
   assert(Tok.is(tok::colon) && "Not a HLSL Annotation");
+  Token OldToken = Tok;
   ConsumeToken();
 
   IdentifierInfo *II = nullptr;
@@ -131,6 +133,10 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
     II = Tok.getIdentifierInfo();
 
   if (!II) {
+    if (CouldBeBitField) {
+      UnconsumeToken(OldToken);
+      return;
+    }
     Diag(Tok.getLocation(), diag::err_expected_semantic_identifier);
     return;
   }
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 50a872fedebf7c..326cd22ff90059 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -24,10 +24,10 @@
 #include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaCodeCompletion.h"
 #include "clang/Sema/SemaOpenMP.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Frontend/OpenMP/OMPAssume.h"
 #include "llvm/Frontend/OpenMP/OMPContext.h"
-#include <bitset>
 #include <optional>
 
 using namespace clang;
@@ -2146,7 +2146,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
   case OMPD_requires: {
     SourceLocation StartLoc = ConsumeToken();
     SmallVector<OMPClause *, 5> Clauses;
-    std::bitset<llvm::omp::Clause_enumSize + 1> SeenClauses;
+    llvm::SmallBitVector SeenClauses(llvm::omp::Clause_enumSize + 1);
     if (Tok.is(tok::annot_pragma_openmp_end)) {
       Diag(Tok, diag::err_omp_expected_clause)
           << getOpenMPDirectiveName(OMPD_requires);
@@ -2374,81 +2374,19 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
   case OMPD_unknown:
     Diag(Tok, diag::err_omp_unknown_directive);
     break;
-  case OMPD_parallel:
-  case OMPD_simd:
-  case OMPD_tile:
-  case OMPD_unroll:
-  case OMPD_task:
-  case OMPD_taskyield:
-  case OMPD_barrier:
-  case OMPD_taskwait:
-  case OMPD_taskgroup:
-  case OMPD_flush:
-  case OMPD_depobj:
-  case OMPD_scan:
-  case OMPD_for:
-  case OMPD_for_simd:
-  case OMPD_sections:
-  case OMPD_section:
-  case OMPD_single:
-  case OMPD_master:
-  case OMPD_ordered:
-  case OMPD_critical:
-  case OMPD_parallel_for:
-  case OMPD_parallel_for_simd:
-  case OMPD_parallel_sections:
-  case OMPD_parallel_master:
-  case OMPD_parallel_masked:
-  case OMPD_atomic:
-  case OMPD_target:
-  case OMPD_teams:
-  case OMPD_cancellation_point:
-  case OMPD_cancel:
-  case OMPD_target_data:
-  case OMPD_target_enter_data:
-  case OMPD_target_exit_data:
-  case OMPD_target_parallel:
-  case OMPD_target_parallel_for:
-  case OMPD_taskloop:
-  case OMPD_taskloop_simd:
-  case OMPD_master_taskloop:
-  case OMPD_master_taskloop_simd:
-  case OMPD_parallel_master_taskloop:
-  case OMPD_parallel_master_taskloop_simd:
-  case OMPD_masked_taskloop:
-  case OMPD_masked_taskloop_simd:
-  case OMPD_parallel_masked_taskloop:
-  case OMPD_parallel_masked_taskloop_simd:
-  case OMPD_distribute:
-  case OMPD_target_update:
-  case OMPD_distribute_parallel_for:
-  case OMPD_distribute_parallel_for_simd:
-  case OMPD_distribute_simd:
-  case OMPD_target_parallel_for_simd:
-  case OMPD_target_simd:
-  case OMPD_scope:
-  case OMPD_teams_distribute:
-  case OMPD_teams_distribute_simd:
-  case OMPD_teams_distribute_parallel_for_simd:
-  case OMPD_teams_distribute_parallel_for:
-  case OMPD_target_teams:
-  case OMPD_target_teams_distribute:
-  case OMPD_target_teams_distribute_parallel_for:
-  case OMPD_target_teams_distribute_parallel_for_simd:
-  case OMPD_target_teams_distribute_simd:
-  case OMPD_dispatch:
-  case OMPD_masked:
-  case OMPD_metadirective:
-  case OMPD_loop:
-  case OMPD_teams_loop:
-  case OMPD_target_teams_loop:
-  case OMPD_parallel_loop:
-  case OMPD_target_parallel_loop:
-    Diag(Tok, diag::err_omp_unexpected_directive)
-        << 1 << getOpenMPDirectiveName(DKind);
-    break;
   default:
-    break;
+    switch (getDirectiveCategory(DKind)) {
+    case Category::Executable:
+    case Category::Meta:
+    case Category::Subsidiary:
+    case Category::Utility:
+      Diag(Tok, diag::err_omp_unexpected_directive)
+          << 1 << getOpenMPDirectiveName(DKind);
+      break;
+    case Category::Declarative:
+    case Category::Informational:
+      break;
+    }
   }
   while (Tok.isNot(tok::annot_pragma_openmp_end))
     ConsumeAnyToken();
@@ -2456,6 +2394,184 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
   return nullptr;
 }
 
+StmtResult Parser::ParseOpenMPExecutableDirective(
+    ParsedStmtContext StmtCtx, OpenMPDirectiveKind DKind, SourceLocation Loc,
+    bool ReadDirectiveWithinMetadirective) {
+  assert(isOpenMPExecutableDirective(DKind) && "Unexpected directive category");
+
+  bool HasAssociatedStatement = true;
+  Association Assoc = getDirectiveAssociation(DKind);
+
+  // OMPD_ordered has None as association, but it comes in two variants,
+  // the second of which is associated with a block.
+  // OMPD_scan and OMPD_section are both "separating", but section is treated
+  // as if it was associated with a statement, while scan is not.
+  if (DKind != OMPD_ordered && DKind != OMPD_section &&
+      (Assoc == Association::None || Assoc == Association::Separating)) {
+    if ((StmtCtx & ParsedStmtContext::AllowStandaloneOpenMPDirectives) ==
+        ParsedStmtContext()) {
+      Diag(Tok, diag::err_omp_immediate_directive)
+          << getOpenMPDirectiveName(DKind) << 0;
+      if (DKind == OMPD_error) {
+        SkipUntil(tok::annot_pragma_openmp_end);
+        return StmtError();
+      }
+    }
+    HasAssociatedStatement = false;
+  }
+
+  SourceLocation EndLoc;
+  SmallVector<OMPClause *, 5> Clauses;
+  llvm::SmallBitVector SeenClauses(llvm::omp::Clause_enumSize + 1);
+  DeclarationNameInfo DirName;
+  OpenMPDirectiveKind CancelRegion = OMPD_unknown;
+  unsigned ScopeFlags = Scope::FnScope | Scope::DeclScope |
+                        Scope::CompoundStmtScope | Scope::OpenMPDirectiveScope;
+
+  // Special processing for flush and depobj clauses.
+  Token ImplicitTok;
+  bool ImplicitClauseAllowed = false;
+  if (DKind == OMPD_flush || DKind == OMPD_depobj) {
+    ImplicitTok = Tok;
+    ImplicitClauseAllowed = true;
+  }
+  ConsumeToken();
+  // Parse directive name of the 'critical' directive if any.
+  if (DKind == OMPD_critical) {
+    BalancedDelimiterTracker T(*this, tok::l_paren,
+                               tok::annot_pragma_openmp_end);
+    if (!T.consumeOpen()) {
+      if (Tok.isAnyIdentifier()) {
+        DirName =
+            DeclarationNameInfo(Tok.getIdentifierInfo(), Tok.getLocation());
+        ConsumeAnyToken();
+      } else {
+        Diag(Tok, diag::err_omp_expected_identifier_for_critical);
+      }
+      T.consumeClose();
+    }
+  } else if (DKind == OMPD_cancellation_point || DKind == OMPD_cancel) {
+    CancelRegion = parseOpenMPDirectiveKind(*this);
+    if (Tok.isNot(tok::annot_pragma_openmp_end))
+      ConsumeToken();
+  }
+
+  if (isOpenMPLoopDirective(DKind))
+    ScopeFlags |= Scope::OpenMPLoopDirectiveScope;
+  if (isOpenMPSimdDirective(DKind))
+    ScopeFlags |= Scope::OpenMPSimdDirectiveScope;
+  ParseScope OMPDirectiveScope(this, ScopeFlags);
+  Actions.OpenMP().StartOpenMPDSABlock(DKind, DirName, Actions.getCurScope(),
+                                       Loc);
+
+  while (Tok.isNot(tok::annot_pragma_openmp_end)) {
+    // If we are parsing for a directive within a metadirective, the directive
+    // ends with a ')'.
+    if (ReadDirectiveWithinMetadirective && Tok.is(tok::r_paren)) {
+      while (Tok.isNot(tok::annot_pragma_openmp_end))
+        ConsumeAnyToken();
+      break;
+    }
+    bool HasImplicitClause = false;
+    if (ImplicitClauseAllowed && Tok.is(tok::l_paren)) {
+      HasImplicitClause = true;
+      // Push copy of the current token back to stream to properly parse
+      // pseudo-clause OMPFlushClause or OMPDepobjClause.
+      PP.EnterToken(Tok, /*IsReinject*/ true);
+      PP.EnterToken(ImplicitTok, /*IsReinject*/ true);
+      ConsumeAnyToken();
+    }
+    OpenMPClauseKind CKind = Tok.isAnnotation()
+                                 ? OMPC_unknown
+                                 : getOpenMPClauseKind(PP.getSpelling(Tok));
+    if (HasImplicitClause) {
+      assert(CKind == OMPC_unknown && "Must be unknown implicit clause.");
+      if (DKind == OMPD_flush) {
+        CKind = OMPC_flush;
+      } else {
+        assert(DKind == OMPD_depobj && "Expected flush or depobj directives.");
+        CKind = OMPC_depobj;
+      }
+    }
+    // No more implicit clauses allowed.
+    ImplicitClauseAllowed = false;
+    Actions.OpenMP().StartOpenMPClause(CKind);
+    HasImplicitClause = false;
+    OMPClause *Clause =
+        ParseOpenMPClause(DKind, CKind, !SeenClauses[unsigned(CKind)]);
+    SeenClauses[unsigned(CKind)] = true;
+    if (Clause)
+      Clauses.push_back(Clause);
+
+    // Skip ',' if any.
+    if (Tok.is(tok::comma))
+      ConsumeToken();
+    Actions.OpenMP().EndOpenMPClause();
+  }
+  // End location of the directive.
+  EndLoc = Tok.getLocation();
+  // Consume final annot_pragma_openmp_end.
+  ConsumeAnnotationToken();
+
+  if (DKind == OMPD_ordered) {
+    // If the depend or doacross clause is specified, the ordered construct
+    // is a stand-alone directive.
+    for (auto CK : {OMPC_depend, OMPC_doacross}) {
+      if (SeenClauses[unsigned(CK)]) {
+        if ((StmtCtx & ParsedStmtContext::AllowStandaloneOpenMPDirectives) ==
+            ParsedStmtContext()) {
+          Diag(Loc, diag::err_omp_immediate_directive)
+              << getOpenMPDirectiveName(DKind) << 1 << getOpenMPClauseName(CK);
+        }
+        HasAssociatedStatement = false;
+      }
+    }
+  }
+
+  if (DKind == OMPD_tile && !SeenClauses[unsigned(OMPC_sizes)]) {
+    Diag(Loc, diag::err_omp_required_clause)
+        << getOpenMPDirectiveName(OMPD_tile) << "sizes";
+  }
+
+  StmtResult AssociatedStmt;
+  if (HasAssociatedStatement) {
+    // The body is a block scope like in Lambdas and Blocks.
+    Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
+    // FIXME: We create a bogus CompoundStmt scope to hold the contents of
+    // the captured region. Code elsewhere assumes that any FunctionScopeInfo
+    // should have at least one compound statement scope within it.
+    ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
+    {
+      Sema::CompoundScopeRAII Scope(Actions);
+      AssociatedStmt = ParseStatement();
+
+      if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
+          getLangOpts().OpenMPIRBuilder)
+        AssociatedStmt =
+            Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get());
+    }
+    AssociatedStmt =
+        Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
+  } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
+             DKind == OMPD_target_exit_data) {
+    Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
+    AssociatedStmt = (Sema::CompoundScopeRAII(Actions),
+                      Actions.ActOnCompoundStmt(Loc, Loc, std::nullopt,
+                                                /*isStmtExpr=*/false));
+    AssociatedStmt =
+        Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
+  }
+
+  StmtResult Directive = Actions.OpenMP().ActOnOpenMPExecutableDirective(
+      DKind, DirName, CancelRegion, Clauses, AssociatedStmt.get(), Loc, EndLoc);
+
+  // Exit scope.
+  Actions.OpenMP().EndOpenMPDSABlock(Directive.get());
+  OMPDirectiveScope.Exit();
+
+  return Directive;
+}
+
 /// Parsing of declarative or executable OpenMP directives.
 ///
 ///       threadprivate-directive:
@@ -2503,24 +2619,30 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
            "Not an OpenMP directive!");
   ParsingOpenMPDirectiveRAII DirScope(*this);
   ParenBraceBracketBalancer BalancerRAIIObj(*this);
-  SmallVector<OMPClause *, 5> Clauses;
-  std::bitset<llvm::omp::Clause_enumSize + 1> SeenClauses;
-  unsigned ScopeFlags = Scope::FnScope | Scope::DeclScope |
-                        Scope::CompoundStmtScope | Scope::OpenMPDirectiveScope;
   SourceLocation Loc = ReadDirectiveWithinMetadirective
                            ? Tok.getLocation()
-                           : ConsumeAnnotationToken(),
-                 EndLoc;
+                           : ConsumeAnnotationToken();
   OpenMPDirectiveKind DKind = parseOpenMPDirectiveKind(*this);
   if (ReadDirectiveWithinMetadirective && DKind == OMPD_unknown) {
     Diag(Tok, diag::err_omp_unknown_directive);
     return StmtError();
   }
-  OpenMPDirectiveKind CancelRegion = OMPD_unknown;
-  // Name of critical directive.
-  DeclarationNameInfo DirName;
+
   StmtResult Directive = StmtError();
-  bool HasAssociatedStatement = true;
+
+  bool IsExecutable = [&]() {
+    if (DKind == OMPD_error) // OMPD_error is handled as executable
+      return true;
+    auto Res = getDirectiveCategory(DKind);
+    return Res == Category::Executable || Res == Category::Subsidiary;
+  }();
+
+  if (IsExecutable) {
+    Directive = ParseOpenMPExecutableDirective(
+        StmtCtx, DKind, Loc, ReadDirectiveWithinMetadirective);
+    assert(!Directive.isUnset() && "Executable directive remained unprocessed");
+    return Directive;
+  }
 
   switch (DKind) {
   case OMPD_nothing:
@@ -2709,7 +2831,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
                                   /*AllowScopeSpecifier=*/false)) {
       SmallVector<OMPClause *, 1> Clauses;
       if (Tok.isNot(tok::annot_pragma_openmp_end)) {
-        std::bitset<llvm::omp::Clause_enumSize + 1> SeenClauses;
+        llvm::SmallBitVector SeenClauses(llvm::omp::Clause_enumSize + 1);
         while (Tok.isNot(tok::annot_pragma_openmp_end)) {
           OpenMPClauseKind CKind =
               Tok.isAnnotation() ? OMPC_unknown
@@ -2763,233 +2885,6 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
     }
     break;
   }
-  case OMPD_flush:
-  case OMPD_depobj:
-  case OMPD_scan:
-  case OMPD_taskyield:
-  case OMPD_error:
-  case OMPD_barrier:
-  case OMPD_taskwait:
-  case OMPD_cancellation_point:
-  case OMPD_cancel:
-  case OMPD_target_enter_data:
-  case OMPD_target_exit_data:
-  case OMPD_target_update:
-  case OMPD_interop:
-    if ((StmtCtx & ParsedStmtContext::AllowStandaloneOpenMPDirectives) ==
-        ParsedStmtContext()) {
-      Diag(Tok, diag::err_omp_immediate_directive)
-          << getOpenMPDirectiveName(DKind) << 0;
-      if (DKind == OMPD_error) {
-        SkipUntil(tok::annot_pragma_openmp_end);
-        break;
-      }
-    }
-    HasAssociatedStatement = false;
-    // Fall through for further analysis.
-    [[fallthrough]];
-  case OMPD_parallel:
-  case OMPD_simd:
-  case OMPD_tile:
-  case OMPD_unroll:
-  case OMPD_for:
-  case OMPD_for_simd:
-  case OMPD_sections:
-  case OMPD_single:
-  case OMPD_section:
-  case OMPD_master:
-  case OMPD_critical:
-  case OMPD_parallel_for:
-  case OMPD_parallel_for_simd:
-  case OMPD_parallel_sections:
-  case OMPD_parallel_master:
-  case OMPD_parallel_masked:
-  case OMPD_task:
-  case OMPD_ordered:
-  case OMPD_atomic:
-  case OMPD_target:
-  case OMPD_teams:
-  case OMPD_taskgroup:
-  case OMPD_target_data:
-  case OMPD_target_parallel:
-  case OMPD_target_parallel_for:
-  case OMPD_loop:
-  case OMPD_teams_loop:
-  case OMPD_target_teams_loop:
-  case OMPD_parallel_loop:
-  case OMPD_target_parallel_loop:
-  case OMPD_scope:
-  case OMPD_taskloop:
-  case OMPD_taskloop_simd:
-  case OMPD_master_taskloop:
-  case OMPD_masked_taskloop:
-  case OMPD_master_taskloop_simd:
-  case OMPD_masked_taskloop_simd:
-  case OMPD_parallel_master_taskloop:
-  case OMPD_parallel_masked_taskloop:
-  case OMPD_parallel_master_taskloop_simd:
-  case OMPD_parallel_masked_taskloop_simd:
-  case OMPD_distribute:
-  case OMPD_distribute_parallel_for:
-  case OMPD_distribute_parallel_for_simd:
-  case OMPD_distribute_simd:
-  case OMPD_target_parallel_for_simd:
-  case OMPD_target_simd:
-  case OMPD_teams_distribute:
-  case OMPD_teams_distribute_simd:
-  case OMPD_teams_distribute_parallel_for_simd:
-  case OMPD_teams_distribute_parallel_for:
-  case OMPD_target_teams:
-  case OMPD_target_teams_distribute:
-  case OMPD_target_teams_distribute_parallel_for:
-  case OMPD_target_teams_distribute_parallel_for_simd:
-  case OMPD_target_teams_distribute_simd:
-  case OMPD_dispatch:
-  case OMPD_masked: {
-    // Special processing for flush and depobj clauses.
-    Token ImplicitTok;
-    bool ImplicitClauseAllowed = false;
-    if (DKind == OMPD_flush || DKind == OMPD_depobj) {
-      ImplicitTok = Tok;
-      ImplicitClauseAllowed = true;
-    }
-    ConsumeToken();
-    // Parse directive name of the 'critical' directive if any.
-    if (DKind == OMPD_critical) {
-      BalancedDelimiterTracker T(*this, tok::l_paren,
-                                 tok::annot_pragma_openmp_end);
-      if (!T.consumeOpen()) {
-        if (Tok.isAnyIdentifier()) {
-          DirName =
-              DeclarationNameInfo(Tok.getIdentifierInfo(), Tok.getLocation());
-          ConsumeAnyToken();
-        } else {
-          Diag(Tok, diag::err_omp_expected_identifier_for_critical);
-        }
-        T.consumeClose();
-      }
-    } else if (DKind == OMPD_cancellation_point || DKind == OMPD_cancel) {
-      CancelRegion = parseOpenMPDirectiveKind(*this);
-      if (Tok.isNot(tok::annot_pragma_openmp_end))
-        ConsumeToken();
-    }
-
-    if (isOpenMPLoopDirective(DKind))
-      ScopeFlags |= Scope::OpenMPLoopDirectiveScope;
-    if (isOpenMPSimdDirective(DKind))
-      ScopeFlags |= Scope::OpenMPSimdDirectiveScope;
-    ParseScope OMPDirectiveScope(this, ScopeFlags);
-    Actions.OpenMP().StartOpenMPDSABlock(DKind, DirName, Actions.getCurScope(),
-                                         Loc);
-
-    while (Tok.isNot(tok::annot_pragma_openmp_end)) {
-      // If we are parsing for a directive within a metadirective, the directive
-      // ends with a ')'.
-      if (ReadDirectiveWithinMetadirective && Tok.is(tok::r_paren)) {
-        while (Tok.isNot(tok::annot_pragma_openmp_end))
-          ConsumeAnyToken();
-        break;
-      }
-      bool HasImplicitClause = false;
-      if (ImplicitClauseAllowed && Tok.is(tok::l_paren)) {
-        HasImplicitClause = true;
-        // Push copy of the current token back to stream to properly parse
-        // pseudo-clause OMPFlushClause or OMPDepobjClause.
-        PP.EnterToken(Tok, /*IsReinject*/ true);
-        PP.EnterToken(ImplicitTok, /*IsReinject*/ true);
-        ConsumeAnyToken();
-      }
-      OpenMPClauseKind CKind = Tok.isAnnotation()
-                                   ? OMPC_unknown
-                                   : getOpenMPClauseKind(PP.getSpelling(Tok));
-      if (HasImplicitClause) {
-        assert(CKind == OMPC_unknown && "Must be unknown implicit clause.");
-        if (DKind == OMPD_flush) {
-          CKind = OMPC_flush;
-        } else {
-          assert(DKind == OMPD_depobj &&
-                 "Expected flush or depobj directives.");
-          CKind = OMPC_depobj;
-        }
-      }
-      // No more implicit clauses allowed.
-      ImplicitClauseAllowed = false;
-      Actions.OpenMP().StartOpenMPClause(CKind);
-      HasImplicitClause = false;
-      OMPClause *Clause =
-          ParseOpenMPClause(DKind, CKind, !SeenClauses[unsigned(CKind)]);
-      SeenClauses[unsigned(CKind)] = true;
-      if (Clause)
-        Clauses.push_back(Clause);
-
-      // Skip ',' if any.
-      if (Tok.is(tok::comma))
-        ConsumeToken();
-      Actions.OpenMP().EndOpenMPClause();
-    }
-    // End location of the directive.
-    EndLoc = Tok.getLocation();
-    // Consume final annot_pragma_openmp_end.
-    ConsumeAnnotationToken();
-
-    if (DKind == OMPD_ordered) {
-      // If the depend or doacross clause is specified, the ordered construct
-      // is a stand-alone directive.
-      for (auto CK : {OMPC_depend, OMPC_doacross}) {
-        if (SeenClauses[unsigned(CK)]) {
-          if ((StmtCtx & ParsedStmtContext::AllowStandaloneOpenMPDirectives) ==
-              ParsedStmtContext()) {
-            Diag(Loc, diag::err_omp_immediate_directive)
-                << getOpenMPDirectiveName(DKind) << 1
-                << getOpenMPClauseName(CK);
-          }
-          HasAssociatedStatement = false;
-        }
-      }
-    }
-
-    if (DKind == OMPD_tile && !SeenClauses[unsigned(OMPC_sizes)]) {
-      Diag(Loc, diag::err_omp_required_clause)
-          << getOpenMPDirectiveName(OMPD_tile) << "sizes";
-    }
-
-    StmtResult AssociatedStmt;
-    if (HasAssociatedStatement) {
-      // The body is a block scope like in Lambdas and Blocks.
-      Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
-      // FIXME: We create a bogus CompoundStmt scope to hold the contents of
-      // the captured region. Code elsewhere assumes that any FunctionScopeInfo
-      // should have at least one compound statement scope within it.
-      ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
-      {
-        Sema::CompoundScopeRAII Scope(Actions);
-        AssociatedStmt = ParseStatement();
-
-        if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
-            getLangOpts().OpenMPIRBuilder)
-          AssociatedStmt =
-              Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get());
-      }
-      AssociatedStmt =
-          Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
-    } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
-               DKind == OMPD_target_exit_data) {
-      Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
-      AssociatedStmt = (Sema::CompoundScopeRAII(Actions),
-                        Actions.ActOnCompoundStmt(Loc, Loc, std::nullopt,
-                                                  /*isStmtExpr=*/false));
-      AssociatedStmt =
-          Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
-    }
-    Directive = Actions.OpenMP().ActOnOpenMPExecutableDirective(
-        DKind, DirName, CancelRegion, Clauses, AssociatedStmt.get(), Loc,
-        EndLoc);
-
-    // Exit scope.
-    Actions.OpenMP().EndOpenMPDSABlock(Directive.get());
-    OMPDirectiveScope.Exit();
-    break;
-  }
   case OMPD_declare_target: {
     SourceLocation DTLoc = ConsumeAnyToken();
     bool HasClauses = Tok.isNot(tok::annot_pragma_openmp_end);
@@ -4785,8 +4680,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
         getLangOpts());
     Data.ExtraModifierLoc = Tok.getLocation();
     if (Data.ExtraModifier == OMPC_ADJUST_ARGS_unknown) {
-      SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end,
-                StopBeforeMatch);
+      Diag(Tok, diag::err_omp_unknown_adjust_args_op);
+      SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch);
     } else {
       ConsumeToken();
       if (Tok.is(tok::colon))
@@ -4799,7 +4694,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
   bool IsComma =
       (Kind != OMPC_reduction && Kind != OMPC_task_reduction &&
        Kind != OMPC_in_reduction && Kind != OMPC_depend &&
-       Kind != OMPC_doacross && Kind != OMPC_map) ||
+       Kind != OMPC_doacross && Kind != OMPC_map && Kind != OMPC_adjust_args) ||
       (Kind == OMPC_reduction && !InvalidReductionId) ||
       (Kind == OMPC_map && Data.ExtraModifier != OMPC_MAP_unknown) ||
       (Kind == OMPC_depend && Data.ExtraModifier != OMPC_DEPEND_unknown) ||
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 6d0cf7b174e50e..71b87147e9a5f1 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2060,9 +2060,19 @@ bool Parser::TryAnnotateTypeOrScopeToken(
       return true;
     }
 
+    bool TemplateKWPresent = false;
+    if (Tok.is(tok::kw_template)) {
+      ConsumeToken();
+      TemplateKWPresent = true;
+    }
+
     TypeResult Ty;
     if (Tok.is(tok::identifier)) {
-      // FIXME: check whether the next token is '<', first!
+      if (TemplateKWPresent && NextToken().isNot(tok::less)) {
+        Diag(Tok.getLocation(),
+             diag::missing_template_arg_list_after_template_kw);
+        return true;
+      }
       Ty = Actions.ActOnTypenameType(getCurScope(), TypenameLoc, SS,
                                      *Tok.getIdentifierInfo(),
                                      Tok.getLocation());
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index f152d243d39a5b..980a83d4431aa2 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -15,6 +15,7 @@ clang_tablegen(OpenCLBuiltins.inc -gen-clang-opencl-builtins
 
 add_clang_library(clangSema
   AnalysisBasedWarnings.cpp
+  CheckExprLifetime.cpp
   CodeCompleteConsumer.cpp
   DeclSpec.cpp
   DelayedDiagnostic.cpp
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
new file mode 100644
index 00000000000000..54e2f1c22536dd
--- /dev/null
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -0,0 +1,1259 @@
+//===--- CheckExprLifetime.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CheckExprLifetime.h"
+#include "clang/AST/Expr.h"
+#include "clang/Sema/Sema.h"
+#include "llvm/ADT/PointerIntPair.h"
+
+namespace clang::sema {
+namespace {
+enum LifetimeKind {
+  /// The lifetime of a temporary bound to this entity ends at the end of the
+  /// full-expression, and that's (probably) fine.
+  LK_FullExpression,
+
+  /// The lifetime of a temporary bound to this entity is extended to the
+  /// lifeitme of the entity itself.
+  LK_Extended,
+
+  /// The lifetime of a temporary bound to this entity probably ends too soon,
+  /// because the entity is allocated in a new-expression.
+  LK_New,
+
+  /// The lifetime of a temporary bound to this entity ends too soon, because
+  /// the entity is a return object.
+  LK_Return,
+
+  /// The lifetime of a temporary bound to this entity ends too soon, because
+  /// the entity is the result of a statement expression.
+  LK_StmtExprResult,
+
+  /// This is a mem-initializer: if it would extend a temporary (other than via
+  /// a default member initializer), the program is ill-formed.
+  LK_MemInitializer,
+};
+using LifetimeResult =
+    llvm::PointerIntPair<const InitializedEntity *, 3, LifetimeKind>;
+} // namespace
+
+/// Determine the declaration which an initialized entity ultimately refers to,
+/// for the purpose of lifetime-extending a temporary bound to a reference in
+/// the initialization of \p Entity.
+static LifetimeResult
+getEntityLifetime(const InitializedEntity *Entity,
+                  const InitializedEntity *InitField = nullptr) {
+  // C++11 [class.temporary]p5:
+  switch (Entity->getKind()) {
+  case InitializedEntity::EK_Variable:
+    //   The temporary [...] persists for the lifetime of the reference
+    return {Entity, LK_Extended};
+
+  case InitializedEntity::EK_Member:
+    // For subobjects, we look at the complete object.
+    if (Entity->getParent())
+      return getEntityLifetime(Entity->getParent(), Entity);
+
+    //   except:
+    // C++17 [class.base.init]p8:
+    //   A temporary expression bound to a reference member in a
+    //   mem-initializer is ill-formed.
+    // C++17 [class.base.init]p11:
+    //   A temporary expression bound to a reference member from a
+    //   default member initializer is ill-formed.
+    //
+    // The context of p11 and its example suggest that it's only the use of a
+    // default member initializer from a constructor that makes the program
+    // ill-formed, not its mere existence, and that it can even be used by
+    // aggregate initialization.
+    return {Entity, Entity->isDefaultMemberInitializer() ? LK_Extended
+                                                         : LK_MemInitializer};
+
+  case InitializedEntity::EK_Binding:
+    // Per [dcl.decomp]p3, the binding is treated as a variable of reference
+    // type.
+    return {Entity, LK_Extended};
+
+  case InitializedEntity::EK_Parameter:
+  case InitializedEntity::EK_Parameter_CF_Audited:
+    //   -- A temporary bound to a reference parameter in a function call
+    //      persists until the completion of the full-expression containing
+    //      the call.
+    return {nullptr, LK_FullExpression};
+
+  case InitializedEntity::EK_TemplateParameter:
+    // FIXME: This will always be ill-formed; should we eagerly diagnose it
+    // here?
+    return {nullptr, LK_FullExpression};
+
+  case InitializedEntity::EK_Result:
+    //   -- The lifetime of a temporary bound to the returned value in a
+    //      function return statement is not extended; the temporary is
+    //      destroyed at the end of the full-expression in the return statement.
+    return {nullptr, LK_Return};
+
+  case InitializedEntity::EK_StmtExprResult:
+    // FIXME: Should we lifetime-extend through the result of a statement
+    // expression?
+    return {nullptr, LK_StmtExprResult};
+
+  case InitializedEntity::EK_New:
+    //   -- A temporary bound to a reference in a new-initializer persists
+    //      until the completion of the full-expression containing the
+    //      new-initializer.
+    return {nullptr, LK_New};
+
+  case InitializedEntity::EK_Temporary:
+  case InitializedEntity::EK_CompoundLiteralInit:
+  case InitializedEntity::EK_RelatedResult:
+    // We don't yet know the storage duration of the surrounding temporary.
+    // Assume it's got full-expression duration for now, it will patch up our
+    // storage duration if that's not correct.
+    return {nullptr, LK_FullExpression};
+
+  case InitializedEntity::EK_ArrayElement:
+    // For subobjects, we look at the complete object.
+    return getEntityLifetime(Entity->getParent(), InitField);
+
+  case InitializedEntity::EK_Base:
+    // For subobjects, we look at the complete object.
+    if (Entity->getParent())
+      return getEntityLifetime(Entity->getParent(), InitField);
+    return {InitField, LK_MemInitializer};
+
+  case InitializedEntity::EK_Delegating:
+    // We can reach this case for aggregate initialization in a constructor:
+    //   struct A { int &&r; };
+    //   struct B : A { B() : A{0} {} };
+    // In this case, use the outermost field decl as the context.
+    return {InitField, LK_MemInitializer};
+
+  case InitializedEntity::EK_BlockElement:
+  case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
+  case InitializedEntity::EK_LambdaCapture:
+  case InitializedEntity::EK_VectorElement:
+  case InitializedEntity::EK_ComplexElement:
+    return {nullptr, LK_FullExpression};
+
+  case InitializedEntity::EK_Exception:
+    // FIXME: Can we diagnose lifetime problems with exceptions?
+    return {nullptr, LK_FullExpression};
+
+  case InitializedEntity::EK_ParenAggInitMember:
+    //   -- A temporary object bound to a reference element of an aggregate of
+    //      class type initialized from a parenthesized expression-list
+    //      [dcl.init, 9.3] persists until the completion of the full-expression
+    //      containing the expression-list.
+    return {nullptr, LK_FullExpression};
+  }
+
+  llvm_unreachable("unknown entity kind");
+}
+
+namespace {
+enum ReferenceKind {
+  /// Lifetime would be extended by a reference binding to a temporary.
+  RK_ReferenceBinding,
+  /// Lifetime would be extended by a std::initializer_list object binding to
+  /// its backing array.
+  RK_StdInitializerList,
+};
+
+/// A temporary or local variable. This will be one of:
+///  * A MaterializeTemporaryExpr.
+///  * A DeclRefExpr whose declaration is a local.
+///  * An AddrLabelExpr.
+///  * A BlockExpr for a block with captures.
+using Local = Expr *;
+
+/// Expressions we stepped over when looking for the local state. Any steps
+/// that would inhibit lifetime extension or take us out of subexpressions of
+/// the initializer are included.
+struct IndirectLocalPathEntry {
+  enum EntryKind {
+    DefaultInit,
+    AddressOf,
+    VarInit,
+    LValToRVal,
+    LifetimeBoundCall,
+    TemporaryCopy,
+    LambdaCaptureInit,
+    GslReferenceInit,
+    GslPointerInit
+  } Kind;
+  Expr *E;
+  union {
+    const Decl *D = nullptr;
+    const LambdaCapture *Capture;
+  };
+  IndirectLocalPathEntry() {}
+  IndirectLocalPathEntry(EntryKind K, Expr *E) : Kind(K), E(E) {}
+  IndirectLocalPathEntry(EntryKind K, Expr *E, const Decl *D)
+      : Kind(K), E(E), D(D) {}
+  IndirectLocalPathEntry(EntryKind K, Expr *E, const LambdaCapture *Capture)
+      : Kind(K), E(E), Capture(Capture) {}
+};
+
+using IndirectLocalPath = llvm::SmallVectorImpl<IndirectLocalPathEntry>;
+
+struct RevertToOldSizeRAII {
+  IndirectLocalPath &Path;
+  unsigned OldSize = Path.size();
+  RevertToOldSizeRAII(IndirectLocalPath &Path) : Path(Path) {}
+  ~RevertToOldSizeRAII() { Path.resize(OldSize); }
+};
+
+using LocalVisitor = llvm::function_ref<bool(IndirectLocalPath &Path, Local L,
+                                             ReferenceKind RK)>;
+} // namespace
+
+static bool isVarOnPath(IndirectLocalPath &Path, VarDecl *VD) {
+  for (auto E : Path)
+    if (E.Kind == IndirectLocalPathEntry::VarInit && E.D == VD)
+      return true;
+  return false;
+}
+
+static bool pathContainsInit(IndirectLocalPath &Path) {
+  return llvm::any_of(Path, [=](IndirectLocalPathEntry E) {
+    return E.Kind == IndirectLocalPathEntry::DefaultInit ||
+           E.Kind == IndirectLocalPathEntry::VarInit;
+  });
+}
+
+static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
+                                             Expr *Init, LocalVisitor Visit,
+                                             bool RevisitSubinits,
+                                             bool EnableLifetimeWarnings);
+
+static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
+                                                  Expr *Init, ReferenceKind RK,
+                                                  LocalVisitor Visit,
+                                                  bool EnableLifetimeWarnings);
+
+template <typename T> static bool isRecordWithAttr(QualType Type) {
+  if (auto *RD = Type->getAsCXXRecordDecl())
+    return RD->hasAttr<T>();
+  return false;
+}
+
+// Decl::isInStdNamespace will return false for iterators in some STL
+// implementations due to them being defined in a namespace outside of the std
+// namespace.
+static bool isInStlNamespace(const Decl *D) {
+  const DeclContext *DC = D->getDeclContext();
+  if (!DC)
+    return false;
+  if (const auto *ND = dyn_cast<NamespaceDecl>(DC))
+    if (const IdentifierInfo *II = ND->getIdentifier()) {
+      StringRef Name = II->getName();
+      if (Name.size() >= 2 && Name.front() == '_' &&
+          (Name[1] == '_' || isUppercase(Name[1])))
+        return true;
+    }
+
+  return DC->isStdNamespace();
+}
+
+static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
+  if (auto *Conv = dyn_cast_or_null<CXXConversionDecl>(Callee))
+    if (isRecordWithAttr<PointerAttr>(Conv->getConversionType()))
+      return true;
+  if (!isInStlNamespace(Callee->getParent()))
+    return false;
+  if (!isRecordWithAttr<PointerAttr>(
+          Callee->getFunctionObjectParameterType()) &&
+      !isRecordWithAttr<OwnerAttr>(Callee->getFunctionObjectParameterType()))
+    return false;
+  if (Callee->getReturnType()->isPointerType() ||
+      isRecordWithAttr<PointerAttr>(Callee->getReturnType())) {
+    if (!Callee->getIdentifier())
+      return false;
+    return llvm::StringSwitch<bool>(Callee->getName())
+        .Cases("begin", "rbegin", "cbegin", "crbegin", true)
+        .Cases("end", "rend", "cend", "crend", true)
+        .Cases("c_str", "data", "get", true)
+        // Map and set types.
+        .Cases("find", "equal_range", "lower_bound", "upper_bound", true)
+        .Default(false);
+  } else if (Callee->getReturnType()->isReferenceType()) {
+    if (!Callee->getIdentifier()) {
+      auto OO = Callee->getOverloadedOperator();
+      return OO == OverloadedOperatorKind::OO_Subscript ||
+             OO == OverloadedOperatorKind::OO_Star;
+    }
+    return llvm::StringSwitch<bool>(Callee->getName())
+        .Cases("front", "back", "at", "top", "value", true)
+        .Default(false);
+  }
+  return false;
+}
+
+static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
+  if (!FD->getIdentifier() || FD->getNumParams() != 1)
+    return false;
+  const auto *RD = FD->getParamDecl(0)->getType()->getPointeeCXXRecordDecl();
+  if (!FD->isInStdNamespace() || !RD || !RD->isInStdNamespace())
+    return false;
+  if (!isRecordWithAttr<PointerAttr>(QualType(RD->getTypeForDecl(), 0)) &&
+      !isRecordWithAttr<OwnerAttr>(QualType(RD->getTypeForDecl(), 0)))
+    return false;
+  if (FD->getReturnType()->isPointerType() ||
+      isRecordWithAttr<PointerAttr>(FD->getReturnType())) {
+    return llvm::StringSwitch<bool>(FD->getName())
+        .Cases("begin", "rbegin", "cbegin", "crbegin", true)
+        .Cases("end", "rend", "cend", "crend", true)
+        .Case("data", true)
+        .Default(false);
+  } else if (FD->getReturnType()->isReferenceType()) {
+    return llvm::StringSwitch<bool>(FD->getName())
+        .Cases("get", "any_cast", true)
+        .Default(false);
+  }
+  return false;
+}
+
+static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call,
+                                    LocalVisitor Visit) {
+  auto VisitPointerArg = [&](const Decl *D, Expr *Arg, bool Value) {
+    // We are not interested in the temporary base objects of gsl Pointers:
+    //   Temp().ptr; // Here ptr might not dangle.
+    if (isa<MemberExpr>(Arg->IgnoreImpCasts()))
+      return;
+    // Once we initialized a value with a reference, it can no longer dangle.
+    if (!Value) {
+      for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) {
+        if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit)
+          continue;
+        if (PE.Kind == IndirectLocalPathEntry::GslPointerInit)
+          return;
+        break;
+      }
+    }
+    Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit
+                          : IndirectLocalPathEntry::GslReferenceInit,
+                    Arg, D});
+    if (Arg->isGLValue())
+      visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
+                                            Visit,
+                                            /*EnableLifetimeWarnings=*/true);
+    else
+      visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
+                                       /*EnableLifetimeWarnings=*/true);
+    Path.pop_back();
+  };
+
+  if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Call)) {
+    const auto *MD = cast_or_null<CXXMethodDecl>(MCE->getDirectCallee());
+    if (MD && shouldTrackImplicitObjectArg(MD))
+      VisitPointerArg(MD, MCE->getImplicitObjectArgument(),
+                      !MD->getReturnType()->isReferenceType());
+    return;
+  } else if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(Call)) {
+    FunctionDecl *Callee = OCE->getDirectCallee();
+    if (Callee && Callee->isCXXInstanceMember() &&
+        shouldTrackImplicitObjectArg(cast<CXXMethodDecl>(Callee)))
+      VisitPointerArg(Callee, OCE->getArg(0),
+                      !Callee->getReturnType()->isReferenceType());
+    return;
+  } else if (auto *CE = dyn_cast<CallExpr>(Call)) {
+    FunctionDecl *Callee = CE->getDirectCallee();
+    if (Callee && shouldTrackFirstArgument(Callee))
+      VisitPointerArg(Callee, CE->getArg(0),
+                      !Callee->getReturnType()->isReferenceType());
+    return;
+  }
+
+  if (auto *CCE = dyn_cast<CXXConstructExpr>(Call)) {
+    const auto *Ctor = CCE->getConstructor();
+    const CXXRecordDecl *RD = Ctor->getParent();
+    if (CCE->getNumArgs() > 0 && RD->hasAttr<PointerAttr>())
+      VisitPointerArg(Ctor->getParamDecl(0), CCE->getArgs()[0], true);
+  }
+}
+
+static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
+  const TypeSourceInfo *TSI = FD->getTypeSourceInfo();
+  if (!TSI)
+    return false;
+  // Don't declare this variable in the second operand of the for-statement;
+  // GCC miscompiles that by ending its lifetime before evaluating the
+  // third operand. See gcc.gnu.org/PR86769.
+  AttributedTypeLoc ATL;
+  for (TypeLoc TL = TSI->getTypeLoc();
+       (ATL = TL.getAsAdjusted<AttributedTypeLoc>());
+       TL = ATL.getModifiedLoc()) {
+    if (ATL.getAttrAs<LifetimeBoundAttr>())
+      return true;
+  }
+
+  // Assume that all assignment operators with a "normal" return type return
+  // *this, that is, an lvalue reference that is the same type as the implicit
+  // object parameter (or the LHS for a non-member operator$=).
+  OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator();
+  if (OO == OO_Equal || isCompoundAssignmentOperator(OO)) {
+    QualType RetT = FD->getReturnType();
+    if (RetT->isLValueReferenceType()) {
+      ASTContext &Ctx = FD->getASTContext();
+      QualType LHST;
+      auto *MD = dyn_cast<CXXMethodDecl>(FD);
+      if (MD && MD->isCXXInstanceMember())
+        LHST = Ctx.getLValueReferenceType(MD->getFunctionObjectParameterType());
+      else
+        LHST = MD->getParamDecl(0)->getType();
+      if (Ctx.hasSameType(RetT, LHST))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call,
+                                        LocalVisitor Visit) {
+  const FunctionDecl *Callee;
+  ArrayRef<Expr *> Args;
+
+  if (auto *CE = dyn_cast<CallExpr>(Call)) {
+    Callee = CE->getDirectCallee();
+    Args = llvm::ArrayRef(CE->getArgs(), CE->getNumArgs());
+  } else {
+    auto *CCE = cast<CXXConstructExpr>(Call);
+    Callee = CCE->getConstructor();
+    Args = llvm::ArrayRef(CCE->getArgs(), CCE->getNumArgs());
+  }
+  if (!Callee)
+    return;
+
+  Expr *ObjectArg = nullptr;
+  if (isa<CXXOperatorCallExpr>(Call) && Callee->isCXXInstanceMember()) {
+    ObjectArg = Args[0];
+    Args = Args.slice(1);
+  } else if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Call)) {
+    ObjectArg = MCE->getImplicitObjectArgument();
+  }
+
+  auto VisitLifetimeBoundArg = [&](const Decl *D, Expr *Arg) {
+    Path.push_back({IndirectLocalPathEntry::LifetimeBoundCall, Arg, D});
+    if (Arg->isGLValue())
+      visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
+                                            Visit,
+                                            /*EnableLifetimeWarnings=*/false);
+    else
+      visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
+                                       /*EnableLifetimeWarnings=*/false);
+    Path.pop_back();
+  };
+
+  bool CheckCoroCall = false;
+  if (const auto *RD = Callee->getReturnType()->getAsRecordDecl()) {
+    CheckCoroCall = RD->hasAttr<CoroLifetimeBoundAttr>() &&
+                    RD->hasAttr<CoroReturnTypeAttr>() &&
+                    !Callee->hasAttr<CoroDisableLifetimeBoundAttr>();
+  }
+
+  if (ObjectArg) {
+    bool CheckCoroObjArg = CheckCoroCall;
+    // Coroutine lambda objects with empty capture list are not lifetimebound.
+    if (auto *LE = dyn_cast<LambdaExpr>(ObjectArg->IgnoreImplicit());
+        LE && LE->captures().empty())
+      CheckCoroObjArg = false;
+    // Allow `get_return_object()` as the object param (__promise) is not
+    // lifetimebound.
+    if (Sema::CanBeGetReturnObject(Callee))
+      CheckCoroObjArg = false;
+    if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg)
+      VisitLifetimeBoundArg(Callee, ObjectArg);
+  }
+
+  for (unsigned I = 0,
+                N = std::min<unsigned>(Callee->getNumParams(), Args.size());
+       I != N; ++I) {
+    if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
+      VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
+  }
+}
+
+/// Visit the locals that would be reachable through a reference bound to the
+/// glvalue expression \c Init.
+static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
+                                                  Expr *Init, ReferenceKind RK,
+                                                  LocalVisitor Visit,
+                                                  bool EnableLifetimeWarnings) {
+  RevertToOldSizeRAII RAII(Path);
+
+  // Walk past any constructs which we can lifetime-extend across.
+  Expr *Old;
+  do {
+    Old = Init;
+
+    if (auto *FE = dyn_cast<FullExpr>(Init))
+      Init = FE->getSubExpr();
+
+    if (InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
+      // If this is just redundant braces around an initializer, step over it.
+      if (ILE->isTransparent())
+        Init = ILE->getInit(0);
+    }
+
+    // Step over any subobject adjustments; we may have a materialized
+    // temporary inside them.
+    Init = const_cast<Expr *>(Init->skipRValueSubobjectAdjustments());
+
+    // Per current approach for DR1376, look through casts to reference type
+    // when performing lifetime extension.
+    if (CastExpr *CE = dyn_cast<CastExpr>(Init))
+      if (CE->getSubExpr()->isGLValue())
+        Init = CE->getSubExpr();
+
+    // Per the current approach for DR1299, look through array element access
+    // on array glvalues when performing lifetime extension.
+    if (auto *ASE = dyn_cast<ArraySubscriptExpr>(Init)) {
+      Init = ASE->getBase();
+      auto *ICE = dyn_cast<ImplicitCastExpr>(Init);
+      if (ICE && ICE->getCastKind() == CK_ArrayToPointerDecay)
+        Init = ICE->getSubExpr();
+      else
+        // We can't lifetime extend through this but we might still find some
+        // retained temporaries.
+        return visitLocalsRetainedByInitializer(Path, Init, Visit, true,
+                                                EnableLifetimeWarnings);
+    }
+
+    // Step into CXXDefaultInitExprs so we can diagnose cases where a
+    // constructor inherits one as an implicit mem-initializer.
+    if (auto *DIE = dyn_cast<CXXDefaultInitExpr>(Init)) {
+      Path.push_back(
+          {IndirectLocalPathEntry::DefaultInit, DIE, DIE->getField()});
+      Init = DIE->getExpr();
+    }
+  } while (Init != Old);
+
+  if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(Init)) {
+    if (Visit(Path, Local(MTE), RK))
+      visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true,
+                                       EnableLifetimeWarnings);
+  }
+
+  if (isa<CallExpr>(Init)) {
+    if (EnableLifetimeWarnings)
+      handleGslAnnotatedTypes(Path, Init, Visit);
+    return visitLifetimeBoundArguments(Path, Init, Visit);
+  }
+
+  switch (Init->getStmtClass()) {
+  case Stmt::DeclRefExprClass: {
+    // If we find the name of a local non-reference parameter, we could have a
+    // lifetime problem.
+    auto *DRE = cast<DeclRefExpr>(Init);
+    auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
+    if (VD && VD->hasLocalStorage() &&
+        !DRE->refersToEnclosingVariableOrCapture()) {
+      if (!VD->getType()->isReferenceType()) {
+        Visit(Path, Local(DRE), RK);
+      } else if (isa<ParmVarDecl>(DRE->getDecl())) {
+        // The lifetime of a reference parameter is unknown; assume it's OK
+        // for now.
+        break;
+      } else if (VD->getInit() && !isVarOnPath(Path, VD)) {
+        Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD});
+        visitLocalsRetainedByReferenceBinding(Path, VD->getInit(),
+                                              RK_ReferenceBinding, Visit,
+                                              EnableLifetimeWarnings);
+      }
+    }
+    break;
+  }
+
+  case Stmt::UnaryOperatorClass: {
+    // The only unary operator that make sense to handle here
+    // is Deref.  All others don't resolve to a "name."  This includes
+    // handling all sorts of rvalues passed to a unary operator.
+    const UnaryOperator *U = cast<UnaryOperator>(Init);
+    if (U->getOpcode() == UO_Deref)
+      visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true,
+                                       EnableLifetimeWarnings);
+    break;
+  }
+
+  case Stmt::ArraySectionExprClass: {
+    visitLocalsRetainedByInitializer(Path,
+                                     cast<ArraySectionExpr>(Init)->getBase(),
+                                     Visit, true, EnableLifetimeWarnings);
+    break;
+  }
+
+  case Stmt::ConditionalOperatorClass:
+  case Stmt::BinaryConditionalOperatorClass: {
+    auto *C = cast<AbstractConditionalOperator>(Init);
+    if (!C->getTrueExpr()->getType()->isVoidType())
+      visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit,
+                                            EnableLifetimeWarnings);
+    if (!C->getFalseExpr()->getType()->isVoidType())
+      visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit,
+                                            EnableLifetimeWarnings);
+    break;
+  }
+
+  case Stmt::CompoundLiteralExprClass: {
+    if (auto *CLE = dyn_cast<CompoundLiteralExpr>(Init)) {
+      if (!CLE->isFileScope())
+        Visit(Path, Local(CLE), RK);
+    }
+    break;
+  }
+
+    // FIXME: Visit the left-hand side of an -> or ->*.
+
+  default:
+    break;
+  }
+}
+
+/// Visit the locals that would be reachable through an object initialized by
+/// the prvalue expression \c Init.
+static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
+                                             Expr *Init, LocalVisitor Visit,
+                                             bool RevisitSubinits,
+                                             bool EnableLifetimeWarnings) {
+  RevertToOldSizeRAII RAII(Path);
+
+  Expr *Old;
+  do {
+    Old = Init;
+
+    // Step into CXXDefaultInitExprs so we can diagnose cases where a
+    // constructor inherits one as an implicit mem-initializer.
+    if (auto *DIE = dyn_cast<CXXDefaultInitExpr>(Init)) {
+      Path.push_back(
+          {IndirectLocalPathEntry::DefaultInit, DIE, DIE->getField()});
+      Init = DIE->getExpr();
+    }
+
+    if (auto *FE = dyn_cast<FullExpr>(Init))
+      Init = FE->getSubExpr();
+
+    // Dig out the expression which constructs the extended temporary.
+    Init = const_cast<Expr *>(Init->skipRValueSubobjectAdjustments());
+
+    if (CXXBindTemporaryExpr *BTE = dyn_cast<CXXBindTemporaryExpr>(Init))
+      Init = BTE->getSubExpr();
+
+    Init = Init->IgnoreParens();
+
+    // Step over value-preserving rvalue casts.
+    if (auto *CE = dyn_cast<CastExpr>(Init)) {
+      switch (CE->getCastKind()) {
+      case CK_LValueToRValue:
+        // If we can match the lvalue to a const object, we can look at its
+        // initializer.
+        Path.push_back({IndirectLocalPathEntry::LValToRVal, CE});
+        return visitLocalsRetainedByReferenceBinding(
+            Path, Init, RK_ReferenceBinding,
+            [&](IndirectLocalPath &Path, Local L, ReferenceKind RK) -> bool {
+              if (auto *DRE = dyn_cast<DeclRefExpr>(L)) {
+                auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
+                if (VD && VD->getType().isConstQualified() && VD->getInit() &&
+                    !isVarOnPath(Path, VD)) {
+                  Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD});
+                  visitLocalsRetainedByInitializer(
+                      Path, VD->getInit(), Visit, true, EnableLifetimeWarnings);
+                }
+              } else if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L)) {
+                if (MTE->getType().isConstQualified())
+                  visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(),
+                                                   Visit, true,
+                                                   EnableLifetimeWarnings);
+              }
+              return false;
+            },
+            EnableLifetimeWarnings);
+
+        // We assume that objects can be retained by pointers cast to integers,
+        // but not if the integer is cast to floating-point type or to _Complex.
+        // We assume that casts to 'bool' do not preserve enough information to
+        // retain a local object.
+      case CK_NoOp:
+      case CK_BitCast:
+      case CK_BaseToDerived:
+      case CK_DerivedToBase:
+      case CK_UncheckedDerivedToBase:
+      case CK_Dynamic:
+      case CK_ToUnion:
+      case CK_UserDefinedConversion:
+      case CK_ConstructorConversion:
+      case CK_IntegralToPointer:
+      case CK_PointerToIntegral:
+      case CK_VectorSplat:
+      case CK_IntegralCast:
+      case CK_CPointerToObjCPointerCast:
+      case CK_BlockPointerToObjCPointerCast:
+      case CK_AnyPointerToBlockPointerCast:
+      case CK_AddressSpaceConversion:
+        break;
+
+      case CK_ArrayToPointerDecay:
+        // Model array-to-pointer decay as taking the address of the array
+        // lvalue.
+        Path.push_back({IndirectLocalPathEntry::AddressOf, CE});
+        return visitLocalsRetainedByReferenceBinding(Path, CE->getSubExpr(),
+                                                     RK_ReferenceBinding, Visit,
+                                                     EnableLifetimeWarnings);
+
+      default:
+        return;
+      }
+
+      Init = CE->getSubExpr();
+    }
+  } while (Old != Init);
+
+  // C++17 [dcl.init.list]p6:
+  //   initializing an initializer_list object from the array extends the
+  //   lifetime of the array exactly like binding a reference to a temporary.
+  if (auto *ILE = dyn_cast<CXXStdInitializerListExpr>(Init))
+    return visitLocalsRetainedByReferenceBinding(Path, ILE->getSubExpr(),
+                                                 RK_StdInitializerList, Visit,
+                                                 EnableLifetimeWarnings);
+
+  if (InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
+    // We already visited the elements of this initializer list while
+    // performing the initialization. Don't visit them again unless we've
+    // changed the lifetime of the initialized entity.
+    if (!RevisitSubinits)
+      return;
+
+    if (ILE->isTransparent())
+      return visitLocalsRetainedByInitializer(Path, ILE->getInit(0), Visit,
+                                              RevisitSubinits,
+                                              EnableLifetimeWarnings);
+
+    if (ILE->getType()->isArrayType()) {
+      for (unsigned I = 0, N = ILE->getNumInits(); I != N; ++I)
+        visitLocalsRetainedByInitializer(Path, ILE->getInit(I), Visit,
+                                         RevisitSubinits,
+                                         EnableLifetimeWarnings);
+      return;
+    }
+
+    if (CXXRecordDecl *RD = ILE->getType()->getAsCXXRecordDecl()) {
+      assert(RD->isAggregate() && "aggregate init on non-aggregate");
+
+      // If we lifetime-extend a braced initializer which is initializing an
+      // aggregate, and that aggregate contains reference members which are
+      // bound to temporaries, those temporaries are also lifetime-extended.
+      if (RD->isUnion() && ILE->getInitializedFieldInUnion() &&
+          ILE->getInitializedFieldInUnion()->getType()->isReferenceType())
+        visitLocalsRetainedByReferenceBinding(Path, ILE->getInit(0),
+                                              RK_ReferenceBinding, Visit,
+                                              EnableLifetimeWarnings);
+      else {
+        unsigned Index = 0;
+        for (; Index < RD->getNumBases() && Index < ILE->getNumInits(); ++Index)
+          visitLocalsRetainedByInitializer(Path, ILE->getInit(Index), Visit,
+                                           RevisitSubinits,
+                                           EnableLifetimeWarnings);
+        for (const auto *I : RD->fields()) {
+          if (Index >= ILE->getNumInits())
+            break;
+          if (I->isUnnamedBitField())
+            continue;
+          Expr *SubInit = ILE->getInit(Index);
+          if (I->getType()->isReferenceType())
+            visitLocalsRetainedByReferenceBinding(Path, SubInit,
+                                                  RK_ReferenceBinding, Visit,
+                                                  EnableLifetimeWarnings);
+          else
+            // This might be either aggregate-initialization of a member or
+            // initialization of a std::initializer_list object. Regardless,
+            // we should recursively lifetime-extend that initializer.
+            visitLocalsRetainedByInitializer(
+                Path, SubInit, Visit, RevisitSubinits, EnableLifetimeWarnings);
+          ++Index;
+        }
+      }
+    }
+    return;
+  }
+
+  // The lifetime of an init-capture is that of the closure object constructed
+  // by a lambda-expression.
+  if (auto *LE = dyn_cast<LambdaExpr>(Init)) {
+    LambdaExpr::capture_iterator CapI = LE->capture_begin();
+    for (Expr *E : LE->capture_inits()) {
+      assert(CapI != LE->capture_end());
+      const LambdaCapture &Cap = *CapI++;
+      if (!E)
+        continue;
+      if (Cap.capturesVariable())
+        Path.push_back({IndirectLocalPathEntry::LambdaCaptureInit, E, &Cap});
+      if (E->isGLValue())
+        visitLocalsRetainedByReferenceBinding(Path, E, RK_ReferenceBinding,
+                                              Visit, EnableLifetimeWarnings);
+      else
+        visitLocalsRetainedByInitializer(Path, E, Visit, true,
+                                         EnableLifetimeWarnings);
+      if (Cap.capturesVariable())
+        Path.pop_back();
+    }
+  }
+
+  // Assume that a copy or move from a temporary references the same objects
+  // that the temporary does.
+  if (auto *CCE = dyn_cast<CXXConstructExpr>(Init)) {
+    if (CCE->getConstructor()->isCopyOrMoveConstructor()) {
+      if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CCE->getArg(0))) {
+        // assert(false && "hit temporary copy path");
+        Expr *Arg = MTE->getSubExpr();
+        Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg,
+                        CCE->getConstructor()});
+        visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
+                                         /*EnableLifetimeWarnings*/ false);
+        Path.pop_back();
+      }
+    }
+  }
+
+  if (isa<CallExpr>(Init) || isa<CXXConstructExpr>(Init)) {
+    if (EnableLifetimeWarnings)
+      handleGslAnnotatedTypes(Path, Init, Visit);
+    return visitLifetimeBoundArguments(Path, Init, Visit);
+  }
+
+  switch (Init->getStmtClass()) {
+  case Stmt::UnaryOperatorClass: {
+    auto *UO = cast<UnaryOperator>(Init);
+    // If the initializer is the address of a local, we could have a lifetime
+    // problem.
+    if (UO->getOpcode() == UO_AddrOf) {
+      // If this is &rvalue, then it's ill-formed and we have already diagnosed
+      // it. Don't produce a redundant warning about the lifetime of the
+      // temporary.
+      if (isa<MaterializeTemporaryExpr>(UO->getSubExpr()))
+        return;
+
+      Path.push_back({IndirectLocalPathEntry::AddressOf, UO});
+      visitLocalsRetainedByReferenceBinding(Path, UO->getSubExpr(),
+                                            RK_ReferenceBinding, Visit,
+                                            EnableLifetimeWarnings);
+    }
+    break;
+  }
+
+  case Stmt::BinaryOperatorClass: {
+    // Handle pointer arithmetic.
+    auto *BO = cast<BinaryOperator>(Init);
+    BinaryOperatorKind BOK = BO->getOpcode();
+    if (!BO->getType()->isPointerType() || (BOK != BO_Add && BOK != BO_Sub))
+      break;
+
+    if (BO->getLHS()->getType()->isPointerType())
+      visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true,
+                                       EnableLifetimeWarnings);
+    else if (BO->getRHS()->getType()->isPointerType())
+      visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true,
+                                       EnableLifetimeWarnings);
+    break;
+  }
+
+  case Stmt::ConditionalOperatorClass:
+  case Stmt::BinaryConditionalOperatorClass: {
+    auto *C = cast<AbstractConditionalOperator>(Init);
+    // In C++, we can have a throw-expression operand, which has 'void' type
+    // and isn't interesting from a lifetime perspective.
+    if (!C->getTrueExpr()->getType()->isVoidType())
+      visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true,
+                                       EnableLifetimeWarnings);
+    if (!C->getFalseExpr()->getType()->isVoidType())
+      visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true,
+                                       EnableLifetimeWarnings);
+    break;
+  }
+
+  case Stmt::BlockExprClass:
+    if (cast<BlockExpr>(Init)->getBlockDecl()->hasCaptures()) {
+      // This is a local block, whose lifetime is that of the function.
+      Visit(Path, Local(cast<BlockExpr>(Init)), RK_ReferenceBinding);
+    }
+    break;
+
+  case Stmt::AddrLabelExprClass:
+    // We want to warn if the address of a label would escape the function.
+    Visit(Path, Local(cast<AddrLabelExpr>(Init)), RK_ReferenceBinding);
+    break;
+
+  default:
+    break;
+  }
+}
+
+/// Whether a path to an object supports lifetime extension.
+enum PathLifetimeKind {
+  /// Lifetime-extend along this path.
+  Extend,
+  /// We should lifetime-extend, but we don't because (due to technical
+  /// limitations) we can't. This happens for default member initializers,
+  /// which we don't clone for every use, so we don't have a unique
+  /// MaterializeTemporaryExpr to update.
+  ShouldExtend,
+  /// Do not lifetime extend along this path.
+  NoExtend
+};
+
+/// Determine whether this is an indirect path to a temporary that we are
+/// supposed to lifetime-extend along.
+static PathLifetimeKind
+shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) {
+  PathLifetimeKind Kind = PathLifetimeKind::Extend;
+  for (auto Elem : Path) {
+    if (Elem.Kind == IndirectLocalPathEntry::DefaultInit)
+      Kind = PathLifetimeKind::ShouldExtend;
+    else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit)
+      return PathLifetimeKind::NoExtend;
+  }
+  return Kind;
+}
+
+/// Find the range for the first interesting entry in the path at or after I.
+static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I,
+                                      Expr *E) {
+  for (unsigned N = Path.size(); I != N; ++I) {
+    switch (Path[I].Kind) {
+    case IndirectLocalPathEntry::AddressOf:
+    case IndirectLocalPathEntry::LValToRVal:
+    case IndirectLocalPathEntry::LifetimeBoundCall:
+    case IndirectLocalPathEntry::TemporaryCopy:
+    case IndirectLocalPathEntry::GslReferenceInit:
+    case IndirectLocalPathEntry::GslPointerInit:
+      // These exist primarily to mark the path as not permitting or
+      // supporting lifetime extension.
+      break;
+
+    case IndirectLocalPathEntry::VarInit:
+      if (cast<VarDecl>(Path[I].D)->isImplicit())
+        return SourceRange();
+      [[fallthrough]];
+    case IndirectLocalPathEntry::DefaultInit:
+      return Path[I].E->getSourceRange();
+
+    case IndirectLocalPathEntry::LambdaCaptureInit:
+      if (!Path[I].Capture->capturesVariable())
+        continue;
+      return Path[I].E->getSourceRange();
+    }
+  }
+  return E->getSourceRange();
+}
+
+static bool pathOnlyInitializesGslPointer(IndirectLocalPath &Path) {
+  for (const auto &It : llvm::reverse(Path)) {
+    if (It.Kind == IndirectLocalPathEntry::VarInit)
+      continue;
+    if (It.Kind == IndirectLocalPathEntry::AddressOf)
+      continue;
+    if (It.Kind == IndirectLocalPathEntry::LifetimeBoundCall)
+      continue;
+    return It.Kind == IndirectLocalPathEntry::GslPointerInit ||
+           It.Kind == IndirectLocalPathEntry::GslReferenceInit;
+  }
+  return false;
+}
+
+void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity,
+                       Expr *Init) {
+  LifetimeResult LR = getEntityLifetime(&Entity);
+  LifetimeKind LK = LR.getInt();
+  const InitializedEntity *ExtendingEntity = LR.getPointer();
+
+  // If this entity doesn't have an interesting lifetime, don't bother looking
+  // for temporaries within its initializer.
+  if (LK == LK_FullExpression)
+    return;
+
+  auto TemporaryVisitor = [&](IndirectLocalPath &Path, Local L,
+                              ReferenceKind RK) -> bool {
+    SourceRange DiagRange = nextPathEntryRange(Path, 0, L);
+    SourceLocation DiagLoc = DiagRange.getBegin();
+
+    auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L);
+
+    bool IsGslPtrInitWithGslTempOwner = false;
+    bool IsLocalGslOwner = false;
+    if (pathOnlyInitializesGslPointer(Path)) {
+      if (isa<DeclRefExpr>(L)) {
+        // We do not want to follow the references when returning a pointer
+        // originating from a local owner to avoid the following false positive:
+        //   int &p = *localUniquePtr;
+        //   someContainer.add(std::move(localUniquePtr));
+        //   return p;
+        IsLocalGslOwner = isRecordWithAttr<OwnerAttr>(L->getType());
+        if (pathContainsInit(Path) || !IsLocalGslOwner)
+          return false;
+      } else {
+        IsGslPtrInitWithGslTempOwner =
+            MTE && !MTE->getExtendingDecl() &&
+            isRecordWithAttr<OwnerAttr>(MTE->getType());
+        // Skipping a chain of initializing gsl::Pointer annotated objects.
+        // We are looking only for the final source to find out if it was
+        // a local or temporary owner or the address of a local variable/param.
+        if (!IsGslPtrInitWithGslTempOwner)
+          return true;
+      }
+    }
+
+    switch (LK) {
+    case LK_FullExpression:
+      llvm_unreachable("already handled this");
+
+    case LK_Extended: {
+      if (!MTE) {
+        // The initialized entity has lifetime beyond the full-expression,
+        // and the local entity does too, so don't warn.
+        //
+        // FIXME: We should consider warning if a static / thread storage
+        // duration variable retains an automatic storage duration local.
+        return false;
+      }
+
+      if (IsGslPtrInitWithGslTempOwner && DiagLoc.isValid()) {
+        SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer)
+            << DiagRange;
+        return false;
+      }
+
+      switch (shouldLifetimeExtendThroughPath(Path)) {
+      case PathLifetimeKind::Extend:
+        // Update the storage duration of the materialized temporary.
+        // FIXME: Rebuild the expression instead of mutating it.
+        MTE->setExtendingDecl(ExtendingEntity->getDecl(),
+                              ExtendingEntity->allocateManglingNumber());
+        // Also visit the temporaries lifetime-extended by this initializer.
+        return true;
+
+      case PathLifetimeKind::ShouldExtend:
+        // We're supposed to lifetime-extend the temporary along this path (per
+        // the resolution of DR1815), but we don't support that yet.
+        //
+        // FIXME: Properly handle this situation. Perhaps the easiest approach
+        // would be to clone the initializer expression on each use that would
+        // lifetime extend its temporaries.
+        SemaRef.Diag(DiagLoc, diag::warn_unsupported_lifetime_extension)
+            << RK << DiagRange;
+        break;
+
+      case PathLifetimeKind::NoExtend:
+        // If the path goes through the initialization of a variable or field,
+        // it can't possibly reach a temporary created in this full-expression.
+        // We will have already diagnosed any problems with the initializer.
+        if (pathContainsInit(Path))
+          return false;
+
+        SemaRef.Diag(DiagLoc, diag::warn_dangling_variable)
+            << RK << !Entity.getParent()
+            << ExtendingEntity->getDecl()->isImplicit()
+            << ExtendingEntity->getDecl() << Init->isGLValue() << DiagRange;
+        break;
+      }
+      break;
+    }
+
+    case LK_MemInitializer: {
+      if (isa<MaterializeTemporaryExpr>(L)) {
+        // Under C++ DR1696, if a mem-initializer (or a default member
+        // initializer used by the absence of one) would lifetime-extend a
+        // temporary, the program is ill-formed.
+        if (auto *ExtendingDecl =
+                ExtendingEntity ? ExtendingEntity->getDecl() : nullptr) {
+          if (IsGslPtrInitWithGslTempOwner) {
+            SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer_member)
+                << ExtendingDecl << DiagRange;
+            SemaRef.Diag(ExtendingDecl->getLocation(),
+                         diag::note_ref_or_ptr_member_declared_here)
+                << true;
+            return false;
+          }
+          bool IsSubobjectMember = ExtendingEntity != &Entity;
+          SemaRef.Diag(DiagLoc, shouldLifetimeExtendThroughPath(Path) !=
+                                        PathLifetimeKind::NoExtend
+                                    ? diag::err_dangling_member
+                                    : diag::warn_dangling_member)
+              << ExtendingDecl << IsSubobjectMember << RK << DiagRange;
+          // Don't bother adding a note pointing to the field if we're inside
+          // its default member initializer; our primary diagnostic points to
+          // the same place in that case.
+          if (Path.empty() ||
+              Path.back().Kind != IndirectLocalPathEntry::DefaultInit) {
+            SemaRef.Diag(ExtendingDecl->getLocation(),
+                         diag::note_lifetime_extending_member_declared_here)
+                << RK << IsSubobjectMember;
+          }
+        } else {
+          // We have a mem-initializer but no particular field within it; this
+          // is either a base class or a delegating initializer directly
+          // initializing the base-class from something that doesn't live long
+          // enough.
+          //
+          // FIXME: Warn on this.
+          return false;
+        }
+      } else {
+        // Paths via a default initializer can only occur during error recovery
+        // (there's no other way that a default initializer can refer to a
+        // local). Don't produce a bogus warning on those cases.
+        if (pathContainsInit(Path))
+          return false;
+
+        // Suppress false positives for code like the one below:
+        //   Ctor(unique_ptr<T> up) : member(*up), member2(move(up)) {}
+        if (IsLocalGslOwner && pathOnlyInitializesGslPointer(Path))
+          return false;
+
+        auto *DRE = dyn_cast<DeclRefExpr>(L);
+        auto *VD = DRE ? dyn_cast<VarDecl>(DRE->getDecl()) : nullptr;
+        if (!VD) {
+          // A member was initialized to a local block.
+          // FIXME: Warn on this.
+          return false;
+        }
+
+        if (auto *Member =
+                ExtendingEntity ? ExtendingEntity->getDecl() : nullptr) {
+          bool IsPointer = !Member->getType()->isReferenceType();
+          SemaRef.Diag(DiagLoc,
+                       IsPointer ? diag::warn_init_ptr_member_to_parameter_addr
+                                 : diag::warn_bind_ref_member_to_parameter)
+              << Member << VD << isa<ParmVarDecl>(VD) << DiagRange;
+          SemaRef.Diag(Member->getLocation(),
+                       diag::note_ref_or_ptr_member_declared_here)
+              << (unsigned)IsPointer;
+        }
+      }
+      break;
+    }
+
+    case LK_New:
+      if (isa<MaterializeTemporaryExpr>(L)) {
+        if (IsGslPtrInitWithGslTempOwner)
+          SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer)
+              << DiagRange;
+        else
+          SemaRef.Diag(DiagLoc, RK == RK_ReferenceBinding
+                                    ? diag::warn_new_dangling_reference
+                                    : diag::warn_new_dangling_initializer_list)
+              << !Entity.getParent() << DiagRange;
+      } else {
+        // We can't determine if the allocation outlives the local declaration.
+        return false;
+      }
+      break;
+
+    case LK_Return:
+    case LK_StmtExprResult:
+      if (auto *DRE = dyn_cast<DeclRefExpr>(L)) {
+        // We can't determine if the local variable outlives the statement
+        // expression.
+        if (LK == LK_StmtExprResult)
+          return false;
+        SemaRef.Diag(DiagLoc, diag::warn_ret_stack_addr_ref)
+            << Entity.getType()->isReferenceType() << DRE->getDecl()
+            << isa<ParmVarDecl>(DRE->getDecl()) << DiagRange;
+      } else if (isa<BlockExpr>(L)) {
+        SemaRef.Diag(DiagLoc, diag::err_ret_local_block) << DiagRange;
+      } else if (isa<AddrLabelExpr>(L)) {
+        // Don't warn when returning a label from a statement expression.
+        // Leaving the scope doesn't end its lifetime.
+        if (LK == LK_StmtExprResult)
+          return false;
+        SemaRef.Diag(DiagLoc, diag::warn_ret_addr_label) << DiagRange;
+      } else if (auto *CLE = dyn_cast<CompoundLiteralExpr>(L)) {
+        SemaRef.Diag(DiagLoc, diag::warn_ret_stack_addr_ref)
+            << Entity.getType()->isReferenceType() << CLE->getInitializer() << 2
+            << DiagRange;
+      } else {
+        // P2748R5: Disallow Binding a Returned Glvalue to a Temporary.
+        // [stmt.return]/p6: In a function whose return type is a reference,
+        // other than an invented function for std::is_convertible ([meta.rel]),
+        // a return statement that binds the returned reference to a temporary
+        // expression ([class.temporary]) is ill-formed.
+        if (SemaRef.getLangOpts().CPlusPlus26 &&
+            Entity.getType()->isReferenceType())
+          SemaRef.Diag(DiagLoc, diag::err_ret_local_temp_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
+        else
+          SemaRef.Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
+      }
+      break;
+    }
+
+    for (unsigned I = 0; I != Path.size(); ++I) {
+      auto Elem = Path[I];
+
+      switch (Elem.Kind) {
+      case IndirectLocalPathEntry::AddressOf:
+      case IndirectLocalPathEntry::LValToRVal:
+        // These exist primarily to mark the path as not permitting or
+        // supporting lifetime extension.
+        break;
+
+      case IndirectLocalPathEntry::LifetimeBoundCall:
+      case IndirectLocalPathEntry::TemporaryCopy:
+      case IndirectLocalPathEntry::GslPointerInit:
+      case IndirectLocalPathEntry::GslReferenceInit:
+        // FIXME: Consider adding a note for these.
+        break;
+
+      case IndirectLocalPathEntry::DefaultInit: {
+        auto *FD = cast<FieldDecl>(Elem.D);
+        SemaRef.Diag(FD->getLocation(),
+                     diag::note_init_with_default_member_initializer)
+            << FD << nextPathEntryRange(Path, I + 1, L);
+        break;
+      }
+
+      case IndirectLocalPathEntry::VarInit: {
+        const VarDecl *VD = cast<VarDecl>(Elem.D);
+        SemaRef.Diag(VD->getLocation(), diag::note_local_var_initializer)
+            << VD->getType()->isReferenceType() << VD->isImplicit()
+            << VD->getDeclName() << nextPathEntryRange(Path, I + 1, L);
+        break;
+      }
+
+      case IndirectLocalPathEntry::LambdaCaptureInit:
+        if (!Elem.Capture->capturesVariable())
+          break;
+        // FIXME: We can't easily tell apart an init-capture from a nested
+        // capture of an init-capture.
+        const ValueDecl *VD = Elem.Capture->getCapturedVar();
+        SemaRef.Diag(Elem.Capture->getLocation(),
+                     diag::note_lambda_capture_initializer)
+            << VD << VD->isInitCapture() << Elem.Capture->isExplicit()
+            << (Elem.Capture->getCaptureKind() == LCK_ByRef) << VD
+            << nextPathEntryRange(Path, I + 1, L);
+        break;
+      }
+    }
+
+    // We didn't lifetime-extend, so don't go any further; we don't need more
+    // warnings or errors on inner temporaries within this one's initializer.
+    return false;
+  };
+
+  bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored(
+      diag::warn_dangling_lifetime_pointer, SourceLocation());
+  llvm::SmallVector<IndirectLocalPathEntry, 8> Path;
+  if (Init->isGLValue())
+    visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding,
+                                          TemporaryVisitor,
+                                          EnableLifetimeWarnings);
+  else
+    visitLocalsRetainedByInitializer(Path, Init, TemporaryVisitor, false,
+                                     EnableLifetimeWarnings);
+}
+
+} // namespace clang::sema
diff --git a/clang/lib/Sema/CheckExprLifetime.h b/clang/lib/Sema/CheckExprLifetime.h
new file mode 100644
index 00000000000000..f9cd3f55d3dbf2
--- /dev/null
+++ b/clang/lib/Sema/CheckExprLifetime.h
@@ -0,0 +1,29 @@
+//===- CheckExprLifetime.h -----------------------------------  -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//===----------------------------------------------------------------------===//
+//
+//  This files implements a statement-local lifetime analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_CHECK_EXPR_LIFETIME_H
+#define LLVM_CLANG_SEMA_CHECK_EXPR_LIFETIME_H
+
+#include "clang/AST/Expr.h"
+#include "clang/Sema/Initialization.h"
+#include "clang/Sema/Sema.h"
+
+namespace clang::sema {
+
+/// Check that the lifetime of the given expr (and its subobjects) is
+/// sufficient for initializing the entity, and perform lifetime extension
+/// (when permitted) if not.
+void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity,
+                       Expr *Init);
+
+} // namespace clang::sema
+
+#endif // LLVM_CLANG_SEMA_CHECK_EXPR_LIFETIME_H
diff --git a/clang/lib/Sema/ScopeInfo.cpp b/clang/lib/Sema/ScopeInfo.cpp
index ce90451f2613bf..12fb7060727233 100644
--- a/clang/lib/Sema/ScopeInfo.cpp
+++ b/clang/lib/Sema/ScopeInfo.cpp
@@ -39,6 +39,7 @@ void FunctionScopeInfo::Clear() {
   FirstReturnLoc = SourceLocation();
   FirstCXXOrObjCTryLoc = SourceLocation();
   FirstSEHTryLoc = SourceLocation();
+  FirstVLALoc = SourceLocation();
   FoundImmediateEscalatingExpression = false;
 
   // Coroutine state
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 069978c1b40239..3f8f2f027172de 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -638,6 +638,19 @@ void Sema::diagnoseNullableToNonnullConversion(QualType DstType,
   Diag(Loc, diag::warn_nullability_lost) << SrcType << DstType;
 }
 
+// Generate diagnostics when adding or removing effects in a type conversion.
+void Sema::diagnoseFunctionEffectConversion(QualType DstType, QualType SrcType,
+                                            SourceLocation Loc) {
+  const auto SrcFX = FunctionEffectsRef::get(SrcType);
+  const auto DstFX = FunctionEffectsRef::get(DstType);
+  if (SrcFX != DstFX) {
+    for (const auto &Diff : FunctionEffectDifferences(SrcFX, DstFX)) {
+      if (Diff.shouldDiagnoseConversion(SrcType, SrcFX, DstType, DstFX))
+        Diag(Loc, diag::warn_invalid_add_func_effects) << Diff.effectName();
+    }
+  }
+}
+
 void Sema::diagnoseZeroToNullptrConversion(CastKind Kind, const Expr *E) {
   // nullptr only exists from C++11 on, so don't warn on its absence earlier.
   if (!getLangOpts().CPlusPlus11)
@@ -715,6 +728,9 @@ ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty,
 
   diagnoseNullableToNonnullConversion(Ty, E->getType(), E->getBeginLoc());
   diagnoseZeroToNullptrConversion(Kind, E);
+  if (!isCast(CCK) && Kind != CK_NullToPointer &&
+      Kind != CK_NullToMemberPointer)
+    diagnoseFunctionEffectConversion(Ty, E->getType(), E->getBeginLoc());
 
   QualType ExprTy = Context.getCanonicalType(E->getType());
   QualType TypeTy = Context.getCanonicalType(Ty);
@@ -2796,3 +2812,153 @@ bool Sema::isDeclaratorFunctionLike(Declarator &D) {
   });
   return Result;
 }
+
+FunctionEffectDifferences::FunctionEffectDifferences(
+    const FunctionEffectsRef &Old, const FunctionEffectsRef &New) {
+
+  FunctionEffectsRef::iterator POld = Old.begin();
+  FunctionEffectsRef::iterator OldEnd = Old.end();
+  FunctionEffectsRef::iterator PNew = New.begin();
+  FunctionEffectsRef::iterator NewEnd = New.end();
+
+  while (true) {
+    int cmp = 0;
+    if (POld == OldEnd) {
+      if (PNew == NewEnd)
+        break;
+      cmp = 1;
+    } else if (PNew == NewEnd)
+      cmp = -1;
+    else {
+      FunctionEffectWithCondition Old = *POld;
+      FunctionEffectWithCondition New = *PNew;
+      if (Old.Effect.kind() < New.Effect.kind())
+        cmp = -1;
+      else if (New.Effect.kind() < Old.Effect.kind())
+        cmp = 1;
+      else {
+        cmp = 0;
+        if (Old.Cond.getCondition() != New.Cond.getCondition()) {
+          // FIXME: Cases where the expressions are equivalent but
+          // don't have the same identity.
+          push_back(FunctionEffectDiff{
+              Old.Effect.kind(), FunctionEffectDiff::Kind::ConditionMismatch,
+              Old, New});
+        }
+      }
+    }
+
+    if (cmp < 0) {
+      // removal
+      FunctionEffectWithCondition Old = *POld;
+      push_back(FunctionEffectDiff{
+          Old.Effect.kind(), FunctionEffectDiff::Kind::Removed, Old, {}});
+      ++POld;
+    } else if (cmp > 0) {
+      // addition
+      FunctionEffectWithCondition New = *PNew;
+      push_back(FunctionEffectDiff{
+          New.Effect.kind(), FunctionEffectDiff::Kind::Added, {}, New});
+      ++PNew;
+    } else {
+      ++POld;
+      ++PNew;
+    }
+  }
+}
+
+bool FunctionEffectDiff::shouldDiagnoseConversion(
+    QualType SrcType, const FunctionEffectsRef &SrcFX, QualType DstType,
+    const FunctionEffectsRef &DstFX) const {
+
+  switch (EffectKind) {
+  case FunctionEffect::Kind::NonAllocating:
+    // nonallocating can't be added (spoofed) during a conversion, unless we
+    // have nonblocking.
+    if (DiffKind == Kind::Added) {
+      for (const auto &CFE : SrcFX) {
+        if (CFE.Effect.kind() == FunctionEffect::Kind::NonBlocking)
+          return false;
+      }
+    }
+    [[fallthrough]];
+  case FunctionEffect::Kind::NonBlocking:
+    // nonblocking can't be added (spoofed) during a conversion.
+    switch (DiffKind) {
+    case Kind::Added:
+      return true;
+    case Kind::Removed:
+      return false;
+    case Kind::ConditionMismatch:
+      // FIXME: Condition mismatches are too coarse right now -- expressions
+      // which are equivalent but don't have the same identity are detected as
+      // mismatches. We're going to diagnose those anyhow until expression
+      // matching is better.
+      return true;
+    }
+  case FunctionEffect::Kind::Blocking:
+  case FunctionEffect::Kind::Allocating:
+    return false;
+  case FunctionEffect::Kind::None:
+    break;
+  }
+  llvm_unreachable("unknown effect kind");
+}
+
+bool FunctionEffectDiff::shouldDiagnoseRedeclaration(
+    const FunctionDecl &OldFunction, const FunctionEffectsRef &OldFX,
+    const FunctionDecl &NewFunction, const FunctionEffectsRef &NewFX) const {
+  switch (EffectKind) {
+  case FunctionEffect::Kind::NonAllocating:
+  case FunctionEffect::Kind::NonBlocking:
+    // nonblocking/nonallocating can't be removed in a redeclaration.
+    switch (DiffKind) {
+    case Kind::Added:
+      return false; // No diagnostic.
+    case Kind::Removed:
+      return true; // Issue diagnostic.
+    case Kind::ConditionMismatch:
+      // All these forms of mismatches are diagnosed.
+      return true;
+    }
+  case FunctionEffect::Kind::Blocking:
+  case FunctionEffect::Kind::Allocating:
+    return false;
+  case FunctionEffect::Kind::None:
+    break;
+  }
+  llvm_unreachable("unknown effect kind");
+}
+
+FunctionEffectDiff::OverrideResult
+FunctionEffectDiff::shouldDiagnoseMethodOverride(
+    const CXXMethodDecl &OldMethod, const FunctionEffectsRef &OldFX,
+    const CXXMethodDecl &NewMethod, const FunctionEffectsRef &NewFX) const {
+  switch (EffectKind) {
+  case FunctionEffect::Kind::NonAllocating:
+  case FunctionEffect::Kind::NonBlocking:
+    switch (DiffKind) {
+
+    // If added on an override, that's fine and not diagnosed.
+    case Kind::Added:
+      return OverrideResult::NoAction;
+
+    // If missing from an override (removed), propagate from base to derived.
+    case Kind::Removed:
+      return OverrideResult::Merge;
+
+    // If there's a mismatch involving the effect's polarity or condition,
+    // issue a warning.
+    case Kind::ConditionMismatch:
+      return OverrideResult::Warn;
+    }
+
+  case FunctionEffect::Kind::Blocking:
+  case FunctionEffect::Kind::Allocating:
+    return OverrideResult::NoAction;
+
+  case FunctionEffect::Kind::None:
+    break;
+  }
+  llvm_unreachable("unknown effect kind");
+}
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 87988519e76916..cc3615da7f9401 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -64,12 +64,14 @@
 #include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaBPF.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaHexagon.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaLoongArch.h"
 #include "clang/Sema/SemaMIPS.h"
 #include "clang/Sema/SemaNVPTX.h"
 #include "clang/Sema/SemaObjC.h"
+#include "clang/Sema/SemaOpenCL.h"
 #include "clang/Sema/SemaPPC.h"
 #include "clang/Sema/SemaRISCV.h"
 #include "clang/Sema/SemaSystemZ.h"
@@ -183,6 +185,33 @@ bool Sema::checkArgCount(CallExpr *Call, unsigned DesiredArgCount) {
          << /*is non object*/ 0 << Call->getArg(1)->getSourceRange();
 }
 
+static bool checkBuiltinVerboseTrap(CallExpr *Call, Sema &S) {
+  bool HasError = false;
+
+  for (unsigned I = 0; I < Call->getNumArgs(); ++I) {
+    Expr *Arg = Call->getArg(I);
+
+    if (Arg->isValueDependent())
+      continue;
+
+    std::optional<std::string> ArgString = Arg->tryEvaluateString(S.Context);
+    int DiagMsgKind = -1;
+    // Arguments must be pointers to constant strings and cannot use '$'.
+    if (!ArgString.has_value())
+      DiagMsgKind = 0;
+    else if (ArgString->find('$') != std::string::npos)
+      DiagMsgKind = 1;
+
+    if (DiagMsgKind >= 0) {
+      S.Diag(Arg->getBeginLoc(), diag::err_builtin_verbose_trap_arg)
+          << DiagMsgKind << Arg->getSourceRange();
+      HasError = true;
+    }
+  }
+
+  return !HasError;
+}
+
 static bool convertArgumentToType(Sema &S, Expr *&Value, QualType Ty) {
   if (Value->isTypeDependent())
     return false;
@@ -1459,528 +1488,6 @@ static bool BuiltinSEHScopeCheck(Sema &SemaRef, CallExpr *TheCall,
   return false;
 }
 
-static inline bool isBlockPointer(Expr *Arg) {
-  return Arg->getType()->isBlockPointerType();
-}
-
-/// OpenCL C v2.0, s6.13.17.2 - Checks that the block parameters are all local
-/// void*, which is a requirement of device side enqueue.
-static bool checkOpenCLBlockArgs(Sema &S, Expr *BlockArg) {
-  const BlockPointerType *BPT =
-      cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
-  ArrayRef<QualType> Params =
-      BPT->getPointeeType()->castAs<FunctionProtoType>()->getParamTypes();
-  unsigned ArgCounter = 0;
-  bool IllegalParams = false;
-  // Iterate through the block parameters until either one is found that is not
-  // a local void*, or the block is valid.
-  for (ArrayRef<QualType>::iterator I = Params.begin(), E = Params.end();
-       I != E; ++I, ++ArgCounter) {
-    if (!(*I)->isPointerType() || !(*I)->getPointeeType()->isVoidType() ||
-        (*I)->getPointeeType().getQualifiers().getAddressSpace() !=
-            LangAS::opencl_local) {
-      // Get the location of the error. If a block literal has been passed
-      // (BlockExpr) then we can point straight to the offending argument,
-      // else we just point to the variable reference.
-      SourceLocation ErrorLoc;
-      if (isa<BlockExpr>(BlockArg)) {
-        BlockDecl *BD = cast<BlockExpr>(BlockArg)->getBlockDecl();
-        ErrorLoc = BD->getParamDecl(ArgCounter)->getBeginLoc();
-      } else if (isa<DeclRefExpr>(BlockArg)) {
-        ErrorLoc = cast<DeclRefExpr>(BlockArg)->getBeginLoc();
-      }
-      S.Diag(ErrorLoc,
-             diag::err_opencl_enqueue_kernel_blocks_non_local_void_args);
-      IllegalParams = true;
-    }
-  }
-
-  return IllegalParams;
-}
-
-static bool checkOpenCLSubgroupExt(Sema &S, CallExpr *Call) {
-  // OpenCL device can support extension but not the feature as extension
-  // requires subgroup independent forward progress, but subgroup independent
-  // forward progress is optional in OpenCL C 3.0 __opencl_c_subgroups feature.
-  if (!S.getOpenCLOptions().isSupported("cl_khr_subgroups", S.getLangOpts()) &&
-      !S.getOpenCLOptions().isSupported("__opencl_c_subgroups",
-                                        S.getLangOpts())) {
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_requires_extension)
-        << 1 << Call->getDirectCallee()
-        << "cl_khr_subgroups or __opencl_c_subgroups";
-    return true;
-  }
-  return false;
-}
-
-static bool OpenCLBuiltinNDRangeAndBlock(Sema &S, CallExpr *TheCall) {
-  if (S.checkArgCount(TheCall, 2))
-    return true;
-
-  if (checkOpenCLSubgroupExt(S, TheCall))
-    return true;
-
-  // First argument is an ndrange_t type.
-  Expr *NDRangeArg = TheCall->getArg(0);
-  if (NDRangeArg->getType().getUnqualifiedType().getAsString() != "ndrange_t") {
-    S.Diag(NDRangeArg->getBeginLoc(), diag::err_opencl_builtin_expected_type)
-        << TheCall->getDirectCallee() << "'ndrange_t'";
-    return true;
-  }
-
-  Expr *BlockArg = TheCall->getArg(1);
-  if (!isBlockPointer(BlockArg)) {
-    S.Diag(BlockArg->getBeginLoc(), diag::err_opencl_builtin_expected_type)
-        << TheCall->getDirectCallee() << "block";
-    return true;
-  }
-  return checkOpenCLBlockArgs(S, BlockArg);
-}
-
-/// OpenCL C v2.0, s6.13.17.6 - Check the argument to the
-/// get_kernel_work_group_size
-/// and get_kernel_preferred_work_group_size_multiple builtin functions.
-static bool OpenCLBuiltinKernelWorkGroupSize(Sema &S, CallExpr *TheCall) {
-  if (S.checkArgCount(TheCall, 1))
-    return true;
-
-  Expr *BlockArg = TheCall->getArg(0);
-  if (!isBlockPointer(BlockArg)) {
-    S.Diag(BlockArg->getBeginLoc(), diag::err_opencl_builtin_expected_type)
-        << TheCall->getDirectCallee() << "block";
-    return true;
-  }
-  return checkOpenCLBlockArgs(S, BlockArg);
-}
-
-/// Diagnose integer type and any valid implicit conversion to it.
-static bool checkOpenCLEnqueueIntType(Sema &S, Expr *E,
-                                      const QualType &IntType);
-
-static bool checkOpenCLEnqueueLocalSizeArgs(Sema &S, CallExpr *TheCall,
-                                            unsigned Start, unsigned End) {
-  bool IllegalParams = false;
-  for (unsigned I = Start; I <= End; ++I)
-    IllegalParams |= checkOpenCLEnqueueIntType(S, TheCall->getArg(I),
-                                              S.Context.getSizeType());
-  return IllegalParams;
-}
-
-/// OpenCL v2.0, s6.13.17.1 - Check that sizes are provided for all
-/// 'local void*' parameter of passed block.
-static bool checkOpenCLEnqueueVariadicArgs(Sema &S, CallExpr *TheCall,
-                                           Expr *BlockArg,
-                                           unsigned NumNonVarArgs) {
-  const BlockPointerType *BPT =
-      cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
-  unsigned NumBlockParams =
-      BPT->getPointeeType()->castAs<FunctionProtoType>()->getNumParams();
-  unsigned TotalNumArgs = TheCall->getNumArgs();
-
-  // For each argument passed to the block, a corresponding uint needs to
-  // be passed to describe the size of the local memory.
-  if (TotalNumArgs != NumBlockParams + NumNonVarArgs) {
-    S.Diag(TheCall->getBeginLoc(),
-           diag::err_opencl_enqueue_kernel_local_size_args);
-    return true;
-  }
-
-  // Check that the sizes of the local memory are specified by integers.
-  return checkOpenCLEnqueueLocalSizeArgs(S, TheCall, NumNonVarArgs,
-                                         TotalNumArgs - 1);
-}
-
-/// OpenCL C v2.0, s6.13.17 - Enqueue kernel function contains four different
-/// overload formats specified in Table 6.13.17.1.
-/// int enqueue_kernel(queue_t queue,
-///                    kernel_enqueue_flags_t flags,
-///                    const ndrange_t ndrange,
-///                    void (^block)(void))
-/// int enqueue_kernel(queue_t queue,
-///                    kernel_enqueue_flags_t flags,
-///                    const ndrange_t ndrange,
-///                    uint num_events_in_wait_list,
-///                    clk_event_t *event_wait_list,
-///                    clk_event_t *event_ret,
-///                    void (^block)(void))
-/// int enqueue_kernel(queue_t queue,
-///                    kernel_enqueue_flags_t flags,
-///                    const ndrange_t ndrange,
-///                    void (^block)(local void*, ...),
-///                    uint size0, ...)
-/// int enqueue_kernel(queue_t queue,
-///                    kernel_enqueue_flags_t flags,
-///                    const ndrange_t ndrange,
-///                    uint num_events_in_wait_list,
-///                    clk_event_t *event_wait_list,
-///                    clk_event_t *event_ret,
-///                    void (^block)(local void*, ...),
-///                    uint size0, ...)
-static bool OpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
-  unsigned NumArgs = TheCall->getNumArgs();
-
-  if (NumArgs < 4) {
-    S.Diag(TheCall->getBeginLoc(),
-           diag::err_typecheck_call_too_few_args_at_least)
-        << 0 << 4 << NumArgs << /*is non object*/ 0;
-    return true;
-  }
-
-  Expr *Arg0 = TheCall->getArg(0);
-  Expr *Arg1 = TheCall->getArg(1);
-  Expr *Arg2 = TheCall->getArg(2);
-  Expr *Arg3 = TheCall->getArg(3);
-
-  // First argument always needs to be a queue_t type.
-  if (!Arg0->getType()->isQueueT()) {
-    S.Diag(TheCall->getArg(0)->getBeginLoc(),
-           diag::err_opencl_builtin_expected_type)
-        << TheCall->getDirectCallee() << S.Context.OCLQueueTy;
-    return true;
-  }
-
-  // Second argument always needs to be a kernel_enqueue_flags_t enum value.
-  if (!Arg1->getType()->isIntegerType()) {
-    S.Diag(TheCall->getArg(1)->getBeginLoc(),
-           diag::err_opencl_builtin_expected_type)
-        << TheCall->getDirectCallee() << "'kernel_enqueue_flags_t' (i.e. uint)";
-    return true;
-  }
-
-  // Third argument is always an ndrange_t type.
-  if (Arg2->getType().getUnqualifiedType().getAsString() != "ndrange_t") {
-    S.Diag(TheCall->getArg(2)->getBeginLoc(),
-           diag::err_opencl_builtin_expected_type)
-        << TheCall->getDirectCallee() << "'ndrange_t'";
-    return true;
-  }
-
-  // With four arguments, there is only one form that the function could be
-  // called in: no events and no variable arguments.
-  if (NumArgs == 4) {
-    // check that the last argument is the right block type.
-    if (!isBlockPointer(Arg3)) {
-      S.Diag(Arg3->getBeginLoc(), diag::err_opencl_builtin_expected_type)
-          << TheCall->getDirectCallee() << "block";
-      return true;
-    }
-    // we have a block type, check the prototype
-    const BlockPointerType *BPT =
-        cast<BlockPointerType>(Arg3->getType().getCanonicalType());
-    if (BPT->getPointeeType()->castAs<FunctionProtoType>()->getNumParams() > 0) {
-      S.Diag(Arg3->getBeginLoc(),
-             diag::err_opencl_enqueue_kernel_blocks_no_args);
-      return true;
-    }
-    return false;
-  }
-  // we can have block + varargs.
-  if (isBlockPointer(Arg3))
-    return (checkOpenCLBlockArgs(S, Arg3) ||
-            checkOpenCLEnqueueVariadicArgs(S, TheCall, Arg3, 4));
-  // last two cases with either exactly 7 args or 7 args and varargs.
-  if (NumArgs >= 7) {
-    // check common block argument.
-    Expr *Arg6 = TheCall->getArg(6);
-    if (!isBlockPointer(Arg6)) {
-      S.Diag(Arg6->getBeginLoc(), diag::err_opencl_builtin_expected_type)
-          << TheCall->getDirectCallee() << "block";
-      return true;
-    }
-    if (checkOpenCLBlockArgs(S, Arg6))
-      return true;
-
-    // Forth argument has to be any integer type.
-    if (!Arg3->getType()->isIntegerType()) {
-      S.Diag(TheCall->getArg(3)->getBeginLoc(),
-             diag::err_opencl_builtin_expected_type)
-          << TheCall->getDirectCallee() << "integer";
-      return true;
-    }
-    // check remaining common arguments.
-    Expr *Arg4 = TheCall->getArg(4);
-    Expr *Arg5 = TheCall->getArg(5);
-
-    // Fifth argument is always passed as a pointer to clk_event_t.
-    if (!Arg4->isNullPointerConstant(S.Context,
-                                     Expr::NPC_ValueDependentIsNotNull) &&
-        !Arg4->getType()->getPointeeOrArrayElementType()->isClkEventT()) {
-      S.Diag(TheCall->getArg(4)->getBeginLoc(),
-             diag::err_opencl_builtin_expected_type)
-          << TheCall->getDirectCallee()
-          << S.Context.getPointerType(S.Context.OCLClkEventTy);
-      return true;
-    }
-
-    // Sixth argument is always passed as a pointer to clk_event_t.
-    if (!Arg5->isNullPointerConstant(S.Context,
-                                     Expr::NPC_ValueDependentIsNotNull) &&
-        !(Arg5->getType()->isPointerType() &&
-          Arg5->getType()->getPointeeType()->isClkEventT())) {
-      S.Diag(TheCall->getArg(5)->getBeginLoc(),
-             diag::err_opencl_builtin_expected_type)
-          << TheCall->getDirectCallee()
-          << S.Context.getPointerType(S.Context.OCLClkEventTy);
-      return true;
-    }
-
-    if (NumArgs == 7)
-      return false;
-
-    return checkOpenCLEnqueueVariadicArgs(S, TheCall, Arg6, 7);
-  }
-
-  // None of the specific case has been detected, give generic error
-  S.Diag(TheCall->getBeginLoc(),
-         diag::err_opencl_enqueue_kernel_incorrect_args);
-  return true;
-}
-
-/// Returns OpenCL access qual.
-static OpenCLAccessAttr *getOpenCLArgAccess(const Decl *D) {
-    return D->getAttr<OpenCLAccessAttr>();
-}
-
-/// Returns true if pipe element type is different from the pointer.
-static bool checkOpenCLPipeArg(Sema &S, CallExpr *Call) {
-  const Expr *Arg0 = Call->getArg(0);
-  // First argument type should always be pipe.
-  if (!Arg0->getType()->isPipeType()) {
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_first_arg)
-        << Call->getDirectCallee() << Arg0->getSourceRange();
-    return true;
-  }
-  OpenCLAccessAttr *AccessQual =
-      getOpenCLArgAccess(cast<DeclRefExpr>(Arg0)->getDecl());
-  // Validates the access qualifier is compatible with the call.
-  // OpenCL v2.0 s6.13.16 - The access qualifiers for pipe should only be
-  // read_only and write_only, and assumed to be read_only if no qualifier is
-  // specified.
-  switch (Call->getDirectCallee()->getBuiltinID()) {
-  case Builtin::BIread_pipe:
-  case Builtin::BIreserve_read_pipe:
-  case Builtin::BIcommit_read_pipe:
-  case Builtin::BIwork_group_reserve_read_pipe:
-  case Builtin::BIsub_group_reserve_read_pipe:
-  case Builtin::BIwork_group_commit_read_pipe:
-  case Builtin::BIsub_group_commit_read_pipe:
-    if (!(!AccessQual || AccessQual->isReadOnly())) {
-      S.Diag(Arg0->getBeginLoc(),
-             diag::err_opencl_builtin_pipe_invalid_access_modifier)
-          << "read_only" << Arg0->getSourceRange();
-      return true;
-    }
-    break;
-  case Builtin::BIwrite_pipe:
-  case Builtin::BIreserve_write_pipe:
-  case Builtin::BIcommit_write_pipe:
-  case Builtin::BIwork_group_reserve_write_pipe:
-  case Builtin::BIsub_group_reserve_write_pipe:
-  case Builtin::BIwork_group_commit_write_pipe:
-  case Builtin::BIsub_group_commit_write_pipe:
-    if (!(AccessQual && AccessQual->isWriteOnly())) {
-      S.Diag(Arg0->getBeginLoc(),
-             diag::err_opencl_builtin_pipe_invalid_access_modifier)
-          << "write_only" << Arg0->getSourceRange();
-      return true;
-    }
-    break;
-  default:
-    break;
-  }
-  return false;
-}
-
-/// Returns true if pipe element type is different from the pointer.
-static bool checkOpenCLPipePacketType(Sema &S, CallExpr *Call, unsigned Idx) {
-  const Expr *Arg0 = Call->getArg(0);
-  const Expr *ArgIdx = Call->getArg(Idx);
-  const PipeType *PipeTy = cast<PipeType>(Arg0->getType());
-  const QualType EltTy = PipeTy->getElementType();
-  const PointerType *ArgTy = ArgIdx->getType()->getAs<PointerType>();
-  // The Idx argument should be a pointer and the type of the pointer and
-  // the type of pipe element should also be the same.
-  if (!ArgTy ||
-      !S.Context.hasSameType(
-          EltTy, ArgTy->getPointeeType()->getCanonicalTypeInternal())) {
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
-        << Call->getDirectCallee() << S.Context.getPointerType(EltTy)
-        << ArgIdx->getType() << ArgIdx->getSourceRange();
-    return true;
-  }
-  return false;
-}
-
-// Performs semantic analysis for the read/write_pipe call.
-// \param S Reference to the semantic analyzer.
-// \param Call A pointer to the builtin call.
-// \return True if a semantic error has been found, false otherwise.
-static bool BuiltinRWPipe(Sema &S, CallExpr *Call) {
-  // OpenCL v2.0 s6.13.16.2 - The built-in read/write
-  // functions have two forms.
-  switch (Call->getNumArgs()) {
-  case 2:
-    if (checkOpenCLPipeArg(S, Call))
-      return true;
-    // The call with 2 arguments should be
-    // read/write_pipe(pipe T, T*).
-    // Check packet type T.
-    if (checkOpenCLPipePacketType(S, Call, 1))
-      return true;
-    break;
-
-  case 4: {
-    if (checkOpenCLPipeArg(S, Call))
-      return true;
-    // The call with 4 arguments should be
-    // read/write_pipe(pipe T, reserve_id_t, uint, T*).
-    // Check reserve_id_t.
-    if (!Call->getArg(1)->getType()->isReserveIDT()) {
-      S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
-          << Call->getDirectCallee() << S.Context.OCLReserveIDTy
-          << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
-      return true;
-    }
-
-    // Check the index.
-    const Expr *Arg2 = Call->getArg(2);
-    if (!Arg2->getType()->isIntegerType() &&
-        !Arg2->getType()->isUnsignedIntegerType()) {
-      S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
-          << Call->getDirectCallee() << S.Context.UnsignedIntTy
-          << Arg2->getType() << Arg2->getSourceRange();
-      return true;
-    }
-
-    // Check packet type T.
-    if (checkOpenCLPipePacketType(S, Call, 3))
-      return true;
-  } break;
-  default:
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_arg_num)
-        << Call->getDirectCallee() << Call->getSourceRange();
-    return true;
-  }
-
-  return false;
-}
-
-// Performs a semantic analysis on the {work_group_/sub_group_
-//        /_}reserve_{read/write}_pipe
-// \param S Reference to the semantic analyzer.
-// \param Call The call to the builtin function to be analyzed.
-// \return True if a semantic error was found, false otherwise.
-static bool BuiltinReserveRWPipe(Sema &S, CallExpr *Call) {
-  if (S.checkArgCount(Call, 2))
-    return true;
-
-  if (checkOpenCLPipeArg(S, Call))
-    return true;
-
-  // Check the reserve size.
-  if (!Call->getArg(1)->getType()->isIntegerType() &&
-      !Call->getArg(1)->getType()->isUnsignedIntegerType()) {
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
-        << Call->getDirectCallee() << S.Context.UnsignedIntTy
-        << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
-    return true;
-  }
-
-  // Since return type of reserve_read/write_pipe built-in function is
-  // reserve_id_t, which is not defined in the builtin def file , we used int
-  // as return type and need to override the return type of these functions.
-  Call->setType(S.Context.OCLReserveIDTy);
-
-  return false;
-}
-
-// Performs a semantic analysis on {work_group_/sub_group_
-//        /_}commit_{read/write}_pipe
-// \param S Reference to the semantic analyzer.
-// \param Call The call to the builtin function to be analyzed.
-// \return True if a semantic error was found, false otherwise.
-static bool BuiltinCommitRWPipe(Sema &S, CallExpr *Call) {
-  if (S.checkArgCount(Call, 2))
-    return true;
-
-  if (checkOpenCLPipeArg(S, Call))
-    return true;
-
-  // Check reserve_id_t.
-  if (!Call->getArg(1)->getType()->isReserveIDT()) {
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
-        << Call->getDirectCallee() << S.Context.OCLReserveIDTy
-        << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
-    return true;
-  }
-
-  return false;
-}
-
-// Performs a semantic analysis on the call to built-in Pipe
-//        Query Functions.
-// \param S Reference to the semantic analyzer.
-// \param Call The call to the builtin function to be analyzed.
-// \return True if a semantic error was found, false otherwise.
-static bool BuiltinPipePackets(Sema &S, CallExpr *Call) {
-  if (S.checkArgCount(Call, 1))
-    return true;
-
-  if (!Call->getArg(0)->getType()->isPipeType()) {
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_first_arg)
-        << Call->getDirectCallee() << Call->getArg(0)->getSourceRange();
-    return true;
-  }
-
-  return false;
-}
-
-// OpenCL v2.0 s6.13.9 - Address space qualifier functions.
-// Performs semantic analysis for the to_global/local/private call.
-// \param S Reference to the semantic analyzer.
-// \param BuiltinID ID of the builtin function.
-// \param Call A pointer to the builtin call.
-// \return True if a semantic error has been found, false otherwise.
-static bool OpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID, CallExpr *Call) {
-  if (S.checkArgCount(Call, 1))
-    return true;
-
-  auto RT = Call->getArg(0)->getType();
-  if (!RT->isPointerType() || RT->getPointeeType()
-      .getAddressSpace() == LangAS::opencl_constant) {
-    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_to_addr_invalid_arg)
-        << Call->getArg(0) << Call->getDirectCallee() << Call->getSourceRange();
-    return true;
-  }
-
-  if (RT->getPointeeType().getAddressSpace() != LangAS::opencl_generic) {
-    S.Diag(Call->getArg(0)->getBeginLoc(),
-           diag::warn_opencl_generic_address_space_arg)
-        << Call->getDirectCallee()->getNameInfo().getAsString()
-        << Call->getArg(0)->getSourceRange();
-  }
-
-  RT = RT->getPointeeType();
-  auto Qual = RT.getQualifiers();
-  switch (BuiltinID) {
-  case Builtin::BIto_global:
-    Qual.setAddressSpace(LangAS::opencl_global);
-    break;
-  case Builtin::BIto_local:
-    Qual.setAddressSpace(LangAS::opencl_local);
-    break;
-  case Builtin::BIto_private:
-    Qual.setAddressSpace(LangAS::opencl_private);
-    break;
-  default:
-    llvm_unreachable("Invalid builtin function");
-  }
-  Call->setType(S.Context.getPointerType(S.Context.getQualifiedType(
-      RT.getUnqualifiedType(), Qual)));
-
-  return false;
-}
-
 namespace {
 enum PointerAuthOpKind {
   PAO_Strip,
@@ -3051,59 +2558,59 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BIwrite_pipe:
     // Since those two functions are declared with var args, we need a semantic
     // check for the argument.
-    if (BuiltinRWPipe(*this, TheCall))
+    if (OpenCL().checkBuiltinRWPipe(TheCall))
       return ExprError();
     break;
   case Builtin::BIreserve_read_pipe:
   case Builtin::BIreserve_write_pipe:
   case Builtin::BIwork_group_reserve_read_pipe:
   case Builtin::BIwork_group_reserve_write_pipe:
-    if (BuiltinReserveRWPipe(*this, TheCall))
+    if (OpenCL().checkBuiltinReserveRWPipe(TheCall))
       return ExprError();
     break;
   case Builtin::BIsub_group_reserve_read_pipe:
   case Builtin::BIsub_group_reserve_write_pipe:
-    if (checkOpenCLSubgroupExt(*this, TheCall) ||
-        BuiltinReserveRWPipe(*this, TheCall))
+    if (OpenCL().checkSubgroupExt(TheCall) ||
+        OpenCL().checkBuiltinReserveRWPipe(TheCall))
       return ExprError();
     break;
   case Builtin::BIcommit_read_pipe:
   case Builtin::BIcommit_write_pipe:
   case Builtin::BIwork_group_commit_read_pipe:
   case Builtin::BIwork_group_commit_write_pipe:
-    if (BuiltinCommitRWPipe(*this, TheCall))
+    if (OpenCL().checkBuiltinCommitRWPipe(TheCall))
       return ExprError();
     break;
   case Builtin::BIsub_group_commit_read_pipe:
   case Builtin::BIsub_group_commit_write_pipe:
-    if (checkOpenCLSubgroupExt(*this, TheCall) ||
-        BuiltinCommitRWPipe(*this, TheCall))
+    if (OpenCL().checkSubgroupExt(TheCall) ||
+        OpenCL().checkBuiltinCommitRWPipe(TheCall))
       return ExprError();
     break;
   case Builtin::BIget_pipe_num_packets:
   case Builtin::BIget_pipe_max_packets:
-    if (BuiltinPipePackets(*this, TheCall))
+    if (OpenCL().checkBuiltinPipePackets(TheCall))
       return ExprError();
     break;
   case Builtin::BIto_global:
   case Builtin::BIto_local:
   case Builtin::BIto_private:
-    if (OpenCLBuiltinToAddr(*this, BuiltinID, TheCall))
+    if (OpenCL().checkBuiltinToAddr(BuiltinID, TheCall))
       return ExprError();
     break;
   // OpenCL v2.0, s6.13.17 - Enqueue kernel functions.
   case Builtin::BIenqueue_kernel:
-    if (OpenCLBuiltinEnqueueKernel(*this, TheCall))
+    if (OpenCL().checkBuiltinEnqueueKernel(TheCall))
       return ExprError();
     break;
   case Builtin::BIget_kernel_work_group_size:
   case Builtin::BIget_kernel_preferred_work_group_size_multiple:
-    if (OpenCLBuiltinKernelWorkGroupSize(*this, TheCall))
+    if (OpenCL().checkBuiltinKernelWorkGroupSize(TheCall))
       return ExprError();
     break;
   case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
   case Builtin::BIget_kernel_sub_group_count_for_ndrange:
-    if (OpenCLBuiltinNDRangeAndBlock(*this, TheCall))
+    if (OpenCL().checkBuiltinNDRangeAndBlock(TheCall))
       return ExprError();
     break;
   case Builtin::BI__builtin_os_log_format:
@@ -3351,6 +2858,11 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_matrix_column_major_store:
     return BuiltinMatrixColumnMajorStore(TheCall, TheCallResult);
 
+  case Builtin::BI__builtin_verbose_trap:
+    if (!checkBuiltinVerboseTrap(TheCall, *this))
+      return ExprError();
+    break;
+
   case Builtin::BI__builtin_get_device_side_mangled_name: {
     auto Check = [](CallExpr *TheCall) {
       if (TheCall->getNumArgs() != 1)
@@ -3393,7 +2905,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   }
   }
 
-  if (getLangOpts().HLSL && CheckHLSLBuiltinFunctionCall(BuiltinID, TheCall))
+  if (getLangOpts().HLSL && HLSL().CheckBuiltinFunctionCall(BuiltinID, TheCall))
     return ExprError();
 
   // Since the target specific builtins for each arch overlap, only check those
@@ -3441,234 +2953,6 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) {
          << ArgNum << Arg->getSourceRange();
 }
 
-// Helper function for CheckHLSLBuiltinFunctionCall
-bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
-  assert(TheCall->getNumArgs() > 1);
-  ExprResult A = TheCall->getArg(0);
-
-  QualType ArgTyA = A.get()->getType();
-
-  auto *VecTyA = ArgTyA->getAs<VectorType>();
-  SourceLocation BuiltinLoc = TheCall->getBeginLoc();
-
-  for (unsigned i = 1; i < TheCall->getNumArgs(); ++i) {
-    ExprResult B = TheCall->getArg(i);
-    QualType ArgTyB = B.get()->getType();
-    auto *VecTyB = ArgTyB->getAs<VectorType>();
-    if (VecTyA == nullptr && VecTyB == nullptr)
-      return false;
-
-    if (VecTyA && VecTyB) {
-      bool retValue = false;
-      if (VecTyA->getElementType() != VecTyB->getElementType()) {
-        // Note: type promotion is intended to be handeled via the intrinsics
-        //  and not the builtin itself.
-        S->Diag(TheCall->getBeginLoc(),
-                diag::err_vec_builtin_incompatible_vector)
-            << TheCall->getDirectCallee() << /*useAllTerminology*/ true
-            << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
-        retValue = true;
-      }
-      if (VecTyA->getNumElements() != VecTyB->getNumElements()) {
-        // You should only be hitting this case if you are calling the builtin
-        // directly. HLSL intrinsics should avoid this case via a
-        // HLSLVectorTruncation.
-        S->Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector)
-            << TheCall->getDirectCallee() << /*useAllTerminology*/ true
-            << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                           TheCall->getArg(1)->getEndLoc());
-        retValue = true;
-      }
-      return retValue;
-    }
-  }
-
-  // Note: if we get here one of the args is a scalar which
-  // requires a VectorSplat on Arg0 or Arg1
-  S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
-      << TheCall->getDirectCallee() << /*useAllTerminology*/ true
-      << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                     TheCall->getArg(1)->getEndLoc());
-  return true;
-}
-
-bool CheckArgsTypesAreCorrect(
-    Sema *S, CallExpr *TheCall, QualType ExpectedType,
-    llvm::function_ref<bool(clang::QualType PassedType)> Check) {
-  for (unsigned i = 0; i < TheCall->getNumArgs(); ++i) {
-    QualType PassedType = TheCall->getArg(i)->getType();
-    if (Check(PassedType)) {
-      if (auto *VecTyA = PassedType->getAs<VectorType>())
-        ExpectedType = S->Context.getVectorType(
-            ExpectedType, VecTyA->getNumElements(), VecTyA->getVectorKind());
-      S->Diag(TheCall->getArg(0)->getBeginLoc(),
-              diag::err_typecheck_convert_incompatible)
-          << PassedType << ExpectedType << 1 << 0 << 0;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
-  auto checkAllFloatTypes = [](clang::QualType PassedType) -> bool {
-    return !PassedType->hasFloatingRepresentation();
-  };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
-                                  checkAllFloatTypes);
-}
-
-bool CheckFloatOrHalfRepresentations(Sema *S, CallExpr *TheCall) {
-  auto checkFloatorHalf = [](clang::QualType PassedType) -> bool {
-    clang::QualType BaseType =
-        PassedType->isVectorType()
-            ? PassedType->getAs<clang::VectorType>()->getElementType()
-            : PassedType;
-    return !BaseType->isHalfType() && !BaseType->isFloat32Type();
-  };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
-                                  checkFloatorHalf);
-}
-
-bool CheckNoDoubleVectors(Sema *S, CallExpr *TheCall) {
-  auto checkDoubleVector = [](clang::QualType PassedType) -> bool {
-    if (const auto *VecTy = PassedType->getAs<VectorType>())
-      return VecTy->getElementType()->isDoubleType();
-    return false;
-  };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
-                                  checkDoubleVector);
-}
-
-bool CheckUnsignedIntRepresentation(Sema *S, CallExpr *TheCall) {
-  auto checkAllUnsignedTypes = [](clang::QualType PassedType) -> bool {
-    return !PassedType->hasUnsignedIntegerRepresentation();
-  };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.UnsignedIntTy,
-                                  checkAllUnsignedTypes);
-}
-
-void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
-                                QualType ReturnType) {
-  auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
-  if (VecTyA)
-    ReturnType = S->Context.getVectorType(ReturnType, VecTyA->getNumElements(),
-                                          VectorKind::Generic);
-  TheCall->setType(ReturnType);
-}
-
-// Note: returning true in this case results in CheckBuiltinFunctionCall
-// returning an ExprError
-bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
-  switch (BuiltinID) {
-  case Builtin::BI__builtin_hlsl_elementwise_all:
-  case Builtin::BI__builtin_hlsl_elementwise_any: {
-    if (checkArgCount(TheCall, 1))
-      return true;
-    break;
-  }
-  case Builtin::BI__builtin_hlsl_elementwise_clamp: {
-    if (checkArgCount(TheCall, 3))
-      return true;
-    if (CheckVectorElementCallArgs(this, TheCall))
-      return true;
-    if (BuiltinElementwiseTernaryMath(
-            TheCall, /*CheckForFloatArgs*/
-            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
-      return true;
-    break;
-  }
-  case Builtin::BI__builtin_hlsl_dot: {
-    if (checkArgCount(TheCall, 2))
-      return true;
-    if (CheckVectorElementCallArgs(this, TheCall))
-      return true;
-    if (BuiltinVectorToScalarMath(TheCall))
-      return true;
-    if (CheckNoDoubleVectors(this, TheCall))
-      return true;
-    break;
-  }
-  case Builtin::BI__builtin_hlsl_elementwise_rcp: {
-    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
-      return true;
-    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
-      return true;
-    break;
-  }
-  case Builtin::BI__builtin_hlsl_elementwise_rsqrt:
-  case Builtin::BI__builtin_hlsl_elementwise_frac: {
-    if (CheckFloatOrHalfRepresentations(this, TheCall))
-      return true;
-    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
-      return true;
-    break;
-  }
-  case Builtin::BI__builtin_hlsl_elementwise_isinf: {
-    if (CheckFloatOrHalfRepresentations(this, TheCall))
-      return true;
-    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
-      return true;
-    SetElementTypeAsReturnType(this, TheCall, this->Context.BoolTy);
-    break;
-  }
-  case Builtin::BI__builtin_hlsl_lerp: {
-    if (checkArgCount(TheCall, 3))
-      return true;
-    if (CheckVectorElementCallArgs(this, TheCall))
-      return true;
-    if (BuiltinElementwiseTernaryMath(TheCall))
-      return true;
-    if (CheckFloatOrHalfRepresentations(this, TheCall))
-      return true;
-    break;
-  }
-  case Builtin::BI__builtin_hlsl_mad: {
-    if (checkArgCount(TheCall, 3))
-      return true;
-    if (CheckVectorElementCallArgs(this, TheCall))
-      return true;
-    if (BuiltinElementwiseTernaryMath(
-            TheCall, /*CheckForFloatArgs*/
-            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
-      return true;
-    break;
-  }
-  // Note these are llvm builtins that we want to catch invalid intrinsic
-  // generation. Normal handling of these builitns will occur elsewhere.
-  case Builtin::BI__builtin_elementwise_bitreverse: {
-    if (CheckUnsignedIntRepresentation(this, TheCall))
-      return true;
-    break;
-  }
-  case Builtin::BI__builtin_elementwise_acos:
-  case Builtin::BI__builtin_elementwise_asin:
-  case Builtin::BI__builtin_elementwise_atan:
-  case Builtin::BI__builtin_elementwise_ceil:
-  case Builtin::BI__builtin_elementwise_cos:
-  case Builtin::BI__builtin_elementwise_cosh:
-  case Builtin::BI__builtin_elementwise_exp:
-  case Builtin::BI__builtin_elementwise_exp2:
-  case Builtin::BI__builtin_elementwise_floor:
-  case Builtin::BI__builtin_elementwise_log:
-  case Builtin::BI__builtin_elementwise_log2:
-  case Builtin::BI__builtin_elementwise_log10:
-  case Builtin::BI__builtin_elementwise_pow:
-  case Builtin::BI__builtin_elementwise_roundeven:
-  case Builtin::BI__builtin_elementwise_sin:
-  case Builtin::BI__builtin_elementwise_sinh:
-  case Builtin::BI__builtin_elementwise_sqrt:
-  case Builtin::BI__builtin_elementwise_tan:
-  case Builtin::BI__builtin_elementwise_tanh:
-  case Builtin::BI__builtin_elementwise_trunc: {
-    if (CheckFloatOrHalfRepresentations(this, TheCall))
-      return true;
-    break;
-  }
-  }
-  return false;
-}
-
 /// Given a FunctionDecl's FormatAttr, attempts to populate the FomatStringInfo
 /// parameter with the FormatAttr's correct format_idx and firstDataArg.
 /// Returns true when the format fits the function and the FormatStringInfo has
@@ -3740,58 +3024,6 @@ static void CheckNonNullArgument(Sema &S,
                               << ArgExpr->getSourceRange());
 }
 
-bool Sema::GetFormatNSStringIdx(const FormatAttr *Format, unsigned &Idx) {
-  FormatStringInfo FSI;
-  if ((GetFormatStringType(Format) == FST_NSString) &&
-      getFormatStringInfo(Format, false, true, &FSI)) {
-    Idx = FSI.FormatIdx;
-    return true;
-  }
-  return false;
-}
-
-/// Diagnose use of %s directive in an NSString which is being passed
-/// as formatting string to formatting method.
-static void
-DiagnoseCStringFormatDirectiveInCFAPI(Sema &S,
-                                        const NamedDecl *FDecl,
-                                        Expr **Args,
-                                        unsigned NumArgs) {
-  unsigned Idx = 0;
-  bool Format = false;
-  ObjCStringFormatFamily SFFamily = FDecl->getObjCFStringFormattingFamily();
-  if (SFFamily == ObjCStringFormatFamily::SFF_CFString) {
-    Idx = 2;
-    Format = true;
-  }
-  else
-    for (const auto *I : FDecl->specific_attrs<FormatAttr>()) {
-      if (S.GetFormatNSStringIdx(I, Idx)) {
-        Format = true;
-        break;
-      }
-    }
-  if (!Format || NumArgs <= Idx)
-    return;
-  const Expr *FormatExpr = Args[Idx];
-  if (const CStyleCastExpr *CSCE = dyn_cast<CStyleCastExpr>(FormatExpr))
-    FormatExpr = CSCE->getSubExpr();
-  const StringLiteral *FormatString;
-  if (const ObjCStringLiteral *OSL =
-      dyn_cast<ObjCStringLiteral>(FormatExpr->IgnoreParenImpCasts()))
-    FormatString = OSL->getString();
-  else
-    FormatString = dyn_cast<StringLiteral>(FormatExpr->IgnoreParenImpCasts());
-  if (!FormatString)
-    return;
-  if (S.FormatStringHasSArg(FormatString)) {
-    S.Diag(FormatExpr->getExprLoc(), diag::warn_objc_cdirective_format_string)
-      << "%s" << 1 << 1;
-    S.Diag(FDecl->getLocation(), diag::note_entity_declared_at)
-      << FDecl->getDeclName();
-  }
-}
-
 /// Determine whether the given type has a non-null nullability annotation.
 static bool isNonNullType(QualType type) {
   if (auto nullability = type->getNullability())
@@ -4208,7 +3440,7 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
   CheckInfNaNFunction(TheCall, FDecl);
 
   if (getLangOpts().ObjC)
-    DiagnoseCStringFormatDirectiveInCFAPI(*this, FDecl, Args, NumArgs);
+    ObjC().DiagnoseCStringFormatDirectiveInCFAPI(FDecl, Args, NumArgs);
 
   unsigned CMId = FDecl->getMemoryFunctionKind();
 
@@ -4582,6 +3814,9 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
   }
 
   // Pointer to object of size zero is not allowed.
+  if (RequireCompleteType(Ptr->getBeginLoc(), AtomTy,
+                          diag::err_incomplete_type))
+    return ExprError();
   if (Context.getTypeInfoInChars(AtomTy).Width.isZero()) {
     Diag(ExprRange.getBegin(), diag::err_atomic_builtin_must_be_pointer)
         << Ptr->getType() << 1 << Ptr->getSourceRange();
@@ -11274,26 +10509,6 @@ static void DiagnoseImpCast(Sema &S, Expr *E, QualType T,
   DiagnoseImpCast(S, E, E->getType(), T, CContext, diag, pruneControlFlow);
 }
 
-static bool isObjCSignedCharBool(Sema &S, QualType Ty) {
-  return Ty->isSpecificBuiltinType(BuiltinType::SChar) &&
-         S.getLangOpts().ObjC && S.ObjC().NSAPIObj->isObjCBOOLType(Ty);
-}
-
-static void adornObjCBoolConversionDiagWithTernaryFixit(
-    Sema &S, Expr *SourceExpr, const Sema::SemaDiagnosticBuilder &Builder) {
-  Expr *Ignored = SourceExpr->IgnoreImplicit();
-  if (const auto *OVE = dyn_cast<OpaqueValueExpr>(Ignored))
-    Ignored = OVE->getSourceExpr();
-  bool NeedsParens = isa<AbstractConditionalOperator>(Ignored) ||
-                     isa<BinaryOperator>(Ignored) ||
-                     isa<CXXOperatorCallExpr>(Ignored);
-  SourceLocation EndLoc = S.getLocForEndOfToken(SourceExpr->getEndLoc());
-  if (NeedsParens)
-    Builder << FixItHint::CreateInsertion(SourceExpr->getBeginLoc(), "(")
-            << FixItHint::CreateInsertion(EndLoc, ")");
-  Builder << FixItHint::CreateInsertion(EndLoc, " ? YES : NO");
-}
-
 /// Diagnose an implicit cast from a floating point value to an integer value.
 static void DiagnoseFloatingImpCast(Sema &S, Expr *E, QualType T,
                                     SourceLocation CContext) {
@@ -11313,11 +10528,10 @@ static void DiagnoseFloatingImpCast(Sema &S, Expr *E, QualType T,
   bool IsConstant =
     E->EvaluateAsFloat(Value, S.Context, Expr::SE_AllowSideEffects);
   if (!IsConstant) {
-    if (isObjCSignedCharBool(S, T)) {
-      return adornObjCBoolConversionDiagWithTernaryFixit(
-          S, E,
-          S.Diag(CContext, diag::warn_impcast_float_to_objc_signed_char_bool)
-              << E->getType());
+    if (S.ObjC().isSignedCharBool(T)) {
+      return S.ObjC().adornBoolConversionDiagWithTernaryFixit(
+          E, S.Diag(CContext, diag::warn_impcast_float_to_objc_signed_char_bool)
+                 << E->getType());
     }
 
     return DiagnoseImpCast(S, E, T, CContext,
@@ -11341,11 +10555,10 @@ static void DiagnoseFloatingImpCast(Sema &S, Expr *E, QualType T,
   precision = (precision * 59 + 195) / 196;
   Value.toString(PrettySourceValue, precision);
 
-  if (isObjCSignedCharBool(S, T) && IntegerValue != 0 && IntegerValue != 1) {
-    return adornObjCBoolConversionDiagWithTernaryFixit(
-        S, E,
-        S.Diag(CContext, diag::warn_impcast_constant_value_to_objc_bool)
-            << PrettySourceValue);
+  if (S.ObjC().isSignedCharBool(T) && IntegerValue != 0 && IntegerValue != 1) {
+    return S.ObjC().adornBoolConversionDiagWithTernaryFixit(
+        E, S.Diag(CContext, diag::warn_impcast_constant_value_to_objc_bool)
+               << PrettySourceValue);
   }
 
   if (Result == llvm::APFloat::opOK && isExact) {
@@ -11546,102 +10759,6 @@ static void DiagnoseNullConversion(Sema &S, Expr *E, QualType T,
                                       S.getFixItZeroLiteralForType(T, Loc));
 }
 
-static void checkObjCArrayLiteral(Sema &S, QualType TargetType,
-                                  ObjCArrayLiteral *ArrayLiteral);
-
-static void
-checkObjCDictionaryLiteral(Sema &S, QualType TargetType,
-                           ObjCDictionaryLiteral *DictionaryLiteral);
-
-/// Check a single element within a collection literal against the
-/// target element type.
-static void checkObjCCollectionLiteralElement(Sema &S,
-                                              QualType TargetElementType,
-                                              Expr *Element,
-                                              unsigned ElementKind) {
-  // Skip a bitcast to 'id' or qualified 'id'.
-  if (auto ICE = dyn_cast<ImplicitCastExpr>(Element)) {
-    if (ICE->getCastKind() == CK_BitCast &&
-        ICE->getSubExpr()->getType()->getAs<ObjCObjectPointerType>())
-      Element = ICE->getSubExpr();
-  }
-
-  QualType ElementType = Element->getType();
-  ExprResult ElementResult(Element);
-  if (ElementType->getAs<ObjCObjectPointerType>() &&
-      S.CheckSingleAssignmentConstraints(TargetElementType,
-                                         ElementResult,
-                                         false, false)
-        != Sema::Compatible) {
-    S.Diag(Element->getBeginLoc(), diag::warn_objc_collection_literal_element)
-        << ElementType << ElementKind << TargetElementType
-        << Element->getSourceRange();
-  }
-
-  if (auto ArrayLiteral = dyn_cast<ObjCArrayLiteral>(Element))
-    checkObjCArrayLiteral(S, TargetElementType, ArrayLiteral);
-  else if (auto DictionaryLiteral = dyn_cast<ObjCDictionaryLiteral>(Element))
-    checkObjCDictionaryLiteral(S, TargetElementType, DictionaryLiteral);
-}
-
-/// Check an Objective-C array literal being converted to the given
-/// target type.
-static void checkObjCArrayLiteral(Sema &S, QualType TargetType,
-                                  ObjCArrayLiteral *ArrayLiteral) {
-  if (!S.ObjC().NSArrayDecl)
-    return;
-
-  const auto *TargetObjCPtr = TargetType->getAs<ObjCObjectPointerType>();
-  if (!TargetObjCPtr)
-    return;
-
-  if (TargetObjCPtr->isUnspecialized() ||
-      TargetObjCPtr->getInterfaceDecl()->getCanonicalDecl() !=
-          S.ObjC().NSArrayDecl->getCanonicalDecl())
-    return;
-
-  auto TypeArgs = TargetObjCPtr->getTypeArgs();
-  if (TypeArgs.size() != 1)
-    return;
-
-  QualType TargetElementType = TypeArgs[0];
-  for (unsigned I = 0, N = ArrayLiteral->getNumElements(); I != N; ++I) {
-    checkObjCCollectionLiteralElement(S, TargetElementType,
-                                      ArrayLiteral->getElement(I),
-                                      0);
-  }
-}
-
-/// Check an Objective-C dictionary literal being converted to the given
-/// target type.
-static void
-checkObjCDictionaryLiteral(Sema &S, QualType TargetType,
-                           ObjCDictionaryLiteral *DictionaryLiteral) {
-  if (!S.ObjC().NSDictionaryDecl)
-    return;
-
-  const auto *TargetObjCPtr = TargetType->getAs<ObjCObjectPointerType>();
-  if (!TargetObjCPtr)
-    return;
-
-  if (TargetObjCPtr->isUnspecialized() ||
-      TargetObjCPtr->getInterfaceDecl()->getCanonicalDecl() !=
-          S.ObjC().NSDictionaryDecl->getCanonicalDecl())
-    return;
-
-  auto TypeArgs = TargetObjCPtr->getTypeArgs();
-  if (TypeArgs.size() != 2)
-    return;
-
-  QualType TargetKeyType = TypeArgs[0];
-  QualType TargetObjectType = TypeArgs[1];
-  for (unsigned I = 0, N = DictionaryLiteral->getNumElements(); I != N; ++I) {
-    auto Element = DictionaryLiteral->getKeyValueElement(I);
-    checkObjCCollectionLiteralElement(S, TargetKeyType, Element.Key, 1);
-    checkObjCCollectionLiteralElement(S, TargetObjectType, Element.Value, 2);
-  }
-}
-
 // Helper function to filter out cases for constant width constant conversion.
 // Don't warn on char array initialization or for non-decimal values.
 static bool isSameWidthConstantConversion(Sema &S, Expr *E, QualType T,
@@ -11717,14 +10834,12 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) {
   }
 }
 
-static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
-                                    SourceLocation CC,
-                                    bool *ICContext = nullptr,
-                                    bool IsListInit = false) {
+void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
+                                   bool *ICContext, bool IsListInit) {
   if (E->isTypeDependent() || E->isValueDependent()) return;
 
-  const Type *Source = S.Context.getCanonicalType(E->getType()).getTypePtr();
-  const Type *Target = S.Context.getCanonicalType(T).getTypePtr();
+  const Type *Source = Context.getCanonicalType(E->getType()).getTypePtr();
+  const Type *Target = Context.getCanonicalType(T).getTypePtr();
   if (Source == Target) return;
   if (Target->isDependentType()) return;
 
@@ -11737,7 +10852,7 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
     return;
 
   if (Source->isAtomicType())
-    S.Diag(E->getExprLoc(), diag::warn_atomic_implicit_seq_cst);
+    Diag(E->getExprLoc(), diag::warn_atomic_implicit_seq_cst);
 
   // Diagnose implicit casts to bool.
   if (Target->isSpecificBuiltinType(BuiltinType::Bool)) {
@@ -11745,34 +10860,32 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
       // Warn on string literal to bool.  Checks for string literals in logical
       // and expressions, for instance, assert(0 && "error here"), are
       // prevented by a check in AnalyzeImplicitConversions().
-      return DiagnoseImpCast(S, E, T, CC,
+      return DiagnoseImpCast(*this, E, T, CC,
                              diag::warn_impcast_string_literal_to_bool);
     if (isa<ObjCStringLiteral>(E) || isa<ObjCArrayLiteral>(E) ||
         isa<ObjCDictionaryLiteral>(E) || isa<ObjCBoxedExpr>(E)) {
       // This covers the literal expressions that evaluate to Objective-C
       // objects.
-      return DiagnoseImpCast(S, E, T, CC,
+      return DiagnoseImpCast(*this, E, T, CC,
                              diag::warn_impcast_objective_c_literal_to_bool);
     }
     if (Source->isPointerType() || Source->canDecayToPointerType()) {
       // Warn on pointer to bool conversion that is always true.
-      S.DiagnoseAlwaysNonNullPointer(E, Expr::NPCK_NotNull, /*IsEqual*/ false,
-                                     SourceRange(CC));
+      DiagnoseAlwaysNonNullPointer(E, Expr::NPCK_NotNull, /*IsEqual*/ false,
+                                   SourceRange(CC));
     }
   }
 
   // If the we're converting a constant to an ObjC BOOL on a platform where BOOL
   // is a typedef for signed char (macOS), then that constant value has to be 1
   // or 0.
-  if (isObjCSignedCharBool(S, T) && Source->isIntegralType(S.Context)) {
+  if (ObjC().isSignedCharBool(T) && Source->isIntegralType(Context)) {
     Expr::EvalResult Result;
-    if (E->EvaluateAsInt(Result, S.getASTContext(),
-                         Expr::SE_AllowSideEffects)) {
+    if (E->EvaluateAsInt(Result, getASTContext(), Expr::SE_AllowSideEffects)) {
       if (Result.Val.getInt() != 1 && Result.Val.getInt() != 0) {
-        adornObjCBoolConversionDiagWithTernaryFixit(
-            S, E,
-            S.Diag(CC, diag::warn_impcast_constant_value_to_objc_bool)
-                << toString(Result.Val.getInt(), 10));
+        ObjC().adornBoolConversionDiagWithTernaryFixit(
+            E, Diag(CC, diag::warn_impcast_constant_value_to_objc_bool)
+                   << toString(Result.Val.getInt(), 10));
       }
       return;
     }
@@ -11781,42 +10894,43 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   // Check implicit casts from Objective-C collection literals to specialized
   // collection types, e.g., NSArray<NSString *> *.
   if (auto *ArrayLiteral = dyn_cast<ObjCArrayLiteral>(E))
-    checkObjCArrayLiteral(S, QualType(Target, 0), ArrayLiteral);
+    ObjC().checkArrayLiteral(QualType(Target, 0), ArrayLiteral);
   else if (auto *DictionaryLiteral = dyn_cast<ObjCDictionaryLiteral>(E))
-    checkObjCDictionaryLiteral(S, QualType(Target, 0), DictionaryLiteral);
+    ObjC().checkDictionaryLiteral(QualType(Target, 0), DictionaryLiteral);
 
   // Strip vector types.
   if (isa<VectorType>(Source)) {
     if (Target->isSveVLSBuiltinType() &&
-        (S.Context.areCompatibleSveTypes(QualType(Target, 0),
-                                         QualType(Source, 0)) ||
-         S.Context.areLaxCompatibleSveTypes(QualType(Target, 0),
-                                            QualType(Source, 0))))
+        (Context.areCompatibleSveTypes(QualType(Target, 0),
+                                       QualType(Source, 0)) ||
+         Context.areLaxCompatibleSveTypes(QualType(Target, 0),
+                                          QualType(Source, 0))))
       return;
 
     if (Target->isRVVVLSBuiltinType() &&
-        (S.Context.areCompatibleRVVTypes(QualType(Target, 0),
-                                         QualType(Source, 0)) ||
-         S.Context.areLaxCompatibleRVVTypes(QualType(Target, 0),
-                                            QualType(Source, 0))))
+        (Context.areCompatibleRVVTypes(QualType(Target, 0),
+                                       QualType(Source, 0)) ||
+         Context.areLaxCompatibleRVVTypes(QualType(Target, 0),
+                                          QualType(Source, 0))))
       return;
 
     if (!isa<VectorType>(Target)) {
-      if (S.SourceMgr.isInSystemMacro(CC))
+      if (SourceMgr.isInSystemMacro(CC))
         return;
-      return DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_vector_scalar);
-    } else if (S.getLangOpts().HLSL &&
+      return DiagnoseImpCast(*this, E, T, CC, diag::warn_impcast_vector_scalar);
+    } else if (getLangOpts().HLSL &&
                Target->castAs<VectorType>()->getNumElements() <
                    Source->castAs<VectorType>()->getNumElements()) {
       // Diagnose vector truncation but don't return. We may also want to
       // diagnose an element conversion.
-      DiagnoseImpCast(S, E, T, CC, diag::warn_hlsl_impcast_vector_truncation);
+      DiagnoseImpCast(*this, E, T, CC,
+                      diag::warn_hlsl_impcast_vector_truncation);
     }
 
     // If the vector cast is cast between two vectors of the same size, it is
     // a bitcast, not a conversion, except under HLSL where it is a conversion.
-    if (!S.getLangOpts().HLSL &&
-        S.Context.getTypeSize(Source) == S.Context.getTypeSize(Target))
+    if (!getLangOpts().HLSL &&
+        Context.getTypeSize(Source) == Context.getTypeSize(Target))
       return;
 
     Source = cast<VectorType>(Source)->getElementType().getTypePtr();
@@ -11828,11 +10942,11 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   // Strip complex types.
   if (isa<ComplexType>(Source)) {
     if (!isa<ComplexType>(Target)) {
-      if (S.SourceMgr.isInSystemMacro(CC) || Target->isBooleanType())
+      if (SourceMgr.isInSystemMacro(CC) || Target->isBooleanType())
         return;
 
-      return DiagnoseImpCast(S, E, T, CC,
-                             S.getLangOpts().CPlusPlus
+      return DiagnoseImpCast(*this, E, T, CC,
+                             getLangOpts().CPlusPlus
                                  ? diag::err_impcast_complex_scalar
                                  : diag::warn_impcast_complex_scalar);
     }
@@ -11847,25 +10961,25 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   // Strip SVE vector types
   if (SourceBT && SourceBT->isSveVLSBuiltinType()) {
     // Need the original target type for vector type checks
-    const Type *OriginalTarget = S.Context.getCanonicalType(T).getTypePtr();
+    const Type *OriginalTarget = Context.getCanonicalType(T).getTypePtr();
     // Handle conversion from scalable to fixed when msve-vector-bits is
     // specified
-    if (S.Context.areCompatibleSveTypes(QualType(OriginalTarget, 0),
-                                        QualType(Source, 0)) ||
-        S.Context.areLaxCompatibleSveTypes(QualType(OriginalTarget, 0),
-                                           QualType(Source, 0)))
+    if (Context.areCompatibleSveTypes(QualType(OriginalTarget, 0),
+                                      QualType(Source, 0)) ||
+        Context.areLaxCompatibleSveTypes(QualType(OriginalTarget, 0),
+                                         QualType(Source, 0)))
       return;
 
     // If the vector cast is cast between two vectors of the same size, it is
     // a bitcast, not a conversion.
-    if (S.Context.getTypeSize(Source) == S.Context.getTypeSize(Target))
+    if (Context.getTypeSize(Source) == Context.getTypeSize(Target))
       return;
 
-    Source = SourceBT->getSveEltType(S.Context).getTypePtr();
+    Source = SourceBT->getSveEltType(Context).getTypePtr();
   }
 
   if (TargetBT && TargetBT->isSveVLSBuiltinType())
-    Target = TargetBT->getSveEltType(S.Context).getTypePtr();
+    Target = TargetBT->getSveEltType(Context).getTypePtr();
 
   // If the source is floating point...
   if (SourceBT && SourceBT->isFloatingPoint()) {
@@ -11873,41 +10987,42 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
     if (TargetBT && TargetBT->isFloatingPoint()) {
       // ...then warn if we're dropping FP rank.
 
-      int Order = S.getASTContext().getFloatingTypeSemanticOrder(
+      int Order = getASTContext().getFloatingTypeSemanticOrder(
           QualType(SourceBT, 0), QualType(TargetBT, 0));
       if (Order > 0) {
         // Don't warn about float constants that are precisely
         // representable in the target type.
         Expr::EvalResult result;
-        if (E->EvaluateAsRValue(result, S.Context)) {
+        if (E->EvaluateAsRValue(result, Context)) {
           // Value might be a float, a float vector, or a float complex.
-          if (IsSameFloatAfterCast(result.Val,
-                   S.Context.getFloatTypeSemantics(QualType(TargetBT, 0)),
-                   S.Context.getFloatTypeSemantics(QualType(SourceBT, 0))))
+          if (IsSameFloatAfterCast(
+                  result.Val,
+                  Context.getFloatTypeSemantics(QualType(TargetBT, 0)),
+                  Context.getFloatTypeSemantics(QualType(SourceBT, 0))))
             return;
         }
 
-        if (S.SourceMgr.isInSystemMacro(CC))
+        if (SourceMgr.isInSystemMacro(CC))
           return;
 
-        DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_float_precision);
+        DiagnoseImpCast(*this, E, T, CC, diag::warn_impcast_float_precision);
       }
       // ... or possibly if we're increasing rank, too
       else if (Order < 0) {
-        if (S.SourceMgr.isInSystemMacro(CC))
+        if (SourceMgr.isInSystemMacro(CC))
           return;
 
-        DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_double_promotion);
+        DiagnoseImpCast(*this, E, T, CC, diag::warn_impcast_double_promotion);
       }
       return;
     }
 
     // If the target is integral, always warn.
     if (TargetBT && TargetBT->isInteger()) {
-      if (S.SourceMgr.isInSystemMacro(CC))
+      if (SourceMgr.isInSystemMacro(CC))
         return;
 
-      DiagnoseFloatingImpCast(S, E, T, CC);
+      DiagnoseFloatingImpCast(*this, E, T, CC);
     }
 
     // Detect the case where a call result is converted from floating-point to
@@ -11929,7 +11044,7 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
         if (isa<ImplicitCastExpr>(LastA) &&
             InnerE->getType()->isBooleanType()) {
           // Warn on this floating-point to bool conversion
-          DiagnoseImpCast(S, E, T, CC,
+          DiagnoseImpCast(*this, E, T, CC,
                           diag::warn_impcast_floating_point_to_bool);
         }
       }
@@ -11941,38 +11056,37 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   if (Source->isFixedPointType()) {
     if (Target->isUnsaturatedFixedPointType()) {
       Expr::EvalResult Result;
-      if (E->EvaluateAsFixedPoint(Result, S.Context, Expr::SE_AllowSideEffects,
-                                  S.isConstantEvaluatedContext())) {
+      if (E->EvaluateAsFixedPoint(Result, Context, Expr::SE_AllowSideEffects,
+                                  isConstantEvaluatedContext())) {
         llvm::APFixedPoint Value = Result.Val.getFixedPoint();
-        llvm::APFixedPoint MaxVal = S.Context.getFixedPointMax(T);
-        llvm::APFixedPoint MinVal = S.Context.getFixedPointMin(T);
+        llvm::APFixedPoint MaxVal = Context.getFixedPointMax(T);
+        llvm::APFixedPoint MinVal = Context.getFixedPointMin(T);
         if (Value > MaxVal || Value < MinVal) {
-          S.DiagRuntimeBehavior(E->getExprLoc(), E,
-                                S.PDiag(diag::warn_impcast_fixed_point_range)
-                                    << Value.toString() << T
-                                    << E->getSourceRange()
-                                    << clang::SourceRange(CC));
+          DiagRuntimeBehavior(E->getExprLoc(), E,
+                              PDiag(diag::warn_impcast_fixed_point_range)
+                                  << Value.toString() << T
+                                  << E->getSourceRange()
+                                  << clang::SourceRange(CC));
           return;
         }
       }
     } else if (Target->isIntegerType()) {
       Expr::EvalResult Result;
-      if (!S.isConstantEvaluatedContext() &&
-          E->EvaluateAsFixedPoint(Result, S.Context,
-                                  Expr::SE_AllowSideEffects)) {
+      if (!isConstantEvaluatedContext() &&
+          E->EvaluateAsFixedPoint(Result, Context, Expr::SE_AllowSideEffects)) {
         llvm::APFixedPoint FXResult = Result.Val.getFixedPoint();
 
         bool Overflowed;
         llvm::APSInt IntResult = FXResult.convertToInt(
-            S.Context.getIntWidth(T),
-            Target->isSignedIntegerOrEnumerationType(), &Overflowed);
+            Context.getIntWidth(T), Target->isSignedIntegerOrEnumerationType(),
+            &Overflowed);
 
         if (Overflowed) {
-          S.DiagRuntimeBehavior(E->getExprLoc(), E,
-                                S.PDiag(diag::warn_impcast_fixed_point_range)
-                                    << FXResult.toString() << T
-                                    << E->getSourceRange()
-                                    << clang::SourceRange(CC));
+          DiagRuntimeBehavior(E->getExprLoc(), E,
+                              PDiag(diag::warn_impcast_fixed_point_range)
+                                  << FXResult.toString() << T
+                                  << E->getSourceRange()
+                                  << clang::SourceRange(CC));
           return;
         }
       }
@@ -11980,20 +11094,20 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   } else if (Target->isUnsaturatedFixedPointType()) {
     if (Source->isIntegerType()) {
       Expr::EvalResult Result;
-      if (!S.isConstantEvaluatedContext() &&
-          E->EvaluateAsInt(Result, S.Context, Expr::SE_AllowSideEffects)) {
+      if (!isConstantEvaluatedContext() &&
+          E->EvaluateAsInt(Result, Context, Expr::SE_AllowSideEffects)) {
         llvm::APSInt Value = Result.Val.getInt();
 
         bool Overflowed;
         llvm::APFixedPoint IntResult = llvm::APFixedPoint::getFromIntValue(
-            Value, S.Context.getFixedPointSemantics(T), &Overflowed);
+            Value, Context.getFixedPointSemantics(T), &Overflowed);
 
         if (Overflowed) {
-          S.DiagRuntimeBehavior(E->getExprLoc(), E,
-                                S.PDiag(diag::warn_impcast_fixed_point_range)
-                                    << toString(Value, /*Radix=*/10) << T
-                                    << E->getSourceRange()
-                                    << clang::SourceRange(CC));
+          DiagRuntimeBehavior(E->getExprLoc(), E,
+                              PDiag(diag::warn_impcast_fixed_point_range)
+                                  << toString(Value, /*Radix=*/10) << T
+                                  << E->getSourceRange()
+                                  << clang::SourceRange(CC));
           return;
         }
       }
@@ -12007,25 +11121,25 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
       TargetBT->isFloatingType() && !IsListInit) {
     // Determine the number of precision bits in the source integer type.
     IntRange SourceRange =
-        GetExprRange(S.Context, E, S.isConstantEvaluatedContext(),
+        GetExprRange(Context, E, isConstantEvaluatedContext(),
                      /*Approximate=*/true);
     unsigned int SourcePrecision = SourceRange.Width;
 
     // Determine the number of precision bits in the
     // target floating point type.
     unsigned int TargetPrecision = llvm::APFloatBase::semanticsPrecision(
-        S.Context.getFloatTypeSemantics(QualType(TargetBT, 0)));
+        Context.getFloatTypeSemantics(QualType(TargetBT, 0)));
 
     if (SourcePrecision > 0 && TargetPrecision > 0 &&
         SourcePrecision > TargetPrecision) {
 
       if (std::optional<llvm::APSInt> SourceInt =
-              E->getIntegerConstantExpr(S.Context)) {
+              E->getIntegerConstantExpr(Context)) {
         // If the source integer is a constant, convert it to the target
         // floating point type. Issue a warning if the value changes
         // during the whole conversion.
         llvm::APFloat TargetFloatValue(
-            S.Context.getFloatTypeSemantics(QualType(TargetBT, 0)));
+            Context.getFloatTypeSemantics(QualType(TargetBT, 0)));
         llvm::APFloat::opStatus ConversionStatus =
             TargetFloatValue.convertFromAPInt(
                 *SourceInt, SourceBT->isSignedInteger(),
@@ -12037,26 +11151,26 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
           SmallString<32> PrettyTargetValue;
           TargetFloatValue.toString(PrettyTargetValue, TargetPrecision);
 
-          S.DiagRuntimeBehavior(
+          DiagRuntimeBehavior(
               E->getExprLoc(), E,
-              S.PDiag(diag::warn_impcast_integer_float_precision_constant)
+              PDiag(diag::warn_impcast_integer_float_precision_constant)
                   << PrettySourceValue << PrettyTargetValue << E->getType() << T
                   << E->getSourceRange() << clang::SourceRange(CC));
         }
       } else {
         // Otherwise, the implicit conversion may lose precision.
-        DiagnoseImpCast(S, E, T, CC,
+        DiagnoseImpCast(*this, E, T, CC,
                         diag::warn_impcast_integer_float_precision);
       }
     }
   }
 
-  DiagnoseNullConversion(S, E, T, CC);
+  DiagnoseNullConversion(*this, E, T, CC);
 
-  S.DiscardMisalignedMemberAddress(Target, E);
+  DiscardMisalignedMemberAddress(Target, E);
 
   if (Target->isBooleanType())
-    DiagnoseIntInBoolContext(S, E);
+    DiagnoseIntInBoolContext(*this, E);
 
   if (!Source->isIntegerType() || !Target->isIntegerType())
     return;
@@ -12066,51 +11180,51 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   if (Target->isSpecificBuiltinType(BuiltinType::Bool))
     return;
 
-  if (isObjCSignedCharBool(S, T) && !Source->isCharType() &&
+  if (ObjC().isSignedCharBool(T) && !Source->isCharType() &&
       !E->isKnownToHaveBooleanValue(/*Semantic=*/false)) {
-    return adornObjCBoolConversionDiagWithTernaryFixit(
-        S, E,
-        S.Diag(CC, diag::warn_impcast_int_to_objc_signed_char_bool)
-            << E->getType());
+    return ObjC().adornBoolConversionDiagWithTernaryFixit(
+        E, Diag(CC, diag::warn_impcast_int_to_objc_signed_char_bool)
+               << E->getType());
   }
 
   IntRange SourceTypeRange =
-      IntRange::forTargetOfCanonicalType(S.Context, Source);
+      IntRange::forTargetOfCanonicalType(Context, Source);
   IntRange LikelySourceRange = GetExprRange(
-      S.Context, E, S.isConstantEvaluatedContext(), /*Approximate=*/true);
-  IntRange TargetRange = IntRange::forTargetOfCanonicalType(S.Context, Target);
+      Context, E, isConstantEvaluatedContext(), /*Approximate=*/true);
+  IntRange TargetRange = IntRange::forTargetOfCanonicalType(Context, Target);
 
   if (LikelySourceRange.Width > TargetRange.Width) {
     // If the source is a constant, use a default-on diagnostic.
     // TODO: this should happen for bitfield stores, too.
     Expr::EvalResult Result;
-    if (E->EvaluateAsInt(Result, S.Context, Expr::SE_AllowSideEffects,
-                         S.isConstantEvaluatedContext())) {
+    if (E->EvaluateAsInt(Result, Context, Expr::SE_AllowSideEffects,
+                         isConstantEvaluatedContext())) {
       llvm::APSInt Value(32);
       Value = Result.Val.getInt();
 
-      if (S.SourceMgr.isInSystemMacro(CC))
+      if (SourceMgr.isInSystemMacro(CC))
         return;
 
       std::string PrettySourceValue = toString(Value, 10);
       std::string PrettyTargetValue = PrettyPrintInRange(Value, TargetRange);
 
-      S.DiagRuntimeBehavior(
-          E->getExprLoc(), E,
-          S.PDiag(diag::warn_impcast_integer_precision_constant)
-              << PrettySourceValue << PrettyTargetValue << E->getType() << T
-              << E->getSourceRange() << SourceRange(CC));
+      DiagRuntimeBehavior(E->getExprLoc(), E,
+                          PDiag(diag::warn_impcast_integer_precision_constant)
+                              << PrettySourceValue << PrettyTargetValue
+                              << E->getType() << T << E->getSourceRange()
+                              << SourceRange(CC));
       return;
     }
 
     // People want to build with -Wshorten-64-to-32 and not -Wconversion.
-    if (S.SourceMgr.isInSystemMacro(CC))
+    if (SourceMgr.isInSystemMacro(CC))
       return;
 
-    if (TargetRange.Width == 32 && S.Context.getIntWidth(E->getType()) == 64)
-      return DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_integer_64_32,
+    if (TargetRange.Width == 32 && Context.getIntWidth(E->getType()) == 64)
+      return DiagnoseImpCast(*this, E, T, CC, diag::warn_impcast_integer_64_32,
                              /* pruneControlFlow */ true);
-    return DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_integer_precision);
+    return DiagnoseImpCast(*this, E, T, CC,
+                           diag::warn_impcast_integer_precision);
   }
 
   if (TargetRange.Width > SourceTypeRange.Width) {
@@ -12118,10 +11232,10 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
       if (UO->getOpcode() == UO_Minus)
         if (Source->isUnsignedIntegerType()) {
           if (Target->isUnsignedIntegerType())
-            return DiagnoseImpCast(S, E, T, CC,
+            return DiagnoseImpCast(*this, E, T, CC,
                                    diag::warn_impcast_high_order_zero_bits);
           if (Target->isSignedIntegerType())
-            return DiagnoseImpCast(S, E, T, CC,
+            return DiagnoseImpCast(*this, E, T, CC,
                                    diag::warn_impcast_nonnegative_result);
         }
   }
@@ -12134,17 +11248,17 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
     // cause a negative value to be stored.
 
     Expr::EvalResult Result;
-    if (E->EvaluateAsInt(Result, S.Context, Expr::SE_AllowSideEffects) &&
-        !S.SourceMgr.isInSystemMacro(CC)) {
+    if (E->EvaluateAsInt(Result, Context, Expr::SE_AllowSideEffects) &&
+        !SourceMgr.isInSystemMacro(CC)) {
       llvm::APSInt Value = Result.Val.getInt();
-      if (isSameWidthConstantConversion(S, E, T, CC)) {
+      if (isSameWidthConstantConversion(*this, E, T, CC)) {
         std::string PrettySourceValue = toString(Value, 10);
         std::string PrettyTargetValue = PrettyPrintInRange(Value, TargetRange);
 
-        S.Diag(E->getExprLoc(),
-               S.PDiag(diag::warn_impcast_integer_precision_constant)
-                   << PrettySourceValue << PrettyTargetValue << E->getType()
-                   << T << E->getSourceRange() << SourceRange(CC));
+        Diag(E->getExprLoc(),
+             PDiag(diag::warn_impcast_integer_precision_constant)
+                 << PrettySourceValue << PrettyTargetValue << E->getType() << T
+                 << E->getSourceRange() << SourceRange(CC));
         return;
       }
     }
@@ -12156,7 +11270,7 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
       ((TargetRange.NonNegative && !LikelySourceRange.NonNegative) ||
        (!TargetRange.NonNegative && LikelySourceRange.NonNegative &&
         LikelySourceRange.Width == TargetRange.Width))) {
-    if (S.SourceMgr.isInSystemMacro(CC))
+    if (SourceMgr.isInSystemMacro(CC))
       return;
 
     if (SourceBT && SourceBT->isInteger() && TargetBT &&
@@ -12177,24 +11291,24 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
       *ICContext = true;
     }
 
-    return DiagnoseImpCast(S, E, T, CC, DiagID);
+    return DiagnoseImpCast(*this, E, T, CC, DiagID);
   }
 
   // Diagnose conversions between different enumeration types.
   // In C, we pretend that the type of an EnumConstantDecl is its enumeration
   // type, to give us better diagnostics.
-  QualType SourceType = E->getEnumCoercedType(S.Context);
-  Source = S.Context.getCanonicalType(SourceType).getTypePtr();
+  QualType SourceType = E->getEnumCoercedType(Context);
+  Source = Context.getCanonicalType(SourceType).getTypePtr();
 
   if (const EnumType *SourceEnum = Source->getAs<EnumType>())
     if (const EnumType *TargetEnum = Target->getAs<EnumType>())
       if (SourceEnum->getDecl()->hasNameForLinkage() &&
           TargetEnum->getDecl()->hasNameForLinkage() &&
           SourceEnum != TargetEnum) {
-        if (S.SourceMgr.isInSystemMacro(CC))
+        if (SourceMgr.isInSystemMacro(CC))
           return;
 
-        return DiagnoseImpCast(S, E, SourceType, T, CC,
+        return DiagnoseImpCast(*this, E, SourceType, T, CC,
                                diag::warn_impcast_different_enum_types);
       }
 }
@@ -12214,7 +11328,7 @@ static void CheckConditionalOperand(Sema &S, Expr *E, QualType T,
 
   AnalyzeImplicitConversions(S, E, CC);
   if (E->getType() != T)
-    return CheckImplicitConversion(S, E, T, CC, &ICContext);
+    return S.CheckImplicitConversion(E, T, CC, &ICContext);
 }
 
 static void CheckConditionalOperator(Sema &S, AbstractConditionalOperator *E,
@@ -12245,11 +11359,11 @@ static void CheckConditionalOperator(Sema &S, AbstractConditionalOperator *E,
   if (E->getType() == T) return;
 
   Suspicious = false;
-  CheckImplicitConversion(S, TrueExpr->IgnoreParenImpCasts(),
-                          E->getType(), CC, &Suspicious);
+  S.CheckImplicitConversion(TrueExpr->IgnoreParenImpCasts(), E->getType(), CC,
+                            &Suspicious);
   if (!Suspicious)
-    CheckImplicitConversion(S, E->getFalseExpr()->IgnoreParenImpCasts(),
-                            E->getType(), CC, &Suspicious);
+    S.CheckImplicitConversion(E->getFalseExpr()->IgnoreParenImpCasts(),
+                              E->getType(), CC, &Suspicious);
 }
 
 /// Check conversion of given expression to boolean.
@@ -12262,7 +11376,7 @@ static void CheckBoolLikeConversion(Sema &S, Expr *E, SourceLocation CC) {
     return;
   if (E->IgnoreParenImpCasts()->getType()->isAtomicType())
     return;
-  CheckImplicitConversion(S, E->IgnoreParenImpCasts(), S.Context.BoolTy, CC);
+  S.CheckImplicitConversion(E->IgnoreParenImpCasts(), S.Context.BoolTy, CC);
 }
 
 namespace {
@@ -12352,7 +11466,7 @@ static void AnalyzeImplicitConversions(
   // The non-canonical typecheck is just an optimization;
   // CheckImplicitConversion will filter out dead implicit conversions.
   if (SourceExpr->getType() != T)
-    CheckImplicitConversion(S, SourceExpr, T, CC, nullptr, IsListInit);
+    S.CheckImplicitConversion(SourceExpr, T, CC, nullptr, IsListInit);
 
   // Now continue drilling into this expression.
 
@@ -12452,21 +11566,6 @@ static void AnalyzeImplicitConversions(Sema &S, Expr *OrigE, SourceLocation CC,
     AnalyzeImplicitConversions(S, WorkList.pop_back_val(), WorkList);
 }
 
-/// Diagnose integer type and any valid implicit conversion to it.
-static bool checkOpenCLEnqueueIntType(Sema &S, Expr *E, const QualType &IntT) {
-  // Taking into account implicit conversions,
-  // allow any integer.
-  if (!E->getType()->isIntegerType()) {
-    S.Diag(E->getBeginLoc(),
-           diag::err_opencl_enqueue_kernel_invalid_local_size_type);
-    return true;
-  }
-  // Potentially emit standard warnings for implicit conversions if enabled
-  // using -Wconversion.
-  CheckImplicitConversion(S, E, IntT, E->getBeginLoc());
-  return false;
-}
-
 // Helper function for Sema::DiagnoseAlwaysNonNullPointer.
 // Returns true when emitting a warning about taking the address of a reference.
 static bool CheckForReference(Sema &SemaRef, const Expr *E,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index e28e5c56c11a7b..029ccf944c513f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -1663,8 +1663,7 @@ bool Sema::CheckRedeclarationModuleOwnership(NamedDecl *New, NamedDecl *Old) {
     // Partitions are part of the module, but a partition could import another
     // module, so verify that the PMIs agree.
     if ((NewM->isModulePartition() || OldM->isModulePartition()) &&
-        NewM->getPrimaryModuleInterfaceName() ==
-            OldM->getPrimaryModuleInterfaceName())
+        getASTContext().isInSameModule(NewM, OldM))
       return false;
   }
 
@@ -3912,6 +3911,49 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S,
     return true;
   }
 
+  const auto OldFX = Old->getFunctionEffects();
+  const auto NewFX = New->getFunctionEffects();
+  QualType OldQTypeForComparison = OldQType;
+  if (OldFX != NewFX) {
+    const auto Diffs = FunctionEffectDifferences(OldFX, NewFX);
+    for (const auto &Diff : Diffs) {
+      if (Diff.shouldDiagnoseRedeclaration(*Old, OldFX, *New, NewFX)) {
+        Diag(New->getLocation(),
+             diag::warn_mismatched_func_effect_redeclaration)
+            << Diff.effectName();
+        Diag(Old->getLocation(), diag::note_previous_declaration);
+      }
+    }
+    // Following a warning, we could skip merging effects from the previous
+    // declaration, but that would trigger an additional "conflicting types"
+    // error.
+    if (const auto *NewFPT = NewQType->getAs<FunctionProtoType>()) {
+      FunctionEffectSet::Conflicts MergeErrs;
+      FunctionEffectSet MergedFX =
+          FunctionEffectSet::getUnion(OldFX, NewFX, MergeErrs);
+      if (!MergeErrs.empty())
+        diagnoseFunctionEffectMergeConflicts(MergeErrs, New->getLocation(),
+                                             Old->getLocation());
+
+      FunctionProtoType::ExtProtoInfo EPI = NewFPT->getExtProtoInfo();
+      EPI.FunctionEffects = FunctionEffectsRef(MergedFX);
+      QualType ModQT = Context.getFunctionType(NewFPT->getReturnType(),
+                                               NewFPT->getParamTypes(), EPI);
+
+      New->setType(ModQT);
+      NewQType = New->getType();
+
+      // Revise OldQTForComparison to include the merged effects,
+      // so as not to fail due to differences later.
+      if (const auto *OldFPT = OldQType->getAs<FunctionProtoType>()) {
+        EPI = OldFPT->getExtProtoInfo();
+        EPI.FunctionEffects = FunctionEffectsRef(MergedFX);
+        OldQTypeForComparison = Context.getFunctionType(
+            OldFPT->getReturnType(), OldFPT->getParamTypes(), EPI);
+      }
+    }
+  }
+
   if (getLangOpts().CPlusPlus) {
     OldQType = Context.getCanonicalType(Old->getType());
     NewQType = Context.getCanonicalType(New->getType());
@@ -4076,9 +4118,8 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S,
     // We also want to respect all the extended bits except noreturn.
 
     // noreturn should now match unless the old type info didn't have it.
-    QualType OldQTypeForComparison = OldQType;
     if (!OldTypeInfo.getNoReturn() && NewTypeInfo.getNoReturn()) {
-      auto *OldType = OldQType->castAs<FunctionProtoType>();
+      auto *OldType = OldQTypeForComparison->castAs<FunctionProtoType>();
       const FunctionType *OldTypeForComparison
         = Context.adjustFunctionType(OldType, OldTypeInfo.withNoReturn(true));
       OldQTypeForComparison = QualType(OldTypeForComparison, 0);
@@ -20547,3 +20588,62 @@ bool Sema::shouldIgnoreInHostDeviceCheck(FunctionDecl *Callee) {
   return LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
          CUDA().IdentifyTarget(Callee) == CUDAFunctionTarget::Global;
 }
+
+// Report a failure to merge function effects between declarations due to a
+// conflict.
+void Sema::diagnoseFunctionEffectMergeConflicts(
+    const FunctionEffectSet::Conflicts &Errs, SourceLocation NewLoc,
+    SourceLocation OldLoc) {
+  for (const FunctionEffectSet::Conflict &Conflict : Errs) {
+    Diag(NewLoc, diag::warn_conflicting_func_effects)
+        << Conflict.Kept.description() << Conflict.Rejected.description();
+    Diag(OldLoc, diag::note_previous_declaration);
+  }
+}
+
+// Warn and return true if adding an effect to a set would create a conflict.
+bool Sema::diagnoseConflictingFunctionEffect(
+    const FunctionEffectsRef &FX, const FunctionEffectWithCondition &NewEC,
+    SourceLocation NewAttrLoc) {
+  // If the new effect has a condition, we can't detect conflicts until the
+  // condition is resolved.
+  if (NewEC.Cond.getCondition() != nullptr)
+    return false;
+
+  // Diagnose the new attribute as incompatible with a previous one.
+  auto Incompatible = [&](const FunctionEffectWithCondition &PrevEC) {
+    Diag(NewAttrLoc, diag::err_attributes_are_not_compatible)
+        << ("'" + NewEC.description() + "'")
+        << ("'" + PrevEC.description() + "'") << false;
+    // We don't necessarily have the location of the previous attribute,
+    // so no note.
+    return true;
+  };
+
+  // Compare against previous attributes.
+  FunctionEffect::Kind NewKind = NewEC.Effect.kind();
+
+  for (const FunctionEffectWithCondition &PrevEC : FX) {
+    // Again, can't check yet when the effect is conditional.
+    if (PrevEC.Cond.getCondition() != nullptr)
+      continue;
+
+    FunctionEffect::Kind PrevKind = PrevEC.Effect.kind();
+    // Note that we allow PrevKind == NewKind; it's redundant and ignored.
+
+    if (PrevEC.Effect.oppositeKind() == NewKind)
+      return Incompatible(PrevEC);
+
+    // A new allocating is incompatible with a previous nonblocking.
+    if (PrevKind == FunctionEffect::Kind::NonBlocking &&
+        NewKind == FunctionEffect::Kind::Allocating)
+      return Incompatible(PrevEC);
+
+    // A new nonblocking is incompatible with a previous allocating.
+    if (PrevKind == FunctionEffect::Kind::Allocating &&
+        NewKind == FunctionEffect::Kind::NonBlocking)
+      return Incompatible(PrevEC);
+  }
+
+  return false;
+}
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ce6b5b1ff6f931..41489789919d03 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -14,6 +14,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/CXXInheritance.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
@@ -25,6 +26,7 @@
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/Basic/DarwinSDKInfo.h"
+#include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/HLSLRuntime.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LangOptions.h"
@@ -65,10 +67,13 @@
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/IR/Assumptions.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
+#include <cassert>
 #include <optional>
 
 using namespace clang;
@@ -166,13 +171,6 @@ bool Sema::checkStringLiteralArgumentAttr(const ParsedAttr &AL, unsigned ArgNum,
   return checkStringLiteralArgumentAttr(AL, ArgExpr, Str, ArgLocation);
 }
 
-/// Check if the passed-in expression is of type int or bool.
-static bool isIntOrBool(Expr *Exp) {
-  QualType QT = Exp->getType();
-  return QT->isBooleanType() || QT->isIntegerType();
-}
-
-
 // Check to see if the type is a smart pointer of some kind.  We assume
 // it's a smart pointer if it defines both operator-> and operator*.
 static bool threadSafetyCheckIsSmartPointer(Sema &S, const RecordType* RT) {
@@ -608,15 +606,31 @@ static bool checkTryLockFunAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL,
   if (!AL.checkAtLeastNumArgs(S, 1))
     return false;
 
-  if (!isIntOrBool(AL.getArgAsExpr(0))) {
+  // The attribute's first argument defines the success value.
+  const Expr *SuccessArg = AL.getArgAsExpr(0);
+  if (!isa<CXXNullPtrLiteralExpr>(SuccessArg) &&
+      !isa<GNUNullExpr>(SuccessArg) && !isa<CXXBoolLiteralExpr>(SuccessArg) &&
+      !isa<IntegerLiteral>(SuccessArg) && !SuccessArg->getEnumConstantDecl()) {
     S.Diag(AL.getLoc(), diag::err_attribute_argument_n_type)
-        << AL << 1 << AANT_ArgumentIntOrBool;
+        << AL << 1 << AANT_ArgumentNullptrOrBoolIntOrEnumLiteral;
     return false;
   }
 
-  // check that all arguments are lockable objects
+  // All remaining arguments must be lockable objects.
   checkAttrArgsAreCapabilityObjs(S, D, AL, Args, 1);
 
+  // The function must return a pointer, boolean, integer, or enum.  We already
+  // know that `D` is a function because `ExclusiveTrylockFunction` and friends
+  // are defined in Attr.td with subject lists that only include functions.
+  QualType ReturnType = D->getAsFunction()->getReturnType();
+  if (!ReturnType->isPointerType() && !ReturnType->isBooleanType() &&
+      !ReturnType->isIntegerType() && !ReturnType->isEnumeralType()) {
+    S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type)
+        << AL << AL.isRegularKeywordAttribute()
+        << ExpectedFunctionReturningPointerBoolIntOrEnum;
+    return false;
+  }
+
   return true;
 }
 
@@ -5102,12 +5116,12 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) {
   return false;
 }
 
-// Helper to get CudaArch.
-static CudaArch getCudaArch(const TargetInfo &TI) {
+// Helper to get OffloadArch.
+static OffloadArch getOffloadArch(const TargetInfo &TI) {
   if (!TI.getTriple().isNVPTX())
-    llvm_unreachable("getCudaArch is only valid for NVPTX triple");
+    llvm_unreachable("getOffloadArch is only valid for NVPTX triple");
   auto &TO = TI.getTargetOpts();
-  return StringToCudaArch(TO.CPU);
+  return StringToOffloadArch(TO.CPU);
 }
 
 // Checks whether an argument of launch_bounds attribute is
@@ -5167,10 +5181,10 @@ Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads,
 
   if (MaxBlocks) {
     // '.maxclusterrank' ptx directive requires .target sm_90 or higher.
-    auto SM = getCudaArch(Context.getTargetInfo());
-    if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) {
+    auto SM = getOffloadArch(Context.getTargetInfo());
+    if (SM == OffloadArch::UNKNOWN || SM < OffloadArch::SM_90) {
       Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90)
-          << CudaArchToString(SM) << CI << MaxBlocks->getSourceRange();
+          << OffloadArchToString(SM) << CI << MaxBlocks->getSourceRange();
       // Ignore it by setting MaxBlocks to null;
       MaxBlocks = nullptr;
     } else {
@@ -6267,6 +6281,116 @@ EnforceTCBLeafAttr *Sema::mergeEnforceTCBLeafAttr(
       *this, D, AL);
 }
 
+static void handleVTablePointerAuthentication(Sema &S, Decl *D,
+                                              const ParsedAttr &AL) {
+  CXXRecordDecl *Decl = cast<CXXRecordDecl>(D);
+  const uint32_t NumArgs = AL.getNumArgs();
+  if (NumArgs > 4) {
+    S.Diag(AL.getLoc(), diag::err_attribute_too_many_arguments) << AL << 4;
+    AL.setInvalid();
+  }
+
+  if (NumArgs == 0) {
+    S.Diag(AL.getLoc(), diag::err_attribute_too_few_arguments) << AL;
+    AL.setInvalid();
+    return;
+  }
+
+  if (D->getAttr<VTablePointerAuthenticationAttr>()) {
+    S.Diag(AL.getLoc(), diag::err_duplicated_vtable_pointer_auth) << Decl;
+    AL.setInvalid();
+  }
+
+  auto KeyType = VTablePointerAuthenticationAttr::VPtrAuthKeyType::DefaultKey;
+  if (AL.isArgIdent(0)) {
+    IdentifierLoc *IL = AL.getArgAsIdent(0);
+    if (!VTablePointerAuthenticationAttr::ConvertStrToVPtrAuthKeyType(
+            IL->Ident->getName(), KeyType)) {
+      S.Diag(IL->Loc, diag::err_invalid_authentication_key) << IL->Ident;
+      AL.setInvalid();
+    }
+    if (KeyType == VTablePointerAuthenticationAttr::DefaultKey &&
+        !S.getLangOpts().PointerAuthCalls) {
+      S.Diag(AL.getLoc(), diag::err_no_default_vtable_pointer_auth) << 0;
+      AL.setInvalid();
+    }
+  } else {
+    S.Diag(AL.getLoc(), diag::err_attribute_argument_type)
+        << AL << AANT_ArgumentIdentifier;
+    return;
+  }
+
+  auto AddressDiversityMode = VTablePointerAuthenticationAttr::
+      AddressDiscriminationMode::DefaultAddressDiscrimination;
+  if (AL.getNumArgs() > 1) {
+    if (AL.isArgIdent(1)) {
+      IdentifierLoc *IL = AL.getArgAsIdent(1);
+      if (!VTablePointerAuthenticationAttr::
+              ConvertStrToAddressDiscriminationMode(IL->Ident->getName(),
+                                                    AddressDiversityMode)) {
+        S.Diag(IL->Loc, diag::err_invalid_address_discrimination) << IL->Ident;
+        AL.setInvalid();
+      }
+      if (AddressDiversityMode ==
+              VTablePointerAuthenticationAttr::DefaultAddressDiscrimination &&
+          !S.getLangOpts().PointerAuthCalls) {
+        S.Diag(IL->Loc, diag::err_no_default_vtable_pointer_auth) << 1;
+        AL.setInvalid();
+      }
+    } else {
+      S.Diag(AL.getLoc(), diag::err_attribute_argument_type)
+          << AL << AANT_ArgumentIdentifier;
+    }
+  }
+
+  auto ED = VTablePointerAuthenticationAttr::ExtraDiscrimination::
+      DefaultExtraDiscrimination;
+  if (AL.getNumArgs() > 2) {
+    if (AL.isArgIdent(2)) {
+      IdentifierLoc *IL = AL.getArgAsIdent(2);
+      if (!VTablePointerAuthenticationAttr::ConvertStrToExtraDiscrimination(
+              IL->Ident->getName(), ED)) {
+        S.Diag(IL->Loc, diag::err_invalid_extra_discrimination) << IL->Ident;
+        AL.setInvalid();
+      }
+      if (ED == VTablePointerAuthenticationAttr::DefaultExtraDiscrimination &&
+          !S.getLangOpts().PointerAuthCalls) {
+        S.Diag(AL.getLoc(), diag::err_no_default_vtable_pointer_auth) << 2;
+        AL.setInvalid();
+      }
+    } else {
+      S.Diag(AL.getLoc(), diag::err_attribute_argument_type)
+          << AL << AANT_ArgumentIdentifier;
+    }
+  }
+
+  uint32_t CustomDiscriminationValue = 0;
+  if (ED == VTablePointerAuthenticationAttr::CustomDiscrimination) {
+    if (NumArgs < 4) {
+      S.Diag(AL.getLoc(), diag::err_missing_custom_discrimination) << AL << 4;
+      AL.setInvalid();
+      return;
+    }
+    if (NumArgs > 4) {
+      S.Diag(AL.getLoc(), diag::err_attribute_too_many_arguments) << AL << 4;
+      AL.setInvalid();
+    }
+
+    if (!AL.isArgExpr(3) || !S.checkUInt32Argument(AL, AL.getArgAsExpr(3),
+                                                   CustomDiscriminationValue)) {
+      S.Diag(AL.getLoc(), diag::err_invalid_custom_discrimination);
+      AL.setInvalid();
+    }
+  } else if (NumArgs > 3) {
+    S.Diag(AL.getLoc(), diag::err_attribute_too_many_arguments) << AL << 3;
+    AL.setInvalid();
+  }
+
+  Decl->addAttr(::new (S.Context) VTablePointerAuthenticationAttr(
+      S.Context, AL, KeyType, AddressDiversityMode, ED,
+      CustomDiscriminationValue));
+}
+
 //===----------------------------------------------------------------------===//
 // Top Level Sema Entry Points
 //===----------------------------------------------------------------------===//
@@ -7136,6 +7260,10 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_TypeNullable:
     handleNullableTypeAttr(S, D, AL);
     break;
+
+  case ParsedAttr::AT_VTablePointerAuthentication:
+    handleVTablePointerAuthentication(S, D, AL);
+    break;
   }
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 234e89b91da07a..ffa6bcac44cbde 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -7116,6 +7116,10 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
     return false;
   };
 
+  if (!Record->isInvalidDecl() &&
+      Record->hasAttr<VTablePointerAuthenticationAttr>())
+    checkIncorrectVTablePointerAuthenticationAttribute(*Record);
+
   auto CompleteMemberFunction = [&](CXXMethodDecl *M) {
     // Check whether the explicitly-defaulted members are valid.
     bool Incomplete = CheckForDefaultedFunction(M);
@@ -10500,6 +10504,39 @@ void Sema::checkIllFormedTrivialABIStruct(CXXRecordDecl &RD) {
   }
 }
 
+void Sema::checkIncorrectVTablePointerAuthenticationAttribute(
+    CXXRecordDecl &RD) {
+  if (RequireCompleteType(RD.getLocation(), Context.getRecordType(&RD),
+                          diag::err_incomplete_type_vtable_pointer_auth))
+    return;
+
+  const CXXRecordDecl *PrimaryBase = &RD;
+  if (PrimaryBase->hasAnyDependentBases())
+    return;
+
+  while (1) {
+    assert(PrimaryBase);
+    const CXXRecordDecl *Base = nullptr;
+    for (auto BasePtr : PrimaryBase->bases()) {
+      if (!BasePtr.getType()->getAsCXXRecordDecl()->isDynamicClass())
+        continue;
+      Base = BasePtr.getType()->getAsCXXRecordDecl();
+      break;
+    }
+    if (!Base || Base == PrimaryBase || !Base->isPolymorphic())
+      break;
+    Diag(RD.getAttr<VTablePointerAuthenticationAttr>()->getLocation(),
+         diag::err_non_top_level_vtable_pointer_auth)
+        << &RD << Base;
+    PrimaryBase = Base;
+  }
+
+  if (!RD.isPolymorphic())
+    Diag(RD.getAttr<VTablePointerAuthenticationAttr>()->getLocation(),
+         diag::err_non_polymorphic_vtable_pointer_auth)
+        << &RD;
+}
+
 void Sema::ActOnFinishCXXMemberSpecification(
     Scope *S, SourceLocation RLoc, Decl *TagDecl, SourceLocation LBrac,
     SourceLocation RBrac, const ParsedAttributesView &AttrList) {
@@ -18291,7 +18328,7 @@ void Sema::SetFunctionBodyKind(Decl *D, SourceLocation Loc, FnBodyKind BodyKind,
   }
 }
 
-bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New,
+bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New,
                                              const CXXMethodDecl *Old) {
   const auto *NewFT = New->getType()->castAs<FunctionProtoType>();
   const auto *OldFT = Old->getType()->castAs<FunctionProtoType>();
@@ -18327,6 +18364,41 @@ bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New,
     return true;
   }
 
+  // Virtual overrides: check for matching effects.
+  const auto OldFX = Old->getFunctionEffects();
+  const auto NewFXOrig = New->getFunctionEffects();
+
+  if (OldFX != NewFXOrig) {
+    FunctionEffectSet NewFX(NewFXOrig);
+    const auto Diffs = FunctionEffectDifferences(OldFX, NewFX);
+    FunctionEffectSet::Conflicts Errs;
+    for (const auto &Diff : Diffs) {
+      switch (Diff.shouldDiagnoseMethodOverride(*Old, OldFX, *New, NewFX)) {
+      case FunctionEffectDiff::OverrideResult::NoAction:
+        break;
+      case FunctionEffectDiff::OverrideResult::Warn:
+        Diag(New->getLocation(), diag::warn_mismatched_func_effect_override)
+            << Diff.effectName();
+        Diag(Old->getLocation(), diag::note_overridden_virtual_function)
+            << Old->getReturnTypeSourceRange();
+        break;
+      case FunctionEffectDiff::OverrideResult::Merge: {
+        NewFX.insert(Diff.Old, Errs);
+        const auto *NewFT = New->getType()->castAs<FunctionProtoType>();
+        FunctionProtoType::ExtProtoInfo EPI = NewFT->getExtProtoInfo();
+        EPI.FunctionEffects = FunctionEffectsRef(NewFX);
+        QualType ModQT = Context.getFunctionType(NewFT->getReturnType(),
+                                                 NewFT->getParamTypes(), EPI);
+        New->setType(ModQT);
+        break;
+      }
+      }
+    }
+    if (!Errs.empty())
+      diagnoseFunctionEffectMergeConflicts(Errs, New->getLocation(),
+                                           Old->getLocation());
+  }
+
   CallingConv NewCC = NewFT->getCallConv(), OldCC = OldFT->getCallConv();
 
   // If the calling conventions match, everything is fine
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 4a2f3a65eac96f..db44cfe1288b69 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -14217,6 +14217,39 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
 
     QualType MPTy = Context.getMemberPointerType(
         op->getType(), Context.getTypeDeclType(MD->getParent()).getTypePtr());
+
+    if (getLangOpts().PointerAuthCalls && MD->isVirtual() &&
+        !isUnevaluatedContext() && !MPTy->isDependentType()) {
+      // When pointer authentication is enabled, argument and return types of
+      // vitual member functions must be complete. This is because vitrual
+      // member function pointers are implemented using virtual dispatch
+      // thunks and the thunks cannot be emitted if the argument or return
+      // types are incomplete.
+      auto ReturnOrParamTypeIsIncomplete = [&](QualType T,
+                                               SourceLocation DeclRefLoc,
+                                               SourceLocation RetArgTypeLoc) {
+        if (RequireCompleteType(DeclRefLoc, T, diag::err_incomplete_type)) {
+          Diag(DeclRefLoc,
+               diag::note_ptrauth_virtual_function_pointer_incomplete_arg_ret);
+          Diag(RetArgTypeLoc,
+               diag::note_ptrauth_virtual_function_incomplete_arg_ret_type)
+              << T;
+          return true;
+        }
+        return false;
+      };
+      QualType RetTy = MD->getReturnType();
+      bool IsIncomplete =
+          !RetTy->isVoidType() &&
+          ReturnOrParamTypeIsIncomplete(
+              RetTy, OpLoc, MD->getReturnTypeSourceRange().getBegin());
+      for (auto *PVD : MD->parameters())
+        IsIncomplete |= ReturnOrParamTypeIsIncomplete(PVD->getType(), OpLoc,
+                                                      PVD->getBeginLoc());
+      if (IsIncomplete)
+        return QualType();
+    }
+
     // Under the MS ABI, lock down the inheritance model now.
     if (Context.getTargetInfo().getCXXABI().isMicrosoft())
       (void)isCompleteType(OpLoc, MPTy);
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 9c423529c80e77..7ccecf055feedd 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -2564,7 +2564,7 @@ DiagnoseCStringFormatDirectiveInObjCAPI(Sema &S,
   }
   else if (Method) {
     for (const auto *I : Method->specific_attrs<FormatAttr>()) {
-      if (S.GetFormatNSStringIdx(I, Idx)) {
+      if (S.ObjC().GetFormatNSStringIdx(I, Idx)) {
         Format = true;
         break;
       }
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index cc9c2598581488..eebe17a5b4bf72 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -356,15 +356,7 @@ static bool isLegalTypeForHLSLSV_DispatchThreadID(QualType T) {
   return true;
 }
 
-void SemaHLSL::handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL) {
-  // FIXME: support semantic on field.
-  // See https://github.com/llvm/llvm-project/issues/57889.
-  if (isa<FieldDecl>(D)) {
-    Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_ast_node)
-        << AL << "parameter";
-    return;
-  }
-
+void SemaHLSL::handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL) {  
   auto *VD = cast<ValueDecl>(D);
   if (!isLegalTypeForHLSLSV_DispatchThreadID(VD->getType())) {
     Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_type)
@@ -852,3 +844,231 @@ void SemaHLSL::DiagnoseAvailabilityViolations(TranslationUnitDecl *TU) {
 
   DiagnoseHLSLAvailability(SemaRef).RunOnTranslationUnit(TU);
 }
+
+// Helper function for CheckHLSLBuiltinFunctionCall
+bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
+  assert(TheCall->getNumArgs() > 1);
+  ExprResult A = TheCall->getArg(0);
+
+  QualType ArgTyA = A.get()->getType();
+
+  auto *VecTyA = ArgTyA->getAs<VectorType>();
+  SourceLocation BuiltinLoc = TheCall->getBeginLoc();
+
+  for (unsigned i = 1; i < TheCall->getNumArgs(); ++i) {
+    ExprResult B = TheCall->getArg(i);
+    QualType ArgTyB = B.get()->getType();
+    auto *VecTyB = ArgTyB->getAs<VectorType>();
+    if (VecTyA == nullptr && VecTyB == nullptr)
+      return false;
+
+    if (VecTyA && VecTyB) {
+      bool retValue = false;
+      if (VecTyA->getElementType() != VecTyB->getElementType()) {
+        // Note: type promotion is intended to be handeled via the intrinsics
+        //  and not the builtin itself.
+        S->Diag(TheCall->getBeginLoc(),
+                diag::err_vec_builtin_incompatible_vector)
+            << TheCall->getDirectCallee() << /*useAllTerminology*/ true
+            << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
+        retValue = true;
+      }
+      if (VecTyA->getNumElements() != VecTyB->getNumElements()) {
+        // You should only be hitting this case if you are calling the builtin
+        // directly. HLSL intrinsics should avoid this case via a
+        // HLSLVectorTruncation.
+        S->Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector)
+            << TheCall->getDirectCallee() << /*useAllTerminology*/ true
+            << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                           TheCall->getArg(1)->getEndLoc());
+        retValue = true;
+      }
+      return retValue;
+    }
+  }
+
+  // Note: if we get here one of the args is a scalar which
+  // requires a VectorSplat on Arg0 or Arg1
+  S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
+      << TheCall->getDirectCallee() << /*useAllTerminology*/ true
+      << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                     TheCall->getArg(1)->getEndLoc());
+  return true;
+}
+
+bool CheckArgsTypesAreCorrect(
+    Sema *S, CallExpr *TheCall, QualType ExpectedType,
+    llvm::function_ref<bool(clang::QualType PassedType)> Check) {
+  for (unsigned i = 0; i < TheCall->getNumArgs(); ++i) {
+    QualType PassedType = TheCall->getArg(i)->getType();
+    if (Check(PassedType)) {
+      if (auto *VecTyA = PassedType->getAs<VectorType>())
+        ExpectedType = S->Context.getVectorType(
+            ExpectedType, VecTyA->getNumElements(), VecTyA->getVectorKind());
+      S->Diag(TheCall->getArg(0)->getBeginLoc(),
+              diag::err_typecheck_convert_incompatible)
+          << PassedType << ExpectedType << 1 << 0 << 0;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
+  auto checkAllFloatTypes = [](clang::QualType PassedType) -> bool {
+    return !PassedType->hasFloatingRepresentation();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkAllFloatTypes);
+}
+
+bool CheckFloatOrHalfRepresentations(Sema *S, CallExpr *TheCall) {
+  auto checkFloatorHalf = [](clang::QualType PassedType) -> bool {
+    clang::QualType BaseType =
+        PassedType->isVectorType()
+            ? PassedType->getAs<clang::VectorType>()->getElementType()
+            : PassedType;
+    return !BaseType->isHalfType() && !BaseType->isFloat32Type();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkFloatorHalf);
+}
+
+bool CheckNoDoubleVectors(Sema *S, CallExpr *TheCall) {
+  auto checkDoubleVector = [](clang::QualType PassedType) -> bool {
+    if (const auto *VecTy = PassedType->getAs<VectorType>())
+      return VecTy->getElementType()->isDoubleType();
+    return false;
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkDoubleVector);
+}
+
+bool CheckUnsignedIntRepresentation(Sema *S, CallExpr *TheCall) {
+  auto checkAllUnsignedTypes = [](clang::QualType PassedType) -> bool {
+    return !PassedType->hasUnsignedIntegerRepresentation();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.UnsignedIntTy,
+                                  checkAllUnsignedTypes);
+}
+
+void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
+                                QualType ReturnType) {
+  auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
+  if (VecTyA)
+    ReturnType = S->Context.getVectorType(ReturnType, VecTyA->getNumElements(),
+                                          VectorKind::Generic);
+  TheCall->setType(ReturnType);
+}
+
+// Note: returning true in this case results in CheckBuiltinFunctionCall
+// returning an ExprError
+bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+  switch (BuiltinID) {
+  case Builtin::BI__builtin_hlsl_elementwise_all:
+  case Builtin::BI__builtin_hlsl_elementwise_any: {
+    if (SemaRef.checkArgCount(TheCall, 1))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_clamp: {
+    if (SemaRef.checkArgCount(TheCall, 3))
+      return true;
+    if (CheckVectorElementCallArgs(&SemaRef, TheCall))
+      return true;
+    if (SemaRef.BuiltinElementwiseTernaryMath(
+            TheCall, /*CheckForFloatArgs*/
+            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_dot: {
+    if (SemaRef.checkArgCount(TheCall, 2))
+      return true;
+    if (CheckVectorElementCallArgs(&SemaRef, TheCall))
+      return true;
+    if (SemaRef.BuiltinVectorToScalarMath(TheCall))
+      return true;
+    if (CheckNoDoubleVectors(&SemaRef, TheCall))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_rcp: {
+    if (CheckAllArgsHaveFloatRepresentation(&SemaRef, TheCall))
+      return true;
+    if (SemaRef.PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_rsqrt:
+  case Builtin::BI__builtin_hlsl_elementwise_frac: {
+    if (CheckFloatOrHalfRepresentations(&SemaRef, TheCall))
+      return true;
+    if (SemaRef.PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_isinf: {
+    if (CheckFloatOrHalfRepresentations(&SemaRef, TheCall))
+      return true;
+    if (SemaRef.PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    SetElementTypeAsReturnType(&SemaRef, TheCall, getASTContext().BoolTy);
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_lerp: {
+    if (SemaRef.checkArgCount(TheCall, 3))
+      return true;
+    if (CheckVectorElementCallArgs(&SemaRef, TheCall))
+      return true;
+    if (SemaRef.BuiltinElementwiseTernaryMath(TheCall))
+      return true;
+    if (CheckFloatOrHalfRepresentations(&SemaRef, TheCall))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_mad: {
+    if (SemaRef.checkArgCount(TheCall, 3))
+      return true;
+    if (CheckVectorElementCallArgs(&SemaRef, TheCall))
+      return true;
+    if (SemaRef.BuiltinElementwiseTernaryMath(
+            TheCall, /*CheckForFloatArgs*/
+            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
+      return true;
+    break;
+  }
+  // Note these are llvm builtins that we want to catch invalid intrinsic
+  // generation. Normal handling of these builitns will occur elsewhere.
+  case Builtin::BI__builtin_elementwise_bitreverse: {
+    if (CheckUnsignedIntRepresentation(&SemaRef, TheCall))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_elementwise_acos:
+  case Builtin::BI__builtin_elementwise_asin:
+  case Builtin::BI__builtin_elementwise_atan:
+  case Builtin::BI__builtin_elementwise_ceil:
+  case Builtin::BI__builtin_elementwise_cos:
+  case Builtin::BI__builtin_elementwise_cosh:
+  case Builtin::BI__builtin_elementwise_exp:
+  case Builtin::BI__builtin_elementwise_exp2:
+  case Builtin::BI__builtin_elementwise_floor:
+  case Builtin::BI__builtin_elementwise_log:
+  case Builtin::BI__builtin_elementwise_log2:
+  case Builtin::BI__builtin_elementwise_log10:
+  case Builtin::BI__builtin_elementwise_pow:
+  case Builtin::BI__builtin_elementwise_roundeven:
+  case Builtin::BI__builtin_elementwise_sin:
+  case Builtin::BI__builtin_elementwise_sinh:
+  case Builtin::BI__builtin_elementwise_sqrt:
+  case Builtin::BI__builtin_elementwise_tan:
+  case Builtin::BI__builtin_elementwise_tanh:
+  case Builtin::BI__builtin_elementwise_trunc: {
+    if (CheckFloatOrHalfRepresentations(&SemaRef, TheCall))
+      return true;
+    break;
+  }
+  }
+  return false;
+}
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index f820db5233f539..26c65b34f4cc45 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CheckExprLifetime.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/Expr.h"
@@ -7284,1238 +7285,9 @@ PerformConstructorInitialization(Sema &S,
   return CurInit;
 }
 
-namespace {
-enum LifetimeKind {
-  /// The lifetime of a temporary bound to this entity ends at the end of the
-  /// full-expression, and that's (probably) fine.
-  LK_FullExpression,
-
-  /// The lifetime of a temporary bound to this entity is extended to the
-  /// lifeitme of the entity itself.
-  LK_Extended,
-
-  /// The lifetime of a temporary bound to this entity probably ends too soon,
-  /// because the entity is allocated in a new-expression.
-  LK_New,
-
-  /// The lifetime of a temporary bound to this entity ends too soon, because
-  /// the entity is a return object.
-  LK_Return,
-
-  /// The lifetime of a temporary bound to this entity ends too soon, because
-  /// the entity is the result of a statement expression.
-  LK_StmtExprResult,
-
-  /// This is a mem-initializer: if it would extend a temporary (other than via
-  /// a default member initializer), the program is ill-formed.
-  LK_MemInitializer,
-};
-using LifetimeResult =
-    llvm::PointerIntPair<const InitializedEntity *, 3, LifetimeKind>;
-}
-
-/// Determine the declaration which an initialized entity ultimately refers to,
-/// for the purpose of lifetime-extending a temporary bound to a reference in
-/// the initialization of \p Entity.
-static LifetimeResult getEntityLifetime(
-    const InitializedEntity *Entity,
-    const InitializedEntity *InitField = nullptr) {
-  // C++11 [class.temporary]p5:
-  switch (Entity->getKind()) {
-  case InitializedEntity::EK_Variable:
-    //   The temporary [...] persists for the lifetime of the reference
-    return {Entity, LK_Extended};
-
-  case InitializedEntity::EK_Member:
-    // For subobjects, we look at the complete object.
-    if (Entity->getParent())
-      return getEntityLifetime(Entity->getParent(), Entity);
-
-    //   except:
-    // C++17 [class.base.init]p8:
-    //   A temporary expression bound to a reference member in a
-    //   mem-initializer is ill-formed.
-    // C++17 [class.base.init]p11:
-    //   A temporary expression bound to a reference member from a
-    //   default member initializer is ill-formed.
-    //
-    // The context of p11 and its example suggest that it's only the use of a
-    // default member initializer from a constructor that makes the program
-    // ill-formed, not its mere existence, and that it can even be used by
-    // aggregate initialization.
-    return {Entity, Entity->isDefaultMemberInitializer() ? LK_Extended
-                                                         : LK_MemInitializer};
-
-  case InitializedEntity::EK_Binding:
-    // Per [dcl.decomp]p3, the binding is treated as a variable of reference
-    // type.
-    return {Entity, LK_Extended};
-
-  case InitializedEntity::EK_Parameter:
-  case InitializedEntity::EK_Parameter_CF_Audited:
-    //   -- A temporary bound to a reference parameter in a function call
-    //      persists until the completion of the full-expression containing
-    //      the call.
-    return {nullptr, LK_FullExpression};
-
-  case InitializedEntity::EK_TemplateParameter:
-    // FIXME: This will always be ill-formed; should we eagerly diagnose it here?
-    return {nullptr, LK_FullExpression};
-
-  case InitializedEntity::EK_Result:
-    //   -- The lifetime of a temporary bound to the returned value in a
-    //      function return statement is not extended; the temporary is
-    //      destroyed at the end of the full-expression in the return statement.
-    return {nullptr, LK_Return};
-
-  case InitializedEntity::EK_StmtExprResult:
-    // FIXME: Should we lifetime-extend through the result of a statement
-    // expression?
-    return {nullptr, LK_StmtExprResult};
-
-  case InitializedEntity::EK_New:
-    //   -- A temporary bound to a reference in a new-initializer persists
-    //      until the completion of the full-expression containing the
-    //      new-initializer.
-    return {nullptr, LK_New};
-
-  case InitializedEntity::EK_Temporary:
-  case InitializedEntity::EK_CompoundLiteralInit:
-  case InitializedEntity::EK_RelatedResult:
-    // We don't yet know the storage duration of the surrounding temporary.
-    // Assume it's got full-expression duration for now, it will patch up our
-    // storage duration if that's not correct.
-    return {nullptr, LK_FullExpression};
-
-  case InitializedEntity::EK_ArrayElement:
-    // For subobjects, we look at the complete object.
-    return getEntityLifetime(Entity->getParent(), InitField);
-
-  case InitializedEntity::EK_Base:
-    // For subobjects, we look at the complete object.
-    if (Entity->getParent())
-      return getEntityLifetime(Entity->getParent(), InitField);
-    return {InitField, LK_MemInitializer};
-
-  case InitializedEntity::EK_Delegating:
-    // We can reach this case for aggregate initialization in a constructor:
-    //   struct A { int &&r; };
-    //   struct B : A { B() : A{0} {} };
-    // In this case, use the outermost field decl as the context.
-    return {InitField, LK_MemInitializer};
-
-  case InitializedEntity::EK_BlockElement:
-  case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
-  case InitializedEntity::EK_LambdaCapture:
-  case InitializedEntity::EK_VectorElement:
-  case InitializedEntity::EK_ComplexElement:
-    return {nullptr, LK_FullExpression};
-
-  case InitializedEntity::EK_Exception:
-    // FIXME: Can we diagnose lifetime problems with exceptions?
-    return {nullptr, LK_FullExpression};
-
-  case InitializedEntity::EK_ParenAggInitMember:
-    //   -- A temporary object bound to a reference element of an aggregate of
-    //      class type initialized from a parenthesized expression-list
-    //      [dcl.init, 9.3] persists until the completion of the full-expression
-    //      containing the expression-list.
-    return {nullptr, LK_FullExpression};
-  }
-
-  llvm_unreachable("unknown entity kind");
-}
-
-namespace {
-enum ReferenceKind {
-  /// Lifetime would be extended by a reference binding to a temporary.
-  RK_ReferenceBinding,
-  /// Lifetime would be extended by a std::initializer_list object binding to
-  /// its backing array.
-  RK_StdInitializerList,
-};
-
-/// A temporary or local variable. This will be one of:
-///  * A MaterializeTemporaryExpr.
-///  * A DeclRefExpr whose declaration is a local.
-///  * An AddrLabelExpr.
-///  * A BlockExpr for a block with captures.
-using Local = Expr*;
-
-/// Expressions we stepped over when looking for the local state. Any steps
-/// that would inhibit lifetime extension or take us out of subexpressions of
-/// the initializer are included.
-struct IndirectLocalPathEntry {
-  enum EntryKind {
-    DefaultInit,
-    AddressOf,
-    VarInit,
-    LValToRVal,
-    LifetimeBoundCall,
-    TemporaryCopy,
-    LambdaCaptureInit,
-    GslReferenceInit,
-    GslPointerInit
-  } Kind;
-  Expr *E;
-  union {
-    const Decl *D = nullptr;
-    const LambdaCapture *Capture;
-  };
-  IndirectLocalPathEntry() {}
-  IndirectLocalPathEntry(EntryKind K, Expr *E) : Kind(K), E(E) {}
-  IndirectLocalPathEntry(EntryKind K, Expr *E, const Decl *D)
-      : Kind(K), E(E), D(D) {}
-  IndirectLocalPathEntry(EntryKind K, Expr *E, const LambdaCapture *Capture)
-      : Kind(K), E(E), Capture(Capture) {}
-};
-
-using IndirectLocalPath = llvm::SmallVectorImpl<IndirectLocalPathEntry>;
-
-struct RevertToOldSizeRAII {
-  IndirectLocalPath &Path;
-  unsigned OldSize = Path.size();
-  RevertToOldSizeRAII(IndirectLocalPath &Path) : Path(Path) {}
-  ~RevertToOldSizeRAII() { Path.resize(OldSize); }
-};
-
-using LocalVisitor = llvm::function_ref<bool(IndirectLocalPath &Path, Local L,
-                                             ReferenceKind RK)>;
-}
-
-static bool isVarOnPath(IndirectLocalPath &Path, VarDecl *VD) {
-  for (auto E : Path)
-    if (E.Kind == IndirectLocalPathEntry::VarInit && E.D == VD)
-      return true;
-  return false;
-}
-
-static bool pathContainsInit(IndirectLocalPath &Path) {
-  return llvm::any_of(Path, [=](IndirectLocalPathEntry E) {
-    return E.Kind == IndirectLocalPathEntry::DefaultInit ||
-           E.Kind == IndirectLocalPathEntry::VarInit;
-  });
-}
-
-static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
-                                             Expr *Init, LocalVisitor Visit,
-                                             bool RevisitSubinits,
-                                             bool EnableLifetimeWarnings);
-
-static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
-                                                  Expr *Init, ReferenceKind RK,
-                                                  LocalVisitor Visit,
-                                                  bool EnableLifetimeWarnings);
-
-template <typename T> static bool isRecordWithAttr(QualType Type) {
-  if (auto *RD = Type->getAsCXXRecordDecl())
-    return RD->hasAttr<T>();
-  return false;
-}
-
-// Decl::isInStdNamespace will return false for iterators in some STL
-// implementations due to them being defined in a namespace outside of the std
-// namespace.
-static bool isInStlNamespace(const Decl *D) {
-  const DeclContext *DC = D->getDeclContext();
-  if (!DC)
-    return false;
-  if (const auto *ND = dyn_cast<NamespaceDecl>(DC))
-    if (const IdentifierInfo *II = ND->getIdentifier()) {
-      StringRef Name = II->getName();
-      if (Name.size() >= 2 && Name.front() == '_' &&
-          (Name[1] == '_' || isUppercase(Name[1])))
-        return true;
-    }
-
-  return DC->isStdNamespace();
-}
-
-static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
-  if (auto *Conv = dyn_cast_or_null<CXXConversionDecl>(Callee))
-    if (isRecordWithAttr<PointerAttr>(Conv->getConversionType()))
-      return true;
-  if (!isInStlNamespace(Callee->getParent()))
-    return false;
-  if (!isRecordWithAttr<PointerAttr>(
-          Callee->getFunctionObjectParameterType()) &&
-      !isRecordWithAttr<OwnerAttr>(Callee->getFunctionObjectParameterType()))
-    return false;
-  if (Callee->getReturnType()->isPointerType() ||
-      isRecordWithAttr<PointerAttr>(Callee->getReturnType())) {
-    if (!Callee->getIdentifier())
-      return false;
-    return llvm::StringSwitch<bool>(Callee->getName())
-        .Cases("begin", "rbegin", "cbegin", "crbegin", true)
-        .Cases("end", "rend", "cend", "crend", true)
-        .Cases("c_str", "data", "get", true)
-        // Map and set types.
-        .Cases("find", "equal_range", "lower_bound", "upper_bound", true)
-        .Default(false);
-  } else if (Callee->getReturnType()->isReferenceType()) {
-    if (!Callee->getIdentifier()) {
-      auto OO = Callee->getOverloadedOperator();
-      return OO == OverloadedOperatorKind::OO_Subscript ||
-             OO == OverloadedOperatorKind::OO_Star;
-    }
-    return llvm::StringSwitch<bool>(Callee->getName())
-        .Cases("front", "back", "at", "top", "value", true)
-        .Default(false);
-  }
-  return false;
-}
-
-static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
-  if (!FD->getIdentifier() || FD->getNumParams() != 1)
-    return false;
-  const auto *RD = FD->getParamDecl(0)->getType()->getPointeeCXXRecordDecl();
-  if (!FD->isInStdNamespace() || !RD || !RD->isInStdNamespace())
-    return false;
-  if (!isRecordWithAttr<PointerAttr>(QualType(RD->getTypeForDecl(), 0)) &&
-      !isRecordWithAttr<OwnerAttr>(QualType(RD->getTypeForDecl(), 0)))
-    return false;
-  if (FD->getReturnType()->isPointerType() ||
-      isRecordWithAttr<PointerAttr>(FD->getReturnType())) {
-    return llvm::StringSwitch<bool>(FD->getName())
-        .Cases("begin", "rbegin", "cbegin", "crbegin", true)
-        .Cases("end", "rend", "cend", "crend", true)
-        .Case("data", true)
-        .Default(false);
-  } else if (FD->getReturnType()->isReferenceType()) {
-    return llvm::StringSwitch<bool>(FD->getName())
-        .Cases("get", "any_cast", true)
-        .Default(false);
-  }
-  return false;
-}
-
-static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call,
-                                    LocalVisitor Visit) {
-  auto VisitPointerArg = [&](const Decl *D, Expr *Arg, bool Value) {
-    // We are not interested in the temporary base objects of gsl Pointers:
-    //   Temp().ptr; // Here ptr might not dangle.
-    if (isa<MemberExpr>(Arg->IgnoreImpCasts()))
-      return;
-    // Once we initialized a value with a reference, it can no longer dangle.
-    if (!Value) {
-      for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) {
-        if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit)
-          continue;
-        if (PE.Kind == IndirectLocalPathEntry::GslPointerInit)
-          return;
-        break;
-      }
-    }
-    Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit
-                          : IndirectLocalPathEntry::GslReferenceInit,
-                    Arg, D});
-    if (Arg->isGLValue())
-      visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
-                                            Visit,
-                                            /*EnableLifetimeWarnings=*/true);
-    else
-      visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
-                                       /*EnableLifetimeWarnings=*/true);
-    Path.pop_back();
-  };
-
-  if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Call)) {
-    const auto *MD = cast_or_null<CXXMethodDecl>(MCE->getDirectCallee());
-    if (MD && shouldTrackImplicitObjectArg(MD))
-      VisitPointerArg(MD, MCE->getImplicitObjectArgument(),
-                      !MD->getReturnType()->isReferenceType());
-    return;
-  } else if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(Call)) {
-    FunctionDecl *Callee = OCE->getDirectCallee();
-    if (Callee && Callee->isCXXInstanceMember() &&
-        shouldTrackImplicitObjectArg(cast<CXXMethodDecl>(Callee)))
-      VisitPointerArg(Callee, OCE->getArg(0),
-                      !Callee->getReturnType()->isReferenceType());
-    return;
-  } else if (auto *CE = dyn_cast<CallExpr>(Call)) {
-    FunctionDecl *Callee = CE->getDirectCallee();
-    if (Callee && shouldTrackFirstArgument(Callee))
-      VisitPointerArg(Callee, CE->getArg(0),
-                      !Callee->getReturnType()->isReferenceType());
-    return;
-  }
-
-  if (auto *CCE = dyn_cast<CXXConstructExpr>(Call)) {
-    const auto *Ctor = CCE->getConstructor();
-    const CXXRecordDecl *RD = Ctor->getParent();
-    if (CCE->getNumArgs() > 0 && RD->hasAttr<PointerAttr>())
-      VisitPointerArg(Ctor->getParamDecl(0), CCE->getArgs()[0], true);
-  }
-}
-
-static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
-  const TypeSourceInfo *TSI = FD->getTypeSourceInfo();
-  if (!TSI)
-    return false;
-  // Don't declare this variable in the second operand of the for-statement;
-  // GCC miscompiles that by ending its lifetime before evaluating the
-  // third operand. See gcc.gnu.org/PR86769.
-  AttributedTypeLoc ATL;
-  for (TypeLoc TL = TSI->getTypeLoc();
-       (ATL = TL.getAsAdjusted<AttributedTypeLoc>());
-       TL = ATL.getModifiedLoc()) {
-    if (ATL.getAttrAs<LifetimeBoundAttr>())
-      return true;
-  }
-
-  // Assume that all assignment operators with a "normal" return type return
-  // *this, that is, an lvalue reference that is the same type as the implicit
-  // object parameter (or the LHS for a non-member operator$=).
-  OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator();
-  if (OO == OO_Equal || isCompoundAssignmentOperator(OO)) {
-    QualType RetT = FD->getReturnType();
-    if (RetT->isLValueReferenceType()) {
-      ASTContext &Ctx = FD->getASTContext();
-      QualType LHST;
-      auto *MD = dyn_cast<CXXMethodDecl>(FD);
-      if (MD && MD->isCXXInstanceMember())
-        LHST = Ctx.getLValueReferenceType(MD->getFunctionObjectParameterType());
-      else
-        LHST = MD->getParamDecl(0)->getType();
-      if (Ctx.hasSameType(RetT, LHST))
-        return true;
-    }
-  }
-
-  return false;
-}
-
-static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call,
-                                        LocalVisitor Visit) {
-  const FunctionDecl *Callee;
-  ArrayRef<Expr*> Args;
-
-  if (auto *CE = dyn_cast<CallExpr>(Call)) {
-    Callee = CE->getDirectCallee();
-    Args = llvm::ArrayRef(CE->getArgs(), CE->getNumArgs());
-  } else {
-    auto *CCE = cast<CXXConstructExpr>(Call);
-    Callee = CCE->getConstructor();
-    Args = llvm::ArrayRef(CCE->getArgs(), CCE->getNumArgs());
-  }
-  if (!Callee)
-    return;
-
-  Expr *ObjectArg = nullptr;
-  if (isa<CXXOperatorCallExpr>(Call) && Callee->isCXXInstanceMember()) {
-    ObjectArg = Args[0];
-    Args = Args.slice(1);
-  } else if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Call)) {
-    ObjectArg = MCE->getImplicitObjectArgument();
-  }
-
-  auto VisitLifetimeBoundArg = [&](const Decl *D, Expr *Arg) {
-    Path.push_back({IndirectLocalPathEntry::LifetimeBoundCall, Arg, D});
-    if (Arg->isGLValue())
-      visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
-                                            Visit,
-                                            /*EnableLifetimeWarnings=*/false);
-    else
-      visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
-                                       /*EnableLifetimeWarnings=*/false);
-    Path.pop_back();
-  };
-
-  bool CheckCoroCall = false;
-  if (const auto *RD = Callee->getReturnType()->getAsRecordDecl()) {
-    CheckCoroCall = RD->hasAttr<CoroLifetimeBoundAttr>() &&
-                    RD->hasAttr<CoroReturnTypeAttr>() &&
-                    !Callee->hasAttr<CoroDisableLifetimeBoundAttr>();
-  }
-
-  if (ObjectArg) {
-    bool CheckCoroObjArg = CheckCoroCall;
-    // Coroutine lambda objects with empty capture list are not lifetimebound.
-    if (auto *LE = dyn_cast<LambdaExpr>(ObjectArg->IgnoreImplicit());
-        LE && LE->captures().empty())
-      CheckCoroObjArg = false;
-    // Allow `get_return_object()` as the object param (__promise) is not
-    // lifetimebound.
-    if (Sema::CanBeGetReturnObject(Callee))
-      CheckCoroObjArg = false;
-    if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg)
-      VisitLifetimeBoundArg(Callee, ObjectArg);
-  }
-
-  for (unsigned I = 0,
-                N = std::min<unsigned>(Callee->getNumParams(), Args.size());
-       I != N; ++I) {
-    if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
-      VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
-  }
-}
-
-/// Visit the locals that would be reachable through a reference bound to the
-/// glvalue expression \c Init.
-static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
-                                                  Expr *Init, ReferenceKind RK,
-                                                  LocalVisitor Visit,
-                                                  bool EnableLifetimeWarnings) {
-  RevertToOldSizeRAII RAII(Path);
-
-  // Walk past any constructs which we can lifetime-extend across.
-  Expr *Old;
-  do {
-    Old = Init;
-
-    if (auto *FE = dyn_cast<FullExpr>(Init))
-      Init = FE->getSubExpr();
-
-    if (InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
-      // If this is just redundant braces around an initializer, step over it.
-      if (ILE->isTransparent())
-        Init = ILE->getInit(0);
-    }
-
-    // Step over any subobject adjustments; we may have a materialized
-    // temporary inside them.
-    Init = const_cast<Expr *>(Init->skipRValueSubobjectAdjustments());
-
-    // Per current approach for DR1376, look through casts to reference type
-    // when performing lifetime extension.
-    if (CastExpr *CE = dyn_cast<CastExpr>(Init))
-      if (CE->getSubExpr()->isGLValue())
-        Init = CE->getSubExpr();
-
-    // Per the current approach for DR1299, look through array element access
-    // on array glvalues when performing lifetime extension.
-    if (auto *ASE = dyn_cast<ArraySubscriptExpr>(Init)) {
-      Init = ASE->getBase();
-      auto *ICE = dyn_cast<ImplicitCastExpr>(Init);
-      if (ICE && ICE->getCastKind() == CK_ArrayToPointerDecay)
-        Init = ICE->getSubExpr();
-      else
-        // We can't lifetime extend through this but we might still find some
-        // retained temporaries.
-        return visitLocalsRetainedByInitializer(Path, Init, Visit, true,
-                                                EnableLifetimeWarnings);
-    }
-
-    // Step into CXXDefaultInitExprs so we can diagnose cases where a
-    // constructor inherits one as an implicit mem-initializer.
-    if (auto *DIE = dyn_cast<CXXDefaultInitExpr>(Init)) {
-      Path.push_back(
-          {IndirectLocalPathEntry::DefaultInit, DIE, DIE->getField()});
-      Init = DIE->getExpr();
-    }
-  } while (Init != Old);
-
-  if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(Init)) {
-    if (Visit(Path, Local(MTE), RK))
-      visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
-  }
-
-  if (isa<CallExpr>(Init)) {
-    if (EnableLifetimeWarnings)
-      handleGslAnnotatedTypes(Path, Init, Visit);
-    return visitLifetimeBoundArguments(Path, Init, Visit);
-  }
-
-  switch (Init->getStmtClass()) {
-  case Stmt::DeclRefExprClass: {
-    // If we find the name of a local non-reference parameter, we could have a
-    // lifetime problem.
-    auto *DRE = cast<DeclRefExpr>(Init);
-    auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
-    if (VD && VD->hasLocalStorage() &&
-        !DRE->refersToEnclosingVariableOrCapture()) {
-      if (!VD->getType()->isReferenceType()) {
-        Visit(Path, Local(DRE), RK);
-      } else if (isa<ParmVarDecl>(DRE->getDecl())) {
-        // The lifetime of a reference parameter is unknown; assume it's OK
-        // for now.
-        break;
-      } else if (VD->getInit() && !isVarOnPath(Path, VD)) {
-        Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD});
-        visitLocalsRetainedByReferenceBinding(Path, VD->getInit(),
-                                              RK_ReferenceBinding, Visit,
-                                              EnableLifetimeWarnings);
-      }
-    }
-    break;
-  }
-
-  case Stmt::UnaryOperatorClass: {
-    // The only unary operator that make sense to handle here
-    // is Deref.  All others don't resolve to a "name."  This includes
-    // handling all sorts of rvalues passed to a unary operator.
-    const UnaryOperator *U = cast<UnaryOperator>(Init);
-    if (U->getOpcode() == UO_Deref)
-      visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
-    break;
-  }
-
-  case Stmt::ArraySectionExprClass: {
-    visitLocalsRetainedByInitializer(Path,
-                                     cast<ArraySectionExpr>(Init)->getBase(),
-                                     Visit, true, EnableLifetimeWarnings);
-    break;
-  }
-
-  case Stmt::ConditionalOperatorClass:
-  case Stmt::BinaryConditionalOperatorClass: {
-    auto *C = cast<AbstractConditionalOperator>(Init);
-    if (!C->getTrueExpr()->getType()->isVoidType())
-      visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit,
-                                            EnableLifetimeWarnings);
-    if (!C->getFalseExpr()->getType()->isVoidType())
-      visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit,
-                                            EnableLifetimeWarnings);
-    break;
-  }
-
-  case Stmt::CompoundLiteralExprClass: {
-    if (auto *CLE = dyn_cast<CompoundLiteralExpr>(Init)) {
-      if (!CLE->isFileScope())
-        Visit(Path, Local(CLE), RK);
-    }
-    break;
-  }
-
-  // FIXME: Visit the left-hand side of an -> or ->*.
-
-  default:
-    break;
-  }
-}
-
-/// Visit the locals that would be reachable through an object initialized by
-/// the prvalue expression \c Init.
-static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
-                                             Expr *Init, LocalVisitor Visit,
-                                             bool RevisitSubinits,
-                                             bool EnableLifetimeWarnings) {
-  RevertToOldSizeRAII RAII(Path);
-
-  Expr *Old;
-  do {
-    Old = Init;
-
-    // Step into CXXDefaultInitExprs so we can diagnose cases where a
-    // constructor inherits one as an implicit mem-initializer.
-    if (auto *DIE = dyn_cast<CXXDefaultInitExpr>(Init)) {
-      Path.push_back({IndirectLocalPathEntry::DefaultInit, DIE, DIE->getField()});
-      Init = DIE->getExpr();
-    }
-
-    if (auto *FE = dyn_cast<FullExpr>(Init))
-      Init = FE->getSubExpr();
-
-    // Dig out the expression which constructs the extended temporary.
-    Init = const_cast<Expr *>(Init->skipRValueSubobjectAdjustments());
-
-    if (CXXBindTemporaryExpr *BTE = dyn_cast<CXXBindTemporaryExpr>(Init))
-      Init = BTE->getSubExpr();
-
-    Init = Init->IgnoreParens();
-
-    // Step over value-preserving rvalue casts.
-    if (auto *CE = dyn_cast<CastExpr>(Init)) {
-      switch (CE->getCastKind()) {
-      case CK_LValueToRValue:
-        // If we can match the lvalue to a const object, we can look at its
-        // initializer.
-        Path.push_back({IndirectLocalPathEntry::LValToRVal, CE});
-        return visitLocalsRetainedByReferenceBinding(
-            Path, Init, RK_ReferenceBinding,
-            [&](IndirectLocalPath &Path, Local L, ReferenceKind RK) -> bool {
-          if (auto *DRE = dyn_cast<DeclRefExpr>(L)) {
-            auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
-            if (VD && VD->getType().isConstQualified() && VD->getInit() &&
-                !isVarOnPath(Path, VD)) {
-              Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD});
-              visitLocalsRetainedByInitializer(Path, VD->getInit(), Visit, true,
-                                               EnableLifetimeWarnings);
-            }
-          } else if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L)) {
-            if (MTE->getType().isConstQualified())
-              visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit,
-                                               true, EnableLifetimeWarnings);
-          }
-          return false;
-        }, EnableLifetimeWarnings);
-
-        // We assume that objects can be retained by pointers cast to integers,
-        // but not if the integer is cast to floating-point type or to _Complex.
-        // We assume that casts to 'bool' do not preserve enough information to
-        // retain a local object.
-      case CK_NoOp:
-      case CK_BitCast:
-      case CK_BaseToDerived:
-      case CK_DerivedToBase:
-      case CK_UncheckedDerivedToBase:
-      case CK_Dynamic:
-      case CK_ToUnion:
-      case CK_UserDefinedConversion:
-      case CK_ConstructorConversion:
-      case CK_IntegralToPointer:
-      case CK_PointerToIntegral:
-      case CK_VectorSplat:
-      case CK_IntegralCast:
-      case CK_CPointerToObjCPointerCast:
-      case CK_BlockPointerToObjCPointerCast:
-      case CK_AnyPointerToBlockPointerCast:
-      case CK_AddressSpaceConversion:
-        break;
-
-      case CK_ArrayToPointerDecay:
-        // Model array-to-pointer decay as taking the address of the array
-        // lvalue.
-        Path.push_back({IndirectLocalPathEntry::AddressOf, CE});
-        return visitLocalsRetainedByReferenceBinding(Path, CE->getSubExpr(),
-                                                     RK_ReferenceBinding, Visit,
-                                                     EnableLifetimeWarnings);
-
-      default:
-        return;
-      }
-
-      Init = CE->getSubExpr();
-    }
-  } while (Old != Init);
-
-  // C++17 [dcl.init.list]p6:
-  //   initializing an initializer_list object from the array extends the
-  //   lifetime of the array exactly like binding a reference to a temporary.
-  if (auto *ILE = dyn_cast<CXXStdInitializerListExpr>(Init))
-    return visitLocalsRetainedByReferenceBinding(Path, ILE->getSubExpr(),
-                                                 RK_StdInitializerList, Visit,
-                                                 EnableLifetimeWarnings);
-
-  if (InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
-    // We already visited the elements of this initializer list while
-    // performing the initialization. Don't visit them again unless we've
-    // changed the lifetime of the initialized entity.
-    if (!RevisitSubinits)
-      return;
-
-    if (ILE->isTransparent())
-      return visitLocalsRetainedByInitializer(Path, ILE->getInit(0), Visit,
-                                              RevisitSubinits,
-                                              EnableLifetimeWarnings);
-
-    if (ILE->getType()->isArrayType()) {
-      for (unsigned I = 0, N = ILE->getNumInits(); I != N; ++I)
-        visitLocalsRetainedByInitializer(Path, ILE->getInit(I), Visit,
-                                         RevisitSubinits,
-                                         EnableLifetimeWarnings);
-      return;
-    }
-
-    if (CXXRecordDecl *RD = ILE->getType()->getAsCXXRecordDecl()) {
-      assert(RD->isAggregate() && "aggregate init on non-aggregate");
-
-      // If we lifetime-extend a braced initializer which is initializing an
-      // aggregate, and that aggregate contains reference members which are
-      // bound to temporaries, those temporaries are also lifetime-extended.
-      if (RD->isUnion() && ILE->getInitializedFieldInUnion() &&
-          ILE->getInitializedFieldInUnion()->getType()->isReferenceType())
-        visitLocalsRetainedByReferenceBinding(Path, ILE->getInit(0),
-                                              RK_ReferenceBinding, Visit,
-                                              EnableLifetimeWarnings);
-      else {
-        unsigned Index = 0;
-        for (; Index < RD->getNumBases() && Index < ILE->getNumInits(); ++Index)
-          visitLocalsRetainedByInitializer(Path, ILE->getInit(Index), Visit,
-                                           RevisitSubinits,
-                                           EnableLifetimeWarnings);
-        for (const auto *I : RD->fields()) {
-          if (Index >= ILE->getNumInits())
-            break;
-          if (I->isUnnamedBitField())
-            continue;
-          Expr *SubInit = ILE->getInit(Index);
-          if (I->getType()->isReferenceType())
-            visitLocalsRetainedByReferenceBinding(Path, SubInit,
-                                                  RK_ReferenceBinding, Visit,
-                                                  EnableLifetimeWarnings);
-          else
-            // This might be either aggregate-initialization of a member or
-            // initialization of a std::initializer_list object. Regardless,
-            // we should recursively lifetime-extend that initializer.
-            visitLocalsRetainedByInitializer(Path, SubInit, Visit,
-                                             RevisitSubinits,
-                                             EnableLifetimeWarnings);
-          ++Index;
-        }
-      }
-    }
-    return;
-  }
-
-  // The lifetime of an init-capture is that of the closure object constructed
-  // by a lambda-expression.
-  if (auto *LE = dyn_cast<LambdaExpr>(Init)) {
-    LambdaExpr::capture_iterator CapI = LE->capture_begin();
-    for (Expr *E : LE->capture_inits()) {
-      assert(CapI != LE->capture_end());
-      const LambdaCapture &Cap = *CapI++;
-      if (!E)
-        continue;
-      if (Cap.capturesVariable())
-        Path.push_back({IndirectLocalPathEntry::LambdaCaptureInit, E, &Cap});
-      if (E->isGLValue())
-        visitLocalsRetainedByReferenceBinding(Path, E, RK_ReferenceBinding,
-                                              Visit, EnableLifetimeWarnings);
-      else
-        visitLocalsRetainedByInitializer(Path, E, Visit, true,
-                                         EnableLifetimeWarnings);
-      if (Cap.capturesVariable())
-        Path.pop_back();
-    }
-  }
-
-  // Assume that a copy or move from a temporary references the same objects
-  // that the temporary does.
-  if (auto *CCE = dyn_cast<CXXConstructExpr>(Init)) {
-    if (CCE->getConstructor()->isCopyOrMoveConstructor()) {
-      if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CCE->getArg(0))) {
-        Expr *Arg = MTE->getSubExpr();
-        Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg,
-                        CCE->getConstructor()});
-        visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
-                                         /*EnableLifetimeWarnings*/false);
-        Path.pop_back();
-      }
-    }
-  }
-
-  if (isa<CallExpr>(Init) || isa<CXXConstructExpr>(Init)) {
-    if (EnableLifetimeWarnings)
-      handleGslAnnotatedTypes(Path, Init, Visit);
-    return visitLifetimeBoundArguments(Path, Init, Visit);
-  }
-
-  switch (Init->getStmtClass()) {
-  case Stmt::UnaryOperatorClass: {
-    auto *UO = cast<UnaryOperator>(Init);
-    // If the initializer is the address of a local, we could have a lifetime
-    // problem.
-    if (UO->getOpcode() == UO_AddrOf) {
-      // If this is &rvalue, then it's ill-formed and we have already diagnosed
-      // it. Don't produce a redundant warning about the lifetime of the
-      // temporary.
-      if (isa<MaterializeTemporaryExpr>(UO->getSubExpr()))
-        return;
-
-      Path.push_back({IndirectLocalPathEntry::AddressOf, UO});
-      visitLocalsRetainedByReferenceBinding(Path, UO->getSubExpr(),
-                                            RK_ReferenceBinding, Visit,
-                                            EnableLifetimeWarnings);
-    }
-    break;
-  }
-
-  case Stmt::BinaryOperatorClass: {
-    // Handle pointer arithmetic.
-    auto *BO = cast<BinaryOperator>(Init);
-    BinaryOperatorKind BOK = BO->getOpcode();
-    if (!BO->getType()->isPointerType() || (BOK != BO_Add && BOK != BO_Sub))
-      break;
-
-    if (BO->getLHS()->getType()->isPointerType())
-      visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true,
-                                       EnableLifetimeWarnings);
-    else if (BO->getRHS()->getType()->isPointerType())
-      visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true,
-                                       EnableLifetimeWarnings);
-    break;
-  }
-
-  case Stmt::ConditionalOperatorClass:
-  case Stmt::BinaryConditionalOperatorClass: {
-    auto *C = cast<AbstractConditionalOperator>(Init);
-    // In C++, we can have a throw-expression operand, which has 'void' type
-    // and isn't interesting from a lifetime perspective.
-    if (!C->getTrueExpr()->getType()->isVoidType())
-      visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
-    if (!C->getFalseExpr()->getType()->isVoidType())
-      visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
-    break;
-  }
-
-  case Stmt::BlockExprClass:
-    if (cast<BlockExpr>(Init)->getBlockDecl()->hasCaptures()) {
-      // This is a local block, whose lifetime is that of the function.
-      Visit(Path, Local(cast<BlockExpr>(Init)), RK_ReferenceBinding);
-    }
-    break;
-
-  case Stmt::AddrLabelExprClass:
-    // We want to warn if the address of a label would escape the function.
-    Visit(Path, Local(cast<AddrLabelExpr>(Init)), RK_ReferenceBinding);
-    break;
-
-  default:
-    break;
-  }
-}
-
-/// Whether a path to an object supports lifetime extension.
-enum PathLifetimeKind {
-  /// Lifetime-extend along this path.
-  Extend,
-  /// We should lifetime-extend, but we don't because (due to technical
-  /// limitations) we can't. This happens for default member initializers,
-  /// which we don't clone for every use, so we don't have a unique
-  /// MaterializeTemporaryExpr to update.
-  ShouldExtend,
-  /// Do not lifetime extend along this path.
-  NoExtend
-};
-
-/// Determine whether this is an indirect path to a temporary that we are
-/// supposed to lifetime-extend along.
-static PathLifetimeKind
-shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) {
-  PathLifetimeKind Kind = PathLifetimeKind::Extend;
-  for (auto Elem : Path) {
-    if (Elem.Kind == IndirectLocalPathEntry::DefaultInit)
-      Kind = PathLifetimeKind::ShouldExtend;
-    else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit)
-      return PathLifetimeKind::NoExtend;
-  }
-  return Kind;
-}
-
-/// Find the range for the first interesting entry in the path at or after I.
-static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I,
-                                      Expr *E) {
-  for (unsigned N = Path.size(); I != N; ++I) {
-    switch (Path[I].Kind) {
-    case IndirectLocalPathEntry::AddressOf:
-    case IndirectLocalPathEntry::LValToRVal:
-    case IndirectLocalPathEntry::LifetimeBoundCall:
-    case IndirectLocalPathEntry::TemporaryCopy:
-    case IndirectLocalPathEntry::GslReferenceInit:
-    case IndirectLocalPathEntry::GslPointerInit:
-      // These exist primarily to mark the path as not permitting or
-      // supporting lifetime extension.
-      break;
-
-    case IndirectLocalPathEntry::VarInit:
-      if (cast<VarDecl>(Path[I].D)->isImplicit())
-        return SourceRange();
-      [[fallthrough]];
-    case IndirectLocalPathEntry::DefaultInit:
-      return Path[I].E->getSourceRange();
-
-    case IndirectLocalPathEntry::LambdaCaptureInit:
-      if (!Path[I].Capture->capturesVariable())
-        continue;
-      return Path[I].E->getSourceRange();
-    }
-  }
-  return E->getSourceRange();
-}
-
-static bool pathOnlyInitializesGslPointer(IndirectLocalPath &Path) {
-  for (const auto &It : llvm::reverse(Path)) {
-    if (It.Kind == IndirectLocalPathEntry::VarInit)
-      continue;
-    if (It.Kind == IndirectLocalPathEntry::AddressOf)
-      continue;
-    if (It.Kind == IndirectLocalPathEntry::LifetimeBoundCall)
-      continue;
-    return It.Kind == IndirectLocalPathEntry::GslPointerInit ||
-           It.Kind == IndirectLocalPathEntry::GslReferenceInit;
-  }
-  return false;
-}
-
 void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
                                     Expr *Init) {
-  LifetimeResult LR = getEntityLifetime(&Entity);
-  LifetimeKind LK = LR.getInt();
-  const InitializedEntity *ExtendingEntity = LR.getPointer();
-
-  // If this entity doesn't have an interesting lifetime, don't bother looking
-  // for temporaries within its initializer.
-  if (LK == LK_FullExpression)
-    return;
-
-  auto TemporaryVisitor = [&](IndirectLocalPath &Path, Local L,
-                              ReferenceKind RK) -> bool {
-    SourceRange DiagRange = nextPathEntryRange(Path, 0, L);
-    SourceLocation DiagLoc = DiagRange.getBegin();
-
-    auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L);
-
-    bool IsGslPtrInitWithGslTempOwner = false;
-    bool IsLocalGslOwner = false;
-    if (pathOnlyInitializesGslPointer(Path)) {
-      if (isa<DeclRefExpr>(L)) {
-        // We do not want to follow the references when returning a pointer originating
-        // from a local owner to avoid the following false positive:
-        //   int &p = *localUniquePtr;
-        //   someContainer.add(std::move(localUniquePtr));
-        //   return p;
-        IsLocalGslOwner = isRecordWithAttr<OwnerAttr>(L->getType());
-        if (pathContainsInit(Path) || !IsLocalGslOwner)
-          return false;
-      } else {
-        IsGslPtrInitWithGslTempOwner = MTE && !MTE->getExtendingDecl() &&
-                            isRecordWithAttr<OwnerAttr>(MTE->getType());
-        // Skipping a chain of initializing gsl::Pointer annotated objects.
-        // We are looking only for the final source to find out if it was
-        // a local or temporary owner or the address of a local variable/param.
-        if (!IsGslPtrInitWithGslTempOwner)
-          return true;
-      }
-    }
-
-    switch (LK) {
-    case LK_FullExpression:
-      llvm_unreachable("already handled this");
-
-    case LK_Extended: {
-      if (!MTE) {
-        // The initialized entity has lifetime beyond the full-expression,
-        // and the local entity does too, so don't warn.
-        //
-        // FIXME: We should consider warning if a static / thread storage
-        // duration variable retains an automatic storage duration local.
-        return false;
-      }
-
-      if (IsGslPtrInitWithGslTempOwner && DiagLoc.isValid()) {
-        Diag(DiagLoc, diag::warn_dangling_lifetime_pointer) << DiagRange;
-        return false;
-      }
-
-      switch (shouldLifetimeExtendThroughPath(Path)) {
-      case PathLifetimeKind::Extend:
-        // Update the storage duration of the materialized temporary.
-        // FIXME: Rebuild the expression instead of mutating it.
-        MTE->setExtendingDecl(ExtendingEntity->getDecl(),
-                              ExtendingEntity->allocateManglingNumber());
-        // Also visit the temporaries lifetime-extended by this initializer.
-        return true;
-
-      case PathLifetimeKind::ShouldExtend:
-        // We're supposed to lifetime-extend the temporary along this path (per
-        // the resolution of DR1815), but we don't support that yet.
-        //
-        // FIXME: Properly handle this situation. Perhaps the easiest approach
-        // would be to clone the initializer expression on each use that would
-        // lifetime extend its temporaries.
-        Diag(DiagLoc, diag::warn_unsupported_lifetime_extension)
-            << RK << DiagRange;
-        break;
-
-      case PathLifetimeKind::NoExtend:
-        // If the path goes through the initialization of a variable or field,
-        // it can't possibly reach a temporary created in this full-expression.
-        // We will have already diagnosed any problems with the initializer.
-        if (pathContainsInit(Path))
-          return false;
-
-        Diag(DiagLoc, diag::warn_dangling_variable)
-            << RK << !Entity.getParent()
-            << ExtendingEntity->getDecl()->isImplicit()
-            << ExtendingEntity->getDecl() << Init->isGLValue() << DiagRange;
-        break;
-      }
-      break;
-    }
-
-    case LK_MemInitializer: {
-      if (isa<MaterializeTemporaryExpr>(L)) {
-        // Under C++ DR1696, if a mem-initializer (or a default member
-        // initializer used by the absence of one) would lifetime-extend a
-        // temporary, the program is ill-formed.
-        if (auto *ExtendingDecl =
-                ExtendingEntity ? ExtendingEntity->getDecl() : nullptr) {
-          if (IsGslPtrInitWithGslTempOwner) {
-            Diag(DiagLoc, diag::warn_dangling_lifetime_pointer_member)
-                << ExtendingDecl << DiagRange;
-            Diag(ExtendingDecl->getLocation(),
-                 diag::note_ref_or_ptr_member_declared_here)
-                << true;
-            return false;
-          }
-          bool IsSubobjectMember = ExtendingEntity != &Entity;
-          Diag(DiagLoc, shouldLifetimeExtendThroughPath(Path) !=
-                                PathLifetimeKind::NoExtend
-                            ? diag::err_dangling_member
-                            : diag::warn_dangling_member)
-              << ExtendingDecl << IsSubobjectMember << RK << DiagRange;
-          // Don't bother adding a note pointing to the field if we're inside
-          // its default member initializer; our primary diagnostic points to
-          // the same place in that case.
-          if (Path.empty() ||
-              Path.back().Kind != IndirectLocalPathEntry::DefaultInit) {
-            Diag(ExtendingDecl->getLocation(),
-                 diag::note_lifetime_extending_member_declared_here)
-                << RK << IsSubobjectMember;
-          }
-        } else {
-          // We have a mem-initializer but no particular field within it; this
-          // is either a base class or a delegating initializer directly
-          // initializing the base-class from something that doesn't live long
-          // enough.
-          //
-          // FIXME: Warn on this.
-          return false;
-        }
-      } else {
-        // Paths via a default initializer can only occur during error recovery
-        // (there's no other way that a default initializer can refer to a
-        // local). Don't produce a bogus warning on those cases.
-        if (pathContainsInit(Path))
-          return false;
-
-        // Suppress false positives for code like the one below:
-        //   Ctor(unique_ptr<T> up) : member(*up), member2(move(up)) {}
-        if (IsLocalGslOwner && pathOnlyInitializesGslPointer(Path))
-          return false;
-
-        auto *DRE = dyn_cast<DeclRefExpr>(L);
-        auto *VD = DRE ? dyn_cast<VarDecl>(DRE->getDecl()) : nullptr;
-        if (!VD) {
-          // A member was initialized to a local block.
-          // FIXME: Warn on this.
-          return false;
-        }
-
-        if (auto *Member =
-                ExtendingEntity ? ExtendingEntity->getDecl() : nullptr) {
-          bool IsPointer = !Member->getType()->isReferenceType();
-          Diag(DiagLoc, IsPointer ? diag::warn_init_ptr_member_to_parameter_addr
-                                  : diag::warn_bind_ref_member_to_parameter)
-              << Member << VD << isa<ParmVarDecl>(VD) << DiagRange;
-          Diag(Member->getLocation(),
-               diag::note_ref_or_ptr_member_declared_here)
-              << (unsigned)IsPointer;
-        }
-      }
-      break;
-    }
-
-    case LK_New:
-      if (isa<MaterializeTemporaryExpr>(L)) {
-        if (IsGslPtrInitWithGslTempOwner)
-          Diag(DiagLoc, diag::warn_dangling_lifetime_pointer) << DiagRange;
-        else
-          Diag(DiagLoc, RK == RK_ReferenceBinding
-                            ? diag::warn_new_dangling_reference
-                            : diag::warn_new_dangling_initializer_list)
-              << !Entity.getParent() << DiagRange;
-      } else {
-        // We can't determine if the allocation outlives the local declaration.
-        return false;
-      }
-      break;
-
-    case LK_Return:
-    case LK_StmtExprResult:
-      if (auto *DRE = dyn_cast<DeclRefExpr>(L)) {
-        // We can't determine if the local variable outlives the statement
-        // expression.
-        if (LK == LK_StmtExprResult)
-          return false;
-        Diag(DiagLoc, diag::warn_ret_stack_addr_ref)
-            << Entity.getType()->isReferenceType() << DRE->getDecl()
-            << isa<ParmVarDecl>(DRE->getDecl()) << DiagRange;
-      } else if (isa<BlockExpr>(L)) {
-        Diag(DiagLoc, diag::err_ret_local_block) << DiagRange;
-      } else if (isa<AddrLabelExpr>(L)) {
-        // Don't warn when returning a label from a statement expression.
-        // Leaving the scope doesn't end its lifetime.
-        if (LK == LK_StmtExprResult)
-          return false;
-        Diag(DiagLoc, diag::warn_ret_addr_label) << DiagRange;
-      } else if (auto *CLE = dyn_cast<CompoundLiteralExpr>(L)) {
-        Diag(DiagLoc, diag::warn_ret_stack_addr_ref)
-            << Entity.getType()->isReferenceType() << CLE->getInitializer() << 2
-            << DiagRange;
-      } else {
-        // P2748R5: Disallow Binding a Returned Glvalue to a Temporary.
-        // [stmt.return]/p6: In a function whose return type is a reference,
-        // other than an invented function for std::is_convertible ([meta.rel]),
-        // a return statement that binds the returned reference to a temporary
-        // expression ([class.temporary]) is ill-formed.
-        if (getLangOpts().CPlusPlus26 && Entity.getType()->isReferenceType())
-          Diag(DiagLoc, diag::err_ret_local_temp_ref)
-              << Entity.getType()->isReferenceType() << DiagRange;
-        else
-          Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
-              << Entity.getType()->isReferenceType() << DiagRange;
-      }
-      break;
-    }
-
-    for (unsigned I = 0; I != Path.size(); ++I) {
-      auto Elem = Path[I];
-
-      switch (Elem.Kind) {
-      case IndirectLocalPathEntry::AddressOf:
-      case IndirectLocalPathEntry::LValToRVal:
-        // These exist primarily to mark the path as not permitting or
-        // supporting lifetime extension.
-        break;
-
-      case IndirectLocalPathEntry::LifetimeBoundCall:
-      case IndirectLocalPathEntry::TemporaryCopy:
-      case IndirectLocalPathEntry::GslPointerInit:
-      case IndirectLocalPathEntry::GslReferenceInit:
-        // FIXME: Consider adding a note for these.
-        break;
-
-      case IndirectLocalPathEntry::DefaultInit: {
-        auto *FD = cast<FieldDecl>(Elem.D);
-        Diag(FD->getLocation(), diag::note_init_with_default_member_initializer)
-            << FD << nextPathEntryRange(Path, I + 1, L);
-        break;
-      }
-
-      case IndirectLocalPathEntry::VarInit: {
-        const VarDecl *VD = cast<VarDecl>(Elem.D);
-        Diag(VD->getLocation(), diag::note_local_var_initializer)
-            << VD->getType()->isReferenceType()
-            << VD->isImplicit() << VD->getDeclName()
-            << nextPathEntryRange(Path, I + 1, L);
-        break;
-      }
-
-      case IndirectLocalPathEntry::LambdaCaptureInit:
-        if (!Elem.Capture->capturesVariable())
-          break;
-        // FIXME: We can't easily tell apart an init-capture from a nested
-        // capture of an init-capture.
-        const ValueDecl *VD = Elem.Capture->getCapturedVar();
-        Diag(Elem.Capture->getLocation(), diag::note_lambda_capture_initializer)
-            << VD << VD->isInitCapture() << Elem.Capture->isExplicit()
-            << (Elem.Capture->getCaptureKind() == LCK_ByRef) << VD
-            << nextPathEntryRange(Path, I + 1, L);
-        break;
-      }
-    }
-
-    // We didn't lifetime-extend, so don't go any further; we don't need more
-    // warnings or errors on inner temporaries within this one's initializer.
-    return false;
-  };
-
-  bool EnableLifetimeWarnings = !getDiagnostics().isIgnored(
-      diag::warn_dangling_lifetime_pointer, SourceLocation());
-  llvm::SmallVector<IndirectLocalPathEntry, 8> Path;
-  if (Init->isGLValue())
-    visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding,
-                                          TemporaryVisitor,
-                                          EnableLifetimeWarnings);
-  else
-    visitLocalsRetainedByInitializer(Path, Init, TemporaryVisitor, false,
-                                     EnableLifetimeWarnings);
+  return sema::checkExprLifetime(*this, Entity, Init);
 }
 
 static void DiagnoseNarrowingInInitList(Sema &S,
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index e9476a0c93c5d4..ca9c7cb9faadfb 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -2391,7 +2391,7 @@ Sema::LambdaScopeForCallOperatorInstantiationRAII::
       Pattern =
           dyn_cast<FunctionDecl>(getLambdaAwareParentOfDeclContext(Pattern));
 
-      if (!FD || !Pattern)
+      if (!ParentFD || !Pattern)
         break;
 
       SemaRef.addInstantiatedParametersToScope(ParentFD, Pattern, Scope, MLTAL);
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index be6ea20a956a39..9a3fabc5933eee 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1606,22 +1606,32 @@ bool Sema::isUsableModule(const Module *M) {
   // [module.global.frag]p1:
   //   The global module fragment can be used to provide declarations that are
   //   attached to the global module and usable within the module unit.
-  if (M == TheGlobalModuleFragment || M == TheImplicitGlobalModuleFragment ||
-      // If M is the module we're parsing, it should be usable. This covers the
-      // private module fragment. The private module fragment is usable only if
-      // it is within the current module unit. And it must be the current
-      // parsing module unit if it is within the current module unit according
-      // to the grammar of the private module fragment. NOTE: This is covered by
-      // the following condition. The intention of the check is to avoid string
-      // comparison as much as possible.
-      M == getCurrentModule() ||
-      // The module unit which is in the same module with the current module
-      // unit is usable.
-      //
-      // FIXME: Here we judge if they are in the same module by comparing the
-      // string. Is there any better solution?
-      M->getPrimaryModuleInterfaceName() ==
-          llvm::StringRef(getLangOpts().CurrentModule).split(':').first) {
+  if (M == TheGlobalModuleFragment || M == TheImplicitGlobalModuleFragment) {
+    UsableModuleUnitsCache.insert(M);
+    return true;
+  }
+
+  // Otherwise, the global module fragment from other translation unit is not
+  // directly usable.
+  if (M->isGlobalModule())
+    return false;
+
+  Module *Current = getCurrentModule();
+
+  // If we're not parsing a module, we can't use all the declarations from
+  // another module easily.
+  if (!Current)
+    return false;
+
+  // If M is the module we're parsing or M and the current module unit lives in
+  // the same module, M should be usable.
+  //
+  // Note: It should be fine to search the vector `ModuleScopes` linearly since
+  // it should be generally small enough. There should be rare module fragments
+  // in a named module unit.
+  if (llvm::count_if(ModuleScopes,
+                     [&M](const ModuleScope &MS) { return MS.Module == M; }) ||
+      getASTContext().isInSameModule(M, Current)) {
     UsableModuleUnitsCache.insert(M);
     return true;
   }
@@ -5816,19 +5826,13 @@ void Sema::diagnoseMissingImport(SourceLocation UseLoc, const NamedDecl *Decl,
     if (M->isModuleMapModule())
       return M->getFullModuleName();
 
-    Module *CurrentModule = getCurrentModule();
-
     if (M->isImplicitGlobalModule())
       M = M->getTopLevelModule();
 
-    bool IsInTheSameModule =
-        CurrentModule && CurrentModule->getPrimaryModuleInterfaceName() ==
-                             M->getPrimaryModuleInterfaceName();
-
     // If the current module unit is in the same module with M, it is OK to show
     // the partition name. Otherwise, it'll be sufficient to show the primary
     // module name.
-    if (IsInTheSameModule)
+    if (getASTContext().isInSameModule(M, getCurrentModule()))
       return M->getTopLevelModuleName().str();
     else
       return M->getPrimaryModuleInterfaceName().str();
@@ -5961,7 +5965,7 @@ RedeclarationKind Sema::forRedeclarationInCurContext() const {
   // anything that is not visible. We don't need to check linkage here; if
   // the context has internal linkage, redeclaration lookup won't find things
   // from other TUs, and we can't safely compute linkage yet in general.
-  if (cast<Decl>(CurContext)->getOwningModuleForLinkage(/*IgnoreLinkage*/ true))
+  if (cast<Decl>(CurContext)->getOwningModuleForLinkage())
     return RedeclarationKind::ForVisibleRedeclaration;
   return RedeclarationKind::ForExternalRedeclaration;
 }
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index ad118ac90e4aa6..98e7971dc0bf3f 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -82,7 +82,8 @@ static std::string stringFromPath(ModuleIdPath Path) {
 /// CurrentModule. Since currently it is expensive to decide whether two module
 /// units come from the same module by comparing the module name.
 static bool
-isImportingModuleUnitFromSameModule(Module *Imported, Module *CurrentModule,
+isImportingModuleUnitFromSameModule(ASTContext &Ctx, Module *Imported,
+                                    Module *CurrentModule,
                                     Module *&FoundPrimaryModuleInterface) {
   if (!Imported->isNamedModule())
     return false;
@@ -109,8 +110,7 @@ isImportingModuleUnitFromSameModule(Module *Imported, Module *CurrentModule,
   if (!CurrentModule->isModulePartitionImplementation())
     return false;
 
-  if (Imported->getPrimaryModuleInterfaceName() ==
-      CurrentModule->getPrimaryModuleInterfaceName()) {
+  if (Ctx.isInSameModule(Imported, CurrentModule)) {
     assert(!FoundPrimaryModuleInterface ||
            FoundPrimaryModuleInterface == Imported);
     FoundPrimaryModuleInterface = Imported;
@@ -127,8 +127,9 @@ isImportingModuleUnitFromSameModule(Module *Imported, Module *CurrentModule,
 ///   the module unit purview of U. These rules can in turn lead to the
 ///   importation of yet more translation units.
 static void
-makeTransitiveImportsVisible(VisibleModuleSet &VisibleModules, Module *Imported,
-                             Module *CurrentModule, SourceLocation ImportLoc,
+makeTransitiveImportsVisible(ASTContext &Ctx, VisibleModuleSet &VisibleModules,
+                             Module *Imported, Module *CurrentModule,
+                             SourceLocation ImportLoc,
                              bool IsImportingPrimaryModuleInterface = false) {
   assert(Imported->isNamedModule() &&
          "'makeTransitiveImportsVisible()' is intended for standard C++ named "
@@ -150,7 +151,7 @@ makeTransitiveImportsVisible(VisibleModuleSet &VisibleModules, Module *Imported,
     // use the sourcelocation loaded from the visible modules.
     VisibleModules.setVisible(Importing, ImportLoc);
 
-    if (isImportingModuleUnitFromSameModule(Importing, CurrentModule,
+    if (isImportingModuleUnitFromSameModule(Ctx, Importing, CurrentModule,
                                             FoundPrimaryModuleInterface))
       for (Module *TransImported : Importing->Imports)
         if (!VisibleModules.isVisible(TransImported))
@@ -484,7 +485,8 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   // and return the import decl to be added to the current TU.
   if (Interface) {
 
-    makeTransitiveImportsVisible(VisibleModules, Interface, Mod, ModuleLoc,
+    makeTransitiveImportsVisible(getASTContext(), VisibleModules, Interface,
+                                 Mod, ModuleLoc,
                                  /*IsImportingPrimaryModuleInterface=*/true);
 
     // Make the import decl for the interface in the impl module.
@@ -643,8 +645,8 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
     Diag(ImportLoc, diag::warn_experimental_header_unit);
 
   if (Mod->isNamedModule())
-    makeTransitiveImportsVisible(VisibleModules, Mod, getCurrentModule(),
-                                 ImportLoc);
+    makeTransitiveImportsVisible(getASTContext(), VisibleModules, Mod,
+                                 getCurrentModule(), ImportLoc);
   else
     VisibleModules.setVisible(Mod, ImportLoc);
 
diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp
index d396258cfc7d18..75233689769c5b 100644
--- a/clang/lib/Sema/SemaObjC.cpp
+++ b/clang/lib/Sema/SemaObjC.cpp
@@ -2255,4 +2255,154 @@ void SemaObjC::handleExternallyRetainedAttr(Decl *D, const ParsedAttr &AL) {
   handleSimpleAttribute<ObjCExternallyRetainedAttr>(*this, D, AL);
 }
 
+bool SemaObjC::GetFormatNSStringIdx(const FormatAttr *Format, unsigned &Idx) {
+  Sema::FormatStringInfo FSI;
+  if ((SemaRef.GetFormatStringType(Format) == Sema::FST_NSString) &&
+      SemaRef.getFormatStringInfo(Format, false, true, &FSI)) {
+    Idx = FSI.FormatIdx;
+    return true;
+  }
+  return false;
+}
+
+/// Diagnose use of %s directive in an NSString which is being passed
+/// as formatting string to formatting method.
+void SemaObjC::DiagnoseCStringFormatDirectiveInCFAPI(const NamedDecl *FDecl,
+                                                     Expr **Args,
+                                                     unsigned NumArgs) {
+  unsigned Idx = 0;
+  bool Format = false;
+  ObjCStringFormatFamily SFFamily = FDecl->getObjCFStringFormattingFamily();
+  if (SFFamily == ObjCStringFormatFamily::SFF_CFString) {
+    Idx = 2;
+    Format = true;
+  } else
+    for (const auto *I : FDecl->specific_attrs<FormatAttr>()) {
+      if (GetFormatNSStringIdx(I, Idx)) {
+        Format = true;
+        break;
+      }
+    }
+  if (!Format || NumArgs <= Idx)
+    return;
+  const Expr *FormatExpr = Args[Idx];
+  if (const CStyleCastExpr *CSCE = dyn_cast<CStyleCastExpr>(FormatExpr))
+    FormatExpr = CSCE->getSubExpr();
+  const StringLiteral *FormatString;
+  if (const ObjCStringLiteral *OSL =
+          dyn_cast<ObjCStringLiteral>(FormatExpr->IgnoreParenImpCasts()))
+    FormatString = OSL->getString();
+  else
+    FormatString = dyn_cast<StringLiteral>(FormatExpr->IgnoreParenImpCasts());
+  if (!FormatString)
+    return;
+  if (SemaRef.FormatStringHasSArg(FormatString)) {
+    Diag(FormatExpr->getExprLoc(), diag::warn_objc_cdirective_format_string)
+        << "%s" << 1 << 1;
+    Diag(FDecl->getLocation(), diag::note_entity_declared_at)
+        << FDecl->getDeclName();
+  }
+}
+
+bool SemaObjC::isSignedCharBool(QualType Ty) {
+  return Ty->isSpecificBuiltinType(BuiltinType::SChar) && getLangOpts().ObjC &&
+         NSAPIObj->isObjCBOOLType(Ty);
+}
+
+void SemaObjC::adornBoolConversionDiagWithTernaryFixit(
+    Expr *SourceExpr, const Sema::SemaDiagnosticBuilder &Builder) {
+  Expr *Ignored = SourceExpr->IgnoreImplicit();
+  if (const auto *OVE = dyn_cast<OpaqueValueExpr>(Ignored))
+    Ignored = OVE->getSourceExpr();
+  bool NeedsParens = isa<AbstractConditionalOperator>(Ignored) ||
+                     isa<BinaryOperator>(Ignored) ||
+                     isa<CXXOperatorCallExpr>(Ignored);
+  SourceLocation EndLoc = SemaRef.getLocForEndOfToken(SourceExpr->getEndLoc());
+  if (NeedsParens)
+    Builder << FixItHint::CreateInsertion(SourceExpr->getBeginLoc(), "(")
+            << FixItHint::CreateInsertion(EndLoc, ")");
+  Builder << FixItHint::CreateInsertion(EndLoc, " ? YES : NO");
+}
+
+/// Check a single element within a collection literal against the
+/// target element type.
+static void checkCollectionLiteralElement(Sema &S, QualType TargetElementType,
+                                          Expr *Element, unsigned ElementKind) {
+  // Skip a bitcast to 'id' or qualified 'id'.
+  if (auto ICE = dyn_cast<ImplicitCastExpr>(Element)) {
+    if (ICE->getCastKind() == CK_BitCast &&
+        ICE->getSubExpr()->getType()->getAs<ObjCObjectPointerType>())
+      Element = ICE->getSubExpr();
+  }
+
+  QualType ElementType = Element->getType();
+  ExprResult ElementResult(Element);
+  if (ElementType->getAs<ObjCObjectPointerType>() &&
+      S.CheckSingleAssignmentConstraints(TargetElementType, ElementResult,
+                                         false, false) != Sema::Compatible) {
+    S.Diag(Element->getBeginLoc(), diag::warn_objc_collection_literal_element)
+        << ElementType << ElementKind << TargetElementType
+        << Element->getSourceRange();
+  }
+
+  if (auto ArrayLiteral = dyn_cast<ObjCArrayLiteral>(Element))
+    S.ObjC().checkArrayLiteral(TargetElementType, ArrayLiteral);
+  else if (auto DictionaryLiteral = dyn_cast<ObjCDictionaryLiteral>(Element))
+    S.ObjC().checkDictionaryLiteral(TargetElementType, DictionaryLiteral);
+}
+
+/// Check an Objective-C array literal being converted to the given
+/// target type.
+void SemaObjC::checkArrayLiteral(QualType TargetType,
+                                 ObjCArrayLiteral *ArrayLiteral) {
+  if (!NSArrayDecl)
+    return;
+
+  const auto *TargetObjCPtr = TargetType->getAs<ObjCObjectPointerType>();
+  if (!TargetObjCPtr)
+    return;
+
+  if (TargetObjCPtr->isUnspecialized() ||
+      TargetObjCPtr->getInterfaceDecl()->getCanonicalDecl() !=
+          NSArrayDecl->getCanonicalDecl())
+    return;
+
+  auto TypeArgs = TargetObjCPtr->getTypeArgs();
+  if (TypeArgs.size() != 1)
+    return;
+
+  QualType TargetElementType = TypeArgs[0];
+  for (unsigned I = 0, N = ArrayLiteral->getNumElements(); I != N; ++I) {
+    checkCollectionLiteralElement(SemaRef, TargetElementType,
+                                  ArrayLiteral->getElement(I), 0);
+  }
+}
+
+void SemaObjC::checkDictionaryLiteral(
+    QualType TargetType, ObjCDictionaryLiteral *DictionaryLiteral) {
+  if (!NSDictionaryDecl)
+    return;
+
+  const auto *TargetObjCPtr = TargetType->getAs<ObjCObjectPointerType>();
+  if (!TargetObjCPtr)
+    return;
+
+  if (TargetObjCPtr->isUnspecialized() ||
+      TargetObjCPtr->getInterfaceDecl()->getCanonicalDecl() !=
+          NSDictionaryDecl->getCanonicalDecl())
+    return;
+
+  auto TypeArgs = TargetObjCPtr->getTypeArgs();
+  if (TypeArgs.size() != 2)
+    return;
+
+  QualType TargetKeyType = TypeArgs[0];
+  QualType TargetObjectType = TypeArgs[1];
+  for (unsigned I = 0, N = DictionaryLiteral->getNumElements(); I != N; ++I) {
+    auto Element = DictionaryLiteral->getKeyValueElement(I);
+    checkCollectionLiteralElement(SemaRef, TargetKeyType, Element.Key, 1);
+    checkCollectionLiteralElement(SemaRef, TargetObjectType, Element.Value, 2);
+  }
+}
+
 } // namespace clang
diff --git a/clang/lib/Sema/SemaOpenCL.cpp b/clang/lib/Sema/SemaOpenCL.cpp
index b3b495a15e02c3..9f746fffd34d0a 100644
--- a/clang/lib/Sema/SemaOpenCL.cpp
+++ b/clang/lib/Sema/SemaOpenCL.cpp
@@ -96,4 +96,483 @@ void SemaOpenCL::handleSubGroupSize(Decl *D, const ParsedAttr &AL) {
                  OpenCLIntelReqdSubGroupSizeAttr(getASTContext(), AL, SGSize));
 }
 
+static inline bool isBlockPointer(Expr *Arg) {
+  return Arg->getType()->isBlockPointerType();
+}
+
+/// OpenCL C v2.0, s6.13.17.2 - Checks that the block parameters are all local
+/// void*, which is a requirement of device side enqueue.
+static bool checkBlockArgs(Sema &S, Expr *BlockArg) {
+  const BlockPointerType *BPT =
+      cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
+  ArrayRef<QualType> Params =
+      BPT->getPointeeType()->castAs<FunctionProtoType>()->getParamTypes();
+  unsigned ArgCounter = 0;
+  bool IllegalParams = false;
+  // Iterate through the block parameters until either one is found that is not
+  // a local void*, or the block is valid.
+  for (ArrayRef<QualType>::iterator I = Params.begin(), E = Params.end();
+       I != E; ++I, ++ArgCounter) {
+    if (!(*I)->isPointerType() || !(*I)->getPointeeType()->isVoidType() ||
+        (*I)->getPointeeType().getQualifiers().getAddressSpace() !=
+            LangAS::opencl_local) {
+      // Get the location of the error. If a block literal has been passed
+      // (BlockExpr) then we can point straight to the offending argument,
+      // else we just point to the variable reference.
+      SourceLocation ErrorLoc;
+      if (isa<BlockExpr>(BlockArg)) {
+        BlockDecl *BD = cast<BlockExpr>(BlockArg)->getBlockDecl();
+        ErrorLoc = BD->getParamDecl(ArgCounter)->getBeginLoc();
+      } else if (isa<DeclRefExpr>(BlockArg)) {
+        ErrorLoc = cast<DeclRefExpr>(BlockArg)->getBeginLoc();
+      }
+      S.Diag(ErrorLoc,
+             diag::err_opencl_enqueue_kernel_blocks_non_local_void_args);
+      IllegalParams = true;
+    }
+  }
+
+  return IllegalParams;
+}
+
+bool SemaOpenCL::checkSubgroupExt(CallExpr *Call) {
+  // OpenCL device can support extension but not the feature as extension
+  // requires subgroup independent forward progress, but subgroup independent
+  // forward progress is optional in OpenCL C 3.0 __opencl_c_subgroups feature.
+  if (!SemaRef.getOpenCLOptions().isSupported("cl_khr_subgroups",
+                                              getLangOpts()) &&
+      !SemaRef.getOpenCLOptions().isSupported("__opencl_c_subgroups",
+                                              getLangOpts())) {
+    Diag(Call->getBeginLoc(), diag::err_opencl_requires_extension)
+        << 1 << Call->getDirectCallee()
+        << "cl_khr_subgroups or __opencl_c_subgroups";
+    return true;
+  }
+  return false;
+}
+
+bool SemaOpenCL::checkBuiltinNDRangeAndBlock(CallExpr *TheCall) {
+  if (SemaRef.checkArgCount(TheCall, 2))
+    return true;
+
+  if (checkSubgroupExt(TheCall))
+    return true;
+
+  // First argument is an ndrange_t type.
+  Expr *NDRangeArg = TheCall->getArg(0);
+  if (NDRangeArg->getType().getUnqualifiedType().getAsString() != "ndrange_t") {
+    Diag(NDRangeArg->getBeginLoc(), diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "'ndrange_t'";
+    return true;
+  }
+
+  Expr *BlockArg = TheCall->getArg(1);
+  if (!isBlockPointer(BlockArg)) {
+    Diag(BlockArg->getBeginLoc(), diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "block";
+    return true;
+  }
+  return checkBlockArgs(SemaRef, BlockArg);
+}
+
+bool SemaOpenCL::checkBuiltinKernelWorkGroupSize(CallExpr *TheCall) {
+  if (SemaRef.checkArgCount(TheCall, 1))
+    return true;
+
+  Expr *BlockArg = TheCall->getArg(0);
+  if (!isBlockPointer(BlockArg)) {
+    Diag(BlockArg->getBeginLoc(), diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "block";
+    return true;
+  }
+  return checkBlockArgs(SemaRef, BlockArg);
+}
+
+/// Diagnose integer type and any valid implicit conversion to it.
+static bool checkOpenCLEnqueueIntType(Sema &S, Expr *E, const QualType &IntT) {
+  // Taking into account implicit conversions,
+  // allow any integer.
+  if (!E->getType()->isIntegerType()) {
+    S.Diag(E->getBeginLoc(),
+           diag::err_opencl_enqueue_kernel_invalid_local_size_type);
+    return true;
+  }
+  // Potentially emit standard warnings for implicit conversions if enabled
+  // using -Wconversion.
+  S.CheckImplicitConversion(E, IntT, E->getBeginLoc());
+  return false;
+}
+
+static bool checkOpenCLEnqueueLocalSizeArgs(Sema &S, CallExpr *TheCall,
+                                            unsigned Start, unsigned End) {
+  bool IllegalParams = false;
+  for (unsigned I = Start; I <= End; ++I)
+    IllegalParams |= checkOpenCLEnqueueIntType(S, TheCall->getArg(I),
+                                               S.Context.getSizeType());
+  return IllegalParams;
+}
+
+/// OpenCL v2.0, s6.13.17.1 - Check that sizes are provided for all
+/// 'local void*' parameter of passed block.
+static bool checkOpenCLEnqueueVariadicArgs(Sema &S, CallExpr *TheCall,
+                                           Expr *BlockArg,
+                                           unsigned NumNonVarArgs) {
+  const BlockPointerType *BPT =
+      cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
+  unsigned NumBlockParams =
+      BPT->getPointeeType()->castAs<FunctionProtoType>()->getNumParams();
+  unsigned TotalNumArgs = TheCall->getNumArgs();
+
+  // For each argument passed to the block, a corresponding uint needs to
+  // be passed to describe the size of the local memory.
+  if (TotalNumArgs != NumBlockParams + NumNonVarArgs) {
+    S.Diag(TheCall->getBeginLoc(),
+           diag::err_opencl_enqueue_kernel_local_size_args);
+    return true;
+  }
+
+  // Check that the sizes of the local memory are specified by integers.
+  return checkOpenCLEnqueueLocalSizeArgs(S, TheCall, NumNonVarArgs,
+                                         TotalNumArgs - 1);
+}
+
+bool SemaOpenCL::checkBuiltinEnqueueKernel(CallExpr *TheCall) {
+  ASTContext &Context = getASTContext();
+  unsigned NumArgs = TheCall->getNumArgs();
+
+  if (NumArgs < 4) {
+    Diag(TheCall->getBeginLoc(), diag::err_typecheck_call_too_few_args_at_least)
+        << 0 << 4 << NumArgs << /*is non object*/ 0;
+    return true;
+  }
+
+  Expr *Arg0 = TheCall->getArg(0);
+  Expr *Arg1 = TheCall->getArg(1);
+  Expr *Arg2 = TheCall->getArg(2);
+  Expr *Arg3 = TheCall->getArg(3);
+
+  // First argument always needs to be a queue_t type.
+  if (!Arg0->getType()->isQueueT()) {
+    Diag(TheCall->getArg(0)->getBeginLoc(),
+         diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << getASTContext().OCLQueueTy;
+    return true;
+  }
+
+  // Second argument always needs to be a kernel_enqueue_flags_t enum value.
+  if (!Arg1->getType()->isIntegerType()) {
+    Diag(TheCall->getArg(1)->getBeginLoc(),
+         diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "'kernel_enqueue_flags_t' (i.e. uint)";
+    return true;
+  }
+
+  // Third argument is always an ndrange_t type.
+  if (Arg2->getType().getUnqualifiedType().getAsString() != "ndrange_t") {
+    Diag(TheCall->getArg(2)->getBeginLoc(),
+         diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "'ndrange_t'";
+    return true;
+  }
+
+  // With four arguments, there is only one form that the function could be
+  // called in: no events and no variable arguments.
+  if (NumArgs == 4) {
+    // check that the last argument is the right block type.
+    if (!isBlockPointer(Arg3)) {
+      Diag(Arg3->getBeginLoc(), diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee() << "block";
+      return true;
+    }
+    // we have a block type, check the prototype
+    const BlockPointerType *BPT =
+        cast<BlockPointerType>(Arg3->getType().getCanonicalType());
+    if (BPT->getPointeeType()->castAs<FunctionProtoType>()->getNumParams() >
+        0) {
+      Diag(Arg3->getBeginLoc(), diag::err_opencl_enqueue_kernel_blocks_no_args);
+      return true;
+    }
+    return false;
+  }
+  // we can have block + varargs.
+  if (isBlockPointer(Arg3))
+    return (checkBlockArgs(SemaRef, Arg3) ||
+            checkOpenCLEnqueueVariadicArgs(SemaRef, TheCall, Arg3, 4));
+  // last two cases with either exactly 7 args or 7 args and varargs.
+  if (NumArgs >= 7) {
+    // check common block argument.
+    Expr *Arg6 = TheCall->getArg(6);
+    if (!isBlockPointer(Arg6)) {
+      Diag(Arg6->getBeginLoc(), diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee() << "block";
+      return true;
+    }
+    if (checkBlockArgs(SemaRef, Arg6))
+      return true;
+
+    // Forth argument has to be any integer type.
+    if (!Arg3->getType()->isIntegerType()) {
+      Diag(TheCall->getArg(3)->getBeginLoc(),
+           diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee() << "integer";
+      return true;
+    }
+    // check remaining common arguments.
+    Expr *Arg4 = TheCall->getArg(4);
+    Expr *Arg5 = TheCall->getArg(5);
+
+    // Fifth argument is always passed as a pointer to clk_event_t.
+    if (!Arg4->isNullPointerConstant(Context,
+                                     Expr::NPC_ValueDependentIsNotNull) &&
+        !Arg4->getType()->getPointeeOrArrayElementType()->isClkEventT()) {
+      Diag(TheCall->getArg(4)->getBeginLoc(),
+           diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee()
+          << Context.getPointerType(Context.OCLClkEventTy);
+      return true;
+    }
+
+    // Sixth argument is always passed as a pointer to clk_event_t.
+    if (!Arg5->isNullPointerConstant(Context,
+                                     Expr::NPC_ValueDependentIsNotNull) &&
+        !(Arg5->getType()->isPointerType() &&
+          Arg5->getType()->getPointeeType()->isClkEventT())) {
+      Diag(TheCall->getArg(5)->getBeginLoc(),
+           diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee()
+          << Context.getPointerType(Context.OCLClkEventTy);
+      return true;
+    }
+
+    if (NumArgs == 7)
+      return false;
+
+    return checkOpenCLEnqueueVariadicArgs(SemaRef, TheCall, Arg6, 7);
+  }
+
+  // None of the specific case has been detected, give generic error
+  Diag(TheCall->getBeginLoc(), diag::err_opencl_enqueue_kernel_incorrect_args);
+  return true;
+}
+
+/// Returns OpenCL access qual.
+static OpenCLAccessAttr *getOpenCLArgAccess(const Decl *D) {
+  return D->getAttr<OpenCLAccessAttr>();
+}
+
+/// Returns true if pipe element type is different from the pointer.
+static bool checkPipeArg(Sema &S, CallExpr *Call) {
+  const Expr *Arg0 = Call->getArg(0);
+  // First argument type should always be pipe.
+  if (!Arg0->getType()->isPipeType()) {
+    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_first_arg)
+        << Call->getDirectCallee() << Arg0->getSourceRange();
+    return true;
+  }
+  OpenCLAccessAttr *AccessQual =
+      getOpenCLArgAccess(cast<DeclRefExpr>(Arg0)->getDecl());
+  // Validates the access qualifier is compatible with the call.
+  // OpenCL v2.0 s6.13.16 - The access qualifiers for pipe should only be
+  // read_only and write_only, and assumed to be read_only if no qualifier is
+  // specified.
+  switch (Call->getDirectCallee()->getBuiltinID()) {
+  case Builtin::BIread_pipe:
+  case Builtin::BIreserve_read_pipe:
+  case Builtin::BIcommit_read_pipe:
+  case Builtin::BIwork_group_reserve_read_pipe:
+  case Builtin::BIsub_group_reserve_read_pipe:
+  case Builtin::BIwork_group_commit_read_pipe:
+  case Builtin::BIsub_group_commit_read_pipe:
+    if (!(!AccessQual || AccessQual->isReadOnly())) {
+      S.Diag(Arg0->getBeginLoc(),
+             diag::err_opencl_builtin_pipe_invalid_access_modifier)
+          << "read_only" << Arg0->getSourceRange();
+      return true;
+    }
+    break;
+  case Builtin::BIwrite_pipe:
+  case Builtin::BIreserve_write_pipe:
+  case Builtin::BIcommit_write_pipe:
+  case Builtin::BIwork_group_reserve_write_pipe:
+  case Builtin::BIsub_group_reserve_write_pipe:
+  case Builtin::BIwork_group_commit_write_pipe:
+  case Builtin::BIsub_group_commit_write_pipe:
+    if (!(AccessQual && AccessQual->isWriteOnly())) {
+      S.Diag(Arg0->getBeginLoc(),
+             diag::err_opencl_builtin_pipe_invalid_access_modifier)
+          << "write_only" << Arg0->getSourceRange();
+      return true;
+    }
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+/// Returns true if pipe element type is different from the pointer.
+static bool checkPipePacketType(Sema &S, CallExpr *Call, unsigned Idx) {
+  const Expr *Arg0 = Call->getArg(0);
+  const Expr *ArgIdx = Call->getArg(Idx);
+  const PipeType *PipeTy = cast<PipeType>(Arg0->getType());
+  const QualType EltTy = PipeTy->getElementType();
+  const PointerType *ArgTy = ArgIdx->getType()->getAs<PointerType>();
+  // The Idx argument should be a pointer and the type of the pointer and
+  // the type of pipe element should also be the same.
+  if (!ArgTy ||
+      !S.Context.hasSameType(
+          EltTy, ArgTy->getPointeeType()->getCanonicalTypeInternal())) {
+    S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
+        << Call->getDirectCallee() << S.Context.getPointerType(EltTy)
+        << ArgIdx->getType() << ArgIdx->getSourceRange();
+    return true;
+  }
+  return false;
+}
+
+bool SemaOpenCL::checkBuiltinRWPipe(CallExpr *Call) {
+  // OpenCL v2.0 s6.13.16.2 - The built-in read/write
+  // functions have two forms.
+  switch (Call->getNumArgs()) {
+  case 2:
+    if (checkPipeArg(SemaRef, Call))
+      return true;
+    // The call with 2 arguments should be
+    // read/write_pipe(pipe T, T*).
+    // Check packet type T.
+    if (checkPipePacketType(SemaRef, Call, 1))
+      return true;
+    break;
+
+  case 4: {
+    if (checkPipeArg(SemaRef, Call))
+      return true;
+    // The call with 4 arguments should be
+    // read/write_pipe(pipe T, reserve_id_t, uint, T*).
+    // Check reserve_id_t.
+    if (!Call->getArg(1)->getType()->isReserveIDT()) {
+      Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
+          << Call->getDirectCallee() << getASTContext().OCLReserveIDTy
+          << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
+      return true;
+    }
+
+    // Check the index.
+    const Expr *Arg2 = Call->getArg(2);
+    if (!Arg2->getType()->isIntegerType() &&
+        !Arg2->getType()->isUnsignedIntegerType()) {
+      Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
+          << Call->getDirectCallee() << getASTContext().UnsignedIntTy
+          << Arg2->getType() << Arg2->getSourceRange();
+      return true;
+    }
+
+    // Check packet type T.
+    if (checkPipePacketType(SemaRef, Call, 3))
+      return true;
+  } break;
+  default:
+    Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_arg_num)
+        << Call->getDirectCallee() << Call->getSourceRange();
+    return true;
+  }
+
+  return false;
+}
+
+bool SemaOpenCL::checkBuiltinReserveRWPipe(CallExpr *Call) {
+  if (SemaRef.checkArgCount(Call, 2))
+    return true;
+
+  if (checkPipeArg(SemaRef, Call))
+    return true;
+
+  // Check the reserve size.
+  if (!Call->getArg(1)->getType()->isIntegerType() &&
+      !Call->getArg(1)->getType()->isUnsignedIntegerType()) {
+    Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
+        << Call->getDirectCallee() << getASTContext().UnsignedIntTy
+        << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
+    return true;
+  }
+
+  // Since return type of reserve_read/write_pipe built-in function is
+  // reserve_id_t, which is not defined in the builtin def file , we used int
+  // as return type and need to override the return type of these functions.
+  Call->setType(getASTContext().OCLReserveIDTy);
+
+  return false;
+}
+
+bool SemaOpenCL::checkBuiltinCommitRWPipe(CallExpr *Call) {
+  if (SemaRef.checkArgCount(Call, 2))
+    return true;
+
+  if (checkPipeArg(SemaRef, Call))
+    return true;
+
+  // Check reserve_id_t.
+  if (!Call->getArg(1)->getType()->isReserveIDT()) {
+    Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_invalid_arg)
+        << Call->getDirectCallee() << getASTContext().OCLReserveIDTy
+        << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
+    return true;
+  }
+
+  return false;
+}
+
+bool SemaOpenCL::checkBuiltinPipePackets(CallExpr *Call) {
+  if (SemaRef.checkArgCount(Call, 1))
+    return true;
+
+  if (!Call->getArg(0)->getType()->isPipeType()) {
+    Diag(Call->getBeginLoc(), diag::err_opencl_builtin_pipe_first_arg)
+        << Call->getDirectCallee() << Call->getArg(0)->getSourceRange();
+    return true;
+  }
+
+  return false;
+}
+
+bool SemaOpenCL::checkBuiltinToAddr(unsigned BuiltinID, CallExpr *Call) {
+  if (SemaRef.checkArgCount(Call, 1))
+    return true;
+
+  auto RT = Call->getArg(0)->getType();
+  if (!RT->isPointerType() ||
+      RT->getPointeeType().getAddressSpace() == LangAS::opencl_constant) {
+    Diag(Call->getBeginLoc(), diag::err_opencl_builtin_to_addr_invalid_arg)
+        << Call->getArg(0) << Call->getDirectCallee() << Call->getSourceRange();
+    return true;
+  }
+
+  if (RT->getPointeeType().getAddressSpace() != LangAS::opencl_generic) {
+    Diag(Call->getArg(0)->getBeginLoc(),
+         diag::warn_opencl_generic_address_space_arg)
+        << Call->getDirectCallee()->getNameInfo().getAsString()
+        << Call->getArg(0)->getSourceRange();
+  }
+
+  RT = RT->getPointeeType();
+  auto Qual = RT.getQualifiers();
+  switch (BuiltinID) {
+  case Builtin::BIto_global:
+    Qual.setAddressSpace(LangAS::opencl_global);
+    break;
+  case Builtin::BIto_local:
+    Qual.setAddressSpace(LangAS::opencl_local);
+    break;
+  case Builtin::BIto_private:
+    Qual.setAddressSpace(LangAS::opencl_private);
+    break;
+  default:
+    llvm_unreachable("Invalid builtin function");
+  }
+  Call->setType(getASTContext().getPointerType(
+      getASTContext().getQualifiedType(RT.getUnqualifiedType(), Qual)));
+
+  return false;
+}
+
 } // namespace clang
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 5c759aedf9798a..3e6c6c94a47cda 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaOpenMP.h"
+
 #include "TreeTransform.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTMutationListener.h"
@@ -466,7 +467,7 @@ class DSAStackTy {
     getTopOfStack().PossiblyLoopCounter = D ? D->getCanonicalDecl() : D;
   }
   /// Gets the possible loop counter decl.
-  const Decl *getPossiblyLoopCunter() const {
+  const Decl *getPossiblyLoopCounter() const {
     return getTopOfStack().PossiblyLoopCounter;
   }
   /// Start new OpenMP region stack in new non-capturing function.
@@ -718,7 +719,7 @@ class DSAStackTy {
     TargetLocations.push_back(LocStart);
   }
 
-  /// Add location for the first encountered atomicc directive.
+  /// Add location for the first encountered atomic directive.
   void addAtomicDirectiveLoc(SourceLocation Loc) {
     if (AtomicLocation.isInvalid())
       AtomicLocation = Loc;
@@ -2584,7 +2585,7 @@ OpenMPClauseKind SemaOpenMP::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
       DSAStack->loopStart();
       return OMPC_private;
     }
-    if ((DSAStack->getPossiblyLoopCunter() == D->getCanonicalDecl() ||
+    if ((DSAStack->getPossiblyLoopCounter() == D->getCanonicalDecl() ||
          DSAStack->isLoopControlVariable(D).first) &&
         !DSAStack->hasExplicitDSA(
             D, [](OpenMPClauseKind K, bool) { return K != OMPC_private; },
@@ -2694,8 +2695,8 @@ bool SemaOpenMP::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
       unsigned NumLevels =
           getOpenMPCaptureLevels(DSAStack->getDirective(Level));
       if (Level == 0)
-        // non-file scope static variale with default(firstprivate)
-        // should be gloabal captured.
+        // non-file scope static variable with default(firstprivate)
+        // should be global captured.
         return (NumLevels == CaptureLevel + 1 &&
                 (TopDVar.CKind != OMPC_shared ||
                  DSAStack->getDefaultDSA() == DSA_firstprivate));
@@ -2730,11 +2731,11 @@ void SemaOpenMP::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
   assert(getLangOpts().OpenMP && "Expected OpenMP compilation mode.");
   std::optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
       OMPDeclareTargetDeclAttr::getDeviceType(Caller->getMostRecentDecl());
-  // Ignore host functions during device analyzis.
+  // Ignore host functions during device analysis.
   if (getLangOpts().OpenMPIsTargetDevice &&
       (!DevTy || *DevTy == OMPDeclareTargetDeclAttr::DT_Host))
     return;
-  // Ignore nohost functions during host analyzis.
+  // Ignore nohost functions during host analysis.
   if (!getLangOpts().OpenMPIsTargetDevice && DevTy &&
       *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost)
     return;
@@ -3010,6 +3011,29 @@ static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
                                      Expr *NumIterations, Sema &SemaRef,
                                      Scope *S, DSAStackTy *Stack);
 
+static bool finishLinearClauses(Sema &SemaRef, ArrayRef<OMPClause *> Clauses,
+                                OMPLoopBasedDirective::HelperExprs &B,
+                                DSAStackTy *Stack) {
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
+         "loop exprs were not built");
+
+  if (SemaRef.CurContext->isDependentContext())
+    return false;
+
+  // Finalize the clauses that need pre-built expressions for CodeGen.
+  for (OMPClause *C : Clauses) {
+    auto *LC = dyn_cast<OMPLinearClause>(C);
+    if (!LC)
+      continue;
+    if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
+                                 B.NumIterations, SemaRef,
+                                 SemaRef.getCurScope(), Stack))
+      return true;
+  }
+
+  return false;
+}
+
 namespace {
 
 class VarDeclFilterCCC final : public CorrectionCandidateCallback {
@@ -3880,7 +3904,7 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
                   if (SemaRef.LangOpts.OpenMP >= 50)
                     return !StackComponents.empty();
                   // Variable is used if it has been marked as an array, array
-                  // section, array shaping or the variable iself.
+                  // section, array shaping or the variable itself.
                   return StackComponents.size() == 1 ||
                          llvm::all_of(
                              llvm::drop_begin(llvm::reverse(StackComponents)),
@@ -5759,7 +5783,7 @@ static CapturedStmt *buildDistanceFunc(Sema &Actions, QualType LogicalTy,
       // Divide by the absolute step amount. If the range is not a multiple of
       // the step size, rounding-up the effective upper bound ensures that the
       // last iteration is included.
-      // Note that the rounding-up may cause an overflow in a temporry that
+      // Note that the rounding-up may cause an overflow in a temporary that
       // could be avoided, but would have occurred in a C-style for-loop as
       // well.
       Expr *Divisor = BuildVarRef(NewStep);
@@ -6040,7 +6064,7 @@ static ExprResult buildUserDefinedMapperRef(Sema &SemaRef, Scope *S,
 static void
 processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack,
                                       SmallVectorImpl<OMPClause *> &Clauses) {
-  // Check for the deault mapper for data members.
+  // Check for the default mapper for data members.
   if (S.getLangOpts().OpenMP < 50)
     return;
   SmallVector<OMPClause *, 4> ImplicitMaps;
@@ -6333,6 +6357,8 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
     Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
     OpenMPDirectiveKind PrevMappedDirective) {
+  assert(isOpenMPExecutableDirective(Kind) && "Unexpected directive category");
+
   StmtResult Res = StmtError();
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
   llvm::SmallVector<OMPClause *> ClausesWithoutBind;
@@ -6525,18 +6551,14 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
                                             ClausesWithImplicit);
   }
 
-  llvm::SmallVector<OpenMPDirectiveKind, 4> AllowedNameModifiers;
   switch (Kind) {
   case OMPD_parallel:
     Res = ActOnOpenMPParallelDirective(ClausesWithImplicit, AStmt, StartLoc,
                                        EndLoc);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_simd:
     Res = ActOnOpenMPSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
                                    VarsWithInheritedDSA);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_tile:
     Res =
@@ -6553,8 +6575,6 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   case OMPD_for_simd:
     Res = ActOnOpenMPForSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
                                       EndLoc, VarsWithInheritedDSA);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_sections:
     Res = ActOnOpenMPSectionsDirective(ClausesWithImplicit, AStmt, StartLoc,
@@ -6585,14 +6605,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   case OMPD_parallel_for:
     Res = ActOnOpenMPParallelForDirective(ClausesWithImplicit, AStmt, StartLoc,
                                           EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_parallel_for_simd:
     Res = ActOnOpenMPParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_parallel);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_scope:
     Res =
@@ -6601,22 +6617,18 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   case OMPD_parallel_master:
     Res = ActOnOpenMPParallelMasterDirective(ClausesWithImplicit, AStmt,
                                              StartLoc, EndLoc);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_parallel_masked:
     Res = ActOnOpenMPParallelMaskedDirective(ClausesWithImplicit, AStmt,
                                              StartLoc, EndLoc);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_parallel_sections:
     Res = ActOnOpenMPParallelSectionsDirective(ClausesWithImplicit, AStmt,
                                                StartLoc, EndLoc);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_task:
     Res =
         ActOnOpenMPTaskDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
-    AllowedNameModifiers.push_back(OMPD_task);
     break;
   case OMPD_taskyield:
     assert(ClausesWithImplicit.empty() &&
@@ -6676,19 +6688,14 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   case OMPD_target:
     Res = ActOnOpenMPTargetDirective(ClausesWithImplicit, AStmt, StartLoc,
                                      EndLoc);
-    AllowedNameModifiers.push_back(OMPD_target);
     break;
   case OMPD_target_parallel:
     Res = ActOnOpenMPTargetParallelDirective(ClausesWithImplicit, AStmt,
                                              StartLoc, EndLoc);
-    AllowedNameModifiers.push_back(OMPD_target);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_target_parallel_for:
     Res = ActOnOpenMPTargetParallelForDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_cancellation_point:
     assert(ClausesWithImplicit.empty() &&
@@ -6702,90 +6709,58 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
            "No associated statement allowed for 'omp cancel' directive");
     Res = ActOnOpenMPCancelDirective(ClausesWithImplicit, StartLoc, EndLoc,
                                      CancelRegion);
-    AllowedNameModifiers.push_back(OMPD_cancel);
     break;
   case OMPD_target_data:
     Res = ActOnOpenMPTargetDataDirective(ClausesWithImplicit, AStmt, StartLoc,
                                          EndLoc);
-    AllowedNameModifiers.push_back(OMPD_target_data);
     break;
   case OMPD_target_enter_data:
     Res = ActOnOpenMPTargetEnterDataDirective(ClausesWithImplicit, StartLoc,
                                               EndLoc, AStmt);
-    AllowedNameModifiers.push_back(OMPD_target_enter_data);
     break;
   case OMPD_target_exit_data:
     Res = ActOnOpenMPTargetExitDataDirective(ClausesWithImplicit, StartLoc,
                                              EndLoc, AStmt);
-    AllowedNameModifiers.push_back(OMPD_target_exit_data);
     break;
   case OMPD_taskloop:
     Res = ActOnOpenMPTaskLoopDirective(ClausesWithImplicit, AStmt, StartLoc,
                                        EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_taskloop);
     break;
   case OMPD_taskloop_simd:
     Res = ActOnOpenMPTaskLoopSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
                                            EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_taskloop);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_master_taskloop:
     Res = ActOnOpenMPMasterTaskLoopDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_taskloop);
     break;
   case OMPD_masked_taskloop:
     Res = ActOnOpenMPMaskedTaskLoopDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_taskloop);
     break;
   case OMPD_master_taskloop_simd:
     Res = ActOnOpenMPMasterTaskLoopSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_taskloop);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_masked_taskloop_simd:
     Res = ActOnOpenMPMaskedTaskLoopSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (getLangOpts().OpenMP >= 51) {
-      AllowedNameModifiers.push_back(OMPD_taskloop);
-      AllowedNameModifiers.push_back(OMPD_simd);
-    }
     break;
   case OMPD_parallel_master_taskloop:
     Res = ActOnOpenMPParallelMasterTaskLoopDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_taskloop);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_parallel_masked_taskloop:
     Res = ActOnOpenMPParallelMaskedTaskLoopDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (getLangOpts().OpenMP >= 51) {
-      AllowedNameModifiers.push_back(OMPD_taskloop);
-      AllowedNameModifiers.push_back(OMPD_parallel);
-    }
     break;
   case OMPD_parallel_master_taskloop_simd:
     Res = ActOnOpenMPParallelMasterTaskLoopSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_taskloop);
-    AllowedNameModifiers.push_back(OMPD_parallel);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_parallel_masked_taskloop_simd:
     Res = ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (getLangOpts().OpenMP >= 51) {
-      AllowedNameModifiers.push_back(OMPD_taskloop);
-      AllowedNameModifiers.push_back(OMPD_parallel);
-      AllowedNameModifiers.push_back(OMPD_simd);
-    }
     break;
   case OMPD_distribute:
     Res = ActOnOpenMPDistributeDirective(ClausesWithImplicit, AStmt, StartLoc,
@@ -6794,40 +6769,26 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   case OMPD_target_update:
     Res = ActOnOpenMPTargetUpdateDirective(ClausesWithImplicit, StartLoc,
                                            EndLoc, AStmt);
-    AllowedNameModifiers.push_back(OMPD_target_update);
     break;
   case OMPD_distribute_parallel_for:
     Res = ActOnOpenMPDistributeParallelForDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_distribute_parallel_for_simd:
     Res = ActOnOpenMPDistributeParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_parallel);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_distribute_simd:
     Res = ActOnOpenMPDistributeSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_target_parallel_for_simd:
     Res = ActOnOpenMPTargetParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
-    AllowedNameModifiers.push_back(OMPD_parallel);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_target_simd:
     Res = ActOnOpenMPTargetSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
                                          EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_teams_distribute:
     Res = ActOnOpenMPTeamsDistributeDirective(
@@ -6836,51 +6797,34 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   case OMPD_teams_distribute_simd:
     Res = ActOnOpenMPTeamsDistributeSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_teams_distribute_parallel_for_simd:
     Res = ActOnOpenMPTeamsDistributeParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_parallel);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_teams_distribute_parallel_for:
     Res = ActOnOpenMPTeamsDistributeParallelForDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_target_teams:
     Res = ActOnOpenMPTargetTeamsDirective(ClausesWithImplicit, AStmt, StartLoc,
                                           EndLoc);
-    AllowedNameModifiers.push_back(OMPD_target);
     break;
   case OMPD_target_teams_distribute:
     Res = ActOnOpenMPTargetTeamsDistributeDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
     break;
   case OMPD_target_teams_distribute_parallel_for:
     Res = ActOnOpenMPTargetTeamsDistributeParallelForDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
-    AllowedNameModifiers.push_back(OMPD_parallel);
     break;
   case OMPD_target_teams_distribute_parallel_for_simd:
     Res = ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
-    AllowedNameModifiers.push_back(OMPD_parallel);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_target_teams_distribute_simd:
     Res = ActOnOpenMPTargetTeamsDistributeSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
-    if (getLangOpts().OpenMP >= 50)
-      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_interop:
     assert(AStmt == nullptr &&
@@ -6902,7 +6846,6 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   case OMPD_target_teams_loop:
     Res = ActOnOpenMPTargetTeamsGenericLoopDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    AllowedNameModifiers.push_back(OMPD_target);
     break;
   case OMPD_parallel_loop:
     Res = ActOnOpenMPParallelGenericLoopDirective(
@@ -6941,7 +6884,7 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
       switch (C->getClauseKind()) {
       case OMPC_num_threads:
       case OMPC_dist_schedule:
-        // Do not analyse if no parent teams directive.
+        // Do not analyze if no parent teams directive.
         if (isOpenMPTeamsDirective(Kind))
           break;
         continue;
@@ -7072,6 +7015,11 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     }
   }
 
+  llvm::SmallVector<OpenMPDirectiveKind, 4> AllowedNameModifiers;
+  for (OpenMPDirectiveKind D : getLeafConstructsOrSelf(Kind)) {
+    if (isAllowedClauseForDirective(D, OMPC_if, getLangOpts().OpenMP))
+      AllowedNameModifiers.push_back(D);
+  }
   if (!AllowedNameModifiers.empty())
     ErrorFound = checkIfClauses(SemaRef, Kind, Clauses, AllowedNameModifiers) ||
                  ErrorFound;
@@ -7757,7 +7705,7 @@ SemaOpenMP::checkOpenMPDeclareVariantFunction(SemaOpenMP::DeclGroupPtrTy DG,
       FnPtrType = Context.getMemberPointerType(AdjustedFnType, ClassType);
       ExprResult ER;
       {
-        // Build adrr_of unary op to correctly handle type checks for member
+        // Build addr_of unary op to correctly handle type checks for member
         // functions.
         Sema::TentativeAnalysisScope Trap(SemaRef);
         ER = SemaRef.CreateBuiltinUnaryOp(VariantRef->getBeginLoc(), UO_AddrOf,
@@ -7994,14 +7942,10 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
   FD->addAttr(NewAttr);
 }
 
-StmtResult
-SemaOpenMP::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
-                                         Stmt *AStmt, SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
-  if (!AStmt)
-    return StmtError();
-
-  auto *CS = cast<CapturedStmt>(AStmt);
+static CapturedStmt *
+setBranchProtectedScope(Sema &SemaRef, OpenMPDirectiveKind DKind, Stmt *AStmt) {
+  auto *CS = dyn_cast<CapturedStmt>(AStmt);
+  assert(CS && "Captured statement expected");
   // 1.2.2 OpenMP Language Terminology
   // Structured block - An executable statement with a single entry at the
   // top and a single exit at the bottom.
@@ -8009,7 +7953,28 @@ SemaOpenMP::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
 
+  for (int ThisCaptureLevel = SemaRef.OpenMP().getOpenMPCaptureLevels(DKind);
+       ThisCaptureLevel > 1; --ThisCaptureLevel) {
+    CS = cast<CapturedStmt>(CS->getCapturedStmt());
+    // 1.2.2 OpenMP Language Terminology
+    // Structured block - An executable statement with a single entry at the
+    // top and a single exit at the bottom.
+    // The point of exit cannot be a branch out of the structured block.
+    // longjmp() and throw() must not violate the entry/exit criteria.
+    CS->getCapturedDecl()->setNothrow();
+  }
   SemaRef.setFunctionHasBranchProtectedScope();
+  return CS;
+}
+
+StmtResult
+SemaOpenMP::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
+                                         Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
+  if (!AStmt)
+    return StmtError();
+
+  setBranchProtectedScope(SemaRef, OMPD_parallel, AStmt);
 
   return OMPParallelDirective::Create(
       getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
@@ -8299,7 +8264,7 @@ bool OpenMPIterationSpaceChecker::setStep(Expr *NewStep, bool Subtract) {
                    diag::err_omp_loop_incr_not_compatible)
           << LCDecl << *TestIsLessOp << NewStep->getSourceRange();
       SemaRef.Diag(ConditionLoc,
-                   diag::note_omp_loop_cond_requres_compatible_incr)
+                   diag::note_omp_loop_cond_requires_compatible_incr)
           << *TestIsLessOp << ConditionSrcRange;
       return true;
     }
@@ -9382,7 +9347,7 @@ void SemaOpenMP::ActOnOpenMPLoopInitialization(SourceLocation ForLoc,
           }
         }
         DSAStack->addLoopControlVariable(D, VD);
-        const Decl *LD = DSAStack->getPossiblyLoopCunter();
+        const Decl *LD = DSAStack->getPossiblyLoopCounter();
         if (LD != D->getCanonicalDecl()) {
           DSAStack->resetPossibleLoopCounter();
           if (auto *Var = dyn_cast_or_null<VarDecl>(LD))
@@ -9444,7 +9409,7 @@ void SemaOpenMP::ActOnOpenMPLoopInitialization(SourceLocation ForLoc,
 }
 
 namespace {
-// Utility for openmp doacross clause kind
+// Utility for OpenMP doacross clause kind
 class OMPDoacrossKind {
 public:
   bool isSource(const OMPDoacrossClause *C) {
@@ -9822,7 +9787,7 @@ static Stmt *buildPreInits(ASTContext &Context,
 /// stored in lieu of using an explicit list. Flattening is necessary because
 /// contained DeclStmts need to be visible after the execution of the list. Used
 /// for OpenMP pre-init declarations/statements.
-static void appendFlattendedStmtList(SmallVectorImpl<Stmt *> &TargetList,
+static void appendFlattenedStmtList(SmallVectorImpl<Stmt *> &TargetList,
                                      Stmt *Item) {
   // nullptr represents an empty list.
   if (!Item)
@@ -9854,7 +9819,7 @@ static Stmt *buildPreInits(ASTContext &Context, ArrayRef<Stmt *> PreInits) {
 
   SmallVector<Stmt *> Stmts;
   for (Stmt *S : PreInits)
-    appendFlattendedStmtList(Stmts, S);
+    appendFlattenedStmtList(Stmts, S);
   return CompoundStmt::Create(Context, PreInits, FPOptionsOverride(), {}, {});
 }
 
@@ -9958,7 +9923,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
             // Search for pre-init declared variables that need to be captured
             // to be referenceable inside the directive.
             SmallVector<Stmt *> Constituents;
-            appendFlattendedStmtList(Constituents, DependentPreInits);
+            appendFlattenedStmtList(Constituents, DependentPreInits);
             for (Stmt *S : Constituents) {
               if (auto *DC = dyn_cast<DeclStmt>(S)) {
                 for (Decl *C : DC->decls()) {
@@ -10678,19 +10643,8 @@ StmtResult SemaOpenMP::ActOnOpenMPSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp simd loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
@@ -10721,19 +10675,8 @@ StmtResult SemaOpenMP::ActOnOpenMPForDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   auto *ForDirective = OMPForDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
@@ -10759,19 +10702,8 @@ StmtResult SemaOpenMP::ActOnOpenMPForSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for simd loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
@@ -10781,12 +10713,10 @@ StmtResult SemaOpenMP::ActOnOpenMPForSimdDirective(
                                      NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult
-SemaOpenMP::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
-                                         Stmt *AStmt, SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
+static bool checkSectionsDirective(Sema &SemaRef, OpenMPDirectiveKind DKind,
+                                   Stmt *AStmt, DSAStackTy *Stack) {
   if (!AStmt)
-    return StmtError();
+    return true;
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   auto BaseStmt = AStmt;
@@ -10795,23 +10725,34 @@ SemaOpenMP::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
   if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
     auto S = C->children();
     if (S.begin() == S.end())
-      return StmtError();
+      return true;
     // All associated statements must be '#pragma omp section' except for
     // the first one.
     for (Stmt *SectionStmt : llvm::drop_begin(S)) {
       if (!SectionStmt || !isa<OMPSectionDirective>(SectionStmt)) {
         if (SectionStmt)
-          Diag(SectionStmt->getBeginLoc(),
-               diag::err_omp_sections_substmt_not_section);
-        return StmtError();
+          SemaRef.Diag(SectionStmt->getBeginLoc(),
+                       diag::err_omp_sections_substmt_not_section)
+              << getOpenMPDirectiveName(DKind);
+        return true;
       }
       cast<OMPSectionDirective>(SectionStmt)
-          ->setHasCancel(DSAStack->isCancelRegion());
+          ->setHasCancel(Stack->isCancelRegion());
     }
   } else {
-    Diag(AStmt->getBeginLoc(), diag::err_omp_sections_not_compound_stmt);
-    return StmtError();
+    SemaRef.Diag(AStmt->getBeginLoc(), diag::err_omp_sections_not_compound_stmt)
+        << getOpenMPDirectiveName(DKind);
+    return true;
   }
+  return false;
+}
+
+StmtResult
+SemaOpenMP::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
+                                         Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
+  if (checkSectionsDirective(SemaRef, OMPD_sections, AStmt, DSAStack))
+    return StmtError();
 
   SemaRef.setFunctionHasBranchProtectedScope();
 
@@ -10928,13 +10869,7 @@ StmtResult SemaOpenMP::ActOnOpenMPGenericLoopDirective(
   if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_loop, DSAStack))
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
+  setBranchProtectedScope(SemaRef, OMPD_loop, AStmt);
 
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse', it will define the nested loops number.
@@ -10947,7 +10882,6 @@ StmtResult SemaOpenMP::ActOnOpenMPGenericLoopDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPGenericLoopDirective::Create(getASTContext(), StartLoc, EndLoc,
                                          NestedLoopCount, Clauses, AStmt, B);
 }
@@ -10964,23 +10898,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsGenericLoopDirective(
   if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_teams_loop, DSAStack))
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_teams_loop);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(SemaRef, OMPD_teams_loop, AStmt);
 
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse', it will define the nested loops number.
@@ -10994,7 +10912,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsGenericLoopDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsGenericLoopDirective::Create(
@@ -11014,23 +10931,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsGenericLoopDirective(
                                   DSAStack))
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_teams_loop);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_target_teams_loop, AStmt);
 
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse', it will define the nested loops number.
@@ -11044,8 +10946,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsGenericLoopDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   return OMPTargetTeamsGenericLoopDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       teamsLoopCanBeParallelFor(AStmt, SemaRef));
@@ -11064,23 +10964,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelGenericLoopDirective(
                                   DSAStack))
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_parallel_loop);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_parallel_loop, AStmt);
 
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse', it will define the nested loops number.
@@ -11094,8 +10979,6 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelGenericLoopDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   return OMPParallelGenericLoopDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -11113,23 +10996,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetParallelGenericLoopDirective(
                                   DSAStack))
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_loop);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_target_parallel_loop, AStmt);
 
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse', it will define the nested loops number.
@@ -11143,8 +11011,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetParallelGenericLoopDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   return OMPTargetParallelGenericLoopDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -11268,13 +11134,7 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelForDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
+  setBranchProtectedScope(SemaRef, OMPD_parallel_for, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -11286,21 +11146,9 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelForDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp parallel for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelForDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
@@ -11312,13 +11160,7 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelForSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
+  setBranchProtectedScope(SemaRef, OMPD_parallel_for_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -11330,21 +11172,12 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelForSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelForSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -11355,16 +11188,7 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMasterDirective(
   if (!AStmt)
     return StmtError();
 
-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-
-  SemaRef.setFunctionHasBranchProtectedScope();
+  setBranchProtectedScope(SemaRef, OMPD_parallel_master, AStmt);
 
   return OMPParallelMasterDirective::Create(
       getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
@@ -11377,16 +11201,7 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedDirective(
   if (!AStmt)
     return StmtError();
 
-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-
-  SemaRef.setFunctionHasBranchProtectedScope();
+  setBranchProtectedScope(SemaRef, OMPD_parallel_masked, AStmt);
 
   return OMPParallelMaskedDirective::Create(
       getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
@@ -11396,34 +11211,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedDirective(
 StmtResult SemaOpenMP::ActOnOpenMPParallelSectionsDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc) {
-  if (!AStmt)
-    return StmtError();
-
-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
-  auto BaseStmt = AStmt;
-  while (auto *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
-    BaseStmt = CS->getCapturedStmt();
-  if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
-    auto S = C->children();
-    if (S.begin() == S.end())
-      return StmtError();
-    // All associated statements must be '#pragma omp section' except for
-    // the first one.
-    for (Stmt *SectionStmt : llvm::drop_begin(S)) {
-      if (!SectionStmt || !isa<OMPSectionDirective>(SectionStmt)) {
-        if (SectionStmt)
-          Diag(SectionStmt->getBeginLoc(),
-               diag::err_omp_parallel_sections_substmt_not_section);
-        return StmtError();
-      }
-      cast<OMPSectionDirective>(SectionStmt)
-          ->setHasCancel(DSAStack->isCancelRegion());
-    }
-  } else {
-    Diag(AStmt->getBeginLoc(),
-         diag::err_omp_parallel_sections_not_compound_stmt);
+  if (checkSectionsDirective(SemaRef, OMPD_parallel_sections, AStmt, DSAStack))
     return StmtError();
-  }
 
   SemaRef.setFunctionHasBranchProtectedScope();
 
@@ -11469,15 +11258,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
                                     {OMPC_detach, OMPC_mergeable}))
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-
-  SemaRef.setFunctionHasBranchProtectedScope();
+  setBranchProtectedScope(SemaRef, OMPD_task, AStmt);
 
   return OMPTaskDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
                                   AStmt, DSAStack->isCancelRegion());
@@ -11628,7 +11409,7 @@ StmtResult SemaOpenMP::ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
          diag::err_omp_scan_single_clause_expected);
     return StmtError();
   }
-  // Check that scan directive is used in the scopeof the OpenMP loop body.
+  // Check that scan directive is used in the scope of the OpenMP loop body.
   if (Scope *S = DSAStack->getCurScope()) {
     Scope *ParentS = S->getParent();
     if (!ParentS || ParentS->getParent() != ParentS->getBreakParent() ||
@@ -11783,7 +11564,7 @@ class OpenMPAtomicUpdateChecker {
     NotAnAssignmentOp,
     /// RHS part of the binary operation is not a binary expression.
     NotABinaryExpression,
-    /// RHS part is not additive/multiplicative/shift/biwise binary
+    /// RHS part is not additive/multiplicative/shift/bitwise binary
     /// expression.
     NotABinaryOperator,
     /// RHS binary operation does not have reference to the updated LHS
@@ -12061,7 +11842,7 @@ class OpenMPAtomicCompareChecker {
     InvalidAssignment,
     /// Not if statement
     NotIfStmt,
-    /// More than two statements in a compund statement.
+    /// More than two statements in a compound statement.
     MoreThanTwoStmts,
     /// Not a compound statement.
     NotCompoundStmt,
@@ -12148,7 +11929,7 @@ class OpenMPAtomicCompareChecker {
 
     return true;
   }
-  };
+};
 
 bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
                                                      ErrorInfoTy &ErrorInfo) {
@@ -13357,7 +13138,7 @@ StmtResult SemaOpenMP::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
         Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare)
             << ErrorInfo.ErrorRange;
         Diag(ErrorInfo.NoteLoc, diag::note_omp_atomic_compare)
-          << ErrorInfo.Error << ErrorInfo.NoteRange;
+            << ErrorInfo.Error << ErrorInfo.NoteRange;
         return StmtError();
       }
       X = Checker.getX();
@@ -13404,23 +13185,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(SemaRef, OMPD_target, AStmt);
 
   // OpenMP [2.16, Nesting of Regions]
   // If specified, a teams construct must be contained within a target
@@ -13433,10 +13198,14 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
       auto I = CS->body_begin();
       while (I != CS->body_end()) {
         const auto *OED = dyn_cast<OMPExecutableDirective>(*I);
-        if (!OED || !isOpenMPTeamsDirective(OED->getDirectiveKind()) ||
-            OMPTeamsFound) {
-
+        bool IsTeams = OED && isOpenMPTeamsDirective(OED->getDirectiveKind());
+        if (!IsTeams || I != CS->body_begin()) {
           OMPTeamsFound = false;
+          if (IsTeams && I != CS->body_begin()) {
+            // This is the two teams case. Since the InnerTeamsRegionLoc will
+            // point to this second one reset the iterator to the other teams.
+            --I;
+          }
           break;
         }
         ++I;
@@ -13457,8 +13226,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
     }
   }
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   return OMPTargetDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
                                     AStmt);
 }
@@ -13469,25 +13236,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetParallelDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
-
-  SemaRef.setFunctionHasBranchProtectedScope();
+  setBranchProtectedScope(SemaRef, OMPD_target_parallel, AStmt);
 
   return OMPTargetParallelDirective::Create(
       getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
@@ -13500,23 +13249,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_for);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_target_parallel_for, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -13528,21 +13262,9 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp target parallel for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetParallelForDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
@@ -13618,23 +13340,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetEnterDataDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_enter_data);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  setBranchProtectedScope(SemaRef, OMPD_target_enter_data, AStmt);
 
   // OpenMP [2.10.2, Restrictions, p. 99]
   // At least one map clause must appear on the directive.
@@ -13654,23 +13360,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetExitDataDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_exit_data);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  setBranchProtectedScope(SemaRef, OMPD_target_exit_data, AStmt);
 
   // OpenMP [2.10.3, Restrictions, p. 102]
   // At least one map clause must appear on the directive.
@@ -13690,23 +13380,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetUpdateDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_update);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  setBranchProtectedScope(SemaRef, OMPD_target_update, AStmt);
 
   if (!hasClauses(Clauses, OMPC_to, OMPC_from)) {
     Diag(StartLoc, diag::err_omp_at_least_one_motion_clause_required);
@@ -13733,15 +13407,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
   if (getLangOpts().HIP && (DSAStack->getParentDirective() == OMPD_target))
     Diag(StartLoc, diag::warn_hip_omp_target_directives);
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-
-  SemaRef.setFunctionHasBranchProtectedScope();
+  setBranchProtectedScope(SemaRef, OMPD_teams, AStmt);
 
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
@@ -13862,19 +13528,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTaskLoopSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
@@ -13988,19 +13643,8 @@ StmtResult SemaOpenMP::ActOnOpenMPMasterTaskLoopSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
@@ -14038,19 +13682,8 @@ StmtResult SemaOpenMP::ActOnOpenMPMaskedTaskLoopSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
@@ -14077,25 +13710,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopDirective(
   if (!AStmt)
     return StmtError();
 
-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_parallel_master_taskloop);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_parallel_master_taskloop, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -14122,7 +13738,6 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopDirective(
   if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMasterTaskLoopDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->isCancelRegion());
@@ -14134,25 +13749,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopDirective(
   if (!AStmt)
     return StmtError();
 
-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_parallel_masked_taskloop);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_parallel_masked_taskloop, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -14179,7 +13777,6 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopDirective(
   if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMaskedTaskLoopDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->isCancelRegion());
@@ -14191,25 +13788,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_parallel_master_taskloop_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_parallel_master_taskloop_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -14221,19 +13801,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
@@ -14249,7 +13818,6 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMasterTaskLoopSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -14260,25 +13828,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_parallel_masked_taskloop_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_parallel_masked_taskloop_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -14290,19 +13841,8 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
@@ -14318,7 +13858,6 @@ StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMaskedTaskLoopSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -14359,24 +13898,8 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_distribute_parallel_for);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_distribute_parallel_for, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14391,7 +13914,6 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPDistributeParallelForDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
@@ -14403,24 +13925,8 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_distribute_parallel_for_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_distribute_parallel_for_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14432,24 +13938,12 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPDistributeParallelForSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -14457,26 +13951,11 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForSimdDirective(
 StmtResult SemaOpenMP::ActOnOpenMPDistributeSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
-  if (!AStmt)
-    return StmtError();
-
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_distribute_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_distribute_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14488,24 +13967,12 @@ StmtResult SemaOpenMP::ActOnOpenMPDistributeSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPDistributeSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
                                             NestedLoopCount, Clauses, AStmt, B);
 }
@@ -14516,24 +13983,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_target_parallel_for_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_target_parallel_for_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
@@ -14545,23 +13996,12 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp target parallel for simd loop exprs were not built");
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetParallelForSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -14572,23 +14012,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(SemaRef, OMPD_target_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will define the
@@ -14600,24 +14024,12 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp target simd loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
                                         NestedLoopCount, Clauses, AStmt, B);
 }
@@ -14628,23 +14040,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_teams_distribute);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_teams_distribute, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14659,8 +14056,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp teams distribute loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeDirective::Create(
@@ -14673,24 +14068,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_teams_distribute_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_teams_distribute_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14699,29 +14078,15 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeSimdDirective(
       OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
-
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp teams distribute simd loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeSimdDirective::Create(
@@ -14734,25 +14099,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_teams_distribute_parallel_for_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_teams_distribute_parallel_for_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14761,29 +14109,15 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForSimdDirective(
       OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
-
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeParallelForSimdDirective::Create(
@@ -14796,25 +14130,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_teams_distribute_parallel_for);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_teams_distribute_parallel_for, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14830,8 +14147,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
-
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeParallelForDirective::Create(
@@ -14845,25 +14160,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_teams);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
-  SemaRef.setFunctionHasBranchProtectedScope();
+  setBranchProtectedScope(SemaRef, OMPD_target_teams, AStmt);
 
   const OMPClause *BareClause = nullptr;
   bool HasThreadLimitAndNumTeamsClause = hasClauses(Clauses, OMPC_num_teams) &&
@@ -14888,24 +14185,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_target_teams_distribute);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS =
+      setBranchProtectedScope(SemaRef, OMPD_target_teams_distribute, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14920,7 +14201,6 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeDirective(
   assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target teams distribute loop exprs were not built");
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -14931,24 +14211,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_target_teams_distribute_parallel_for);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_target_teams_distribute_parallel_for, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -14960,21 +14224,9 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp target teams distribute parallel for loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeParallelForDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
@@ -14986,24 +14238,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(
-           OMPD_target_teams_distribute_parallel_for_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_target_teams_distribute_parallel_for_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -15016,25 +14252,12 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp target teams distribute parallel for simd loop exprs were not "
-         "built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeParallelForSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -15045,24 +14268,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
   if (!AStmt)
     return StmtError();
 
-  auto *CS = cast<CapturedStmt>(AStmt);
-  // 1.2.2 OpenMP Language Terminology
-  // Structured block - An executable statement with a single entry at the
-  // top and a single exit at the bottom.
-  // The point of exit cannot be a branch out of the structured block.
-  // longjmp() and throw() must not violate the entry/exit criteria.
-  CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel =
-           getOpenMPCaptureLevels(OMPD_target_teams_distribute_simd);
-       ThisCaptureLevel > 1; --ThisCaptureLevel) {
-    CS = cast<CapturedStmt>(CS->getCapturedStmt());
-    // 1.2.2 OpenMP Language Terminology
-    // Structured block - An executable statement with a single entry at the
-    // top and a single exit at the bottom.
-    // The point of exit cannot be a branch out of the structured block.
-    // longjmp() and throw() must not violate the entry/exit criteria.
-    CS->getCapturedDecl()->setNothrow();
-  }
+  CapturedStmt *CS = setBranchProtectedScope(
+      SemaRef, OMPD_target_teams_distribute_simd, AStmt);
 
   OMPLoopBasedDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
@@ -15074,24 +14281,12 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
-         "omp target teams distribute simd loop exprs were not built");
-
-  if (!SemaRef.CurContext->isDependentContext()) {
-    // Finalize the clauses that need pre-built expressions for CodeGen.
-    for (OMPClause *C : Clauses) {
-      if (auto *LC = dyn_cast<OMPLinearClause>(C))
-        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, SemaRef,
-                                     SemaRef.getCurScope(), DSAStack))
-          return StmtError();
-    }
-  }
+  if (finishLinearClauses(SemaRef, Clauses, B, DSAStack))
+    return StmtError();
 
   if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeSimdDirective::Create(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
@@ -15134,7 +14329,7 @@ bool SemaOpenMP::checkTransformableLoopNest(
         else
           llvm_unreachable("Unhandled loop transformation");
 
-        appendFlattendedStmtList(OriginalInits.back(), DependentPreInits);
+        appendFlattenedStmtList(OriginalInits.back(), DependentPreInits);
       });
   assert(OriginalInits.back().empty() && "No preinit after innermost loop");
   OriginalInits.pop_back();
@@ -17238,9 +16433,8 @@ OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause(
         ArgumentLoc, StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_fail:
-    Res = ActOnOpenMPFailClause(
-        static_cast<OpenMPClauseKind>(Argument),
-        ArgumentLoc, StartLoc, LParenLoc, EndLoc);
+    Res = ActOnOpenMPFailClause(static_cast<OpenMPClauseKind>(Argument),
+                                ArgumentLoc, StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_update:
     Res = ActOnOpenMPUpdateClause(static_cast<OpenMPDependClauseKind>(Argument),
@@ -23895,8 +23089,7 @@ void SemaOpenMP::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
 
 /// This class visits every VarDecl that the initializer references and adds
 /// OMPDeclareTargetDeclAttr to each of them.
-class GlobalDeclRefChecker final
-    : public StmtVisitor<GlobalDeclRefChecker> {
+class GlobalDeclRefChecker final : public StmtVisitor<GlobalDeclRefChecker> {
   SmallVector<VarDecl *> DeclVector;
   Attr *A;
 
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index fb4ff72e42eb53..f8f55ce6526acd 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1877,6 +1877,27 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType,
       FromFn = QT->getAs<FunctionType>();
       Changed = true;
     }
+
+    // For C, when called from checkPointerTypesForAssignment,
+    // we need to not alter FromFn, or else even an innocuous cast
+    // like dropping effects will fail. In C++ however we do want to
+    // alter FromFn (because of the way PerformImplicitConversion works).
+    if (getLangOpts().CPlusPlus) {
+      FromFPT = cast<FunctionProtoType>(FromFn); // in case FromFn changed above
+
+      // Transparently add/drop effects; here we are concerned with
+      // language rules/canonicalization. Adding/dropping effects is a warning.
+      const auto FromFX = FromFPT->getFunctionEffects();
+      const auto ToFX = ToFPT->getFunctionEffects();
+      if (FromFX != ToFX) {
+        FunctionProtoType::ExtProtoInfo ExtInfo = FromFPT->getExtProtoInfo();
+        ExtInfo.FunctionEffects = ToFX;
+        QualType QT = Context.getFunctionType(
+            FromFPT->getReturnType(), FromFPT->getParamTypes(), ExtInfo);
+        FromFn = QT->getAs<FunctionType>();
+        Changed = true;
+      }
+    }
   }
 
   if (!Changed)
@@ -4351,8 +4372,8 @@ HLSLCompareFloatingRank(QualType LHS, QualType RHS) {
   if (const auto *VT = RHS->getAs<VectorType>())
     RHS = VT->getElementType();
 
-  const auto L = LHS->getAs<BuiltinType>()->getKind();
-  const auto R = RHS->getAs<BuiltinType>()->getKind();
+  const auto L = LHS->castAs<BuiltinType>()->getKind();
+  const auto R = RHS->castAs<BuiltinType>()->getKind();
   if (L == R)
     return ImplicitConversionSequence::Indistinguishable;
   return L < R ? ImplicitConversionSequence::Better
@@ -12148,7 +12169,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
   case ovl_fail_bad_conversion: {
     unsigned I = (Cand->IgnoreObjectArgument ? 1 : 0);
     for (unsigned N = Cand->Conversions.size(); I != N; ++I)
-      if (Cand->Conversions[I].isBad())
+      if (Cand->Conversions[I].isInitialized() && Cand->Conversions[I].isBad())
         return DiagnoseBadConversion(S, Cand, I, TakingCandidateAddress);
 
     // FIXME: this currently happens when we're called from SemaInit
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index a032e3ec6f6353..9f4acbe5e6dd55 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2513,6 +2513,9 @@ struct ConvertConstructorToDeductionGuideTransform {
       TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc,
                               DeductionGuideName);
 
+    if (!TSI)
+      return nullptr;
+
     FunctionProtoTypeLoc FPTL =
         TSI->getTypeLoc().castAs<FunctionProtoTypeLoc>();
 
@@ -2523,6 +2526,9 @@ struct ConvertConstructorToDeductionGuideTransform {
       if (NestedPattern)
         TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc,
                                 DeclarationName());
+      if (!TSI)
+        return nullptr;
+
       ParmVarDecl *NewParam =
           ParmVarDecl::Create(SemaRef.Context, DC, Loc, Loc, nullptr,
                               TSI->getType(), TSI, SC_None, nullptr);
@@ -3170,10 +3176,6 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     Expr *RequiresClause = buildAssociatedConstraints(
         SemaRef, F, AliasTemplate, DeduceResults, IsDeducible);
 
-    // FIXME: implement the is_deducible constraint per C++
-    // [over.match.class.deduct]p3.3:
-    //    ... and a constraint that is satisfied if and only if the arguments
-    //    of A are deducible (see below) from the return type.
     auto *FPrimeTemplateParamList = TemplateParameterList::Create(
         Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(),
         AliasTemplate->getTemplateParameters()->getLAngleLoc(),
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 9bb12c6aa7b122..308274720d58d6 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -31,6 +31,7 @@
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/DelayedDiagnostic.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/ParsedAttr.h"
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
@@ -149,6 +150,10 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
 #define FUNCTION_TYPE_ATTRS_CASELIST                                           \
   case ParsedAttr::AT_NSReturnsRetained:                                       \
   case ParsedAttr::AT_NoReturn:                                                \
+  case ParsedAttr::AT_NonBlocking:                                             \
+  case ParsedAttr::AT_NonAllocating:                                           \
+  case ParsedAttr::AT_Blocking:                                                \
+  case ParsedAttr::AT_Allocating:                                              \
   case ParsedAttr::AT_Regparm:                                                 \
   case ParsedAttr::AT_CmseNSCall:                                              \
   case ParsedAttr::AT_ArmStreaming:                                            \
@@ -7522,6 +7527,111 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) {
   llvm_unreachable("unexpected attribute kind!");
 }
 
+std::optional<FunctionEffectMode>
+Sema::ActOnEffectExpression(Expr *CondExpr, StringRef AttributeName) {
+  if (CondExpr->isTypeDependent() || CondExpr->isValueDependent())
+    return FunctionEffectMode::Dependent;
+
+  std::optional<llvm::APSInt> ConditionValue =
+      CondExpr->getIntegerConstantExpr(Context);
+  if (!ConditionValue) {
+    // FIXME: err_attribute_argument_type doesn't quote the attribute
+    // name but needs to; users are inconsistent.
+    Diag(CondExpr->getExprLoc(), diag::err_attribute_argument_type)
+        << AttributeName << AANT_ArgumentIntegerConstant
+        << CondExpr->getSourceRange();
+    return std::nullopt;
+  }
+  return !ConditionValue->isZero() ? FunctionEffectMode::True
+                                   : FunctionEffectMode::False;
+}
+
+static bool
+handleNonBlockingNonAllocatingTypeAttr(TypeProcessingState &TPState,
+                                       ParsedAttr &PAttr, QualType &QT,
+                                       FunctionTypeUnwrapper &Unwrapped) {
+  // Delay if this is not a function type.
+  if (!Unwrapped.isFunctionType())
+    return false;
+
+  Sema &S = TPState.getSema();
+
+  // Require FunctionProtoType.
+  auto *FPT = Unwrapped.get()->getAs<FunctionProtoType>();
+  if (FPT == nullptr) {
+    S.Diag(PAttr.getLoc(), diag::err_func_with_effects_no_prototype)
+        << PAttr.getAttrName()->getName();
+    return true;
+  }
+
+  // Parse the new  attribute.
+  // non/blocking or non/allocating? Or conditional (computed)?
+  bool IsNonBlocking = PAttr.getKind() == ParsedAttr::AT_NonBlocking ||
+                       PAttr.getKind() == ParsedAttr::AT_Blocking;
+
+  FunctionEffectMode NewMode = FunctionEffectMode::None;
+  Expr *CondExpr = nullptr; // only valid if dependent
+
+  if (PAttr.getKind() == ParsedAttr::AT_NonBlocking ||
+      PAttr.getKind() == ParsedAttr::AT_NonAllocating) {
+    if (!PAttr.checkAtMostNumArgs(S, 1)) {
+      PAttr.setInvalid();
+      return true;
+    }
+
+    // Parse the condition, if any.
+    if (PAttr.getNumArgs() == 1) {
+      CondExpr = PAttr.getArgAsExpr(0);
+      std::optional<FunctionEffectMode> MaybeMode =
+          S.ActOnEffectExpression(CondExpr, PAttr.getAttrName()->getName());
+      if (!MaybeMode) {
+        PAttr.setInvalid();
+        return true;
+      }
+      NewMode = *MaybeMode;
+      if (NewMode != FunctionEffectMode::Dependent)
+        CondExpr = nullptr;
+    } else {
+      NewMode = FunctionEffectMode::True;
+    }
+  } else {
+    // This is the `blocking` or `allocating` attribute.
+    if (S.CheckAttrNoArgs(PAttr)) {
+      // The attribute has been marked invalid.
+      return true;
+    }
+    NewMode = FunctionEffectMode::False;
+  }
+
+  const FunctionEffect::Kind FEKind =
+      (NewMode == FunctionEffectMode::False)
+          ? (IsNonBlocking ? FunctionEffect::Kind::Blocking
+                           : FunctionEffect::Kind::Allocating)
+          : (IsNonBlocking ? FunctionEffect::Kind::NonBlocking
+                           : FunctionEffect::Kind::NonAllocating);
+  const FunctionEffectWithCondition NewEC{FunctionEffect(FEKind),
+                                          EffectConditionExpr(CondExpr)};
+
+  if (S.diagnoseConflictingFunctionEffect(FPT->getFunctionEffects(), NewEC,
+                                          PAttr.getLoc())) {
+    PAttr.setInvalid();
+    return true;
+  }
+
+  // Add the effect to the FunctionProtoType.
+  FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
+  FunctionEffectSet FX(EPI.FunctionEffects);
+  FunctionEffectSet::Conflicts Errs;
+  [[maybe_unused]] bool Success = FX.insert(NewEC, Errs);
+  assert(Success && "effect conflicts should have been diagnosed above");
+  EPI.FunctionEffects = FunctionEffectsRef(FX);
+
+  QualType NewType = S.Context.getFunctionType(FPT->getReturnType(),
+                                               FPT->getParamTypes(), EPI);
+  QT = Unwrapped.wrap(S, NewType->getAs<FunctionType>());
+  return true;
+}
+
 static bool checkMutualExclusion(TypeProcessingState &state,
                                  const FunctionProtoType::ExtProtoInfo &EPI,
                                  ParsedAttr &Attr,
@@ -7834,6 +7944,13 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
     return true;
   }
 
+  if (attr.getKind() == ParsedAttr::AT_NonBlocking ||
+      attr.getKind() == ParsedAttr::AT_NonAllocating ||
+      attr.getKind() == ParsedAttr::AT_Blocking ||
+      attr.getKind() == ParsedAttr::AT_Allocating) {
+    return handleNonBlockingNonAllocatingTypeAttr(state, attr, type, unwrapped);
+  }
+
   // Delay if the type didn't work out to a function.
   if (!unwrapped.isFunctionType()) return false;
 
@@ -8086,23 +8203,19 @@ static void HandleNeonVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr,
 
   // Target must have NEON (or MVE, whose vectors are similar enough
   // not to need a separate attribute)
-  if (!(S.Context.getTargetInfo().hasFeature("neon") ||
-        S.Context.getTargetInfo().hasFeature("mve") ||
-        S.Context.getTargetInfo().hasFeature("sve") ||
-        S.Context.getTargetInfo().hasFeature("sme") ||
-        IsTargetCUDAAndHostARM) &&
-      VecKind == VectorKind::Neon) {
-    S.Diag(Attr.getLoc(), diag::err_attribute_unsupported)
-        << Attr << "'neon', 'mve', 'sve' or 'sme'";
+  if (!S.Context.getTargetInfo().hasFeature("mve") &&
+      VecKind == VectorKind::Neon &&
+      S.Context.getTargetInfo().getTriple().isArmMClass()) {
+    S.Diag(Attr.getLoc(), diag::err_attribute_unsupported_m_profile)
+        << Attr << "'mve'";
     Attr.setInvalid();
     return;
   }
-  if (!(S.Context.getTargetInfo().hasFeature("neon") ||
-        S.Context.getTargetInfo().hasFeature("mve") ||
-        IsTargetCUDAAndHostARM) &&
-      VecKind == VectorKind::NeonPoly) {
-    S.Diag(Attr.getLoc(), diag::err_attribute_unsupported)
-        << Attr << "'neon' or 'mve'";
+  if (!S.Context.getTargetInfo().hasFeature("mve") &&
+      VecKind == VectorKind::NeonPoly &&
+      S.Context.getTargetInfo().getTriple().isArmMClass()) {
+    S.Diag(Attr.getLoc(), diag::err_attribute_unsupported_m_profile)
+        << Attr << "'mve'";
     Attr.setInvalid();
     return;
   }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index cf4e80399632b2..51ba22f99e3a3c 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -6267,6 +6267,55 @@ QualType TreeTransform<Derived>::TransformFunctionProtoType(
     EPI.ExtParameterInfos = nullptr;
   }
 
+  // Transform any function effects with unevaluated conditions.
+  // Hold this set in a local for the rest of this function, since EPI
+  // may need to hold a FunctionEffectsRef pointing into it.
+  std::optional<FunctionEffectSet> NewFX;
+  if (ArrayRef FXConds = EPI.FunctionEffects.conditions(); !FXConds.empty()) {
+    NewFX.emplace();
+    EnterExpressionEvaluationContext Unevaluated(
+        getSema(), Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+    for (const FunctionEffectWithCondition &PrevEC : EPI.FunctionEffects) {
+      FunctionEffectWithCondition NewEC = PrevEC;
+      if (Expr *CondExpr = PrevEC.Cond.getCondition()) {
+        ExprResult NewExpr = getDerived().TransformExpr(CondExpr);
+        if (NewExpr.isInvalid())
+          return QualType();
+        std::optional<FunctionEffectMode> Mode =
+            SemaRef.ActOnEffectExpression(NewExpr.get(), PrevEC.Effect.name());
+        if (!Mode)
+          return QualType();
+
+        // The condition expression has been transformed, and re-evaluated.
+        // It may or may not have become constant.
+        switch (*Mode) {
+        case FunctionEffectMode::True:
+          NewEC.Cond = {};
+          break;
+        case FunctionEffectMode::False:
+          NewEC.Effect = FunctionEffect(PrevEC.Effect.oppositeKind());
+          NewEC.Cond = {};
+          break;
+        case FunctionEffectMode::Dependent:
+          NewEC.Cond = EffectConditionExpr(NewExpr.get());
+          break;
+        case FunctionEffectMode::None:
+          llvm_unreachable(
+              "FunctionEffectMode::None shouldn't be possible here");
+        }
+      }
+      if (!SemaRef.diagnoseConflictingFunctionEffect(*NewFX, NewEC,
+                                                     TL.getBeginLoc())) {
+        FunctionEffectSet::Conflicts Errs;
+        NewFX->insert(NewEC, Errs);
+        assert(Errs.empty());
+      }
+    }
+    EPI.FunctionEffects = *NewFX;
+    EPIChanged = true;
+  }
+
   QualType Result = TL.getType();
   if (getDerived().AlwaysRebuild() || ResultType != T->getReturnType() ||
       T->getParamTypes() != llvm::ArrayRef(ParamTypes) || EPIChanged) {
@@ -6502,7 +6551,7 @@ QualType TreeTransform<Derived>::TransformTypeOfExprType(TypeLocBuilder &TLB,
     return QualType();
 
   QualType Result = TL.getType();
-  TypeOfKind Kind = Result->getAs<TypeOfExprType>()->getKind();
+  TypeOfKind Kind = Result->castAs<TypeOfExprType>()->getKind();
   if (getDerived().AlwaysRebuild() || E.get() != TL.getUnderlyingExpr()) {
     Result =
         getDerived().RebuildTypeOfExprType(E.get(), TL.getTypeofLoc(), Kind);
@@ -6527,7 +6576,7 @@ QualType TreeTransform<Derived>::TransformTypeOfType(TypeLocBuilder &TLB,
     return QualType();
 
   QualType Result = TL.getType();
-  TypeOfKind Kind = Result->getAs<TypeOfType>()->getKind();
+  TypeOfKind Kind = Result->castAs<TypeOfType>()->getKind();
   if (getDerived().AlwaysRebuild() || New_Under_TI != Old_Under_TI) {
     Result = getDerived().RebuildTypeOfType(New_Under_TI->getType(), Kind);
     if (Result.isNull())
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index e8b398615e0e1a..079ac3f0e3545f 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -935,9 +935,8 @@ LocalDeclID LocalDeclID::get(ASTReader &Reader, ModuleFile &MF, DeclID Value) {
   return ID;
 }
 
-static LocalDeclID getLocalDeclID(ASTReader &Reader, ModuleFile &MF,
-                                  unsigned ModuleFileIndex,
-                                  unsigned LocalDeclID) {
+LocalDeclID LocalDeclID::get(ASTReader &Reader, ModuleFile &MF,
+                             unsigned ModuleFileIndex, unsigned LocalDeclID) {
   DeclID Value = (DeclID)ModuleFileIndex << 32 | (DeclID)LocalDeclID;
   return LocalDeclID::get(Reader, MF, Value);
 }
@@ -7884,7 +7883,7 @@ LocalDeclID ASTReader::mapGlobalIDToModuleFileGlobalID(ModuleFile &M,
   if (!OrignalModuleFileIndex)
     return LocalDeclID();
 
-  return getLocalDeclID(*this, M, OrignalModuleFileIndex, ID);
+  return LocalDeclID::get(*this, M, OrignalModuleFileIndex, ID);
 }
 
 GlobalDeclID ASTReader::ReadDeclID(ModuleFile &F, const RecordDataImpl &Record,
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 4b8b515c02c70d..6afb18f932e724 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3723,6 +3723,23 @@ void ASTDeclReader::attachPreviousDecl(ASTReader &Reader, Decl *D,
 #include "clang/AST/DeclNodes.inc"
   }
 
+  // [basic.link]/p10:
+  //    If two declarations of an entity are attached to different modules,
+  //    the program is ill-formed;
+  //
+  // FIXME: Get rid of the enumeration of decl types once we have an appropriate
+  // abstract for decls of an entity. e.g., the namespace decl and using decl
+  // doesn't introduce an entity.
+  if (Module *M = Previous->getOwningModule();
+      M && M->isNamedModule() &&
+      isa<VarDecl, FunctionDecl, TagDecl, RedeclarableTemplateDecl>(Previous) &&
+      !Reader.getContext().isInSameModule(M, D->getOwningModule())) {
+    Reader.Diag(Previous->getLocation(),
+                diag::err_multiple_decl_in_different_modules)
+        << cast<NamedDecl>(Previous) << M->Name;
+    Reader.Diag(D->getLocation(), diag::note_also_found);
+  }
+
   // If the declaration was visible in one module, a redeclaration of it in
   // another module remains visible even if it wouldn't be visible by itself.
   //
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index d99d85652c7b77..9c55960b14cba9 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5355,6 +5355,20 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
 
   writeUnhashedControlBlock(PP, Context);
 
+  // Don't reuse type ID and Identifier ID from readers for C++ standard named
+  // modules since we want to support no-transitive-change model for named
+  // modules. The theory for no-transitive-change model is,
+  // for a user of a named module, the user can only access the indirectly
+  // imported decls via the directly imported module. So that it is possible to
+  // control what matters to the users when writing the module. It would be
+  // problematic if the users can reuse the type IDs and identifier IDs from
+  // indirectly imported modules arbitrarily. So we choose to clear these ID
+  // here.
+  if (isWritingStdCXXNamedModules()) {
+    TypeIdxs.clear();
+    IdentifierIDs.clear();
+  }
+
   // Look for any identifiers that were named while processing the
   // headers, but are otherwise not needed. We add these to the hash
   // table to enable checking of the predefines buffer in the case
@@ -5387,6 +5401,17 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
     }
   }
 
+  // Form the record of special types.
+  RecordData SpecialTypes;
+  AddTypeRef(Context.getRawCFConstantStringType(), SpecialTypes);
+  AddTypeRef(Context.getFILEType(), SpecialTypes);
+  AddTypeRef(Context.getjmp_bufType(), SpecialTypes);
+  AddTypeRef(Context.getsigjmp_bufType(), SpecialTypes);
+  AddTypeRef(Context.ObjCIdRedefinitionType, SpecialTypes);
+  AddTypeRef(Context.ObjCClassRedefinitionType, SpecialTypes);
+  AddTypeRef(Context.ObjCSelRedefinitionType, SpecialTypes);
+  AddTypeRef(Context.getucontext_tType(), SpecialTypes);
+
   PrepareWritingSpecialDecls(SemaRef);
 
   // Write the control block
@@ -5418,17 +5443,6 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
   for (auto &Selector : AllSelectors)
     SemaRef.ObjC().updateOutOfDateSelector(Selector);
 
-  // Form the record of special types.
-  RecordData SpecialTypes;
-  AddTypeRef(Context.getRawCFConstantStringType(), SpecialTypes);
-  AddTypeRef(Context.getFILEType(), SpecialTypes);
-  AddTypeRef(Context.getjmp_bufType(), SpecialTypes);
-  AddTypeRef(Context.getsigjmp_bufType(), SpecialTypes);
-  AddTypeRef(Context.ObjCIdRedefinitionType, SpecialTypes);
-  AddTypeRef(Context.ObjCClassRedefinitionType, SpecialTypes);
-  AddTypeRef(Context.ObjCSelRedefinitionType, SpecialTypes);
-  AddTypeRef(Context.getucontext_tType(), SpecialTypes);
-
   if (Chain) {
     // Write the mapping information describing our module dependencies and how
     // each of those modules were mapped into our own offset/ID space, so that
@@ -6686,6 +6700,11 @@ void ASTWriter::ReaderInitialized(ASTReader *Reader) {
 }
 
 void ASTWriter::IdentifierRead(IdentifierID ID, IdentifierInfo *II) {
+  // Don't reuse Type ID from external modules for named modules. See the
+  // comments in WriteASTCore for details.
+  if (isWritingStdCXXNamedModules())
+    return;
+
   IdentifierID &StoredID = IdentifierIDs[II];
   unsigned OriginalModuleFileIndex = StoredID >> 32;
 
@@ -6708,6 +6727,11 @@ void ASTWriter::MacroRead(serialization::MacroID ID, MacroInfo *MI) {
 }
 
 void ASTWriter::TypeRead(TypeIdx Idx, QualType T) {
+  // Don't reuse Type ID from external modules for named modules. See the
+  // comments in WriteASTCore for details.
+  if (isWritingStdCXXNamedModules())
+    return;
+
   // Always take the type index that comes in later module files.
   // This copes with an interesting
   // case for chained AST writing where we schedule writing the type and then,
diff --git a/clang/lib/StaticAnalyzer/Checkers/NoOwnershipChangeVisitor.cpp b/clang/lib/StaticAnalyzer/Checkers/NoOwnershipChangeVisitor.cpp
index 2ff76679b5ebf7..22b5ebfd6fab04 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NoOwnershipChangeVisitor.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NoOwnershipChangeVisitor.cpp
@@ -18,6 +18,7 @@ using namespace clang;
 using namespace ento;
 using OwnerSet = NoOwnershipChangeVisitor::OwnerSet;
 
+namespace {
 // Collect which entities point to the allocated memory, and could be
 // responsible for deallocating it.
 class OwnershipBindingsHandler : public StoreManager::BindingsHandler {
@@ -46,6 +47,7 @@ class OwnershipBindingsHandler : public StoreManager::BindingsHandler {
     out << "}\n";
   }
 };
+} // namespace
 
 OwnerSet NoOwnershipChangeVisitor::getOwnersAtNode(const ExplodedNode *N) {
   OwnerSet Ret;
diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
index b73534136fdf02..eea93a41f13841 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
@@ -43,8 +43,6 @@ class PointerSubChecker
   bool checkArrayBounds(CheckerContext &C, const Expr *E,
                         const ElementRegion *ElemReg,
                         const MemRegion *Reg) const;
-  void reportBug(CheckerContext &C, const Expr *E,
-                 const llvm::StringLiteral &Msg) const;
 
 public:
   void checkPreStmt(const BinaryOperator *B, CheckerContext &C) const;
@@ -57,6 +55,14 @@ bool PointerSubChecker::checkArrayBounds(CheckerContext &C, const Expr *E,
   if (!ElemReg)
     return true;
 
+  auto ReportBug = [&](const llvm::StringLiteral &Msg) {
+    if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
+      auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
+      R->addRange(E->getSourceRange());
+      C.emitReport(std::move(R));
+    }
+  };
+
   ProgramStateRef State = C.getState();
   const MemRegion *SuperReg = ElemReg->getSuperRegion();
   SValBuilder &SVB = C.getSValBuilder();
@@ -64,7 +70,7 @@ bool PointerSubChecker::checkArrayBounds(CheckerContext &C, const Expr *E,
   if (SuperReg == Reg) {
     if (const llvm::APSInt *I = SVB.getKnownValue(State, ElemReg->getIndex());
         I && (!I->isOne() && !I->isZero()))
-      reportBug(C, E, Msg_BadVarIndex);
+      ReportBug(Msg_BadVarIndex);
     return false;
   }
 
@@ -77,7 +83,7 @@ bool PointerSubChecker::checkArrayBounds(CheckerContext &C, const Expr *E,
     ProgramStateRef S1, S2;
     std::tie(S1, S2) = C.getState()->assume(*IndexTooLarge);
     if (S1 && !S2) {
-      reportBug(C, E, Msg_LargeArrayIndex);
+      ReportBug(Msg_LargeArrayIndex);
       return false;
     }
   }
@@ -89,22 +95,13 @@ bool PointerSubChecker::checkArrayBounds(CheckerContext &C, const Expr *E,
     ProgramStateRef S1, S2;
     std::tie(S1, S2) = State->assume(*IndexTooSmall);
     if (S1 && !S2) {
-      reportBug(C, E, Msg_NegativeArrayIndex);
+      ReportBug(Msg_NegativeArrayIndex);
       return false;
     }
   }
   return true;
 }
 
-void PointerSubChecker::reportBug(CheckerContext &C, const Expr *E,
-                                  const llvm::StringLiteral &Msg) const {
-  if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
-    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
-    R->addRange(E->getSourceRange());
-    C.emitReport(std::move(R));
-  }
-}
-
 void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
                                      CheckerContext &C) const {
   // When doing pointer subtraction, if the two pointers do not point to the
@@ -136,6 +133,9 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
   if (!checkArrayBounds(C, B->getRHS(), ElemRR, LR))
     return;
 
+  const ValueDecl *DiffDeclL = nullptr;
+  const ValueDecl *DiffDeclR = nullptr;
+
   if (ElemLR && ElemRR) {
     const MemRegion *SuperLR = ElemLR->getSuperRegion();
     const MemRegion *SuperRR = ElemRR->getSuperRegion();
@@ -144,9 +144,30 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
     // Allow arithmetic on different symbolic regions.
     if (isa<SymbolicRegion>(SuperLR) || isa<SymbolicRegion>(SuperRR))
       return;
+    if (const auto *SuperDLR = dyn_cast<DeclRegion>(SuperLR))
+      DiffDeclL = SuperDLR->getDecl();
+    if (const auto *SuperDRR = dyn_cast<DeclRegion>(SuperRR))
+      DiffDeclR = SuperDRR->getDecl();
   }
 
-  reportBug(C, B, Msg_MemRegionDifferent);
+  if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
+    auto R =
+        std::make_unique<PathSensitiveBugReport>(BT, Msg_MemRegionDifferent, N);
+    R->addRange(B->getSourceRange());
+    // The declarations may be identical even if the regions are different:
+    //   struct { int array[10]; } a, b;
+    //   do_something(&a.array[5] - &b.array[5]);
+    // In this case don't emit notes.
+    if (DiffDeclL != DiffDeclR) {
+      if (DiffDeclL)
+        R->addNote("Array at the left-hand side of subtraction",
+                   {DiffDeclL, C.getSourceManager()});
+      if (DiffDeclR)
+        R->addNote("Array at the right-hand side of subtraction",
+                   {DiffDeclR, C.getSourceManager()});
+    }
+    C.emitReport(std::move(R));
+  }
 }
 
 void ento::registerPointerSubChecker(CheckerManager &mgr) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 845a5f9b390dc4..8f4bd17afc8581 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -672,7 +672,7 @@ class StdLibraryFunctionsChecker
     StringRef getNote() const { return Note; }
   };
 
-  using ArgTypes = std::vector<std::optional<QualType>>;
+  using ArgTypes = ArrayRef<std::optional<QualType>>;
   using RetType = std::optional<QualType>;
 
   // A placeholder type, we use it whenever we do not care about the concrete
@@ -1746,7 +1746,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     }
     // Add the same summary for different names with the Signature explicitly
     // given.
-    void operator()(std::vector<StringRef> Names, Signature Sign, Summary Sum) {
+    void operator()(ArrayRef<StringRef> Names, Signature Sign, Summary Sum) {
       for (StringRef Name : Names)
         operator()(Name, Sign, Sum);
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 613c221de7b4cd..9aee7f952ad2d8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -10,6 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "NoOwnershipChangeVisitor.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
@@ -74,6 +77,12 @@ struct StreamErrorState {
   /// Returns if the StreamErrorState is a valid object.
   operator bool() const { return NoError || FEof || FError; }
 
+  LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); }
+  LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &os) const {
+    os << "NoError: " << NoError << ", FEof: " << FEof
+       << ", FError: " << FError;
+  }
+
   void Profile(llvm::FoldingSetNodeID &ID) const {
     ID.AddBoolean(NoError);
     ID.AddBoolean(FEof);
@@ -98,6 +107,18 @@ struct StreamState {
     OpenFailed /// The last open operation has failed.
   } State;
 
+  StringRef getKindStr() const {
+    switch (State) {
+    case Opened:
+      return "Opened";
+    case Closed:
+      return "Closed";
+    case OpenFailed:
+      return "OpenFailed";
+    }
+    llvm_unreachable("Unknown StreamState!");
+  }
+
   /// State of the error flags.
   /// Ignored in non-opened stream state but must be NoError.
   StreamErrorState const ErrorState;
@@ -146,6 +167,9 @@ struct StreamState {
     return StreamState{L, OpenFailed, {}, false};
   }
 
+  LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); }
+  LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &os) const;
+
   void Profile(llvm::FoldingSetNodeID &ID) const {
     ID.AddPointer(LastOperation);
     ID.AddInteger(State);
@@ -183,6 +207,14 @@ struct FnDescription {
   ArgNoTy StreamArgNo;
 };
 
+LLVM_DUMP_METHOD void StreamState::dumpToStream(llvm::raw_ostream &os) const {
+  os << "{Kind: " << getKindStr() << ", Last operation: " << LastOperation
+     << ", ErrorState: ";
+  ErrorState.dumpToStream(os);
+  os << ", FilePos: " << (FilePositionIndeterminate ? "Indeterminate" : "OK")
+     << '}';
+}
+
 /// Get the value of the stream argument out of the passed call event.
 /// The call should contain a function that is described by Desc.
 SVal getStreamArg(const FnDescription *Desc, const CallEvent &Call) {
@@ -300,6 +332,8 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
   /// If true, generate failure branches for cases that are often not checked.
   bool PedanticMode = false;
 
+  const CallDescription FCloseDesc = {CDM::CLibrary, {"fclose"}, 1};
+
 private:
   CallDescriptionMap<FnDescription> FnDescriptions = {
       {{CDM::CLibrary, {"fopen"}, 2},
@@ -310,8 +344,7 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
        {&StreamChecker::preFreopen, &StreamChecker::evalFreopen, 2}},
       {{CDM::CLibrary, {"tmpfile"}, 0},
        {nullptr, &StreamChecker::evalFopen, ArgNone}},
-      {{CDM::CLibrary, {"fclose"}, 1},
-       {&StreamChecker::preDefault, &StreamChecker::evalFclose, 0}},
+      {FCloseDesc, {&StreamChecker::preDefault, &StreamChecker::evalFclose, 0}},
       {{CDM::CLibrary, {"fread"}, 4},
        {&StreamChecker::preRead,
         std::bind(&StreamChecker::evalFreadFwrite, _1, _2, _3, _4, true), 3}},
@@ -696,6 +729,62 @@ struct StreamOperationEvaluator {
 
 } // end anonymous namespace
 
+//===----------------------------------------------------------------------===//
+// Definition of NoStreamStateChangeVisitor.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class NoStreamStateChangeVisitor final : public NoOwnershipChangeVisitor {
+protected:
+  /// Syntactically checks whether the callee is a closing function. Since
+  /// we have no path-sensitive information on this call (we would need a
+  /// CallEvent instead of a CallExpr for that), its possible that a
+  /// closing function was called indirectly through a function pointer,
+  /// but we are not able to tell, so this is a best effort analysis.
+  bool isClosingCallAsWritten(const CallExpr &Call) const {
+    const auto *StreamChk = static_cast<const StreamChecker *>(&Checker);
+    return StreamChk->FCloseDesc.matchesAsWritten(Call);
+  }
+
+  bool doesFnIntendToHandleOwnership(const Decl *Callee,
+                                     ASTContext &ACtx) final {
+    using namespace clang::ast_matchers;
+    const FunctionDecl *FD = dyn_cast<FunctionDecl>(Callee);
+
+    auto Matches =
+        match(findAll(callExpr().bind("call")), *FD->getBody(), ACtx);
+    for (BoundNodes Match : Matches) {
+      if (const auto *Call = Match.getNodeAs<CallExpr>("call"))
+        if (isClosingCallAsWritten(*Call))
+          return true;
+    }
+    // TODO: Ownership might change with an attempt to store stream object, not
+    // only through closing it. Check for attempted stores as well.
+    return false;
+  }
+
+  bool hasResourceStateChanged(ProgramStateRef CallEnterState,
+                               ProgramStateRef CallExitEndState) final {
+    return CallEnterState->get<StreamMap>(Sym) !=
+           CallExitEndState->get<StreamMap>(Sym);
+  }
+
+  PathDiagnosticPieceRef emitNote(const ExplodedNode *N) override {
+    PathDiagnosticLocation L = PathDiagnosticLocation::create(
+        N->getLocation(),
+        N->getState()->getStateManager().getContext().getSourceManager());
+    return std::make_shared<PathDiagnosticEventPiece>(
+        L, "Returning without closing stream object or storing it for later "
+           "release");
+  }
+
+public:
+  NoStreamStateChangeVisitor(SymbolRef Sym, const StreamChecker *Checker)
+      : NoOwnershipChangeVisitor(Sym, Checker) {}
+};
+
+} // end anonymous namespace
+
 const ExplodedNode *StreamChecker::getAcquisitionSite(const ExplodedNode *N,
                                                       SymbolRef StreamSym,
                                                       CheckerContext &C) {
@@ -1872,6 +1961,7 @@ StreamChecker::reportLeaks(const SmallVector<SymbolRef, 2> &LeakedSyms,
             LocUsedForUniqueing,
             StreamOpenNode->getLocationContext()->getDecl());
     R->markInteresting(LeakSym);
+    R->addVisitor<NoStreamStateChangeVisitor>(LeakSym, this);
     C.emitReport(std::move(R));
   }
 
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/CMakeLists.txt b/clang/lib/Tooling/Inclusions/Stdlib/CMakeLists.txt
index ed323ab3528b10..4d01a0850a074b 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/CMakeLists.txt
+++ b/clang/lib/Tooling/Inclusions/Stdlib/CMakeLists.txt
@@ -4,6 +4,7 @@
 # our goal is to disable the /Og flag while retaining the other optimizations from the /O1|/O2 set
 if(MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES Clang
     AND MSVC_VERSION VERSION_GREATER_EQUAL 1932
+    AND MSVC_VERSION VERSION_LESS 1939
     AND CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
 
   string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
diff --git a/clang/lib/Tooling/Transformer/RangeSelector.cpp b/clang/lib/Tooling/Transformer/RangeSelector.cpp
index 7370baf010834c..e84ddde74a7079 100644
--- a/clang/lib/Tooling/Transformer/RangeSelector.cpp
+++ b/clang/lib/Tooling/Transformer/RangeSelector.cpp
@@ -96,13 +96,6 @@ static SourceLocation findPreviousTokenKind(SourceLocation Start,
   }
 }
 
-static SourceLocation findOpenParen(const CallExpr &E, const SourceManager &SM,
-                                    const LangOptions &LangOpts) {
-  SourceLocation EndLoc =
-      E.getNumArgs() == 0 ? E.getRParenLoc() : E.getArg(0)->getBeginLoc();
-  return findPreviousTokenKind(EndLoc, SM, LangOpts, tok::TokenKind::l_paren);
-}
-
 RangeSelector transformer::before(RangeSelector Selector) {
   return [Selector](const MatchResult &Result) -> Expected<CharSourceRange> {
     Expected<CharSourceRange> SelectedRange = Selector(Result);
@@ -287,18 +280,50 @@ RangeSelector transformer::statements(std::string ID) {
 }
 
 namespace {
-// Returns the range of the source between the call's parentheses.
-CharSourceRange getCallArgumentsRange(const MatchResult &Result,
-                                      const CallExpr &CE) {
+
+SourceLocation getRLoc(const CallExpr &E) { return E.getRParenLoc(); }
+
+SourceLocation getRLoc(const CXXConstructExpr &E) {
+  return E.getParenOrBraceRange().getEnd();
+}
+
+tok::TokenKind getStartToken(const CallExpr &E) {
+  return tok::TokenKind::l_paren;
+}
+
+tok::TokenKind getStartToken(const CXXConstructExpr &E) {
+  return isa<CXXTemporaryObjectExpr>(E) ? tok::TokenKind::l_paren
+                                        : tok::TokenKind::l_brace;
+}
+
+template <typename ExprWithArgs>
+SourceLocation findArgStartDelimiter(const ExprWithArgs &E, SourceLocation RLoc,
+                                     const SourceManager &SM,
+                                     const LangOptions &LangOpts) {
+  SourceLocation Loc = E.getNumArgs() == 0 ? RLoc : E.getArg(0)->getBeginLoc();
+  return findPreviousTokenKind(Loc, SM, LangOpts, getStartToken(E));
+}
+// Returns the range of the source between the call's or construct expr's
+// parentheses/braces.
+template <typename ExprWithArgs>
+CharSourceRange getArgumentsRange(const MatchResult &Result,
+                                  const ExprWithArgs &CE) {
+  const SourceLocation RLoc = getRLoc(CE);
   return CharSourceRange::getCharRange(
-      findOpenParen(CE, *Result.SourceManager, Result.Context->getLangOpts())
+      findArgStartDelimiter(CE, RLoc, *Result.SourceManager,
+                            Result.Context->getLangOpts())
           .getLocWithOffset(1),
-      CE.getRParenLoc());
+      RLoc);
 }
 } // namespace
 
 RangeSelector transformer::callArgs(std::string ID) {
-  return RelativeSelector<CallExpr, getCallArgumentsRange>(std::move(ID));
+  return RelativeSelector<CallExpr, getArgumentsRange<CallExpr>>(std::move(ID));
+}
+
+RangeSelector transformer::constructExprArgs(std::string ID) {
+  return RelativeSelector<CXXConstructExpr,
+                          getArgumentsRange<CXXConstructExpr>>(std::move(ID));
 }
 
 namespace {
diff --git a/clang/lib/Tooling/Transformer/Stencil.cpp b/clang/lib/Tooling/Transformer/Stencil.cpp
index d91c9e0a20cc1b..bc4fa6e36057c1 100644
--- a/clang/lib/Tooling/Transformer/Stencil.cpp
+++ b/clang/lib/Tooling/Transformer/Stencil.cpp
@@ -51,7 +51,7 @@ static Error printNode(StringRef Id, const MatchFinder::MatchResult &Match,
   if (auto Err = NodeOrErr.takeError())
     return Err;
   NodeOrErr->print(Os, PrintingPolicy(Match.Context->getLangOpts()));
-  *Result += Os.str();
+  *Result += Output;
   return Error::success();
 }
 
@@ -371,7 +371,7 @@ class SelectBoundStencil : public clang::transformer::StencilInterface {
       Stream << ", " << DefaultStencil->toString();
     }
     Stream << ")";
-    return Stream.str();
+    return Buffer;
   }
 
 private:
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index 6f6fca8c1cfd8e..933437c3401c40 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -26,6 +26,9 @@ static_assert(foo[2][2] == nullptr, "");
 static_assert(foo[2][3] == &m, "");
 static_assert(foo[2][4] == nullptr, "");
 
+constexpr int afterEnd[] = {1,2,3};
+static_assert(&afterEnd[3] == afterEnd + 3, "");
+
 constexpr int ZeroSizeArray[] = {};
 
 constexpr int SomeInt[] = {1};
@@ -510,11 +513,9 @@ namespace NonConstReads {
             // both-note {{read of non-const variable 'z'}}
 #else
   void *p = nullptr;
-  int arr[!p]; // ref-error {{not allowed at file scope}} \
-               // expected-error {{not allowed at file scope}}
+  int arr[!p]; // both-error {{not allowed at file scope}}
   int z;
-  int a[z]; // ref-error {{not allowed at file scope}} \
-            // expected-error {{not allowed at file scope}}
+  int a[z]; // both-error {{not allowed at file scope}}
 #endif
 
   const int y = 0;
@@ -623,3 +624,4 @@ constexpr int *get2() {
   extern int same_entity_2[];
   return same_entity_2;
 }
+static_assert(get2() == same_entity_2, "failed to find previous decl");
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 1cc450e48def06..684658da26a0ed 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -288,3 +288,8 @@ char test10_global[test10_bound]; // all-error {{variable length array declarati
 void test10(void) {
   char test10_local[test10_bound] = "help"; // all-error {{variable-sized object may not be initialized}}
 }
+
+void SuperSpecialFunc(void) {
+const int SuperSpecialCase = 10;
+_Static_assert((sizeof(SuperSpecialCase) == 12 && SuperSpecialCase == 3) || SuperSpecialCase == 10, ""); // pedantic-warning {{GNU extension}}
+}
diff --git a/clang/test/AST/Interp/const-temporaries.cpp b/clang/test/AST/Interp/const-temporaries.cpp
new file mode 100644
index 00000000000000..1dc16ef63a8468
--- /dev/null
+++ b/clang/test/AST/Interp/const-temporaries.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin -emit-llvm -o - %s -std=c++1y -fexperimental-new-constant-interpreter | FileCheck %s
+//
+// expected-no-diagnostics
+
+struct A {
+  constexpr A() : n(1) {}
+  ~A();
+  int n;
+};
+struct B : A {
+  A a[3];
+  constexpr B() {
+    ++a[0].n;
+    a[1].n += 2;
+    a[2].n = n + a[1].n;
+  }
+};
+B b;
+
+// CHECK: @b ={{.*}} global {{.*}} i32 1, {{.*}} { i32 2 }, {{.*}} { i32 3 }, {{.*}} { i32 4 }
+// CHECK-NOT: _ZN1BC
+
+namespace ModifyStaticTemporary {
+  struct A { int &&temporary; int x; };
+  constexpr int f(int &r) { r *= 9; return r - 12; }
+  A a = { 6, f(a.temporary) };
+  // CHECK: @_ZGRN21ModifyStaticTemporary1aE_ = internal global i32 54
+  // CHECK: @_ZN21ModifyStaticTemporary1aE ={{.*}} global {{.*}} ptr @_ZGRN21ModifyStaticTemporary1aE_, i32 42
+
+  A b = { 7, ++b.temporary };
+  // CHECK: @_ZGRN21ModifyStaticTemporary1bE_ = internal global i32 8
+  // CHECK: @_ZN21ModifyStaticTemporary1bE ={{.*}} global {{.*}} ptr @_ZGRN21ModifyStaticTemporary1bE_, i32 8
+
+  // Can't emit all of 'c' as a constant here, so emit the initial value of
+  // 'c.temporary', not the value as modified by the partial evaluation within
+  // the initialization of 'c.x'.
+  A c = { 10, (++c.temporary, b.x) };
+  // CHECK: @_ZN21ModifyStaticTemporary1cE ={{.*}} global {{.*}} zeroinitializer
+  // CHECK: @_ZGRN21ModifyStaticTemporary1cE_ = internal global i32 10
+}
+
+// CHECK: @_ZGRN28VariableTemplateWithConstRef1iIvEE_ = linkonce_odr constant i32 5, align 4
+// CHECK: @_ZN28VariableTemplateWithConstRef3useE ={{.*}} constant ptr @_ZGRN28VariableTemplateWithConstRef1iIvEE_
+namespace VariableTemplateWithConstRef {
+  template <typename T>
+  const int &i = 5;
+  const int &use = i<void>;
+}
+
+// CHECK: @_ZGRN34HiddenVariableTemplateWithConstRef1iIvEE_ = linkonce_odr hidden constant i32 5, align 4
+// CHECK: @_ZN34HiddenVariableTemplateWithConstRef3useE ={{.*}} constant ptr @_ZGRN34HiddenVariableTemplateWithConstRef1iIvEE_
+namespace HiddenVariableTemplateWithConstRef {
+  template <typename T>
+  __attribute__((visibility("hidden"))) const int &i = 5;
+  const int &use = i<void>;
+}
+
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE1_ = linkonce_odr constant i32 1
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE0_ = linkonce_odr global {{.*}} { ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE1_ }
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE3_ = linkonce_odr constant i32 2
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE2_ = linkonce_odr global {{.*}} { ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE3_ }
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE5_ = linkonce_odr constant i32 3
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE4_ = linkonce_odr global {{.*}} { ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE5_ }
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE7_ = linkonce_odr constant i32 4
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE6_ = linkonce_odr global {{.*}} { ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE7_ }
+// CHECK: @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE_ = linkonce_odr global %"struct.VariableTemplateWithPack::S" { ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE0_, ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE2_, ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE4_, ptr @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE6_ }
+// CHECK: @_ZN24VariableTemplateWithPack1pE ={{.*}} global {{.*}} @_ZGRN24VariableTemplateWithPack1sIJLi1ELi2ELi3ELi4EEEE_
+namespace VariableTemplateWithPack {
+  struct A {
+    const int &r;
+  };
+  struct S {
+    A &&a, &&b, &&c, &&d;
+  };
+  template <int... N>
+  S &&s = {A{N}...};
+  S *p = &s<1, 2, 3, 4>;
+}
+
+
+// CHECK: @_ZGR1z_ ={{.*}} global [2 x i32] [i32 10, i32 2]
+// CHECK: @z = global { ptr, i32 } { ptr @_ZGR1z_, i32 10 }
+typedef int v[2];
+struct Z { int &&x, y; };
+Z z = { v{1,2}[0], z.x = 10 };
+
+// CHECK: @_ZGR2z2_ ={{.*}} global %struct.R { i64 10 }
+// @z = {{.}} global %struct.Z { ptr @_ZGR1z_, %struct.R { i64 10 } }
+struct R { mutable long x; };
+struct Z2 { const R &x, y; };
+Z2 z2 = { R{1}, z2.x.x = 10 };
+
+// CHECK: __cxa_atexit({{.*}} @_ZN1BD1Ev, {{.*}} @b
+
+// CHECK: define
+// CHECK-NOT: @_ZGRN21ModifyStaticTemporary1cE_
+// CHECK: store {{.*}} @_ZGRN21ModifyStaticTemporary1cE_, {{.*}} @_ZN21ModifyStaticTemporary1cE
+// CHECK: add
+// CHECK: store
+// CHECK: load {{.*}} @_ZN21ModifyStaticTemporary1bE
+// CHECK: store {{.*}} @_ZN21ModifyStaticTemporary1cE
diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp
index 10c62a43ef33b4..f43be1d3c0403d 100644
--- a/clang/test/AST/Interp/functions.cpp
+++ b/clang/test/AST/Interp/functions.cpp
@@ -473,6 +473,10 @@ namespace AddressOf {
 
   constexpr _Complex float F = {3, 4};
   static_assert(__builtin_addressof(F) == &F, "");
+
+  void testAddressof(int x) {
+    static_assert(&x == __builtin_addressof(x), "");
+  }
 }
 
 namespace std {
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 5a29013a053a20..a7a602ec355d5f 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -45,6 +45,8 @@ constexpr int Failed2 = Failed1 + 1; // both-error {{must be initialized by a co
 static_assert(Failed2 == 0, ""); // both-error {{not an integral constant expression}} \
                                  // both-note {{initializer of 'Failed2' is not a constant expression}}
 
+const int x = *(volatile int*)0x1234;
+
 namespace ScalarTypes {
   constexpr int ScalarInitInt = int();
   static_assert(ScalarInitInt == 0, "");
@@ -1211,6 +1213,18 @@ constexpr int externvar1() { // both-error {{never produces a constant expressio
    return arr[0]; // ref-note {{read of non-constexpr variable 'arr'}} \
                   // expected-note {{indexing of array without known bound}}
 }
+
+namespace StmtExprs {
+  constexpr int foo() {
+     ({
+       int i;
+       for (i = 0; i < 76; i++) {}
+       i; // both-warning {{expression result unused}}
+    });
+    return 76;
+  }
+  static_assert(foo() == 76, "");
+}
 #endif
 
 namespace Extern {
diff --git a/clang/test/AST/Interp/mutable.cpp b/clang/test/AST/Interp/mutable.cpp
new file mode 100644
index 00000000000000..aebbea920578c6
--- /dev/null
+++ b/clang/test/AST/Interp/mutable.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++11 -verify=expected,expected11,both,both11 %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify=expected,expected14,both %s
+// RUN: %clang_cc1 -std=c++11 -verify=ref,ref11,both,both11 %s
+// RUN: %clang_cc1 -std=c++14 -verify=ref,ref14,both %s
+
+
+
+
+
+namespace Simple {
+  struct S {
+    mutable int a; // both-note {{declared here}} \
+                   // both11-note {{declared here}}
+    int a2;
+  };
+
+  constexpr S s{12, 24};
+  static_assert(s.a == 12, ""); // both-error {{not an integral constant expression}}  \
+                                // both-note {{read of mutable member 'a'}}
+  static_assert(s.a2 == 24, "");
+
+
+  constexpr S s2{12, s2.a}; // both11-error {{must be initialized by a constant expression}} \
+                            // both11-note {{read of mutable member 'a'}} \
+                            // both11-note {{declared here}}
+  static_assert(s2.a2 == 12, ""); // both11-error {{not an integral constant expression}} \
+                                  // both11-note {{initializer of 's2' is not a constant expression}}
+}
diff --git a/clang/test/AST/Interp/references.cpp b/clang/test/AST/Interp/references.cpp
index efb756545b4aa1..9a790dc75d7308 100644
--- a/clang/test/AST/Interp/references.cpp
+++ b/clang/test/AST/Interp/references.cpp
@@ -130,3 +130,8 @@ const char (&nonextended_string_ref)[3] = {"hi"};
 static_assert(nonextended_string_ref[0] == 'h', "");
 static_assert(nonextended_string_ref[1] == 'i', "");
 static_assert(nonextended_string_ref[2] == '\0', "");
+
+/// This isa non-constant context. Reading A is not allowed,
+/// but taking its address is.
+int &&A = 12;
+int arr[!&A];
diff --git a/clang/test/Analysis/analyzer-checker-option-help.c b/clang/test/Analysis/analyzer-checker-option-help.c
index 5f95569e58498e..c95b8e7064c5f7 100644
--- a/clang/test/Analysis/analyzer-checker-option-help.c
+++ b/clang/test/Analysis/analyzer-checker-option-help.c
@@ -34,27 +34,7 @@
 // CHECK-STABLE: OPTIONS:
 //
 // CHECK-STABLE:   cplusplus.Move:WarnOn
-// CHECK-STABLE-SAME:          (string) In non-aggressive mode, only warn
-// CHECK-STABLLE:              on use-after-move of local variables (or
-// CHECK-STABLLE:              local rvalue references) and of STL objects.
-// CHECK-STABLLE:              The former is possible because local variables
-// CHECK-STABLLE:              (or local rvalue references) are not tempting
-// CHECK-STABLLE:              their user to re-use the storage. The latter
-// CHECK-STABLLE:              is possible because STL objects are known
-// CHECK-STABLLE:              to end up in a valid but unspecified state
-// CHECK-STABLLE:              after the move and their state-reset methods
-// CHECK-STABLLE:              are also known, which allows us to predict
-// CHECK-STABLLE:              precisely when use-after-move is invalid.
-// CHECK-STABLLE:              Some STL objects are known to conform to
-// CHECK-STABLLE:              additional contracts after move, so they
-// CHECK-STABLLE:              are not tracked. However, smart pointers
-// CHECK-STABLLE:              specifically are tracked because we can
-// CHECK-STABLLE:              perform extra checking over them. In aggressive
-// CHECK-STABLLE:              mode, warn on any use-after-move because
-// CHECK-STABLLE:              the user has intentionally asked us to completely
-// CHECK-STABLLE:              eliminate use-after-move in his code. Values:
-// CHECK-STABLLE:              "KnownsOnly", "KnownsAndLocals", "All".
-// CHECK-STABLLE:              (default: KnownsAndLocals)
+// CHECK-STABLE-SAME:         (string) With setting "KnownsOnly" warn
 
 // CHECK-STABLE-NOT: debug.AnalysisOrder:*
 // CHECK-DEVELOPER:  debug.AnalysisOrder:*
diff --git a/clang/test/Analysis/casts.c b/clang/test/Analysis/casts.c
index 7dad4edfa89b99..462a9865f15640 100644
--- a/clang/test/Analysis/casts.c
+++ b/clang/test/Analysis/casts.c
@@ -129,7 +129,7 @@ void locAsIntegerCasts(void *p) {
 }
 
 void multiDimensionalArrayPointerCasts(void) {
-  static int x[10][10];
+  static int x[10][10]; // expected-note2{{Array at the right-hand side of subtraction}}
   int *y1 = &(x[3][5]);
   char *z = ((char *) y1) + 2;
   int *y2 = (int *)(z - 2);
diff --git a/clang/test/Analysis/pointer-sub-notes.c b/clang/test/Analysis/pointer-sub-notes.c
new file mode 100644
index 00000000000000..6e347557354226
--- /dev/null
+++ b/clang/test/Analysis/pointer-sub-notes.c
@@ -0,0 +1,66 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.PointerSub -analyzer-output=text -verify %s
+
+void negative_1() {
+  int a[3];
+  int x = -1;
+  // FIXME: should indicate that 'x' is -1
+  int d = &a[x] - &a[0]; // expected-warning{{Using a negative array index at pointer subtraction is undefined behavior}} \
+                         // expected-note{{Using a negative array index at pointer subtraction is undefined behavior}}
+}
+
+void negative_2() {
+  int a[3];
+  int *p1 = a, *p2 = a;
+  --p2;
+  // FIXME: should indicate that 'p2' is negative
+  int d = p1 - p2; // expected-warning{{Using a negative array index at pointer subtraction is undefined behavior}} \
+                   // expected-note{{Using a negative array index at pointer subtraction is undefined behavior}}
+}
+
+void different_1() {
+  int a[3]; // expected-note{{Array at the left-hand side of subtraction}}
+  int b[3]; // expected-note{{Array at the right-hand side of subtraction}}
+  int d = &a[2] - &b[0]; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} \
+                         // expected-note{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+}
+
+void different_2() {
+  int a[3]; // expected-note{{Array at the right-hand side of subtraction}}
+  int b[3]; // expected-note{{Array at the left-hand side of subtraction}}
+  int *p1 = a + 1;
+  int *p2 = b;
+  int d = p2 - p1; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} \
+                   // expected-note{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+}
+
+int different_3() {
+  struct {
+    int array[5];
+  } a, b;
+  return &a.array[3] - &b.array[2]; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} \
+                                    // expected-note{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+}
+
+int different_4() {
+  struct {
+    int array1[5]; // expected-note{{Array at the left-hand side of subtraction}}
+    int array2[5]; // expected-note{{Array at the right-hand side of subtraction}}
+  } a;
+  return &a.array1[3] - &a.array2[4]; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} \
+                                      // expected-note{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+}
+
+void different_5() {
+  int d;
+  static int x[10][10]; // expected-note2{{Array at the left-hand side of subtraction}}
+  int *y1 = &(x[3][5]);
+  char *z = ((char *) y1) + 2;
+  int *y2 = (int *)(z - 2);
+  int *y3 = ((int *)x) + 35; // This is offset for [3][5].
+
+  d = y2 - y1; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} \
+               // expected-note{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+  d = y3 - y1; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} \
+               // expected-note{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+  d = y3 - y2;
+}
diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c
index 9a446547e2868f..88e6dec2d172f8 100644
--- a/clang/test/Analysis/pointer-sub.c
+++ b/clang/test/Analysis/pointer-sub.c
@@ -19,7 +19,8 @@ void f1(void) {
 }
 
 void f2(void) {
-  int a[10], b[10], c;
+  int a[10], b[10], c; // expected-note{{Array at the left-hand side of subtraction}} \
+                       // expected-note2{{Array at the right-hand side of subtraction}}
   int *p = &a[2];
   int *q = &a[8];
   int d = q - p; // no-warning (pointers into the same array)
@@ -41,7 +42,8 @@ void f2(void) {
 }
 
 void f3(void) {
-  int a[3][4];
+  int a[3][4]; // expected-note{{Array at the left-hand side of subtraction}} \
+               // expected-note2{{Array at the right-hand side of subtraction}}
   int d;
 
   d = &(a[2]) - &(a[1]);
@@ -74,8 +76,8 @@ void f4(void) {
 typedef struct {
   int a;
   int b;
-  int c[10];
-  int d[10];
+  int c[10]; // expected-note2{{Array at the right-hand side of subtraction}}
+  int d[10]; // expected-note2{{Array at the left-hand side of subtraction}}
 } S;
 
 void f5(void) {
@@ -100,8 +102,8 @@ void f6(void) {
   char *a1 = (char *)&l;
   int d = a1[3] - l;
 
-  long long la1[3];
-  long long la2[3];
+  long long la1[3]; // expected-note{{Array at the right-hand side of subtraction}}
+  long long la2[3]; // expected-note{{Array at the left-hand side of subtraction}}
   char *pla1 = (char *)la1;
   char *pla2 = (char *)la2;
   d = pla1[1] - pla1[0];
diff --git a/clang/test/Analysis/stream-notes-missing-close.cpp b/clang/test/Analysis/stream-notes-missing-close.cpp
new file mode 100644
index 00000000000000..272ca96c76ee18
--- /dev/null
+++ b/clang/test/Analysis/stream-notes-missing-close.cpp
@@ -0,0 +1,179 @@
+// RUN: %clang_analyze_cc1 -verify %s -analyzer-output=text \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=unix.Stream
+
+
+#include "Inputs/system-header-simulator.h"
+char *logDump();
+bool coin();
+
+[[noreturn]] void halt();
+
+void assert(bool b) {
+  if (!b)
+    halt();
+}
+
+//===----------------------------------------------------------------------===//
+// Report for which we expect NoOwnershipChangeVisitor to add a new note.
+//===----------------------------------------------------------------------===//
+
+namespace stream_opened_in_fn_call {
+// TODO: AST analysis of sink would reveal that it doesn't intent to free the
+// allocated memory, but in this instance, its also the only function with
+// the ability to do so, we should see a note here.
+void sink(FILE *f) {
+}
+
+void f() {
+  sink(fopen("input.txt", "w"));
+  // expected-note@-1{{Stream opened here}}
+} // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+// expected-note@-1{{Opened stream never closed. Potential resource leak}}
+} // namespace stream_opened_in_fn_call
+
+namespace stream_passed_to_fn_call {
+
+void expectedClose(FILE *f) {
+  if (char *log = logDump()) { // expected-note{{Assuming 'log' is null}}
+                               // expected-note@-1{{Taking false branch}}
+    printf("%s", log);
+    fclose(f);
+  }
+} // expected-note{{Returning without closing stream object or storing it for later release}}
+
+void f() {
+  FILE *f = fopen("input.txt", "w"); // expected-note{{Stream opened here}}
+  if (!f) // expected-note{{'f' is non-null}}
+          // expected-note@-1{{Taking false branch}}
+    return;
+  if (coin()) { // expected-note{{Assuming the condition is true}}
+                // expected-note@-1{{Taking true branch}}
+    expectedClose(f); // expected-note{{Calling 'expectedClose'}}
+                      // expected-note@-1{{Returning from 'expectedClose'}}
+
+    return; // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+            // expected-note@-1{{Opened stream never closed. Potential resource leak}}
+  }
+  fclose(f);
+}
+} // namespace stream_passed_to_fn_call
+
+namespace stream_shared_with_ptr_of_shorter_lifetime {
+
+void sink(FILE *f) {
+  FILE *Q = f;
+  if (coin()) // expected-note {{Assuming the condition is false}}
+              // expected-note@-1 {{Taking false branch}}
+    fclose(f);
+  (void)Q;
+} // expected-note{{Returning without closing stream object or storing it for later release}}
+
+void foo() {
+  FILE *f = fopen("input.txt", "w"); // expected-note{{Stream opened here}}
+  if (!f) // expected-note{{'f' is non-null}}
+          // expected-note@-1{{Taking false branch}}
+    return;
+  sink(f); // expected-note {{Calling 'sink'}}
+           // expected-note@-1 {{Returning from 'sink'}}
+} // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+// expected-note@-1{{Opened stream never closed. Potential resource leak}}
+
+} // namespace stream_shared_with_ptr_of_shorter_lifetime
+
+//===----------------------------------------------------------------------===//
+// Report for which we *do not* expect NoOwnershipChangeVisitor add a new note,
+// nor do we want it to.
+//===----------------------------------------------------------------------===//
+
+namespace stream_not_passed_to_fn_call {
+
+void expectedClose(FILE *f) {
+  if (char *log = logDump()) {
+    printf("%s", log);
+    fclose(f);
+  }
+}
+
+void f(FILE *p) {
+  FILE *f = fopen("input.txt", "w"); // expected-note{{Stream opened here}}
+  if (!f) // expected-note{{'f' is non-null}}
+          // expected-note@-1{{Taking false branch}}
+    return;
+  expectedClose(p); // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+                    // expected-note@-1{{Opened stream never closed. Potential resource leak}}
+}
+} // namespace stream_not_passed_to_fn_call
+
+namespace stream_shared_with_ptr_of_same_lifetime {
+
+void expectedClose(FILE *f, FILE **p) {
+  // NOTE: Not a job of NoOwnershipChangeVisitor, but maybe this could be
+  // highlighted still?
+  *p = f;
+}
+
+void f() {
+  FILE *f = fopen("input.txt", "w"); // expected-note{{Stream opened here}}
+  FILE *p = NULL;
+  if (!f) // expected-note{{'f' is non-null}}
+          // expected-note@-1{{Taking false branch}}
+    return;
+  expectedClose(f, &p);
+} // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+  // expected-note@-1{{Opened stream never closed. Potential resource leak}}
+} // namespace stream_shared_with_ptr_of_same_lifetime
+
+namespace stream_passed_into_fn_that_doesnt_intend_to_free {
+void expectedClose(FILE *f) {
+}
+
+void f() {
+  FILE *f = fopen("input.txt", "w"); // expected-note{{Stream opened here}}
+  if (!f) // expected-note{{'f' is non-null}}
+          // expected-note@-1{{Taking false branch}}
+    return;
+  expectedClose(f);
+
+} // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+  // expected-note@-1{{Opened stream never closed. Potential resource leak}}
+} // namespace stream_passed_into_fn_that_doesnt_intend_to_free
+
+namespace stream_passed_into_fn_that_doesnt_intend_to_free2 {
+void bar();
+
+void expectedClose(FILE *f) {
+  // Correctly realize that calling bar() doesn't mean that this function would
+  // like to deallocate anything.
+  bar();
+}
+
+void f() {
+  FILE *f = fopen("input.txt", "w"); // expected-note{{Stream opened here}}
+  if (!f) // expected-note{{'f' is non-null}}
+          // expected-note@-1{{Taking false branch}}
+    return;
+  expectedClose(f);
+
+} // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+  // expected-note@-1{{Opened stream never closed. Potential resource leak}}
+} // namespace stream_passed_into_fn_that_doesnt_intend_to_free2
+
+namespace streamstate_from_closed_to_open {
+
+// StreamState of the symbol changed from nothing to Allocated. We don't want to
+// emit notes when the RefKind changes in the stack frame.
+static FILE *fopenWrapper() {
+  FILE *f = fopen("input.txt", "w"); // expected-note{{Stream opened here}}
+  assert(f);
+  return f;
+}
+void use_ret() {
+  FILE *v;
+  v = fopenWrapper(); // expected-note {{Calling 'fopenWrapper'}}
+                      // expected-note@-1{{Returning from 'fopenWrapper'}}
+
+} // expected-warning{{Opened stream never closed. Potential resource leak [unix.Stream]}}
+  // expected-note@-1{{Opened stream never closed. Potential resource leak}}
+
+} // namespace streamstate_from_closed_to_open
diff --git a/clang/test/C/C11/n1285.c b/clang/test/C/C11/n1285.c
new file mode 100644
index 00000000000000..e7a9463e68103a
--- /dev/null
+++ b/clang/test/C/C11/n1285.c
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -verify=wrong -std=c99 %s
+// RUN: %clang_cc1 -verify=wrong -std=c11 %s
+// RUN: %clang_cc1 -verify=cpp -std=c++11 -x c++ %s
+
+/* WG14 N1285: No
+ * Extending the lifetime of temporary objects (factored approach)
+ *
+ * NB: we do not properly materialize temporary expressions in situations where
+ * it would be expected; that is why the "no-diagnostics" marking is named
+ * "wrong". We do issue the expected diagnostic in C++ mode.
+ */
+
+// wrong-no-diagnostics
+
+struct X { int a[5]; };
+struct X f(void);
+
+int foo(void) {
+  // FIXME: This diagnostic should be issued in C11 as well (though not in C99,
+  // as this paper was a breaking change between C99 and C11).
+  int *p = f().a; // cpp-warning {{temporary whose address is used as value of local variable 'p' will be destroyed at the end of the full-expression}}
+  return *p;
+}
+
diff --git a/clang/test/C/C2x/n2819.c b/clang/test/C/C2x/n2819.c
new file mode 100644
index 00000000000000..c428fbba802453
--- /dev/null
+++ b/clang/test/C/C2x/n2819.c
@@ -0,0 +1,57 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+// RUN: %clang_cc1 -triple=x86_64 -emit-llvm -o - -std=c23 %s | FileCheck %s
+
+/* WG14 N2819: No
+ * Disambiguate the storage class of some compound literals
+ */
+
+int *escaped;
+// CHECK-LABEL: define dso_local i32 @f(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr @escaped, align 8
+// CHECK-NEXT:    ret i32 1
+//
+int f(int *ptr) { escaped = ptr; return 1; }
+
+// CHECK-LABEL: define dso_local i32 @g(
+// CHECK-SAME: ptr noundef [[PARA:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PARA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PARA]], ptr [[PARA_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @f(ptr noundef @.compoundliteral)
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[CALL]] to i64
+// CHECK-NEXT:    ret i32 0
+//
+// FIXME: notice the we are using the global .compoundliteral object, not
+// allocating a new object on each call to g(). That's what was clarified by
+// N2819.
+int g(char *para [f(( int [27]) { 0 })]) {
+  return 0;
+}
+
+// CHECK-LABEL: define dso_local i32 @main(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @g(ptr noundef null)
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @escaped, align 8
+// CHECK-NEXT:    store i32 12, ptr [[TMP0]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @escaped, align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int main() {
+  // Sets 'escaped' to the address of the array created by the compound literal.
+  g(nullptr);
+
+  // The lifetime of that object should have ended when g() returned, so this
+  // should be a use-after-free.
+  *escaped = 12;
+  return *escaped;
+}
+
diff --git a/clang/test/C/C2x/n3033.c b/clang/test/C/C2x/n3033.c
new file mode 100644
index 00000000000000..bf249a37facb37
--- /dev/null
+++ b/clang/test/C/C2x/n3033.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -std=c23 -E %s | FileCheck %s
+
+/* WG14 N3033: Clang 12
+ * Comma ommission and deletion (__VA_OPT__)
+ */
+
+#define F(...)           f(0 __VA_OPT__(,) __VA_ARGS__)
+#define G(X, ...)        f(0, X __VA_OPT__(,) __VA_ARGS__)
+#define SDEF(sname, ...) S sname __VA_OPT__(= { __VA_ARGS__ })
+#define EMP
+
+F(a, b, c)        // replaced by f(0, a, b, c)
+// CHECK: f(0 , a, b, c)
+F()               // replaced by f(0)
+// CHECK: f(0 )
+F(EMP)            // replaced by f(0)
+// CHECK: f(0 )
+
+G(a, b, c)        // replaced by f(0, a, b, c)
+// CHECK: f(0, a , b, c)
+G(a, )            // replaced by f(0, a)
+// CHECK: f(0, a )
+G(a)              // replaced by f(0, a)
+// CHECK: f(0, a )
+
+SDEF(foo);        // replaced by S foo;
+// CHECK: S foo ;
+SDEF(bar, 1, 2);  // replaced by S bar = { 1, 2 };
+// CHECK: S bar = { 1, 2 };
+
+//#define H1(X, ...)       X __VA_OPT__(##) __VA_ARGS__  // error: ## may not appear at the beginning of a replacement list (6.10.3.3)
+
+#define H2(X, Y, ...)    __VA_OPT__(X ## Y,) __VA_ARGS__
+H2(a, b, c, d)    // replaced by ab, c, d
+// CHECK: ab, c, d
+
+#define H3(X, ...)       #__VA_OPT__(X##X X##X)
+H3(, 0)           // replaced by ""
+// CHECK: ""
+
+#define H4(X, ...)       __VA_OPT__(a X ## X) ## b
+H4(, 1)           // replaced by a b
+// CHECK: a b
+
+#define H5A(...)         __VA_OPT__()/**/__VA_OPT__()
+#define H5B(X)           a ## X ## b
+#define H5C(X)           H5B(X)
+H5C(H5A())        // replaced by ab
+// CHECK: ab
+
diff --git a/clang/test/C/C2x/n3033_2.c b/clang/test/C/C2x/n3033_2.c
new file mode 100644
index 00000000000000..5da206658690cd
--- /dev/null
+++ b/clang/test/C/C2x/n3033_2.c
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c23 -verify %s
+
+#define H1(X, ...)       X __VA_OPT__(##) __VA_ARGS__ // expected-error {{'##' cannot appear at start of __VA_OPT__ argument}}
+
diff --git a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.stc/p1.cpp b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.stc/p1.cpp
index f6b5d2487e73d0..9efa7b67f5bdbd 100644
--- a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.stc/p1.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.stc/p1.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,spec %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wno-explicit-specialization-storage-class %s
 
 // A storage-class-specifier shall not be specified in an explicit
 // specialization (14.7.3) or an explicit instantiation (14.7.2)
@@ -7,13 +8,13 @@ template<typename T> void f(T) {}
 template<typename T> static void g(T) {}
 
 
-template<> static void f<int>(int); // expected-warning{{explicit specialization cannot have a storage class}}
+template<> static void f<int>(int); // spec-warning{{explicit specialization cannot have a storage class}}
 template static void f<float>(float); // expected-error{{explicit instantiation cannot have a storage class}}
 
 template<> void f<double>(double);
 template void f<long>(long);
 
-template<> static void g<int>(int); // expected-warning{{explicit specialization cannot have a storage class}}
+template<> static void g<int>(int); // spec-warning{{explicit specialization cannot have a storage class}}
 template static void g<float>(float); // expected-error{{explicit instantiation cannot have a storage class}}
 
 template<> void g<double>(double);
@@ -29,5 +30,12 @@ int X<T>::value = 17;
 
 template static int X<int>::value; // expected-error{{explicit instantiation cannot have a storage class}}
 
-template<> static int X<float>::value; // expected-warning{{explicit specialization cannot have a storage class}}
+template<> static int X<float>::value; // spec-warning{{explicit specialization cannot have a storage class}}
                                        // expected-error@-1{{'static' can only be specified inside the class definition}}
+
+struct t1 {
+  template<typename>
+  static void f1();
+  template<>
+  static void f1<int>(); // spec-warning{{explicit specialization cannot have a storage class}}
+};
diff --git a/clang/test/CodeGen/PowerPC/inline-asm-constraints-error.c b/clang/test/CodeGen/PowerPC/inline-asm-constraints-error.c
new file mode 100644
index 00000000000000..2f35e52fc0b77c
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/inline-asm-constraints-error.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm-only -triple powerpc64-ibm-aix-xcoff -verify %s
+// RUN: %clang_cc1 -emit-llvm-only -triple powerpc-ibm-aix-xcoff -verify %s
+// This test case exist to test marking the 'a' inline assembly constraint as
+// unsupported because powerpc previously marked it as supported.
+int foo(int arg){
+  asm goto ("bc 12,2,%l[TEST_LABEL]" : : "a"(&&TEST_LABEL) : : TEST_LABEL); //expected-error {{invalid input constraint 'a' in asm}}
+  return 0;
+TEST_LABEL: return arg + 1;
+}
diff --git a/clang/test/CodeGen/aarch64-mixed-target-attributes.c b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
index 6aa747d4cb4617..3c047fec6ceedb 100644
--- a/clang/test/CodeGen/aarch64-mixed-target-attributes.c
+++ b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
@@ -30,9 +30,6 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @explicit_default.ifunc = weak_odr alias i32 (), ptr @explicit_default
-// CHECK: @implicit_default.ifunc = weak_odr alias i32 (), ptr @implicit_default
-// CHECK: @default_def_with_version_decls.ifunc = weak_odr alias i32 (), ptr @default_def_with_version_decls
 // CHECK: @explicit_default = weak_odr ifunc i32 (), ptr @explicit_default.resolver
 // CHECK: @implicit_default = weak_odr ifunc i32 (), ptr @implicit_default.resolver
 // CHECK: @default_def_with_version_decls = weak_odr ifunc i32 (), ptr @default_def_with_version_decls.resolver
diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
new file mode 100644
index 00000000000000..7c9067a5ceece3
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
@@ -0,0 +1,1825 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: aarch64-registered-target
+//RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+//
+// X2- hor
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_hor_za8_s8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_hor_za8_s8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svreadz_hor_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_s8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_hor_za8_u8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_hor_za8_u8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svreadz_hor_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_u8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_hor_za16_s16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_hor_za16_s16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svint16x2_t test_svreadz_hor_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_s16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_hor_za16_u16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_hor_za16_u16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svuint16x2_t test_svreadz_hor_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_u16_vg2(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x half> @test_svreadz_hor_za16_f16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x half> @_Z28test_svreadz_hor_za16_f16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+svfloat16x2_t test_svreadz_hor_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_f16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @test_svreadz_hor_za16_bf16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @_Z29test_svreadz_hor_za16_bf16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+svbfloat16x2_t test_svreadz_hor_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_bf16_vg2(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_hor_za32_s32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_hor_za32_s32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svint32x2_t test_svreadz_hor_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_s32_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_hor_za32_u32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_hor_za32_u32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svuint32x2_t test_svreadz_hor_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_u32_vg2(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x float> @test_svreadz_hor_za32_f32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x float> @_Z28test_svreadz_hor_za32_f32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_svreadz_hor_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_f32_vg2(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_hor_za64_s64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_hor_za64_s64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svint64x2_t test_svreadz_hor_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_s64_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_hor_za64_u64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_hor_za64_u64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svuint64x2_t test_svreadz_hor_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_u64_vg2(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x double> @test_svreadz_hor_za64_f64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x double> @_Z28test_svreadz_hor_za64_f64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+svfloat64x2_t test_svreadz_hor_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_f64_vg2(7, slice);
+}
+
+
+//
+// X2- ver
+//
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_ver_za8_s8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_ver_za8_s8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svreadz_ver_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_s8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_ver_za8_u8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_ver_za8_u8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svreadz_ver_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_u8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_ver_za16_s16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_ver_za16_s16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svint16x2_t test_svreadz_ver_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_s16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_ver_za16_u16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_ver_za16_u16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svuint16x2_t test_svreadz_ver_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_u16_vg2(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x half> @test_svreadz_ver_za16_f16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x half> @_Z28test_svreadz_ver_za16_f16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+svfloat16x2_t test_svreadz_ver_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_f16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @test_svreadz_ver_za16_bf16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @_Z29test_svreadz_ver_za16_bf16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+svbfloat16x2_t test_svreadz_ver_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_bf16_vg2(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_ver_za32_s32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_ver_za32_s32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svint32x2_t test_svreadz_ver_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_s32_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_ver_za32_u32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_ver_za32_u32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svuint32x2_t test_svreadz_ver_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_u32_vg2(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x float> @test_svreadz_ver_za32_f32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x float> @_Z28test_svreadz_ver_za32_f32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_svreadz_ver_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_f32_vg2(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_ver_za64_s64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_ver_za64_s64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svint64x2_t test_svreadz_ver_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_s64_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_ver_za64_u64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_ver_za64_u64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svuint64x2_t test_svreadz_ver_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_u64_vg2(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x double> @test_svreadz_ver_za64_f64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x double> @_Z28test_svreadz_ver_za64_f64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+svfloat64x2_t test_svreadz_ver_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_f64_vg2(7, slice);
+}
+
+
+//
+// X4 - hor
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_hor_za8_s8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_hor_za8_s8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svint8x4_t test_svreadz_hor_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_s8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_hor_za8_u8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_hor_za8_u8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svreadz_hor_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_u8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_hor_za16_s16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_hor_za16_s16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svint16x4_t test_svreadz_hor_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_s16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_hor_za16_u16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_hor_za16_u16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svuint16x4_t test_svreadz_hor_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_u16_vg4(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x half> @test_svreadz_hor_za16_f16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x half> @_Z28test_svreadz_hor_za16_f16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+svfloat16x4_t test_svreadz_hor_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_f16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @test_svreadz_hor_za16_bf16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @_Z29test_svreadz_hor_za16_bf16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+svbfloat16x4_t test_svreadz_hor_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_bf16_vg4(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_hor_za32_s32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_hor_za32_s32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svint32x4_t test_svreadz_hor_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_s32_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_hor_za32_u32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_hor_za32_u32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svuint32x4_t test_svreadz_hor_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_u32_vg4(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x float> @test_svreadz_hor_za32_f32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x float> @_Z28test_svreadz_hor_za32_f32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+svfloat32x4_t test_svreadz_hor_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_f32_vg4(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_hor_za64_s64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_hor_za64_s64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svint64x4_t test_svreadz_hor_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_s64_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_hor_za64_u64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_hor_za64_u64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svuint64x4_t test_svreadz_hor_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_u64_vg4(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x double> @test_svreadz_hor_za64_f64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x double> @_Z28test_svreadz_hor_za64_f64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+svfloat64x4_t test_svreadz_hor_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_f64_vg4(7, slice);
+}
+
+//
+// X4 - ver
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_ver_za8_s8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_ver_za8_s8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svint8x4_t test_svreadz_ver_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_s8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_ver_za8_u8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_ver_za8_u8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svreadz_ver_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_u8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_ver_za16_s16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_ver_za16_s16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svint16x4_t test_svreadz_ver_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_s16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_ver_za16_u16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_ver_za16_u16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svuint16x4_t test_svreadz_ver_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_u16_vg4(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x half> @test_svreadz_ver_za16_f16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x half> @_Z28test_svreadz_ver_za16_f16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+svfloat16x4_t test_svreadz_ver_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_f16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @test_svreadz_ver_za16_bf16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @_Z29test_svreadz_ver_za16_bf16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+svbfloat16x4_t test_svreadz_ver_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_bf16_vg4(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_ver_za32_s32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_ver_za32_s32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svint32x4_t test_svreadz_ver_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_s32_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_ver_za32_u32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_ver_za32_u32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svuint32x4_t test_svreadz_ver_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_u32_vg4(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x float> @test_svreadz_ver_za32_f32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x float> @_Z28test_svreadz_ver_za32_f32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+svfloat32x4_t test_svreadz_ver_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_f32_vg4(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_ver_za64_s64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_ver_za64_s64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svint64x4_t test_svreadz_ver_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_s64_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_ver_za64_u64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_ver_za64_u64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svuint64x4_t test_svreadz_ver_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_u64_vg4(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x double> @test_svreadz_ver_za64_f64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x double> @_Z28test_svreadz_ver_za64_f64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+svfloat64x4_t test_svreadz_ver_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_f64_vg4(7, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za8_s8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z23test_svreadz_hor_za8_s8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svreadz_hor_za8_s8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_s8(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za8_u8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z23test_svreadz_hor_za8_u8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svreadz_hor_za8_u8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_u8(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za16_s16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z25test_svreadz_hor_za16_s16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svreadz_hor_za16_s16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_s16(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za16_u16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z25test_svreadz_hor_za16_u16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svreadz_hor_za16_u16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_u16(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svreadz_hor_za16_f16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z25test_svreadz_hor_za16_f16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svreadz_hor_za16_f16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_f16(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svreadz_hor_za16_bf16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z26test_svreadz_hor_za16_bf16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svreadz_hor_za16_bf16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_bf16(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za32_s32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z25test_svreadz_hor_za32_s32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+svint32_t test_svreadz_hor_za32_s32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_s32(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za32_u32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z25test_svreadz_hor_za32_u32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+svuint32_t test_svreadz_hor_za32_u32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_u32(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svreadz_hor_za32_f32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z25test_svreadz_hor_za32_f32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svreadz_hor_za32_f32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_f32(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za64_s64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z25test_svreadz_hor_za64_s64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+svint64_t test_svreadz_hor_za64_s64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_s64(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za64_u64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z25test_svreadz_hor_za64_u64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+svuint64_t test_svreadz_hor_za64_u64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_u64(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svreadz_hor_za64_f64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z25test_svreadz_hor_za64_f64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+svfloat64_t test_svreadz_hor_za64_f64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_f64(7, slice);
+}
+
+// ZA128
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za128_s8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreadz_hor_za128_s8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svreadz_hor_za128_s8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_s8(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za128_u8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreadz_hor_za128_u8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svreadz_hor_za128_u8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_u8(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za128_s16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z26test_svreadz_hor_za128_s16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svreadz_hor_za128_s16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_s16(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za128_u16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z26test_svreadz_hor_za128_u16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svreadz_hor_za128_u16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_u16(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svreadz_hor_za128_f16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z26test_svreadz_hor_za128_f16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svreadz_hor_za128_f16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_f16(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svreadz_hor_za128_bf16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 5, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z27test_svreadz_hor_za128_bf16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 5, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svreadz_hor_za128_bf16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_bf16(5, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za128_s32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 6, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z26test_svreadz_hor_za128_s32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 6, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+svint32_t test_svreadz_hor_za128_s32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_s32(6, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za128_u32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z26test_svreadz_hor_za128_u32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+svuint32_t test_svreadz_hor_za128_u32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_u32(7, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svreadz_hor_za128_f32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 8, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z26test_svreadz_hor_za128_f32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 8, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svreadz_hor_za128_f32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_f32(8, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za128_s64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 13, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z26test_svreadz_hor_za128_s64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 13, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+svint64_t test_svreadz_hor_za128_s64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_s64(13, slice);
+}
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za128_u64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 14, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z26test_svreadz_hor_za128_u64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 14, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+svuint64_t test_svreadz_hor_za128_u64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_u64(14, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svreadz_hor_za128_f64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 [[SLICE]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z26test_svreadz_hor_za128_f64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+svfloat64_t test_svreadz_hor_za128_f64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za128_f64(15, slice);
+}
diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index 7aabecd1c07b85..e4e0ae7b66afc8 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -196,14 +196,14 @@ void minusarch() {}
 // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
-// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
 // CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a" }
+// CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+ete,+fp-armv8,+neon,+trbe,+v8a" }
 // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" }
 // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+perfmon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.1a,+v8.2a,+v8a" "tune-cpu"="cortex-a710" }
 // CHECK: attributes #[[ATTR9]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
-// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,-sve" }
+// CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
+// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,-sve" }
 // CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
 // CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16" }
 // CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
diff --git a/clang/test/CodeGen/asan-frame-pointer.cpp b/clang/test/CodeGen/asan-frame-pointer.cpp
index ed3624f3146ebb..ecc1a18c11daa3 100644
--- a/clang/test/CodeGen/asan-frame-pointer.cpp
+++ b/clang/test/CodeGen/asan-frame-pointer.cpp
@@ -8,12 +8,12 @@ int global;
 
 // NONE: define internal void @asan.module_ctor() #[[#ATTR:]] {
 // NONE: define internal void @asan.module_dtor() #[[#ATTR]] {
-// NONE: attributes #[[#ATTR]] = { nounwind }
+// NONE: attributes #[[#ATTR]] = { nounwind
 
 // NONLEAF: define internal void @asan.module_ctor() #[[#ATTR:]] {
 // NONLEAF: define internal void @asan.module_dtor() #[[#ATTR]] {
-// NONLEAF: attributes #[[#ATTR]] = { nounwind "frame-pointer"="non-leaf" }
+// NONLEAF: attributes #[[#ATTR]] = { nounwind "frame-pointer"="non-leaf"
 
 // ALL: define internal void @asan.module_ctor() #[[#ATTR:]] {
 // ALL: define internal void @asan.module_dtor() #[[#ATTR]] {
-// ALL: attributes #[[#ATTR]] = { nounwind "frame-pointer"="all" }
+// ALL: attributes #[[#ATTR]] = { nounwind "frame-pointer"="all"
diff --git a/clang/test/CodeGen/asan-globals.cpp b/clang/test/CodeGen/asan-globals.cpp
index 4a370cbd446500..be52ea99de9692 100644
--- a/clang/test/CodeGen/asan-globals.cpp
+++ b/clang/test/CodeGen/asan-globals.cpp
@@ -67,13 +67,13 @@ void func() {
 // CHECK-NEXT: call void @__asan_unregister_globals
 // CHECK-NEXT: ret void
 
-// CHECK: attributes #[[#ATTR]] = { nounwind }
+// CHECK: attributes #[[#ATTR]] = { nounwind
 
 /// If -fasynchronous-unwind-tables, set the module flag "uwtable". ctor/dtor
 /// will thus get the uwtable attribute.
 // RUN: %clang_cc1 -emit-llvm -fsanitize=address -funwind-tables=2 -o - %s | FileCheck %s --check-prefixes=UWTABLE
 // UWTABLE: define internal void @asan.module_dtor() #[[#ATTR:]] {
-// UWTABLE: attributes #[[#ATTR]] = { nounwind uwtable }
+// UWTABLE: attributes #[[#ATTR]] = { nounwind uwtable
 // UWTABLE: ![[#]] = !{i32 7, !"uwtable", i32 2}
 
 // IGNORELIST-SRC:     @{{.*}}extra_global{{.*}} ={{.*}} global
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index ad6079a91fcd59..60f9c7f1fc24ed 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -27,14 +27,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @ftc.ifunc = weak_odr alias i32 (), ptr @ftc
-// CHECK: @ftc_def.ifunc = weak_odr alias i32 (), ptr @ftc_def
-// CHECK: @ftc_dup1.ifunc = weak_odr alias i32 (), ptr @ftc_dup1
-// CHECK: @ftc_dup2.ifunc = weak_odr alias i32 (), ptr @ftc_dup2
-// CHECK: @ftc_dup3.ifunc = weak_odr alias i32 (), ptr @ftc_dup3
-// CHECK: @ftc_inline2.ifunc = weak_odr alias i32 (), ptr @ftc_inline2
-// CHECK: @ftc_inline1.ifunc = weak_odr alias i32 (), ptr @ftc_inline1
-// CHECK: @ftc_inline3.ifunc = weak_odr alias i32 (), ptr @ftc_inline3
 // CHECK: @ftc = weak_odr ifunc i32 (), ptr @ftc.resolver
 // CHECK: @ftc_def = weak_odr ifunc i32 (), ptr @ftc_def.resolver
 // CHECK: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
@@ -45,14 +37,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver
 //.
 // CHECK-MTE-BTI: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK-MTE-BTI: @ftc.ifunc = weak_odr alias i32 (), ptr @ftc
-// CHECK-MTE-BTI: @ftc_def.ifunc = weak_odr alias i32 (), ptr @ftc_def
-// CHECK-MTE-BTI: @ftc_dup1.ifunc = weak_odr alias i32 (), ptr @ftc_dup1
-// CHECK-MTE-BTI: @ftc_dup2.ifunc = weak_odr alias i32 (), ptr @ftc_dup2
-// CHECK-MTE-BTI: @ftc_dup3.ifunc = weak_odr alias i32 (), ptr @ftc_dup3
-// CHECK-MTE-BTI: @ftc_inline2.ifunc = weak_odr alias i32 (), ptr @ftc_inline2
-// CHECK-MTE-BTI: @ftc_inline1.ifunc = weak_odr alias i32 (), ptr @ftc_inline1
-// CHECK-MTE-BTI: @ftc_inline3.ifunc = weak_odr alias i32 (), ptr @ftc_inline3
 // CHECK-MTE-BTI: @ftc = weak_odr ifunc i32 (), ptr @ftc.resolver
 // CHECK-MTE-BTI: @ftc_def = weak_odr ifunc i32 (), ptr @ftc_def.resolver
 // CHECK-MTE-BTI: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 75f8734e5aaf37..024aafffca6299 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -137,19 +137,6 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @fmv.ifunc = weak_odr alias i32 (), ptr @fmv
-// CHECK: @fmv_one.ifunc = weak_odr alias i32 (), ptr @fmv_one
-// CHECK: @fmv_two.ifunc = weak_odr alias i32 (), ptr @fmv_two
-// CHECK: @fmv_e.ifunc = weak_odr alias i32 (), ptr @fmv_e
-// CHECK: @fmv_d.ifunc = internal alias i32 (), ptr @fmv_d
-// CHECK: @fmv_c.ifunc = weak_odr alias void (), ptr @fmv_c
-// CHECK: @fmv_inline.ifunc = weak_odr alias i32 (), ptr @fmv_inline
-// CHECK: @unused_with_default_def.ifunc = weak_odr alias i32 (), ptr @unused_with_default_def
-// CHECK: @unused_with_implicit_default_def.ifunc = weak_odr alias i32 (), ptr @unused_with_implicit_default_def
-// CHECK: @unused_with_implicit_forward_default_def.ifunc = weak_odr alias i32 (), ptr @unused_with_implicit_forward_default_def
-// CHECK: @default_def_with_version_decls.ifunc = weak_odr alias i32 (), ptr @default_def_with_version_decls
-// CHECK: @used_def_without_default_decl.ifunc = weak_odr alias i32 (), ptr @used_def_without_default_decl
-// CHECK: @used_decl_without_default_decl.ifunc = weak_odr alias i32 (), ptr @used_decl_without_default_decl
 // CHECK: @fmv = weak_odr ifunc i32 (), ptr @fmv.resolver
 // CHECK: @fmv_one = weak_odr ifunc i32 (), ptr @fmv_one.resolver
 // CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver
diff --git a/clang/test/CodeGen/coverage-target-attr.c b/clang/test/CodeGen/coverage-target-attr.c
new file mode 100644
index 00000000000000..d46299f5bee223
--- /dev/null
+++ b/clang/test/CodeGen/coverage-target-attr.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -emit-llvm -coverage-notes-file=/dev/null -coverage-data-file=/dev/null -triple aarch64-linux-android30 -target-cpu generic -target-feature +tagged-globals -fsanitize=hwaddress %s -o %t
+// RUN: FileCheck %s < %t
+
+// CHECK: define internal void @__llvm_gcov_writeout() unnamed_addr [[ATTR:#[0-9]+]]
+// CHECK: define internal void @__llvm_gcov_reset() unnamed_addr [[ATTR]]
+// CHECK: define internal void @__llvm_gcov_init() unnamed_addr [[ATTR]]
+// CHECK: define internal void @hwasan.module_ctor() [[ATTR2:#[0-9]+]]
+// CHECK: attributes [[ATTR]] = {{.*}} "target-cpu"="generic" "target-features"="+tagged-globals"
+// CHECK: attributes [[ATTR2]] = {{.*}} "target-cpu"="generic" "target-features"="+tagged-globals"
+
+__attribute__((weak)) int foo = 0;
+
+__attribute__((weak)) void bar() {}
+
+int main() {
+  if (foo) bar();
+}
diff --git a/clang/test/CodeGen/ptrauth-intrinsics.c b/clang/test/CodeGen/ptrauth-intrinsics.c
index db37d785537691..50bf1898e4b372 100644
--- a/clang/test/CodeGen/ptrauth-intrinsics.c
+++ b/clang/test/CodeGen/ptrauth-intrinsics.c
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-intrinsics -emit-llvm %s  -o - | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-elf     -fptrauth-intrinsics -emit-llvm %s  -o - | FileCheck %s
+//
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-intrinsics -emit-llvm %s  -o - -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-elf     -fptrauth-intrinsics -emit-llvm %s  -o - -fexperimental-new-constant-interpreter | FileCheck %s
 
 void (*fnptr)(void);
 long int_discriminator;
diff --git a/clang/test/CodeGen/ptrauth-ubsan-vptr.cpp b/clang/test/CodeGen/ptrauth-ubsan-vptr.cpp
new file mode 100644
index 00000000000000..6c36004641477d
--- /dev/null
+++ b/clang/test/CodeGen/ptrauth-ubsan-vptr.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple arm64e-apple-ios15 -fsanitize=vptr -O0 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64e-apple-ios15 -fsanitize=vptr -O2 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+
+struct S {
+  S() {}
+  ~S() {}
+  virtual int v() { return 0; }
+  int a;
+};
+
+struct T : public S {
+  virtual int v();
+};
+
+// CHECK-LABEL: foo1
+int foo1(void* Buffer) {
+  T *p = reinterpret_cast<T*>(Buffer);
+  return p->v();
+}
+// CHECK-NOT: call {{.*}} @llvm.ptrauth.auth{{.*}}
+// CHECK-NOT: call {{.*}} @llvm.ptrauth.strip{{.*}}
+
+// CHECK-LABEL: foo2
+int foo2(S* s) {
+  T *p = dynamic_cast<T*>(s);
+  return p->v();
+}
+
+// CHECK-NOT: call {{.*}} @llvm.ptrauth.auth{{.*}}
+// CHECK-NOT: call {{.*}} @llvm.ptrauth.strip{{.*}}
diff --git a/clang/test/CodeGen/sanitize-metadata-nosanitize.c b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
index 60f93476b050ff..6414956fb67967 100644
--- a/clang/test/CodeGen/sanitize-metadata-nosanitize.c
+++ b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
@@ -93,7 +93,7 @@ __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
 // CHECK: attributes #1 = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 // CHECK: attributes #2 = { disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 // CHECK: attributes #3 = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #4 = { nounwind }
+// CHECK: attributes #4 = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 //.
 // CHECK: !2 = !{!"sanmd_covered!C", !3}
 // CHECK: !3 = !{i64 0}
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 7f7005d21b99a3..9a9fda70226fc2 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -269,6 +269,6 @@
 // RUN: FileCheck %s -check-prefix=VE
 // VE: target datalayout = "e-m:e-i64:64-n32:64-S128-v64:64:64-v128:64:64-v256:64:64-v512:64:64-v1024:64:64-v2048:64:64-v4096:64:64-v8192:64:64-v16384:64:64"
 
-// RUN: %clang_cc1 -triple spirv64-amd -o - -emit-llvm %s | \
-// RUN: FileCheck %s -check-prefix=SPIR64
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -o - -emit-llvm %s | \
+// RUN: FileCheck %s -check-prefix=AMDGPUSPIRV64
 // AMDGPUSPIRV64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0"
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index 57557bf09f411c..eeb23bc7e1c01e 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -49,18 +49,18 @@ __global__ void ffp2(double *p) {
   // CHECK: atomicrmw fmin ptr {{.*}} monotonic
   // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
   // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
-  // SAFE: _Z4ffp2Pd
+  // SAFE-LABEL: @_Z4ffp2Pd
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
-  // UNSAFE: _Z4ffp2Pd
-  // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
+  // UNSAFE-LABEL: @_Z4ffp2Pd
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
+  // UNSAFE: global_atomic_max_f64
+  // UNSAFE: global_atomic_min_f64
   __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
@@ -76,18 +76,18 @@ __global__ void ffp3(long double *p) {
   // CHECK: atomicrmw fmin ptr {{.*}} monotonic
   // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
   // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
-  // SAFE: _Z4ffp3Pe
+  // SAFE-LABEL: @_Z4ffp3Pe
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
-  // UNSAFE: _Z4ffp3Pe
-  // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
+  // UNSAFE-LABEL: @_Z4ffp3Pe
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
+  // UNSAFE: global_atomic_max_f64
+  // UNSAFE: global_atomic_min_f64
   __atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0L, memory_order_relaxed);
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 132cbd27b08fc3..2e88afac813f43 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -98,7 +98,7 @@ __global__
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
 // CHECK-NEXT:    store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -142,7 +142,7 @@ __global__ void test_ds_fadd(float src) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
 // CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -245,10 +245,10 @@ __device__ void func(float *x);
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
 // CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR8:[0-9]+]]
+// CHECK-NEXT:    call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
index 7bb756a4a27314..32851805298f18 100644
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@@ -95,7 +95,7 @@ __global__
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
 // CHECK-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -139,7 +139,7 @@ __global__ void test_ds_fadd(float src) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
 // CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -236,10 +236,10 @@ __device__ void func(float *x);
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
 // CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
diff --git a/clang/test/CodeGenCXX/attr-annotate.cpp b/clang/test/CodeGenCXX/attr-annotate.cpp
index 64627a6b83e11c..ce284c7d199828 100644
--- a/clang/test/CodeGenCXX/attr-annotate.cpp
+++ b/clang/test/CodeGenCXX/attr-annotate.cpp
@@ -3,9 +3,9 @@
 //CHECK: @[[STR1:.*]] = private unnamed_addr constant [{{.*}} x i8] c"{{.*}}attr-annotate.cpp\00", section "llvm.metadata"
 //CHECK: @[[STR2:.*]] = private unnamed_addr constant [4 x i8] c"abc\00", align 1
 //CHECK: @[[STR:.*]] = private unnamed_addr constant [5 x i8] c"test\00", section "llvm.metadata"
-//CHECK: @[[ARGS:.*]] = private unnamed_addr constant { %struct.Struct } { %struct.Struct { ptr @_ZN1AIjLj9EE2SVE, ptr getelementptr (i8, ptr @_ZN1AIjLj9EE2SVE, i64 4) } }, section "llvm.metadata"
-//CHECK: @[[ARGS2:.*]] = private unnamed_addr constant { i32, ptr, i32 } { i32 9, ptr @[[STR2:.*]], i32 8 }, section "llvm.metadata"
-//CHECK: @llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @_ZN1AIjLj9EE5test2Ev, ptr @.str.6, ptr @.str.1, i32 24, ptr @[[ARGS]] }, { ptr, ptr, ptr, i32, ptr } { ptr @_ZN1AIjLj9EE4testILi8EEEvv, ptr @[[STR:.*]], ptr @[[STR1:.*]], i32 {{.*}}, ptr @[[ARGS2:.*]] }]
+//CHECK: @[[ARGS:.*]] = private unnamed_addr constant { i32, ptr, i32 } { i32 9, ptr @[[STR2:.*]], i32 8 }, section "llvm.metadata"
+//CHECK: @[[ARGS2:.*]] = private unnamed_addr constant { %struct.Struct } { %struct.Struct { ptr @_ZN1AIjLj9EE2SVE, ptr getelementptr (i8, ptr @_ZN1AIjLj9EE2SVE, i64 4) } }, section "llvm.metadata"
+//CHECK: @llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @_ZN1AIjLj9EE4testILi8EEEvv, ptr @.str.6, ptr @.str.1, i32 23, ptr @.args.7 }, { ptr, ptr, ptr, i32, ptr } { ptr @_ZN1AIjLj9EE5test2Ev, ptr @.str.6, ptr @.str.1, i32 24, ptr @.args.8 }], section "llvm.metadata"
 
 constexpr const char* str() {
   return "abc";
diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
index 7953f902bf09b2..29ae6b6856500f 100644
--- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
@@ -40,10 +40,6 @@ void run_foo_tml() {
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_Z7foo_ovli.ifunc = weak_odr alias i32 (i32), ptr @_Z7foo_ovli
-// CHECK: @_Z7foo_ovlv.ifunc = weak_odr alias i32 (), ptr @_Z7foo_ovlv
-// CHECK: @_ZN7MyClassIssE7foo_tmlEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv
-// CHECK: @_ZN7MyClassIisE7foo_tmlEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv
 // CHECK: @_Z7foo_ovli = weak_odr ifunc i32 (i32), ptr @_Z7foo_ovli.resolver
 // CHECK: @_Z7foo_ovlv = weak_odr ifunc i32 (), ptr @_Z7foo_ovlv.resolver
 // CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver
diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
index 8b7273fe3bb517..fd19f4c5a30304 100644
--- a/clang/test/CodeGenCXX/attr-target-version.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version.cpp
@@ -62,12 +62,6 @@ int bar() {
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_Z3fooi.ifunc = weak_odr alias i32 (i32), ptr @_Z3fooi
-// CHECK: @_Z3foov.ifunc = weak_odr alias i32 (), ptr @_Z3foov
-// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr alias i32 (ptr, i32), ptr @_ZN7MyClass3gooEi
-// CHECK: @_ZN7MyClass23unused_with_default_defEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN7MyClass23unused_with_default_defEv
-// CHECK: @_ZN7MyClass32unused_with_implicit_default_defEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN7MyClass32unused_with_implicit_default_defEv
-// CHECK: @_ZN7MyClass40unused_with_implicit_forward_default_defEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv
 // CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
 // CHECK: @_Z3fooi = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver
 // CHECK: @_Z3foov = weak_odr ifunc i32 (), ptr @_Z3foov.resolver
diff --git a/clang/test/CodeGenCXX/auto-var-init-max-size.cpp b/clang/test/CodeGenCXX/auto-var-init-max-size.cpp
index ef38b8227a9a1a..25c30c383332b5 100644
--- a/clang/test/CodeGenCXX/auto-var-init-max-size.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init-max-size.cpp
@@ -15,7 +15,9 @@ struct Foo {
 
 int foo(unsigned n) {
   bool var_size_1;
-  long var_size_8 = 123;
+  long var_size_8_init = 123;
+  long var_size_8_noinit;
+  long var_size_8_init_later;
   void *var_size_8p;
   int var_size_1024[256];
   Foo var_size_1028;
@@ -24,29 +26,39 @@ int foo(unsigned n) {
   int var_vla[n];
   // builtin, non-constant size
   var_size_8p = __builtin_alloca(sizeof(unsigned long long) * n);
-  // There are 8 variables: var_size_1, var_size_8, var_size_8p, var_size_1024,
+  var_size_8_init_later = 456;
+  // There are 10 variables: var_size_1, var_size_8_init, var_size_8_noinit,
+  // var_size_8_init_later, var_size_8p, var_size_1024,
   // var_size_1028, var_size_4096, var_vla, and a builtin anonymous var ("%5").
+  // - Doesn't need auto-init: var_size_8_init
   // - COMMON (auto-init regardless of the max size): "var_vla", and "%5"
   // - Max size 1: "var_size_1"
-  // - Max size 1024: "var_size_1", "var_size_8", "var_size_8p", "var_size_1024"
-  // - Max size 4096: "var_size_1", "var_size_8", "var_size_8p", "var_size_1024", "var_size_1028", "var_size_4096"
+  // - Max size 1024: "var_size_1", "var_size_8_noinit", "var_size_8_init_later", "var_size_8p", "var_size_1024"
+  // - Max size 4096: "var_size_1", "var_size_8_noinit", "var_size_8_init_later", "var_size_8p", "var_size_1024",
+  //                  "var_size_1028", "var_size_4096"
   //
   // PATTERN-MAX-1: store i8 -86, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
-  // PATTERN-MAX-1-NOT: store i64 -6148914691236517206, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1-NEXT: store i64 123, ptr %var_size_8_init, align 8
+  // PATTERN-MAX-1-NOT: store i64 -6148914691236517206, ptr %var_size_8_noinit, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1-NOT: store i64 -6148914691236517206, ptr %var_size_8_init_later, align 8, !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1-NOT: store ptr inttoptr (i64 -6148914691236517206 to ptr), ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 -86, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 -86, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 -86, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
 
   // PATTERN-MAX-1024: store i8 -86, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
-  // PATTERN-MAX-1024: store i64 -6148914691236517206, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1024-NEXT: store i64 123, ptr %var_size_8_init, align 8
+  // PATTERN-MAX-1024: store i64 -6148914691236517206, ptr %var_size_8_noinit, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1024: store i64 -6148914691236517206, ptr %var_size_8_init_later, align 8, !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1024: store ptr inttoptr (i64 -6148914691236517206 to ptr), ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1024: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 -86, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 -86, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 -86, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
 
   // PATTERN-MAX-4096: store i8 -86, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
-  // PATTERN-MAX-4096: store i64 -6148914691236517206, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-4096-NEXT: store i64 123, ptr %var_size_8_init, align 8
+  // PATTERN-MAX-4096: store i64 -6148914691236517206, ptr %var_size_8_noinit, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-4096: store i64 -6148914691236517206, ptr %var_size_8_init_later, align 8, !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-4096: store ptr inttoptr (i64 -6148914691236517206 to ptr), ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-4096: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 -86, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
   // PATTERN-MAX-4096: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 -86, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
@@ -56,21 +68,27 @@ int foo(unsigned n) {
   // PATTERN-COMMON: call void @llvm.memset.p0.i64(ptr align 16 %5, i8 -86, i64 %mul, i1 false), !annotation [[AUTO_INIT:!.+]]
 
   // ZERO-MAX-1: store i8 0, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
-  // ZERO-MAX-1-NOT: store i64 0, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1-NEXT: store i64 123, ptr %var_size_8_init, align 8
+  // ZERO-MAX-1-NOT: store i64 0, ptr %var_size_8_noinit, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1-NOT: store i64 0, ptr %var_size_8_init_later, align 8, !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1-NOT: store ptr null, ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 0, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 0, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 0, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
 
   // ZERO-MAX-1024: store i8 0, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
-  // ZERO-MAX-1024: store i64 0, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1024-NEXT: store i64 123, ptr %var_size_8_init, align 8
+  // ZERO-MAX-1024: store i64 0, ptr %var_size_8_noinit, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1024: store i64 0, ptr %var_size_8_init_later, align 8, !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1024: store ptr null, ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1024: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 0, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 0, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 0, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
 
   // ZERO-MAX-4096: store i8 0, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
-  // ZERO-MAX-4096: store i64 0, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-4096-NEXT: store i64 123, ptr %var_size_8_init, align 8
+  // ZERO-MAX-4096: store i64 0, ptr %var_size_8_noinit, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-4096: store i64 0, ptr %var_size_8_init_later, align 8, !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-4096: store ptr null, ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-4096: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 0, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
   // ZERO-MAX-4096: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 0, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
diff --git a/clang/test/CodeGenCXX/auto-var-init-stop-after.cpp b/clang/test/CodeGenCXX/auto-var-init-stop-after.cpp
index a782692d0127ec..f1dc0e3a068e76 100644
--- a/clang/test/CodeGenCXX/auto-var-init-stop-after.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init-stop-after.cpp
@@ -18,7 +18,7 @@ typedef struct {
 
 int foo(unsigned n) {
   // scalar variable
-  long a = 888;
+  long a;
   // array
   S arr[ARRLEN];
   // VLA
diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp
index e1568bee136e5e..e697731b0cdf1e 100644
--- a/clang/test/CodeGenCXX/auto-var-init.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init.cpp
@@ -146,16 +146,8 @@ struct notlockfree { long long a[4]; };
 // PATTERN-O1-NOT: @__const.test_atomictailpad_uninit.uninit
 // PATTERN-O0: @__const.test_complexfloat_uninit.uninit = private unnamed_addr constant { float, float } { float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000 }, align 4
 // PATTERN-O1-NOT: @__const.test_complexfloat_uninit.uninit
-// PATTERN-O0: @__const.test_complexfloat_braces.braces = private unnamed_addr constant { float, float } { float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000 }, align 4
-// PATTERN-O1-NOT: @__const.test_complexfloat_braces.braces
-// PATTERN-O0: @__const.test_complexfloat_custom.custom = private unnamed_addr constant { float, float } { float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000 }, align 4
-// PATTERN-O1-NOT: @__const.test_complexfloat_custom.custom
 // PATTERN-O0: @__const.test_complexdouble_uninit.uninit = private unnamed_addr constant { double, double } { double 0xFFFFFFFFFFFFFFFF, double 0xFFFFFFFFFFFFFFFF }, align 8
 // PATTERN-O1-NOT: @__const.test_complexdouble_uninit.uninit
-// PATTERN-O0: @__const.test_complexdouble_braces.braces = private unnamed_addr constant { double, double } { double 0xFFFFFFFFFFFFFFFF, double 0xFFFFFFFFFFFFFFFF }, align 8
-// PATTERN-O1-NOT: @__const.test_complexdouble_braces.braces
-// PATTERN-O0: @__const.test_complexdouble_custom.custom = private unnamed_addr constant { double, double } { double 0xFFFFFFFFFFFFFFFF, double 0xFFFFFFFFFFFFFFFF }, align 8
-// PATTERN-O1-NOT: @__const.test_complexdouble_custom.custom
 // PATTERN-O0: @__const.test_semivolatile_uninit.uninit = private unnamed_addr constant %struct.semivolatile { i32 [[I32]], i32 [[I32]] }, align 4
 // PATTERN-O0: @__const.test_semivolatile_custom.custom = private unnamed_addr constant %struct.semivolatile { i32 1145324612, i32 1145324612 }, align 4
 // PATTERN-O1-NOT: @__const.test_semivolatile_custom.custom
diff --git a/clang/test/CodeGenCXX/catch-undef-behavior.cpp b/clang/test/CodeGenCXX/catch-undef-behavior.cpp
index 1a0e98a6613754..0c14c97dacca95 100644
--- a/clang/test/CodeGenCXX/catch-undef-behavior.cpp
+++ b/clang/test/CodeGenCXX/catch-undef-behavior.cpp
@@ -56,8 +56,8 @@ void member_access(S *p) {
 
   // (1b) Check that 'p' actually points to an 'S'.
 
-  // CHECK: %[[VPTR:.*]] = load i64, ptr
-  //
+  // CHECK: %[[VTABLE:.*]] = load ptr, ptr %0
+  // CHECK: %[[VPTR:.*]] = ptrtoint ptr %[[VTABLE]] to i64
   // hash_16_bytes:
   //
   // If this number changes, it indicates that either the mangled name of ::S
@@ -110,7 +110,8 @@ void member_access(S *p) {
 
   // (3b) Check that 'p' actually points to an 'S'
 
-  // CHECK: load i64, ptr
+  // CHECK: [[VTABLE2:%.*]] = load ptr, ptr
+  // CHECK: ptrtoint ptr [[VTABLE2]] to i64
   // CHECK-NEXT: mul i64 %[[#]], -4658895280553007687, !nosanitize
   // [...]
   // CHECK: getelementptr inbounds [128 x i64], ptr @__ubsan_vptr_type_cache, i32 0, i64 %
diff --git a/clang/test/CodeGenCXX/debug-info-verbose-trap.cpp b/clang/test/CodeGenCXX/debug-info-verbose-trap.cpp
new file mode 100644
index 00000000000000..f492698ccab83d
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-verbose-trap.cpp
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++20 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
+
+// CHECK-LABEL: define void @_Z2f0v()
+// CHECK: call void @llvm.trap(), !dbg ![[LOC17:.*]]
+
+// CHECK: declare void @llvm.trap() #[[ATTR1:.*]]
+
+// CHECK-LABEL: define void @_Z2f1v()
+// CHECK: call void @llvm.trap(), !dbg ![[LOC23:.*]]
+// CHECK: call void @llvm.trap(), !dbg ![[LOC25:.*]]
+
+// CHECK-LABEL: define void @_Z2f3v()
+// CHECK: call void @_Z2f2IXadsoKcL_ZL8constCatEEEXadsoS0_L_ZL8constMsgEEEEvv()
+
+// CHECK-LABEL: define internal void @_Z2f2IXadsoKcL_ZL8constCatEEEXadsoS0_L_ZL8constMsgEEEEvv
+// CHECK: call void @llvm.trap(), !dbg ![[LOC36:.*]]
+
+// CHECK: attributes #[[ATTR1]] = { cold {{.*}}}
+
+// CHECK: ![[FILESCOPE:.*]] = !DIFile(filename: "{{.*}}debug-info-verbose-trap.cpp"
+
+char const constCat[] = "category2";
+char const constMsg[] = "hello";
+
+// CHECK: ![[SUBPROG14:.*]] = distinct !DISubprogram(name: "f0", linkageName: "_Z2f0v",
+// CHECK: ![[LOC17]] = !DILocation(line: 0, scope: ![[SUBPROG18:.*]], inlinedAt: ![[LOC20:.*]])
+// CHECK: ![[SUBPROG18]] = distinct !DISubprogram(name: "__clang_trap_msg$category1$Argument_must_not_be_null", scope: ![[FILESCOPE]], file: ![[FILESCOPE]], type: !{{.*}}, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !{{.*}})
+// CHECK: ![[LOC20]] = !DILocation(line: [[@LINE+2]], column: 3, scope: ![[SUBPROG14]])
+void f0() {
+  __builtin_verbose_trap("category1", "Argument_must_not_be_null");
+}
+
+// CHECK: ![[SUBPROG22:.*]] = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v",
+// CHECK: ![[LOC23]] = !DILocation(line: 0, scope: ![[SUBPROG18]], inlinedAt: ![[LOC24:.*]])
+// CHECK: ![[LOC24]] = !DILocation(line: [[@LINE+5]], column: 3, scope: ![[SUBPROG22]])
+// CHECK: ![[LOC25]] = !DILocation(line: 0, scope: ![[SUBPROG26:.*]], inlinedAt: ![[LOC27:.*]])
+// CHECK: ![[SUBPROG26]] = distinct !DISubprogram(name: "__clang_trap_msg$category2$hello", scope: ![[FILESCOPE]], file: ![[FILESCOPE]], type: !{{.*}}, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !{{.*}})
+// CHECK: ![[LOC27]] = !DILocation(line: [[@LINE+3]], column: 3, scope: ![[SUBPROG22]])
+void f1() {
+  __builtin_verbose_trap("category1", "Argument_must_not_be_null");
+  __builtin_verbose_trap("category2", "hello");
+}
+
+// CHECK: ![[SUBPROG32:.*]] = distinct !DISubprogram(name: "f2<constCat, constMsg>", linkageName: "_Z2f2IXadsoKcL_ZL8constCatEEEXadsoS0_L_ZL8constMsgEEEEvv",
+// CHECK: ![[LOC36]] = !DILocation(line: 0, scope: ![[SUBPROG26]], inlinedAt: ![[LOC37:.*]])
+// CHECK: ![[LOC37]] = !DILocation(line: [[@LINE+3]], column: 3, scope: ![[SUBPROG32]])
+template <const char * const category, const char * const reason>
+void f2() {
+  __builtin_verbose_trap(category, reason);
+}
+
+void f3() {
+  f2<constCat, constMsg>();
+}
diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
index 5bcd0da06eebc3..193f01db4c5d3a 100644
--- a/clang/test/CodeGenCXX/fmv-namespace.cpp
+++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
@@ -19,8 +19,6 @@ int baz() { return OtherName::foo(); }
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_ZN4Name3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN4Name3fooEv
-// CHECK: @_ZN9OtherName3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN9OtherName3fooEv
 // CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver
 // CHECK: @_ZN9OtherName3fooEv = weak_odr ifunc i32 (), ptr @_ZN9OtherName3fooEv.resolver
 //.
diff --git a/clang/test/CodeGenCXX/microsoft-uuidof.cpp b/clang/test/CodeGenCXX/microsoft-uuidof.cpp
index 35b4f027e54d1d..2de964a152dd56 100644
--- a/clang/test/CodeGenCXX/microsoft-uuidof.cpp
+++ b/clang/test/CodeGenCXX/microsoft-uuidof.cpp
@@ -4,6 +4,13 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -DWRONG_GUID -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-WRONG-GUID
 
+/// The same, but with the new constant interpreter.
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -emit-llvm %s -o - -DDEFINE_GUID -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-GUID
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -emit-llvm %s -o - -DDEFINE_GUID -DBRACKET_ATTRIB -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-GUID
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -emit-llvm %s -o - -triple=i386-pc-linux -fms-extensions | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -emit-llvm %s -o - -triple=x86_64-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-64
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -emit-llvm %s -o - -DDEFINE_GUID -DWRONG_GUID -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-WRONG-GUID
+
 #ifdef DEFINE_GUID
 struct _GUID {
 #ifdef WRONG_GUID
diff --git a/clang/test/CodeGenCXX/msabi-preserve-none-cc.cpp b/clang/test/CodeGenCXX/msabi-preserve-none-cc.cpp
new file mode 100644
index 00000000000000..29df5e4d84a700
--- /dev/null
+++ b/clang/test/CodeGenCXX/msabi-preserve-none-cc.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fdeclspec -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-unknown-windows-msvc -fdeclspec -emit-llvm %s -o - | FileCheck %s
+
+void __attribute__((__preserve_none__)) f() {}
+// CHECK-DAG: @"?f@@YVXXZ"
+
+void (__attribute__((__preserve_none__)) *p)();
+// CHECK-DAG: @"?p@@3P6VXXZEA
+
+namespace {
+void __attribute__((__preserve_none__)) __attribute__((__used__)) f() { }
+}
+// CHECK-DAG: @"?f@?A0x{{[^@]*}}@@YVXXZ"
+
+namespace n {
+void __attribute__((__preserve_none__)) f() {}
+}
+// CHECK-DAG: @"?f@n@@YVXXZ"
+
+struct __declspec(dllexport) S {
+  S(const S &) = delete;
+  S & operator=(const S &) = delete;
+  void __attribute__((__preserve_none__)) m() { }
+};
+// CHECK-DAG: @"?m@S@@QEAVXXZ"
+
+void f(void (__attribute__((__preserve_none__))())) {}
+// CHECK-DAG: @"?f@@YAXP6VXXZ@Z"
diff --git a/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-call-2.cpp b/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-call-2.cpp
new file mode 100644
index 00000000000000..c2f20d56b0a6b0
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-call-2.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fapple-kext -fno-rtti -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: @_ZTV1A = unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZNK1A3abcEv, i32 0, i64 12401, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2)), ptr null] }, align 8
+// CHECK: @_ZTV4Base = unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZNK4Base3abcEv, i32 0, i64 64320, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV4Base, i32 0, i32 0, i32 2)), ptr null] }, align 8
+// CHECK: @_ZTV8Derived2 = unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZNK8Derived23efgEv, i32 0, i64 36603, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 3)), ptr null] }, align 8
+// CHECK: @_ZTV2D2 = unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZNK2D23abcEv, i32 0, i64 20222, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 3)), ptr null] }, align 8
+
+struct A {
+  virtual const char* abc(void) const;
+};
+
+const char* A::abc(void) const {return "A"; };
+
+struct B : virtual A {
+  virtual void VF();
+};
+
+void B::VF() {}
+
+void FUNC(B* p) {
+// CHECK: [[T1:%.*]] = load ptr, ptr getelementptr inbounds (ptr, ptr @_ZTV1A, i64 2)
+// CHECK-NEXT:  [[BT1:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr getelementptr inbounds (ptr, ptr @_ZTV1A, i64 2) to i64), i64 12401)
+// CHECK-NEXT:  [[T2:%.*]] = call noundef ptr [[T1]](ptr noundef {{.*}}) [ "ptrauth"(i32 0, i64 [[BT1]]) ]
+  const char* c = p->A::abc();
+}
+
+
+// Test2
+struct Base { virtual char* abc(void) const; };
+
+char* Base::abc() const { return 0; }
+
+struct Derived : public Base {
+};
+
+void FUNC1(Derived* p) {
+// CHECK: [[U1:%.*]] = load ptr, ptr getelementptr inbounds (ptr, ptr @_ZTV4Base, i64 2)
+// CHECK-NEXT: [[BU1:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr getelementptr inbounds (ptr, ptr @_ZTV4Base, i64 2) to i64), i64 64320)
+// CHECK-NEXT:  [[U2:%.*]] = call noundef ptr [[U1]](ptr noundef {{.*}}) [ "ptrauth"(i32 0, i64 [[BU1]]) ]
+  char* c = p->Base::abc();
+}
+
+
+// Test3
+struct Base2 { };
+
+struct Derived2 : virtual Base2 {
+  virtual char* efg(void) const;
+};
+
+char* Derived2::efg(void) const { return 0; }
+
+void FUNC2(Derived2* p) {
+// CHECK: [[V1:%.*]] = load ptr, ptr getelementptr inbounds (ptr, ptr @_ZTV8Derived2, i64 3)
+// CHECK-NEXT:  [[BV1:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr getelementptr inbounds (ptr, ptr @_ZTV8Derived2, i64 3) to i64), i64 36603)
+// CHECK-NEXT:  [[V2:%.*]] = call noundef ptr [[V1]](ptr noundef {{.*}}) [ "ptrauth"(i32 0, i64 [[BV1]]) ]
+  char* c = p->Derived2::efg();
+}
+
+// Test4
+struct Base3 { };
+
+struct D1 : virtual Base3 {
+};
+
+struct D2 : virtual Base3 {
+ virtual char *abc(void) const;
+};
+
+struct Sub : D1, D2 {
+};
+
+char* D2::abc(void) const { return 0; }
+
+void FUNC3(Sub* p) {
+// CHECK: [[W1:%.*]] = load ptr, ptr getelementptr inbounds (ptr, ptr @_ZTV2D2, i64 3)
+// CHECK-NEXT:  [[BW1:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr getelementptr inbounds (ptr, ptr @_ZTV2D2, i64 3) to i64), i64 20222)
+// CHECK-NEXT:  [[W2:%.*]] = call noundef ptr [[W1]](ptr noundef {{.*}}) [ "ptrauth"(i32 0, i64 [[BW1]]) ]
+  char* c = p->D2::abc();
+}
+
+
+// Test4
+struct Base4 { virtual void abc(); };
+
+void Base4::abc() {}
+
+struct Derived4 : public Base4 {
+  void abc() override;
+};
+
+void Derived4::abc() {}
+
+void FUNC4(Derived4* p) {
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 0
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 426)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+  p->abc();
+}
diff --git a/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-call.cpp b/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-call.cpp
new file mode 100644
index 00000000000000..996829a14d7d16
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-call.cpp
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fapple-kext -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: @_ZTV5TemplIiE = internal unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI5TemplIiE, ptr ptrauth (ptr @_ZN5TemplIiE1fEv, i32 0, i64 22189, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5TemplIiE, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN5TemplIiE1gEv, i32 0, i64 9912, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5TemplIiE, i32 0, i32 0, i32 3)), ptr null] }, align 8
+
+struct Base {
+  virtual void abc(void) const;
+};
+
+void Base::abc(void) const {}
+
+void FUNC(Base* p) {
+  p->Base::abc();
+}
+
+// CHECK: getelementptr inbounds (ptr, ptr @_ZTV4Base, i64 2)
+// CHECK-NOT: call void @_ZNK4Base3abcEv
+
+template<class T>
+struct Templ {
+  virtual void f() {}
+  virtual void g() {}
+};
+template<class T>
+struct SubTempl : public Templ<T> {
+  virtual void f() {} // override
+  virtual void g() {} // override
+};
+
+void f(SubTempl<int>* t) {
+  // Qualified calls go through the (qualified) vtable in apple-kext mode.
+  // Since t's this pointer points to SubTempl's vtable, the call needs
+  // to load Templ<int>'s vtable.  Hence, Templ<int>::g needs to be
+  // instantiated in this TU, for it's referenced by the vtable.
+  // (This happens only in apple-kext mode; elsewhere virtual calls can always
+  // use the vtable pointer off this instead of having to load the vtable
+  // symbol.)
+  t->Templ::f();
+}
+
+// CHECK: getelementptr inbounds (ptr, ptr @_ZTV5TemplIiE, i64 2)
+// CHECK: define internal void @_ZN5TemplIiE1fEv(ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %this)
+// CHECK: define internal void @_ZN5TemplIiE1gEv(ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %this)
diff --git a/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-virtual-dtor-call.cpp b/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-virtual-dtor-call.cpp
new file mode 100644
index 00000000000000..7bcf1fbfdb9de9
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-apple-kext-indirect-virtual-dtor-call.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++98 -fptrauth-calls -fapple-kext -fno-rtti -disable-O0-optnone -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: @_ZTV5TemplIiE = internal unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5TemplIiED1Ev, i32 0, i64 57986, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV5TemplIiE, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN5TemplIiED0Ev, i32 0, i64 22856, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV5TemplIiE, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN5TemplIiE1fEv, i32 0, i64 22189, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV5TemplIiE, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN5TemplIiE1gEv, i32 0, i64 9912, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV5TemplIiE, i32 0, i32 0, i32 5)), ptr null] }, align 8
+
+struct B1 {
+  virtual ~B1();
+};
+
+B1::~B1() {}
+
+void DELETE(B1 *pb1) {
+  pb1->B1::~B1();
+}
+// CHECK-LABEL: define void @_ZN2B1D0Ev
+// CHECK: [[T1:%.*]] = load ptr, ptr getelementptr inbounds (ptr, ptr @_ZTV2B1, i64 2)
+// CHECK-NEXT: [[B1:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr getelementptr inbounds (ptr, ptr @_ZTV2B1, i64 2) to i64), i64 14635)
+// CHECK-NEXT: call noundef ptr [[T1]](ptr noundef nonnull align 8 dereferenceable(8) [[T2:%.*]]) [ "ptrauth"(i32 0, i64 [[B1]]) ]
+// CHECK-LABEL: define void @_Z6DELETEP2B1
+// CHECK: [[T3:%.*]] = load ptr, ptr getelementptr inbounds (ptr, ptr @_ZTV2B1, i64 2)
+// CHECK-NEXT: [[B3:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr getelementptr inbounds (ptr, ptr @_ZTV2B1, i64 2) to i64), i64 14635)
+// CHECK-NEXT:  call noundef ptr [[T3]](ptr noundef nonnull align 8 dereferenceable(8) [[T4:%.*]]) [ "ptrauth"(i32 0, i64 [[B3]])
+
+template<class T>
+struct Templ {
+  virtual ~Templ(); // Out-of-line so that the destructor doesn't cause a vtable
+  virtual void f() {}
+  virtual void g() {}
+};
+template<class T>
+struct SubTempl : public Templ<T> {
+  virtual ~SubTempl() {} // override
+  virtual void f() {} // override
+  virtual void g() {} // override
+};
+
+void f(SubTempl<int>* t) {
+  // Qualified calls go through the (qualified) vtable in apple-kext mode.
+  // Since t's this pointer points to SubTempl's vtable, the call needs
+  // to load Templ<int>'s vtable.  Hence, Templ<int>::g needs to be
+  // instantiated in this TU, for it's referenced by the vtable.
+  // (This happens only in apple-kext mode; elsewhere virtual calls can always
+  // use the vtable pointer off this instead of having to load the vtable
+  // symbol.)
+  t->Templ::~Templ();
+}
+
+// CHECK: getelementptr inbounds (ptr, ptr @_ZTV5TemplIiE, i64 2)
+// CHECK: declare void @_ZN5TemplIiED0Ev(ptr noundef nonnull align 8 dereferenceable(8))
+// CHECK: define internal void @_ZN5TemplIiE1fEv(ptr noundef nonnull align 8 dereferenceable(8) %this)
+// CHECK: define internal void @_ZN5TemplIiE1gEv(ptr noundef nonnull align 8 dereferenceable(8) %this)
diff --git a/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
new file mode 100644
index 00000000000000..e6497b3f152aa0
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
@@ -0,0 +1,564 @@
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,NODISC %s
+
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,TYPE %s
+
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,ADDR %s
+
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,BOTH %s
+
+#include <ptrauth.h>
+
+namespace test1 {
+
+#define authenticated(a...) ptrauth_cxx_vtable_pointer(a)
+
+struct NoExplicitAuth {
+  virtual ~NoExplicitAuth();
+  virtual void f();
+  virtual void g();
+};
+
+struct authenticated(no_authentication, no_address_discrimination, no_extra_discrimination) ExplicitlyDisableAuth {
+  virtual ~ExplicitlyDisableAuth();
+  virtual void f();
+  virtual void g();
+};
+
+struct authenticated(default_key, address_discrimination, default_extra_discrimination) ExplicitAddressDiscrimination {
+  virtual ~ExplicitAddressDiscrimination();
+  virtual void f();
+  virtual void g();
+};
+
+struct authenticated(default_key, no_address_discrimination, default_extra_discrimination) ExplicitNoAddressDiscrimination {
+  virtual ~ExplicitNoAddressDiscrimination();
+  virtual void f();
+  virtual void g();
+};
+
+struct authenticated(default_key, default_address_discrimination, no_extra_discrimination) ExplicitNoExtraDiscrimination {
+  virtual ~ExplicitNoExtraDiscrimination();
+  virtual void f();
+  virtual void g();
+};
+
+struct authenticated(default_key, default_address_discrimination, type_discrimination) ExplicitTypeDiscrimination {
+  virtual ~ExplicitTypeDiscrimination();
+  virtual void f();
+  virtual void g();
+};
+
+struct authenticated(default_key, default_address_discrimination, custom_discrimination, 42424) ExplicitCustomDiscrimination {
+  virtual ~ExplicitCustomDiscrimination();
+  virtual void f();
+  virtual void g();
+};
+
+template <typename T>
+struct SubClass : T {
+  virtual void g();
+  virtual T *h();
+};
+
+template <typename T>
+SubClass<T> *make_subclass(T *);
+
+struct authenticated(default_key, address_discrimination, type_discrimination) BasicStruct {
+  virtual ~BasicStruct();
+};
+
+template <typename T>
+struct PrimaryBasicStruct : BasicStruct, T {};
+template <typename T>
+struct PrimaryBasicStruct<T> *make_multiple_primary(T *);
+
+template <typename T>
+struct VirtualSubClass : virtual T {
+  virtual void g();
+  virtual T *h();
+};
+template <typename T>
+struct VirtualPrimaryStruct : virtual T, VirtualSubClass<T> {};
+template <typename T>
+struct VirtualPrimaryStruct<T> *make_virtual_primary(T *);
+
+extern "C" {
+
+// CHECK: @TVDisc_NoExplicitAuth = global i32 [[DISC_DEFAULT:49565]], align 4
+int TVDisc_NoExplicitAuth = ptrauth_string_discriminator("_ZTVN5test114NoExplicitAuthE");
+
+// CHECK: @TVDisc_ExplicitlyDisableAuth = global i32 [[DISC_DISABLED:24369]], align 4
+int TVDisc_ExplicitlyDisableAuth = ptrauth_string_discriminator("_ZTVN5test121ExplicitlyDisableAuthE");
+
+// CHECK: @TVDisc_ExplicitAddressDiscrimination = global i32 [[DISC_ADDR:56943]], align 4
+int TVDisc_ExplicitAddressDiscrimination = ptrauth_string_discriminator("_ZTVN5test129ExplicitAddressDiscriminationE");
+
+// CHECK: @TVDisc_ExplicitNoAddressDiscrimination = global i32 [[DISC_NO_ADDR:6022]], align 4
+int TVDisc_ExplicitNoAddressDiscrimination = ptrauth_string_discriminator("_ZTVN5test131ExplicitNoAddressDiscriminationE");
+
+// CHECK: @TVDisc_ExplicitNoExtraDiscrimination = global i32 [[DISC_NO_EXTRA:9072]], align 4
+int TVDisc_ExplicitNoExtraDiscrimination = ptrauth_string_discriminator("_ZTVN5test129ExplicitNoExtraDiscriminationE");
+
+// CHECK: @TVDisc_ExplicitTypeDiscrimination = global i32 [[DISC_TYPE:6177]], align 4
+int TVDisc_ExplicitTypeDiscrimination = ptrauth_string_discriminator("_ZTVN5test126ExplicitTypeDiscriminationE");
+
+
+// CHECK-LABEL: define void @test_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_DEFAULT]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_DEFAULT]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_default(NoExplicitAuth *a) {
+  a->f();
+}
+
+// CHECK-LABEL: define void @test_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+// CHECK-NOT:     call i64 @llvm.ptrauth.auth
+void test_disabled(ExplicitlyDisableAuth *a) {
+  a->f();
+}
+
+// CHECK-LABEL: define void @test_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// TYPE:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// TYPE:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_ADDR]])
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_ADDR]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_addr_disc(ExplicitAddressDiscrimination *a) {
+  a->f();
+}
+
+// CHECK-LABEL: define void @test_no_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_NO_ADDR]])
+//
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_NO_ADDR]])
+void test_no_addr_disc(ExplicitNoAddressDiscrimination *a) {
+  a->f();
+}
+
+// CHECK-LABEL: define void @test_no_extra_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+void test_no_extra_disc(ExplicitNoExtraDiscrimination *a) {
+  a->f();
+}
+
+// CHECK-LABEL: define void @test_type_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_TYPE]])
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_TYPE]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_TYPE]])
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_TYPE]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_type_disc(ExplicitTypeDiscrimination *a) {
+  a->f();
+}
+
+// CHECK-LABEL: define void @test_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = load ptr, ptr {{%.*}}, align 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_custom_disc(ExplicitCustomDiscrimination *a) {
+  a->f();
+}
+
+//
+// Test some simple single inheritance cases.
+// Codegen should be the same as the simple cases above once we have a vtable.
+//
+
+// CHECK-LABEL: define void @test_subclass_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_DEFAULT]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_DEFAULT]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_subclass_default(NoExplicitAuth *a) {
+  make_subclass(a)->f();
+}
+
+// CHECK-LABEL: define void @test_subclass_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+// CHECK-NOT:     call i64 @llvm.ptrauth.auth
+void test_subclass_disabled(ExplicitlyDisableAuth *a) {
+  make_subclass(a)->f();
+}
+
+// CHECK-LABEL: define void @test_subclass_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// TYPE:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// TYPE:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_ADDR]])
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_ADDR]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_subclass_addr_disc(ExplicitAddressDiscrimination *a) {
+  make_subclass(a)->f();
+}
+
+// CHECK-LABEL: define void @test_subclass_no_addr_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_NO_ADDR]])
+//
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_NO_ADDR]])
+void test_subclass_no_addr_disc(ExplicitNoAddressDiscrimination *a) {
+  make_subclass(a)->f();
+}
+
+// CHECK-LABEL: define void @test_subclass_no_extra_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+void test_subclass_no_extra_disc(ExplicitNoExtraDiscrimination *a) {
+  make_subclass(a)->f();
+}
+
+// CHECK-LABEL: define void @test_subclass_type_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_TYPE]])
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_TYPE]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_TYPE]])
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_TYPE]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_subclass_type_disc(ExplicitTypeDiscrimination *a) {
+  make_subclass(a)->f();
+}
+
+// CHECK-LABEL: define void @test_subclass_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTADDR:%.*]] = call noundef ptr @_ZN5test113make_subclass
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_subclass_custom_disc(ExplicitCustomDiscrimination *a) {
+  make_subclass(a)->f();
+}
+
+
+//
+// Test some simple multiple inheritance cases.
+// Codegen should be the same as the simple cases above once we have a vtable.
+//
+
+// CHECK-LABEL: define void @test_multiple_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[CALL:%.*]] = call noundef ptr @_ZN5test121make_multiple_primary
+// CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_DEFAULT]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_DEFAULT]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_multiple_default(NoExplicitAuth *a) {
+  make_multiple_primary(a)->f();
+}
+
+// CHECK-LABEL: define void @test_multiple_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[CALL:%.*]] = call noundef ptr @_ZN5test121make_multiple_primary
+// CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+// CHECK-NOT:     call i64 @llvm.ptrauth.auth
+void test_multiple_disabled(ExplicitlyDisableAuth *a) {
+  make_multiple_primary(a)->f();
+}
+
+// CHECK-LABEL: define void @test_multiple_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[CALL:%.*]] = call noundef ptr @_ZN5test121make_multiple_primary
+// CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 8
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_multiple_custom_disc(ExplicitCustomDiscrimination *a) {
+  make_multiple_primary(a)->f();
+}
+
+//
+// Test some virtual inheritance cases.
+// Codegen should be the same as the simple cases above once we have a vtable,
+// but twice for vtt/vtable.  The names in the vtt version have "VTT" prefixes.
+//
+
+// CHECK-LABEL: define void @test_virtual_default(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTTADDR:%.*]] = call noundef ptr @_ZN5test120make_virtual_primary
+// CHECK:         [[VTTABLE:%.*]] = load ptr, ptr [[VTTADDR]], align 8
+//
+// NODISC:        [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 [[DISC_DEFAULT]])
+//
+// ADDR:          [[VTTADDRI64:%.*]] = ptrtoint ptr [[VTTADDR]] to i64
+// ADDR:          [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 [[VTTADDRI64]])
+//
+// BOTH:          [[VTTADDRI64:%.*]] = ptrtoint ptr [[VTTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTTADDRI64]], i64 [[DISC_DEFAULT]])
+// BOTH:          [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 [[BLEND]])
+
+// CHECK:         [[AUTHEDPTR:%.*]] = inttoptr i64 [[AUTHED]] to ptr
+// CHECK:         [[VBOFFPTR:%.*]] = getelementptr i8, ptr [[AUTHEDPTR]], i64 -48
+// CHECK:         [[VBOFFSET:%.*]] = load i64, ptr [[VBOFFPTR]]
+// CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[VTTADDR]], i64 [[VBOFFSET]]
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 0)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[DISC_DEFAULT]])
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[VTADDRI64]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 [[DISC_DEFAULT]])
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_virtual_default(NoExplicitAuth *a) {
+  make_virtual_primary(a)->f();
+}
+
+// CHECK-LABEL: define void @test_virtual_disabled(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK-NOT:     call i64 @llvm.ptrauth.auth
+void test_virtual_disabled(ExplicitlyDisableAuth *a) {
+  make_virtual_primary(a)->f();
+}
+
+// CHECK-LABEL: define void @test_virtual_custom_disc(ptr noundef {{%.*}}) {{#.*}} {
+// CHECK:         [[VTTADDR:%.*]] = call noundef ptr @_ZN5test120make_virtual_primary
+// CHECK:         [[VTTABLE:%.*]] = load ptr, ptr [[VTTADDR]], align 8
+//
+// NODISC:        [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 42424)
+//
+// TYPE:          [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 42424)
+//
+// ADDR:          [[VTTADDRI64:%.*]] = ptrtoint ptr [[VTTADDR]] to i64
+// ADDR:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTTADDRI64]], i64 42424)
+// ADDR:          [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// BOTH:          [[VTTADDRI64:%.*]] = ptrtoint ptr [[VTTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTTADDRI64]], i64 42424)
+// BOTH:          [[VTTABLEI64:%.*]] = ptrtoint ptr [[VTTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTTABLEI64]], i32 2, i64 [[BLEND]])
+
+// CHECK:         [[AUTHEDPTR:%.*]] = inttoptr i64 [[AUTHED]] to ptr
+// CHECK:         [[VBOFFPTR:%.*]] = getelementptr i8, ptr [[AUTHEDPTR]], i64 -48
+// CHECK:         [[VBOFFSET:%.*]] = load i64, ptr [[VBOFFPTR]]
+// CHECK:         [[VTADDR:%.*]] = getelementptr inbounds i8, ptr [[VTTADDR]], i64 [[VBOFFSET]]
+// CHECK:         [[VTABLE:%.*]] = load ptr, ptr [[VTADDR]], align 8
+//
+// NODISC:        [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// NODISC:        [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// TYPE:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// TYPE:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 42424)
+//
+// ADDR:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// ADDR:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// ADDR:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// ADDR:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+//
+// BOTH:          [[VTADDRI64:%.*]] = ptrtoint ptr [[VTADDR]] to i64
+// BOTH:          [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTADDRI64]], i64 42424)
+// BOTH:          [[VTABLEI64:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// BOTH:          [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI64]], i32 2, i64 [[BLEND]])
+void test_virtual_custom_disc(ExplicitCustomDiscrimination *a) {
+  make_virtual_primary(a)->f();
+}
+
+} // extern "C"
+} // namespace test1
diff --git a/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
new file mode 100644
index 00000000000000..b4a8784a33d8cd
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -I%S -triple=arm64-apple-ios -fptrauth-calls -std=c++11 -emit-llvm -o - | FileCheck %s
+#include <typeinfo>
+
+struct A { int a; };
+
+// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+// CHECK: @_ZTS1A = linkonce_odr hidden constant [3 x i8] c"1A\00"
+// CHECK: @_ZTI1A = linkonce_odr hidden constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1A to i64), i64 -9223372036854775808) to ptr) }
+
+auto ATI = typeid(A);
diff --git a/clang/test/CodeGenCXX/ptrauth-thunks.cpp b/clang/test/CodeGenCXX/ptrauth-thunks.cpp
new file mode 100644
index 00000000000000..a85c8c4c065c46
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-thunks.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck %s
+
+namespace Test1 {
+  struct B1 {
+    virtual void* foo1() {
+      return 0;
+    }
+  };
+  struct Pad1 {
+    virtual ~Pad1() {}
+  };
+  struct Proxy1 : Pad1, B1 {
+    virtual ~Proxy1() {}
+  };
+  struct D : virtual Proxy1 {
+    virtual ~D() {}
+    virtual void* foo1();
+  };
+  void* D::foo1() {
+    return (void*)this;
+  }
+}
+
+// CHECK-LABEL: define linkonce_odr void @_ZTv0_n24_N5Test11DD0Ev(ptr noundef %this)
+// CHECK: %[[This:.*]] = load ptr
+// CHECK: %[[SignedVTable:.*]] = load ptr, ptr %[[This]], align 8
+// CHECK: %[[SignedVTableAsInt:.*]] = ptrtoint ptr %[[SignedVTable]] to i64
+// CHECK: %[[VTable:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[SignedVTableAsInt]], i32 2, i64 0)
diff --git a/clang/test/CodeGenCXX/ptrauth-virtual-function.cpp b/clang/test/CodeGenCXX/ptrauth-virtual-function.cpp
new file mode 100644
index 00000000000000..563d60780769bb
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-virtual-function.cpp
@@ -0,0 +1,581 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - | FileCheck %s
+
+// Check virtual function pointers in vtables are signed.
+
+// CHECK: %[[CLASS_B1:.*]] = type { ptr }
+
+// CHECK: @_ZTV2B1 = unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI2B1, ptr ptrauth (ptr @_ZN2B12m0Ev, i32 0, i64 58196, ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2B1, i32 0, i32 0, i32 2))] }, align 8
+
+// CHECK: @g_B1 = global %class.B1 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV2B1, i32 0, i32 0, i32 2), i32 2) }, align 8
+
+// CHECK: @_ZTV2B0 = unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI2B0,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV2B0, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV2B0, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV2B0, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B0D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV2B0, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B0D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV2B0, i32 0, i32 0, i32 6))] }, align 8
+
+// CHECK: @_ZTV2D0 = unnamed_addr constant { [9 x ptr] } { [9 x ptr] [ptr null, ptr @_ZTI2D0,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D02m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTch0_h4_N2D02m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D0D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D0D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D02m1Ev, i32 0, i64 35045, ptr getelementptr inbounds ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D02m3Ev, i32 0, i64 10565, ptr getelementptr inbounds ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 8))] }, align 8
+
+// CHECK: @_ZTV2D1 = unnamed_addr constant { [8 x ptr] } { [8 x ptr] [ptr null, ptr @_ZTI2D1,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D12m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTch0_h4_N2D12m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D1D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D1D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D12m1Ev, i32 0, i64 52864, ptr getelementptr inbounds ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 7))] }, align 8
+
+// CHECK: @_ZTV2D2 = unnamed_addr constant { [9 x ptr], [8 x ptr] } { [9 x ptr] [ptr null, ptr @_ZTI2D2,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D22m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTch0_h4_N2D22m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D2D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D2D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D22m1Ev, i32 0, i64 35045, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D22m3Ev, i32 0, i64 10565, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 8))],
+// CHECK-SAME: [8 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr @_ZTI2D2,
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D22m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 1, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTchn16_h4_N2D22m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 1, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 1, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D2D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 1, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D2D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D22m1Ev, i32 0, i64 52864, ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 1, i32 7))] }, align 8
+
+// CHECK: @_ZTV2D3 = unnamed_addr constant { [7 x ptr], [7 x ptr], [11 x ptr] } { [7 x ptr] [ptr inttoptr (i64 32 to ptr), ptr null, ptr @_ZTI2D3,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D32m0Ev, i32 0, i64 44578, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D32m1Ev, i32 0, i64 30766, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D3D1Ev, i32 0, i64 57279, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2D3D0Ev, i32 0, i64 62452, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 0, i32 6))],
+// CHECK-SAME: [7 x ptr] [ptr inttoptr (i64 16 to ptr), ptr inttoptr (i64 -16 to ptr), ptr @_ZTI2D3,
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D32m0Ev, i32 0, i64 49430, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 1, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D32m1Ev, i32 0, i64 57119, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 1, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D3D1Ev, i32 0, i64 60799, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 1, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn16_N2D3D0Ev, i32 0, i64 52565, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 1, i32 6))],
+// CHECK-SAME: [11 x ptr] [ptr inttoptr (i64 -32 to ptr), ptr null, ptr inttoptr (i64 -32 to ptr), ptr inttoptr (i64 -32 to ptr), ptr inttoptr (i64 -32 to ptr), ptr @_ZTI2D3,
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n24_N2D32m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 2, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTcv0_n32_h4_N2D32m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 2, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 2, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N2D3D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 2, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N2D3D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [7 x ptr], [7 x ptr], [11 x ptr] }, ptr @_ZTV2D3, i32 0, i32 2, i32 10))] }, align 8
+
+// CHECK: @_ZTC2D30_2V0 = unnamed_addr constant { [7 x ptr], [11 x ptr] } { [7 x ptr] [ptr inttoptr (i64 32 to ptr), ptr null, ptr @_ZTI2V0,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V02m0Ev, i32 0, i64 44578, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V02m1Ev, i32 0, i64 30766, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V0D1Ev, i32 0, i64 57279, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V0D0Ev, i32 0, i64 62452, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 0, i32 6))],
+// CHECK-SAME: [11 x ptr] [ptr inttoptr (i64 -32 to ptr), ptr null, ptr inttoptr (i64 -32 to ptr), ptr inttoptr (i64 -32 to ptr), ptr inttoptr (i64 -32 to ptr), ptr @_ZTI2V0,
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n24_N2V02m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTcv0_n32_h4_N2V02m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N2V0D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N2V0D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D30_2V0, i32 0, i32 1, i32 10))] }, align 8
+
+// CHECK: @_ZTC2D316_2V1 = unnamed_addr constant { [7 x ptr], [11 x ptr] } { [7 x ptr] [ptr inttoptr (i64 16 to ptr), ptr null, ptr @_ZTI2V1,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V12m0Ev, i32 0, i64 49430, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V12m1Ev, i32 0, i64 57119, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V1D1Ev, i32 0, i64 60799, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2V1D0Ev, i32 0, i64 52565, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 0, i32 6))],
+// CHECK-SAME: [11 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr inttoptr (i64 -16 to ptr), ptr inttoptr (i64 -16 to ptr), ptr inttoptr (i64 -16 to ptr), ptr @_ZTI2V1,
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n24_N2V12m0Ev, i32 0, i64 53119, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTcv0_n32_h4_N2V12m1Ev, i32 0, i64 15165, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN2B02m2Ev, i32 0, i64 43073, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N2V1D1Ev, i32 0, i64 25525, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N2V1D0Ev, i32 0, i64 21295, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC2D316_2V1, i32 0, i32 1, i32 10))] }, align 8
+
+
+struct S0 {
+  int f;
+};
+
+struct S1 {
+  int f;
+};
+
+struct S2 : S0, S1 {
+  int f;
+};
+
+class B0 {
+public:
+  virtual void m0();
+  virtual S1 *m1();
+  virtual void m2();
+  virtual ~B0();
+  int f;
+};
+
+class B1 {
+public:
+  virtual void m0();
+};
+
+class D0 : public B0 {
+public:
+  void m0() override;
+  S2 *m1() override;
+  virtual void m3();
+  int f;
+};
+
+class D1 : public B0 {
+public:
+  void m0() override;
+  S2 *m1() override;
+  int f;
+};
+
+class D2 : public D0, public D1 {
+public:
+  void m0() override;
+  S2 *m1() override;
+  void m3() override;
+  int f;
+};
+
+class V0 : public virtual B0 {
+public:
+  void m0() override;
+  S2 *m1() override;
+  int f;
+};
+
+class V1 : public virtual B0 {
+public:
+  void m0() override;
+  S2 *m1() override;
+  ~V1();
+  int f;
+};
+
+class D3 : public V0, public V1 {
+public:
+  void m0() override;
+  S2 *m1() override;
+  int f;
+};
+
+B1 g_B1;
+
+void B0::m0() {}
+
+void B1::m0() {}
+
+void D0::m0() {}
+
+void D1::m0() {}
+
+void D2::m0() {}
+
+void D3::m0() {}
+
+V1::~V1() {
+  m1();
+}
+
+// Check sign/authentication of vtable pointers and authentication of virtual
+// functions.
+
+// CHECK-LABEL: define noundef ptr @_ZN2V1D2Ev(
+// CHECK: %[[THIS1:.*]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T1:[0-9]+]] = ptrtoint ptr %[[T0]] to i64
+// CHECK: %[[T2:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T1]], i32 2, i64 0)
+// CHECK: %[[T3:[0-9]+]] = inttoptr i64 %[[T2]] to ptr
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[T3]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 %[[T6]], i32 2, i64 0)
+// CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T7]] to ptr
+// CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS1]]
+
+// CHECK-LABEL: define void @_Z8testB0m0P2B0(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 0
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 53119)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testB0m0(B0 *a) {
+  a->m0();
+}
+
+// CHECK-LABEL: define void @_Z8testB0m1P2B0(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 1
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 15165)
+// CHECK: call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testB0m1(B0 *a) {
+  a->m1();
+}
+
+// CHECK-LABEL: define void @_Z8testB0m2P2B0(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 2
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 43073)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testB0m2(B0 *a) {
+  a->m2();
+}
+
+// CHECK-LABEL: define void @_Z8testD0m0P2D0(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 0
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 53119)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD0m0(D0 *a) {
+  a->m0();
+}
+
+// CHECK-LABEL: define void @_Z8testD0m1P2D0(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 5
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 35045)
+// CHECK: call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD0m1(D0 *a) {
+  a->m1();
+}
+
+// CHECK-LABEL: define void @_Z8testD0m2P2D0(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 2
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 43073)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD0m2(D0 *a) {
+  a->m2();
+}
+
+// CHECK-LABEL: define void @_Z8testD0m3P2D0(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 6
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 10565)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD0m3(D0 *a) {
+  a->m3();
+}
+
+
+// CHECK-LABEL: define void @_Z8testD1m0P2D1(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 0
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 53119)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD1m0(D1 *a) {
+  a->m0();
+}
+
+// CHECK-LABEL: define void @_Z8testD1m1P2D1(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 5
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 52864)
+// CHECK: call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD1m1(D1 *a) {
+  a->m1();
+}
+
+// CHECK-LABEL: define void @_Z8testD1m2P2D1(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 2
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 43073)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD1m2(D1 *a) {
+  a->m2();
+}
+
+
+// CHECK-LABEL: define void @_Z8testD2m0P2D2(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 0
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 53119)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(36) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD2m0(D2 *a) {
+  a->m0();
+}
+
+// CHECK-LABEL: define void @_Z8testD2m1P2D2(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 5
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 35045)
+// CHECK: call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(36) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD2m1(D2 *a) {
+  a->m1();
+}
+
+// CHECK-LABEL: define void @_Z10testD2m2D0P2D2(
+// CHECK: call void @_ZN2B02m2Ev(ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}){{$}}
+
+void testD2m2D0(D2 *a) {
+  a->D0::m2();
+}
+
+// CHECK-LABEL: define void @_Z10testD2m2D1P2D2(
+// CHECK: call void @_ZN2B02m2Ev(ptr noundef nonnull align {{[0-9]+}} dereferenceable(12) %{{.*}}){{$}}
+
+void testD2m2D1(D2 *a) {
+  a->D1::m2();
+}
+
+// CHECK-LABEL: define void @_Z8testD2m3P2D2(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 6
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 10565)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(36) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD2m3(D2 *a) {
+  a->m3();
+}
+
+// CHECK-LABEL: define void @_Z8testD3m0P2D3(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 0
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 44578)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD3m0(D3 *a) {
+  a->m0();
+}
+
+// CHECK-LABEL: define void @_Z8testD3m1P2D3(
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T0]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 1
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 30766)
+// CHECK: call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD3m1(D3 *a) {
+  a->m1();
+}
+
+// CHECK: define void @_Z8testD3m2P2D3(ptr noundef %[[A:.*]])
+// CHECK: %[[A_ADDR:.*]] = alloca ptr, align 8
+// CHECK: store ptr %[[A]], ptr %[[A_ADDR]], align 8
+// CHECK: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]], align 8
+// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[V0]], align 8
+// CHECK: %[[V1:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[V2:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V1]], i32 2, i64 0)
+// CHECK: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr
+// CHECK: %[[VBASE_OFFSET_PTR:.*]] = getelementptr i8, ptr %[[V3]], i64 -24
+// CHECK: %[[VBASE_OFFSET:.*]] = load i64, ptr %[[VBASE_OFFSET_PTR]], align 8
+// CHECK: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[V0]], i64 %[[VBASE_OFFSET]]
+// CHECK: %[[VTABLE1:.*]] = load ptr, ptr %[[ADD_PTR]], align 8
+// CHECK: %[[V4:.*]] = ptrtoint ptr %[[VTABLE1]] to i64
+// CHECK: %[[V5:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V4]], i32 2, i64 0)
+// CHECK: %[[V6:.*]] = inttoptr i64 %[[V5]] to ptr
+// CHECK: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[V6]], i64 2
+// CHECK: %[[V7:.*]] = load ptr, ptr %[[VFN]], align 8
+// CHECK: %[[V8:.*]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[V9:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V8]], i64 43073)
+// CHECK: call void %[[V7]](ptr noundef nonnull align 8 dereferenceable(12) %[[ADD_PTR]]) [ "ptrauth"(i32 0, i64 %[[V9]]) ]
+
+void testD3m2(D3 *a) {
+  a->m2();
+}
+
+// CHECK-LABEL: define void @_Z17testD3Destructor0P2D3(
+// CHECK: load ptr, ptr
+// CHECK: %[[VTABLE:.*]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T2:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T2]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:[a-z]+]] = getelementptr inbounds ptr, ptr %[[T4]], i64 3
+// CHECK: %[[T5:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 62452)
+// CHECK: call void %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD3Destructor0(D3 *a) {
+  delete a;
+}
+
+// CHECK-LABEL: define void @_Z17testD3Destructor1P2D3(
+// CHECK: %[[T6:.*]] = load ptr, ptr %
+// CHECK: %[[VTABLE0:[a-z0-9]+]] = load ptr, ptr %
+// CHECK: %[[T2:[0-9]+]] = ptrtoint ptr %[[VTABLE0]] to i64
+// CHECK: %[[T3:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T2]], i32 2, i64 0)
+// CHECK: %[[T4:[0-9]+]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[COMPLETE_OFFSET_PTR:.*]] = getelementptr inbounds i64, ptr %[[T4]], i64 -2
+// CHECK: %[[T5:[0-9]+]] = load i64, ptr %[[COMPLETE_OFFSET_PTR]]
+// CHECK: %[[T7:[0-9]+]] = getelementptr inbounds i8, ptr %[[T6]], i64 %[[T5]]
+// CHECK: %[[VTABLE1:[a-z0-9]+]] = load ptr, ptr %[[T6]]
+// CHECK: %[[T9:[0-9]+]] = ptrtoint ptr %[[VTABLE1]] to i64
+// CHECK: %[[T10:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T9]], i32 2, i64 0)
+// CHECK: %[[T11:[0-9]+]] = inttoptr i64 %[[T10]] to ptr
+// CHECK: %[[VFN:[a-z0-9]+]] = getelementptr inbounds ptr, ptr %[[T11]], i64 2
+// CHECK: %[[T12:[0-9]+]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T13:[0-9]+]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T14:[0-9]+]] = call i64 @llvm.ptrauth.blend(i64 %[[T13]], i64 57279)
+// CHECK: %call = call noundef ptr %[[T12]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T14]]) ]
+// CHECK: call void @_ZdlPv(ptr noundef %[[T7]])
+
+void testD3Destructor1(D3 *a) {
+  ::delete a;
+}
+
+// CHECK-LABEL: define void @_Z17testD3Destructor2P2D3(
+// CHECK: load ptr, ptr
+// CHECK: %[[VTABLE:.*]] = load ptr, ptr %
+// CHECK: %[[T2:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T3:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[T2]], i32 2, i64 0)
+// CHECK: %[[T4:.*]] = inttoptr i64 %[[T3]] to ptr
+// CHECK: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[T4]], i64 2
+// CHECK: %[[T5:.*]] = load ptr, ptr %[[VFN]]
+// CHECK: %[[T6:.*]] = ptrtoint ptr %[[VFN]] to i64
+// CHECK: %[[T7:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[T6]], i64 57279)
+// CHECK: %call = call noundef ptr %[[T5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(32) %{{.*}}) #{{.*}} [ "ptrauth"(i32 0, i64 %[[T7]]) ]
+
+void testD3Destructor2(D3 *a) {
+  a->~D3();
+}
+
+void materializeConstructors() {
+  B0 B0;
+  B1 B1;
+  D0 D0;
+  D1 D1;
+  D2 D2;
+  D3 D3;
+  V0 V0;
+  V1 V1;
+}
+
+// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2B0C2Ev(
+// CHECK: %[[THIS:.*]] = load ptr, ptr %
+// CHECK: %[[T0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 40) ({ [7 x ptr] }, ptr @_ZTV2B0, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
+// CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T0]] to ptr
+// CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS]]
+
+// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2D0C2Ev(
+// CHECK: %[[T0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 56) ({ [9 x ptr] }, ptr @_ZTV2D0, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
+// CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T0]] to ptr
+// CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS]]
+
+// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2D1C2Ev(
+// CHECK: %[[T0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 48) ({ [8 x ptr] }, ptr @_ZTV2D1, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
+// CHECK: %[[SIGNED_VTADDR:[0-9]+]] = inttoptr i64 %[[T0]] to ptr
+// CHECK: store ptr %[[SIGNED_VTADDR]], ptr %[[THIS]]
+
+// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2D2C2Ev(
+// CHECK: %[[SLOT0:.*]] = load ptr, ptr
+// CHECK: %[[SIGN_VTADDR0:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 56) ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 0, i32 2) to i64), i32 2, i64 0)
+// CHECK: %[[T1:[0-9]+]] = inttoptr i64 %[[SIGN_VTADDR0]] to ptr
+// CHECK: store ptr %[[T1]], ptr %[[SLOT0]]
+// CHECK: %[[T3:[a-z0-9.]+]] = getelementptr inbounds i8, ptr %[[SLOT0]], i64 16
+// CHECK: %[[SIGN_VTADDR1:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr getelementptr inbounds inrange(-16, 48) ({ [9 x ptr], [8 x ptr] }, ptr @_ZTV2D2, i32 0, i32 1, i32 2) to i64), i32 2, i64 0)
+// CHECK: %[[T5:[0-9]+]] = inttoptr i64 %[[SIGN_VTADDR1]] to ptr
+// CHECK: store ptr %[[T5]], ptr %[[T3]]
+
+// CHECK-LABEL: define linkonce_odr noundef ptr @_ZN2V0C2Ev(
+// CHECK: %[[THIS1]] = load ptr, ptr %
+// CHECK: %[[VTT:[a-z0-9]+]] = load ptr, ptr %{{.*}}
+// CHECK: %[[T0:[0-9]+]] = load ptr, ptr %[[VTT]]
+// CHECK: %[[T1:[0-9]+]] = ptrtoint ptr %[[T0]] to i64
+// CHECK: %[[T2:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T1]], i32 2, i64 0)
+// CHECK: %[[T3:[0-9]+]] = inttoptr i64 %[[T2]] to ptr
+// CHECK: %[[VTADDR0:[0-9]+]] = ptrtoint ptr %[[T3]] to i64
+// CHECK: %[[T7:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 %[[VTADDR0]], i32 2, i64 0)
+// CHECK: %[[SIGN_VTADDR0:[0-9]+]] = inttoptr i64 %[[T7]] to ptr
+// CHECK: store ptr %[[SIGN_VTADDR0]], ptr %[[SLOT0]]
+// CHECK: %[[T9:[0-9]+]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 1
+// CHECK: %[[T10:[0-9]+]] = load ptr, ptr %[[T9]]
+// CHECK: %[[T11:[0-9]+]] = ptrtoint ptr %[[T10]] to i64
+// CHECK: %[[T12:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T11]], i32 2, i64 0)
+// CHECK: %[[T13:[0-9]+]] = inttoptr i64 %[[T12]] to ptr
+// CHECK: %[[VTABLE:[a-z]+]] = load ptr, ptr %[[THIS1]]
+// CHECK: %[[T15:[0-9]+]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK: %[[T16:[0-9]+]] = call i64 @llvm.ptrauth.auth(i64 %[[T15]], i32 2, i64 0)
+// CHECK: %[[T17:[0-9]+]] = inttoptr i64 %[[T16]] to ptr
+// CHECK: %[[VBASE_OFFSET_PTR:[a-z.]+]] = getelementptr i8, ptr %[[T17]], i64 -24
+// CHECK: %[[VBASE_OFFSET:[a-z.]+]] = load i64, ptr %[[VBASE_OFFSET_PTR]]
+// CHECK: %[[T20:[a-z.]+]] = getelementptr inbounds i8, ptr %[[THIS1]], i64 %[[VBASE_OFFSET]]
+// CHECK: %[[VTADDR1:[0-9]+]] = ptrtoint ptr %[[T13]] to i64
+// CHECK: %[[T23:[0-9]+]] = call i64 @llvm.ptrauth.sign(i64 %[[VTADDR1]], i32 2, i64 0)
+// CHECK: %[[SIGN_VTADDR1:[0-9]+]] = inttoptr i64 %[[T23]] to ptr
+// CHECK: store ptr %[[SIGN_VTADDR1]], ptr %[[T20]]
diff --git a/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
new file mode 100644
index 00000000000000..00b1cbd06e0f03
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
@@ -0,0 +1,309 @@
+// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-intrinsics -fptrauth-calls -fptrauth-vtable-pointer-type-discrimination -emit-llvm -O0 -disable-llvm-passes -o - | FileCheck --check-prefix=CHECK %s
+
+// The actual vtable construction
+
+// CHECK: @_ZTV1C = unnamed_addr constant { [5 x ptr], [11 x ptr] } { [5 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr @_ZTI1C,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1CD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1CD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 0, i32 4))],
+// CHECK-SAME: [11 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -8 to ptr), ptr @_ZTI1C,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1CD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1CD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 1, i32 10))] }, align 8
+
+// CHECK: @_ZTT1C = unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 0, i32 3), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [5 x ptr], [11 x ptr] }, ptr @_ZTV1C, i32 0, i32 1, i32 6), i32 2)], align 8
+
+// CHECK: @_ZTV1D = unnamed_addr constant { [7 x ptr], [11 x ptr] } { [7 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr @_ZTI1D,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1DD1Ev, i32 0, i64 59423, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1DD0Ev, i32 0, i64 25900, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1gEv, i32 0, i64 59070, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1hEz, i32 0, i64 65100, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 6))],
+// CHECK-SAME: [11 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr inttoptr (i64 -8 to ptr), ptr @_ZTI1D,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n32_N1D1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n40_N1D1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 10))] }, align 8
+
+// CHECK: @_ZTT1D = unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-24, 32) ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 3), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [7 x ptr], [11 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 6), i32 2)], align 8
+
+// CHECK: @_ZTV1F = unnamed_addr constant { [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] } { [6 x ptr] [ptr inttoptr (i64 32 to ptr), ptr inttoptr (i64 16 to ptr), ptr null, ptr @_ZTI1F,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1FD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1FD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 0, i32 5))], [7 x ptr] [ptr inttoptr (i64 8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr @_ZTI1F,
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn8_N1FD1Ev, i32 0, i64 59423, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 1, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn8_N1FD0Ev, i32 0, i64 25900, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 1, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1gEv, i32 0, i64 59070, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 1, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1hEz, i32 0, i64 65100, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 1, i32 6))], [11 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1F,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 2, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n32_N1D1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 2, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n40_N1D1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 2, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1FD1EvU11__vtptrauthILj0Lb0Lj62866E, i32 0, i64 2043, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 2, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1FD0EvU11__vtptrauthILj0Lb0Lj62866E, i32 0, i64 63674, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 2, i32 10))], [11 x ptr] [ptr inttoptr (i64 -32 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -32 to ptr), ptr @_ZTI1F,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1fEv, i32 0, i64 28408, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 3, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1gEv, i32 0, i64 22926, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 3, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1hEz, i32 0, i64 9832, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 3, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1FD1Ev, i32 0, i64 5817, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 3, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1FD0Ev, i32 0, i64 26464, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 3, i32 10))] }, align 8
+
+// CHECK: @_ZTT1F = unnamed_addr constant [8 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 0, i32 4), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 0, i32 3), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 1, i32 6), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-24, 32) ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 0, i32 3), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 6), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 2, i32 6), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-24, 32) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 1, i32 3), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1F, i32 0, i32 3, i32 6), i32 2)], align 8
+
+// CHECK: @_ZTV1G = unnamed_addr constant { [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] } { [6 x ptr] [ptr inttoptr (i64 16 to ptr), ptr inttoptr (i64 24 to ptr), ptr null, ptr @_ZTI1G,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1GD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1GD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 0, i32 5))], [7 x ptr] [ptr inttoptr (i64 16 to ptr), ptr inttoptr (i64 -8 to ptr), ptr @_ZTI1G,
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn8_N1GD1Ev, i32 0, i64 59423, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 1, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZThn8_N1GD0Ev, i32 0, i64 25900, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 1, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1gEv, i32 0, i64 59070, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 1, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1hEz, i32 0, i64 65100, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 1, i32 6))], [11 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1G,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1fEv, i32 0, i64 28408, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 2, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1gEv, i32 0, i64 22926, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 2, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1hEz, i32 0, i64 9832, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 2, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1GD1Ev, i32 0, i64 5817, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 2, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1GD0Ev, i32 0, i64 26464, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 2, i32 10))], [11 x ptr] [ptr inttoptr (i64 -24 to ptr), ptr inttoptr (i64 -16 to ptr), ptr inttoptr (i64 -16 to ptr), ptr null, ptr inttoptr (i64 -24 to ptr), ptr @_ZTI1G,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 3, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n32_N1D1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 3, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n40_N1D1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 3, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1GD1EvU11__vtptrauthILj0Lb0Lj62866E, i32 0, i64 2043, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 3, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1GD0EvU11__vtptrauthILj0Lb0Lj62866E, i32 0, i64 63674, ptr getelementptr inbounds ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 3, i32 10))] }, align 8
+
+// CHECK: @_ZTT1G = unnamed_addr constant [8 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 0, i32 4), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 0, i32 3), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 1, i32 6), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-24, 32) ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 0, i32 3), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 6), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 2, i32 6), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-48, 40) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 3, i32 6), i32 2),
+// CHECK-SAME: ptr ptrauth (ptr getelementptr inbounds inrange(-24, 32) ({ [6 x ptr], [7 x ptr], [11 x ptr], [11 x ptr] }, ptr @_ZTV1G, i32 0, i32 1, i32 3), i32 2)], align 8
+
+// CHECK: @_ZTV1A = unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI1A,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1AD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1AD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 6))] }, align 8
+
+// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+
+// CHECK: @_ZTS1A = constant [3 x i8] c"1A\00", align 1
+
+// CHECK: @_ZTI1A = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }, align 8
+
+// CHECK: @_ZTVN10__cxxabiv121__vmi_class_type_infoE = external global [0 x ptr]
+
+// CHECK: @_ZTS1C = constant [3 x i8] c"1C\00", align 1
+
+// CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
+
+// CHECK: @_ZTS1B = linkonce_odr hidden constant [3 x i8] c"1B\00", align 1
+
+// CHECK: @_ZTI1B = linkonce_odr hidden constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1B to i64), i64 -9223372036854775808) to ptr), ptr @_ZTI1A }, align 8
+
+// CHECK: @_ZTI1C = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1C, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8
+
+// CHECK: @_ZTS1D = constant [3 x i8] c"1D\00", align 1
+
+// CHECK: @_ZTI1D = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1D, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8
+
+// CHECK: @_ZTV1E = unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI1E,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1fEv, i32 0, i64 28408, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1gEv, i32 0, i64 22926, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1E1hEz, i32 0, i64 9832, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1ED1Ev, i32 0, i64 5817, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1ED0Ev, i32 0, i64 26464, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 6))] }, align 8
+
+// CHECK: @_ZTS1E = constant [3 x i8] c"1E\00", align 1
+
+// CHECK: @_ZTI1E = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1E }, align 8
+
+// CHECK: @_ZTC1F0_1C = unnamed_addr constant { [5 x ptr], [11 x ptr] } { [5 x ptr] [ptr inttoptr (i64 16 to ptr), ptr null, ptr @_ZTI1C,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1CD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1CD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 0, i32 4))], [11 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1C,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1CD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1CD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 1, i32 10))] }, align 8
+
+// CHECK: @_ZTC1F8_1D = unnamed_addr constant { [7 x ptr], [11 x ptr] } { [7 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr @_ZTI1D,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1DD1Ev, i32 0, i64 59423, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1DD0Ev, i32 0, i64 25900, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1gEv, i32 0, i64 59070, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1hEz, i32 0, i64 65100, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 0, i32 6))], [11 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr inttoptr (i64 -8 to ptr), ptr @_ZTI1D,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n32_N1D1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n40_N1D1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 10))] }, align 8
+
+// CHECK: @_ZTS1F = constant [3 x i8] c"1F\00", align 1
+
+// CHECK: @_ZTI1F = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1F, i32 3, i32 3, ptr @_ZTI1C, i64 2, ptr @_ZTI1D, i64 2050, ptr @_ZTI1E, i64 -8189 }, align 8
+
+// CHECK: @_ZTC1G0_1C = unnamed_addr constant { [5 x ptr], [11 x ptr] } { [5 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr @_ZTI1C,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1CD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1CD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 0, i32 4))], [11 x ptr] [ptr inttoptr (i64 -24 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -24 to ptr), ptr @_ZTI1C,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1CD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1CD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 1, i32 10))] }, align 8
+
+// CHECK: @_ZTC1G8_1D = unnamed_addr constant { [7 x ptr], [11 x ptr] } { [7 x ptr] [ptr inttoptr (i64 16 to ptr), ptr null, ptr @_ZTI1D,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1DD1Ev, i32 0, i64 59423, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1DD0Ev, i32 0, i64 25900, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1gEv, i32 0, i64 59070, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1D1hEz, i32 0, i64 65100, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 0, i32 6))], [11 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr inttoptr (i64 -16 to ptr), ptr inttoptr (i64 -16 to ptr), ptr null, ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1D,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 6)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n32_N1D1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 7)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n40_N1D1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 8)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 9)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 10))] }, align 8
+
+// CHECK: @_ZTS1G = constant [3 x i8] c"1G\00", align 1
+
+// CHECK: @_ZTI1G = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1G, i32 3, i32 3, ptr @_ZTI1E, i64 -8189, ptr @_ZTI1C, i64 2, ptr @_ZTI1D, i64 2050 }, align 8
+
+// CHECK: @_ZTV1B = linkonce_odr unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI1B,
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 3)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1A1hEz, i32 0, i64 31735, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 4)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1BD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 5)),
+// CHECK-SAME: ptr ptrauth (ptr @_ZN1BD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 6))] }, align 8
+
+extern "C" int printf(const char *format, ...);
+
+class A {
+public:
+  A() {}
+  virtual int f();
+  virtual int g();
+  virtual int h(...);
+  virtual ~A() {}
+
+public:
+  bool necessary_field;
+};
+
+class B : public A {
+public:
+  B() : A() {}
+  virtual ~B() {}
+};
+
+class C : public virtual B {
+public:
+  C() : B() {}
+  ~C();
+};
+
+class D : public virtual B {
+public:
+  D() : B() {}
+  ~D();
+  virtual int g();
+  virtual int h(...);
+};
+
+class E {
+public:
+  virtual int f();
+  virtual int g();
+  virtual int h(...);
+  virtual ~E(){};
+};
+
+class F : public C, public D, public virtual E {
+  ~F();
+};
+
+class G : public virtual E, public C, public D {
+  ~G();
+};
+
+C::~C() {}
+D::~D() {}
+F::~F() {}
+G::~G() {}
+int E::f() { return 1; }
+int A::f() { return 0; }
+int E::g() { return 1; }
+int A::g() { return 0; }
+int D::g() { return 0; }
+
+int E::h(...) { return 1; }
+int A::h(...) { return 0; }
+int D::h(...) { return 0; }
+
+int main() {
+  A *ans = new C();
+  delete ans;
+
+  B *b = new D();
+  b->f();
+  b->g();
+  b->h(1,2,3);
+  b = new C();
+  b->f();
+  b->h(1,2,3);
+  b = new C();
+  b->f();
+  b->h(1,2,3);
+  b = new F();
+  b->f();
+  b->g();
+  b->h(1,2,3);
+
+  ans = new B();
+  delete ans;
+
+  ans = new F();
+  ans->f();
+  ans->g();
+  ans->h(1,2,3);
+  delete ans;
+
+  E *e = new F();
+  e->f();
+  e->g();
+  e->h(1,2,3);
+  delete e;
+  e = new G();
+  e->f();
+  e->g();
+  e->h(1,2,3);
+  delete e;
+}
+
+// And check the thunks
+// CHECK: ptr @_ZTv0_n48_N1CD1Ev(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
+
+// CHECK: void @_ZTv0_n48_N1CD0Ev(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
+
+// CHECK: ptr @_ZTv0_n48_N1DD1Ev(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
+
+// CHECK: void @_ZTv0_n48_N1DD0Ev(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
+
+// CHECK: void @_ZTv0_n48_N1FD0EvU11__vtptrauthILj0Lb0Lj62866E(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
+
+// CHECK: void @_ZTv0_n48_N1FD0Ev(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 12810)
+
+// CHECK: void @_ZTv0_n48_N1GD0Ev(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 12810)
+
+// CHECK: void @_ZTv0_n48_N1GD0EvU11__vtptrauthILj0Lb0Lj62866E(ptr noundef %this)
+// CHECK: [[TEMP:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TEMP:%.*]], i32 2, i64 62866)
diff --git a/clang/test/CodeGenCXX/trivial-auto-var-init-skip-scalar-with-nonconst-init.cpp b/clang/test/CodeGenCXX/trivial-auto-var-init-skip-scalar-with-nonconst-init.cpp
new file mode 100644
index 00000000000000..ba4df6ebe115fe
--- /dev/null
+++ b/clang/test/CodeGenCXX/trivial-auto-var-init-skip-scalar-with-nonconst-init.cpp
@@ -0,0 +1,121 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -emit-llvm -o - | FileCheck %s -check-prefix=UNINIT
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=pattern %s -emit-llvm -o - | FileCheck %s -check-prefix=PATTERN
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=zero %s -emit-llvm -o - | FileCheck %s -check-prefix=ZERO
+
+template<typename T> void used(T &) noexcept;
+
+extern "C" {
+
+extern int get_int(int) noexcept;
+struct C {
+  int x;
+  int y;
+};
+extern C make_c() noexcept;
+
+// Scalar with a self-reference: does need auto-init.
+// UNINIT-LABEL:  test_selfinit_call(
+// ZERO-LABEL:    test_selfinit_call(
+// ZERO: store i32 0, ptr %self, align 4, !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_selfinit_call(
+// PATTERN: store i32 -1431655766, ptr %self, align 4, !annotation [[AUTO_INIT:!.+]]
+void test_selfinit_call() {
+  int self = get_int(self);
+  used(self);
+}
+
+// Scalar without a self-reference: no auto-init needed.
+// UNINIT-LABEL:  test_nonself_call(
+// ZERO-LABEL:    test_nonself_call(
+// ZERO-NOT: !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_nonself_call(
+// PATTERN-NOT: !annotation [[AUTO_INIT:!.+]]
+void test_nonself_call() {
+  int x = get_int(2);
+  used(x);
+}
+
+// Scalar with a self-reference: does need auto-init.
+// UNINIT-LABEL:  test_selfinit_lambda_call(
+// ZERO-LABEL:    test_selfinit_lambda_call(
+// ZERO: store i32 0, ptr %self, align 4, !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_selfinit_lambda_call(
+// PATTERN: store i32 -1431655766, ptr %self, align 4, !annotation [[AUTO_INIT:!.+]]
+void test_selfinit_lambda_call() {
+  int self = [&](){ return self; }();
+  used(self);
+}
+
+// Scalar with a self-reference: does need auto-init.
+// UNINIT-LABEL:  test_selfinit_gnu_stmt_expression(
+// ZERO-LABEL:    test_selfinit_gnu_stmt_expression(
+// ZERO: store i32 0, ptr %self, align 4, !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_selfinit_gnu_stmt_expression(
+// PATTERN: store i32 -1431655766, ptr %self, align 4, !annotation [[AUTO_INIT:!.+]]
+void test_selfinit_gnu_stmt_expression() {
+  int self = ({int x = self; x + 1; });
+  used(self);
+}
+
+// Not a scalar: auto-init just in case
+// UNINIT-LABEL:  test_nonscalar_call(
+// ZERO-LABEL:    test_nonscalar_call(
+// ZERO: call void @llvm.memset{{.*}}, i8 0, i64 8, {{.*}} !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_nonscalar_call(
+// PATTERN: call void @llvm.memcpy{{.*}}, i64 8, {{.*}} !annotation [[AUTO_INIT:!.+]]
+void test_nonscalar_call() {
+  C c = make_c();
+  used(c);
+}
+
+// Scalar with a self-reference: does need auto-init.
+// UNINIT-LABEL:  test_self_ptr(
+// ZERO-LABEL:    test_self_ptr(
+// ZERO: store ptr null, ptr %self, align 8, !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_self_ptr(
+// PATTERN: store ptr inttoptr (i64 -6148914691236517206 to ptr), ptr %self, align 8, !annotation [[AUTO_INIT:!.+]]
+void test_self_ptr() {
+  void* self = self;
+  used(self);
+}
+
+// Scalar without a self-reference: no auto-init needed.
+// UNINIT-LABEL:  test_nonself_ptr(
+// ZERO-LABEL:    test_nonself_ptr(
+// ZERO-NOT: !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_nonself_ptr(
+// PATTERN-NOT: !annotation [[AUTO_INIT:!.+]]
+void test_nonself_ptr() {
+  int y = 0;
+  void* x = &y;
+  used(x);
+}
+
+// Scalar with a self-reference: does need auto-init.
+// UNINIT-LABEL:  test_self_complex(
+// ZERO-LABEL:    test_self_complex(
+// ZERO: call void @llvm.memset{{.*}} !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_self_complex(
+// PATTERN: call void @llvm.memcpy{{.*}} !annotation [[AUTO_INIT:!.+]]
+void test_self_complex() {
+  _Complex float self = 3.0 * 3.0 * self;
+  used(self);
+}
+
+// Scalar without a self-reference: no auto-init needed.
+// UNINIT-LABEL:  test_nonself_complex(
+// ZERO-LABEL:    test_nonself_complex(
+// ZERO-NOT: !annotation [[AUTO_INIT:!.+]]
+// PATTERN-LABEL: test_nonself_complex(
+// PATTERN-NOT: !annotation [[AUTO_INIT:!.+]]
+void test_nonself_complex() {
+  _Complex float y = 0.0;
+  _Complex float x = 3.0 * 3.0 * y;
+  used(x);
+}
+
+} // extern "C"
+
+// ZERO: [[AUTO_INIT]] = !{!"auto-init"}
+// PATTERN: [[AUTO_INIT]] = !{!"auto-init"}
+
diff --git a/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp b/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp
index c70316214cf665..72c59cb41c34b1 100644
--- a/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp
+++ b/clang/test/CodeGenCXX/ubsan-vtable-checks.cpp
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-windows -emit-llvm -fsanitize=null %s -o - | FileCheck %s --check-prefix=CHECK-NULL --check-prefix=MSABI
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux -emit-llvm -fsanitize=null,vptr %s -o - | FileCheck %s --check-prefix=CHECK-VPTR --check-prefix=ITANIUM
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-windows -emit-llvm -fsanitize=null,vptr %s -o - | FileCheck %s --check-prefix=CHECK-VPTR --check-prefix=MSABI  --check-prefix=CHECK-VPTR-MS
+// RUN: %clang_cc1 -std=c++11 -triple arm64e-ios-13 -emit-llvm -fptrauth-intrinsics -fptrauth-calls -fptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-address-discrimination -fsanitize=null,vptr %s -o - | FileCheck %s --check-prefix=CHECK-VPTR --check-prefix=ITANIUM --check-prefix=CHECK-PTRAUTH
 struct T {
   virtual ~T() {}
   virtual int v() { return 1; }
@@ -26,18 +27,49 @@ int get_v(T* t) {
   // CHECK-NULL: call void @__ubsan_handle_type_mismatch_v1_abort
   // Second, we check that vtable is actually loaded once the type check is done.
   // CHECK-NULL: load ptr, ptr {{.*}}
+
+  // CHECK-PTRAUTH: [[CAST_VTABLE:%.*]] = ptrtoint ptr %vtable to i64
+  // CHECK-PTRAUTH: [[STRIPPED_VTABLE:%.*]] = call i64 @llvm.ptrauth.strip(i64 [[CAST_VTABLE]], i32 0), !nosanitize !2
+  // CHECK-PTRAUTH: [[STRIPPED_PTR:%.*]] = inttoptr i64 [[STRIPPED_VTABLE]] to ptr
+  // CHECK-PTRAUTH: [[STRIPPED_INT:%.*]] = ptrtoint ptr [[STRIPPED_PTR]] to i64
+  // Make sure authed vtable pointer feeds into hashing
+  // CHECK-PTRAUTH: {{%.*}} = mul i64 [[STRIPPED_INT]], {{.*}}
+
+  // Verify that we authenticate for the actual vcall
+  // CHECK-PTRAUTH: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 17113)
+  // CHECK-PTRAUTH: [[CAST_VTABLE:%.*]] = ptrtoint ptr %vtable2 to i64
+  // CHECK-PTRAUTH: [[AUTHED_INT:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VTABLE]], i32 2, i64 [[BLENDED]])
+  // CHECK-PTRAUTH: [[AUTHED_PTR:%.*]] = inttoptr i64 [[AUTHED_INT]] to ptr
+  // CHECK-PTRAUTH: {{%.*}} = getelementptr inbounds ptr, ptr [[AUTHED_PTR]], i64 2
   return t->v();
 }
 
 // ITANIUM: define{{.*}} void @_Z9delete_itP1T
 // MSABI: define dso_local void @"?delete_it
 void delete_it(T *t) {
-  // First, we check that vtable is not loaded before a type check.
   // CHECK-VPTR-NOT: load {{.*}} (ptr{{.*}})**, {{.*}} (ptr{{.*}})***
   // CHECK-VPTR: br i1 {{.*}} label %{{.*}}
-  // CHECK-VPTR: call void @__ubsan_handle_dynamic_type_cache_miss_abort
+  // CHECK-VPTR: call void @__ubsan_handle_type_mismatch_v1_abort
   // Second, we check that vtable is actually loaded once the type check is done.
   // CHECK-VPTR: load ptr, ptr {{.*}}
+
+  // First, we check that vtable is not loaded before a type check.
+  // CHECK-PTRAUTH: ptrtoint ptr {{%.*}} to i64
+  // CHECK-PTRAUTH: [[CAST_VTABLE:%.*]] = ptrtoint ptr [[VTABLE:%.*]] to i64
+  // CHECK-PTRAUTH: [[STRIPPED_VTABLE:%.*]] = call i64 @llvm.ptrauth.strip(i64 [[CAST_VTABLE]], i32 0)
+  // CHECK-PTRAUTH: [[STRIPPED_PTR:%.*]] = inttoptr i64 [[STRIPPED_VTABLE]] to ptr
+  // CHECK-PTRAUTH: [[STRIPPED_INT:%.*]] = ptrtoint ptr [[STRIPPED_PTR]] to i64
+  // CHECK-PTRAUTH: {{%.*}} = mul i64 [[STRIPPED_INT]], {{.*}}
+  // CHECK-PTRAUTH: call void @__ubsan_handle_dynamic_type_cache_miss_abort(
+  // Second, we check that vtable is actually loaded once the type check is done.
+  // ptrauth for the virtual function load
+  // CHECK-PTRAUTH: [[VTABLE2:%.*]] = load ptr, ptr {{.*}}
+  // CHECK-PTRAUTH: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 %{{.*}}, i64 17113)
+  // CHECK-PTRAUTH: [[CAST_VTABLE:%.*]] = ptrtoint ptr [[VTABLE2]] to i64
+  // CHECK-PTRAUTH: [[AUTHED_INT:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VTABLE]], i32 2, i64 [[BLENDED]])
+  // CHECK-PTRAUTH: [[AUTHED_PTR:%.*]] = inttoptr i64 [[AUTHED_INT]] to ptr
+  // CHECK-PTRAUTH: getelementptr inbounds ptr, ptr
+  // CHECK-PTRAUTH {{%.*}} = getelementptr inbounds ptr, ptr [[AUTHED_PTR]], i64 1
   delete t;
 }
 
@@ -47,7 +79,19 @@ U* dyncast(T *t) {
   // First, we check that dynamic_cast is not called before a type check.
   // CHECK-VPTR-NOT: call ptr @__{{dynamic_cast|RTDynamicCast}}
   // CHECK-VPTR: br i1 {{.*}} label %{{.*}}
+  // CHECK-PTRAUTH: [[V0:%.*]] = ptrtoint ptr {{%.*}} to i64
+  // CHECK-PTRAUTH: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[V0]], i64 17113)
+  // CHECK-PTRAUTH: [[CAST_VTABLE:%.*]] = ptrtoint ptr {{%.*}} to i64
+  // CHECK-PTRAUTH: [[STRIPPED_VTABLE:%.*]] = call i64 @llvm.ptrauth.strip(i64 [[CAST_VTABLE]], i32 0)
+  // CHECK-PTRAUTH: [[STRIPPED_PTR:%.*]] = inttoptr i64 [[STRIPPED_VTABLE]] to ptr
+  // CHECK-PTRAUTH: [[STRIPPED_INT:%.*]] = ptrtoint ptr [[STRIPPED_PTR]] to i64
+  // CHECK-PTRAUTH: {{%.*}} = mul i64 [[STRIPPED_INT]], {{.*}}
   // CHECK-VPTR: call void @__ubsan_handle_dynamic_type_cache_miss_abort
+  // CHECK-PTRAUTH: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 17113)
+  // CHECK-PTRAUTH: [[CAST_VTABLE:%.*]] = ptrtoint ptr %vtable1 to i64
+  // CHECK-PTRAUTH: [[AUTHED_INT:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VTABLE]], i32 2, i64 [[BLENDED]])
+  // CHECK-PTRAUTH: [[AUTHED_PTR:%.*]] = inttoptr i64 [[AUTHED_INT]] to ptr
+  // CHECK-PTRAUTH: {{%.*}} = load volatile i8, ptr [[AUTHED_PTR]], align 8
   // Second, we check that dynamic_cast is actually called once the type check is done.
   // CHECK-VPTR: call ptr @__{{dynamic_cast|RTDynamicCast}}
   return dynamic_cast<U*>(t);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index f30776a8bb85b9..3cf1056cf4f48b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -8,13 +8,13 @@ typedef unsigned int uint;
 typedef unsigned long ulong;
 
 // CHECK-LABEL: @test_permlane16(
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
   *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
 }
 
 // CHECK-LABEL: @test_permlanex16(
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
   *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index 868b5bed0c9522..f4446574635687 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -37,7 +37,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
 }
 
 // CHECK-LABEL: @test_permlane64(
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64(i32 %a)
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
 void test_permlane64(global uint* out, uint a) {
   *out = __builtin_amdgcn_permlane64(a);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store.cl
new file mode 100644
index 00000000000000..37975d59730c55
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store.cl
@@ -0,0 +1,172 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
+
+typedef char i8;
+typedef short i16;
+typedef int i32;
+typedef int i64 __attribute__((ext_vector_type(2)));
+typedef int i96 __attribute__((ext_vector_type(3)));
+typedef int i128 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b8(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b16(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b32(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b64(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b96(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b96(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b128(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b8_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b8_non_const_offset(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b16_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b16_non_const_offset(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b32_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b32_non_const_offset(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b64_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b64_non_const_offset(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b96_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b96_non_const_offset(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b128_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b128_non_const_offset(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, offset, /*soffset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b8_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b8_non_const_soffset(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b16_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b16_non_const_soffset(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b32_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b32_non_const_soffset(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b64_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b64_non_const_soffset(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b96_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b96_non_const_soffset(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_store_b128_non_const_soffset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_raw_ptr_buffer_store_b128_non_const_soffset(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+  __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, /*offset=*/0, soffset, /*aux=*/0);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 46af87f5e1d683..5bd8f77a5930c4 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -158,23 +158,85 @@ void test_ds_faddf(local float *out, float src) {
 }
 
 // CHECK-LABEL: @test_ds_fmin
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+// CHECK: atomicrmw volatile fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src release, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
 #if !defined(__SPIRV__)
 void test_ds_fminf(local float *out, float src) {
 #else
 void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
 #endif
   *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, true);
+
+  // Test all orders.
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
+
+  // Test all syncscopes.
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
 }
 
 // CHECK-LABEL: @test_ds_fmax
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+// CHECK: atomicrmw volatile fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src release, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
 #if !defined(__SPIRV__)
 void test_ds_fmaxf(local float *out, float src) {
 #else
 void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
 #endif
   *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, true);
+
+  // Test all orders.
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
+
+  // Test all syncscopes.
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
 }
 
 // CHECK-LABEL: @test_s_memtime
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 95daa2cdbc92cb..6a6d5b1dfed3df 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -308,14 +308,14 @@ void test_ds_bpermute(global int* out, int a, int b)
 }
 
 // CHECK-LABEL: @test_readfirstlane
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
 void test_readfirstlane(global int* out, int a)
 {
   *out = __builtin_amdgcn_readfirstlane(a);
 }
 
 // CHECK-LABEL: @test_readlane
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
 void test_readlane(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_readlane(a, b);
diff --git a/clang/test/Driver/aarch64-fp16.c b/clang/test/Driver/aarch64-fp16.c
index 4da1834cf68765..72ccd79850148f 100644
--- a/clang/test/Driver/aarch64-fp16.c
+++ b/clang/test/Driver/aarch64-fp16.c
@@ -21,7 +21,7 @@
 
 // RUN: %clang --target=aarch64 -march=armv8a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-FP16FML %s
-// GENERICV8A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV8A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.2-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-NO-FP16FML %s
@@ -35,11 +35,11 @@
 // GENERICV82A-FP16-SAME: {{$}}
 
 // RUN: %clang --target=aarch64 -march=armv8.2-a+profile -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-SPE %s
-// GENERICV82A-SPE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.2a"{{.*}} "-target-feature" "+spe"{{.*}} "-target-feature" "+neon"
+// GENERICV82A-SPE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.2a"{{.*}} "-target-feature" "+neon"{{.*}} "-target-feature" "+spe"
 
 // RUN: %clang --target=aarch64 -march=armv8.2a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.2-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16FML %s
-// GENERICV82A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV82A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.2a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.2-a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16-NO-FP16FML %s
@@ -51,15 +51,15 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.2a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16FML-NO-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.2-a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16FML-NO-FP16 %s
-// GENERICV82A-FP16FML-NO-FP16: "-target-feature" "-fullfp16"{{.*}} "-target-feature" "-fp16fml"
+// GENERICV82A-FP16FML-NO-FP16: "-target-feature" "-fp16fml"{{.*}} "-target-feature" "-fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.2a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-NO-FP16-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-NO-FP16-FP16FML %s
-// GENERICV82A-NO-FP16-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV82A-NO-FP16-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.2a+fp16+profile -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16-SPE %s
 // RUN: %clang --target=aarch64 -march=armv8.2-a+fp16+profile -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-FP16-SPE %s
-// GENERICV82A-FP16-SPE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}"{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+v8.2a"{{.*}}  "-target-feature" "+fullfp16"{{.*}} "-target-feature" "+spe"{{.*}} "-target-feature" "+neon"
+// GENERICV82A-FP16-SPE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}"{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+v8.2a"{{.*}}  "-target-feature" "+fullfp16"{{.*}} "-target-feature" "+neon"{{.*}} "-target-feature" "+spe"
 // GENERICV82A-FP16-SPE-NOT:  "-target-feature" "{{[+-]}}fp16fml"
 // GENERICV82A-FP16-SPE-SAME: {{$}}
 
@@ -74,7 +74,7 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.3a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.3-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-FP16FML %s
-// GENERICV83A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV83A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.3a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-FP16-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.3-a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-FP16-NO-FP16FML %s
@@ -88,11 +88,11 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.3a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-FP16FML-NO-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.3-a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-FP16FML-NO-FP16 %s
-// GENERICV83A-FP16FML-NO-FP16: "-target-feature" "-fullfp16" "-target-feature" "-fp16fml"
+// GENERICV83A-FP16FML-NO-FP16: "-target-feature" "-fp16fml" "-target-feature" "-fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.3a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-NO-FP16-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.3-a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-NO-FP16-FP16FML %s
-// GENERICV83A-NO-FP16-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV83A-NO-FP16-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.4a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.4-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-NO-FP16FML %s
@@ -101,27 +101,27 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.4a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.4-a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16 %s
-// GENERICV84A-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV84A-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.4a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16FML %s
-// GENERICV84A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV84A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.4a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.4-a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16-NO-FP16FML %s
-// GENERICV84A-FP16-NO-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "-fp16fml"
+// GENERICV84A-FP16-NO-FP16FML: "-target-feature" "-fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.4a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-NO-FP16FML-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-NO-FP16FML-FP16 %s
-// GENERICV84A-NO-FP16FML-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV84A-NO-FP16FML-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.4a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16FML-NO-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-FP16FML-NO-FP16 %s
-// GENERICV84A-FP16FML-NO-FP16: "-target-feature" "-fullfp16" "-target-feature" "-fp16fml"
+// GENERICV84A-FP16FML-NO-FP16: "-target-feature" "-fp16fml" "-target-feature" "-fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.4a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-NO-FP16-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-NO-FP16-FP16FML %s
-// GENERICV84A-NO-FP16-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV84A-NO-FP16-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-NO-FP16FML %s
@@ -130,27 +130,27 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.5a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.5-a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16 %s
-// GENERICV85A-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV85A-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.5a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.5-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16FML %s
-// GENERICV85A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV85A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.5a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.5-a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16-NO-FP16FML %s
-// GENERICV85A-FP16-NO-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "-fp16fml"
+// GENERICV85A-FP16-NO-FP16FML: "-target-feature" "-fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.5a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-NO-FP16FML-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.5-a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-NO-FP16FML-FP16 %s
-// GENERICV85A-NO-FP16FML-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV85A-NO-FP16FML-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.5a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16FML-NO-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.5-a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16FML-NO-FP16 %s
-// GENERICV85A-FP16FML-NO-FP16: "-target-feature" "-fullfp16" "-target-feature" "-fp16fml"
+// GENERICV85A-FP16FML-NO-FP16: "-target-feature" "-fp16fml" "-target-feature" "-fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.5a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-NO-FP16-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.5-a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-NO-FP16-FP16FML %s
-// GENERICV85A-NO-FP16-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV85A-NO-FP16-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-NO-FP16FML %s
@@ -159,27 +159,27 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.6a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.6-a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16 %s
-// GENERICV86A-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV86A-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.6a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.6-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16FML %s
-// GENERICV86A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV86A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.6a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.6-a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16-NO-FP16FML %s
-// GENERICV86A-FP16-NO-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "-fp16fml"
+// GENERICV86A-FP16-NO-FP16FML: "-target-feature" "-fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.6a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-NO-FP16FML-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.6-a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-NO-FP16FML-FP16 %s
-// GENERICV86A-NO-FP16FML-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV86A-NO-FP16FML-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.6a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16FML-NO-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.6-a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-FP16FML-NO-FP16 %s
-// GENERICV86A-FP16FML-NO-FP16: "-target-feature" "-fullfp16" "-target-feature" "-fp16fml"
+// GENERICV86A-FP16FML-NO-FP16: "-target-feature" "-fp16fml" "-target-feature" "-fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.6a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-NO-FP16-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.6-a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-NO-FP16-FP16FML %s
-// GENERICV86A-NO-FP16-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV86A-NO-FP16-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-NO-FP16FML %s
@@ -188,27 +188,27 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.7a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.7-a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16 %s
-// GENERICV87A-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV87A-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.7a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.7-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16FML %s
-// GENERICV87A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV87A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.7a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.7-a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16-NO-FP16FML %s
-// GENERICV87A-FP16-NO-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "-fp16fml"
+// GENERICV87A-FP16-NO-FP16FML: "-target-feature" "-fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.7a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-NO-FP16FML-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.7-a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-NO-FP16FML-FP16 %s
-// GENERICV87A-NO-FP16FML-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV87A-NO-FP16FML-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.7a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16FML-NO-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.7-a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-FP16FML-NO-FP16 %s
-// GENERICV87A-FP16FML-NO-FP16: "-target-feature" "-fullfp16" "-target-feature" "-fp16fml"
+// GENERICV87A-FP16FML-NO-FP16: "-target-feature" "-fp16fml" "-target-feature" "-fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.7a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-NO-FP16-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.7-a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-NO-FP16-FP16FML %s
-// GENERICV87A-NO-FP16-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV87A-NO-FP16-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-NO-FP16FML %s
@@ -217,24 +217,24 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.8a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.8-a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16 %s
-// GENERICV88A-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV88A-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.8a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.8-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16FML %s
-// GENERICV88A-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV88A-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.8a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16-NO-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.8-a+fp16+nofp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16-NO-FP16FML %s
-// GENERICV88A-FP16-NO-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "-fp16fml"
+// GENERICV88A-FP16-NO-FP16FML: "-target-feature" "-fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.8a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-NO-FP16FML-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.8-a+nofp16fml+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-NO-FP16FML-FP16 %s
-// GENERICV88A-NO-FP16FML-FP16: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV88A-NO-FP16FML-FP16: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.8a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16FML-NO-FP16 %s
 // RUN: %clang --target=aarch64 -march=armv8.8-a+fp16fml+nofp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-FP16FML-NO-FP16 %s
-// GENERICV88A-FP16FML-NO-FP16: "-target-feature" "-fullfp16" "-target-feature" "-fp16fml"
+// GENERICV88A-FP16FML-NO-FP16: "-target-feature" "-fp16fml" "-target-feature" "-fullfp16"
 
 // RUN: %clang --target=aarch64 -march=armv8.8a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-NO-FP16-FP16FML %s
 // RUN: %clang --target=aarch64 -march=armv8.8-a+nofp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-NO-FP16-FP16FML %s
-// GENERICV88A-NO-FP16-FP16FML: "-target-feature" "+fullfp16" "-target-feature" "+fp16fml"
+// GENERICV88A-NO-FP16-FP16FML: "-target-feature" "+fp16fml" "-target-feature" "+fullfp16"
diff --git a/clang/test/Driver/aarch64-mac-cpus.c b/clang/test/Driver/aarch64-mac-cpus.c
index 488298cfd2d245..8d23ad8c956fd7 100644
--- a/clang/test/Driver/aarch64-mac-cpus.c
+++ b/clang/test/Driver/aarch64-mac-cpus.c
@@ -21,4 +21,4 @@
 // EXPLICIT-A11: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-a11"
 // EXPLICIT-A7: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-a7"
 // EXPLICIT-A14: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-a14"
-// EXPLICIT-M1: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-m1"
+// EXPLICIT-M1: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-a14"
diff --git a/clang/test/Driver/aarch64-sve2.c b/clang/test/Driver/aarch64-sve2.c
index c801f44d3cb842..389c3a52bde44b 100644
--- a/clang/test/Driver/aarch64-sve2.c
+++ b/clang/test/Driver/aarch64-sve2.c
@@ -5,4 +5,4 @@
 // RUN: %clang --target=aarch64 -mlittle-endian -march=armv9-a+nosve2 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-NOSVE2 %s
 // RUN: %clang --target=aarch64_be -mlittle-endian -march=armv9a+nosve2 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-NOSVE2 %s
 // RUN: %clang --target=aarch64_be -mlittle-endian -march=armv9-a+nosve2 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-NOSVE2 %s
-// GENERICV9A-NOSVE2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9a"{{.*}} "-target-feature" "+neon" "-target-feature" "+sve" "-target-feature" "-sve2"
+// GENERICV9A-NOSVE2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9a"{{.*}} "-target-feature" "+neon"{{.*}} "-target-feature" "+sve" "-target-feature" "-sve2"
diff --git a/clang/test/Driver/aarch64-v81a.c b/clang/test/Driver/aarch64-v81a.c
index e84652ec7f11e1..f3873b3ab1864e 100644
--- a/clang/test/Driver/aarch64-v81a.c
+++ b/clang/test/Driver/aarch64-v81a.c
@@ -19,3 +19,17 @@
 // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV81A %s
 // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1-a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV81A %s
 // ARM64-GENERICV81A: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "generic"{{.*}} "-target-feature" "+v8.1a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.1-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/aarch64-v82a.c b/clang/test/Driver/aarch64-v82a.c
index 9dd355934c1059..318c270cad3966 100644
--- a/clang/test/Driver/aarch64-v82a.c
+++ b/clang/test/Driver/aarch64-v82a.c
@@ -13,3 +13,21 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.2-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV82A-BE %s
 // GENERICV82A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.2a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.2-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/aarch64-v83a.c b/clang/test/Driver/aarch64-v83a.c
index b0ff9fb3abc24c..f35c500ee0a9be 100644
--- a/clang/test/Driver/aarch64-v83a.c
+++ b/clang/test/Driver/aarch64-v83a.c
@@ -13,3 +13,26 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.3a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.3-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV83A-BE %s
 // GENERICV83A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.3a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.3-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/aarch64-v84a.c b/clang/test/Driver/aarch64-v84a.c
index 030990bfe5131c..d72c79fc14cec5 100644
--- a/clang/test/Driver/aarch64-v84a.c
+++ b/clang/test/Driver/aarch64-v84a.c
@@ -13,3 +13,37 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.4a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.4-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV84A-BE %s
 // GENERICV84A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.4a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.4-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/aarch64-v85a.c b/clang/test/Driver/aarch64-v85a.c
index 3e1e921dcc0133..06c0989bc8e9b2 100644
--- a/clang/test/Driver/aarch64-v85a.c
+++ b/clang/test/Driver/aarch64-v85a.c
@@ -13,3 +13,45 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-BE %s
 // GENERICV85A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.5a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.5-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/aarch64-v86a.c b/clang/test/Driver/aarch64-v86a.c
index ba2b57979b5187..04d372e762a37d 100644
--- a/clang/test/Driver/aarch64-v86a.c
+++ b/clang/test/Driver/aarch64-v86a.c
@@ -13,3 +13,50 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
 // GENERICV86A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.6a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.6-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/aarch64-v87a.c b/clang/test/Driver/aarch64-v87a.c
index ee4b68882739a8..b385e7cfb2ad61 100644
--- a/clang/test/Driver/aarch64-v87a.c
+++ b/clang/test/Driver/aarch64-v87a.c
@@ -13,3 +13,53 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV87A-BE %s
 // GENERICV87A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.7a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.7-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_HCX
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_WFxT
+// ARCH-EXTENSION: FEAT_XS
diff --git a/clang/test/Driver/aarch64-v88a.c b/clang/test/Driver/aarch64-v88a.c
index b680c1f567134d..438796b91fff61 100644
--- a/clang/test/Driver/aarch64-v88a.c
+++ b/clang/test/Driver/aarch64-v88a.c
@@ -13,3 +13,56 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV88A-BE %s
 // GENERICV88A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.8a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.8-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_HBC
+// ARCH-EXTENSION: FEAT_HCX
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MOPS
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NMI, FEAT_GICv3_NMI
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_WFxT
+// ARCH-EXTENSION: FEAT_XS
diff --git a/clang/test/Driver/aarch64-v89a.c b/clang/test/Driver/aarch64-v89a.c
index 903b793d046ba6..42bcb127cd3bd2 100644
--- a/clang/test/Driver/aarch64-v89a.c
+++ b/clang/test/Driver/aarch64-v89a.c
@@ -12,3 +12,62 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.9a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV89A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv8.9-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV89A-BE %s
 // GENERICV89A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.9a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8.9-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CHK
+// ARCH-EXTENSION: FEAT_CLRBHB
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSSC
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_HBC
+// ARCH-EXTENSION: FEAT_HCX
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MOPS
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NMI, FEAT_GICv3_NMI
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_PRFMSLC
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RASv2
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SPECRES2
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_WFxT
+// ARCH-EXTENSION: FEAT_XS
diff --git a/clang/test/Driver/aarch64-v8a.c b/clang/test/Driver/aarch64-v8a.c
new file mode 100644
index 00000000000000..a3b01560b22a6f
--- /dev/null
+++ b/clang/test/Driver/aarch64-v8a.c
@@ -0,0 +1,29 @@
+// RUN: %clang --target=aarch64_be -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-BE %s
+// RUN: %clang --target=aarch64_be -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-BE %s
+// RUN: %clang --target=aarch64 -mbig-endian -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-BE %s
+// RUN: %clang --target=aarch64 -mbig-endian -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-BE %s
+// RUN: %clang --target=aarch64_be -mbig-endian -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-BE %s
+// RUN: %clang --target=aarch64_be -mbig-endian -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-BE %s
+// GENERICV8A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic"{{.*}} "-target-feature" "+v8a"{{.*}} "-target-feature" "+neon"
+
+// RUN: %clang --target=aarch64 -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A %s
+// RUN: %clang --target=aarch64 -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A %s
+// RUN: %clang --target=aarch64 -mlittle-endian -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A %s
+// RUN: %clang --target=aarch64 -mlittle-endian -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A %s
+// RUN: %clang --target=aarch64_be -mlittle-endian -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A %s
+// RUN: %clang --target=aarch64_be -mlittle-endian -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A %s
+// GENERICV8A: "-cc1"{{.*}} "-triple" "aarch64{{(--)?}}"{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+v8a"{{.*}} "-target-feature" "+neon"{{.*}}
+
+// RUN: %clang --target=arm64 -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV8A %s
+// RUN: %clang --target=arm64 -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV8A %s
+// RUN: %clang --target=arm64 -mlittle-endian -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV8A %s
+// RUN: %clang --target=arm64 -mlittle-endian -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV8A %s
+// ARM64-GENERICV8A: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "generic"{{.*}} "-target-feature" "+v8a"{{.*}} "-target-feature" "+neon"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv8-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_TRBE
diff --git a/clang/test/Driver/aarch64-v91a.c b/clang/test/Driver/aarch64-v91a.c
index 80853a59d01537..c5667f8fb3bcdc 100644
--- a/clang/test/Driver/aarch64-v91a.c
+++ b/clang/test/Driver/aarch64-v91a.c
@@ -13,3 +13,55 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.1a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV91A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.1-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV91A-BE %s
 // GENERICV91A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.1a"{{.*}} "-target-feature" "+sve" "-target-feature" "+sve2"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv9.1-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FP16
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MEC
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_RME
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_SVE
+// ARCH-EXTENSION: FEAT_SVE2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/aarch64-v92a.c b/clang/test/Driver/aarch64-v92a.c
index ee644cc6f3c620..6088a69312dc4c 100644
--- a/clang/test/Driver/aarch64-v92a.c
+++ b/clang/test/Driver/aarch64-v92a.c
@@ -13,3 +13,58 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.2a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV92A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.2-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV92A-BE %s
 // GENERICV92A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.2a"{{.*}} "-target-feature" "+sve" "-target-feature" "+sve2"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv9.2-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FP16
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_HCX
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MEC
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_RME
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_SVE
+// ARCH-EXTENSION: FEAT_SVE2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_WFxT
+// ARCH-EXTENSION: FEAT_XS
diff --git a/clang/test/Driver/aarch64-v93a.c b/clang/test/Driver/aarch64-v93a.c
index 817559e28ccf47..5db3034078ea80 100644
--- a/clang/test/Driver/aarch64-v93a.c
+++ b/clang/test/Driver/aarch64-v93a.c
@@ -13,3 +13,61 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.3a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV93A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.3-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV93A-BE %s
 // GENERICV93A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.3a"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv9.3-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FP16
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_HBC
+// ARCH-EXTENSION: FEAT_HCX
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MEC
+// ARCH-EXTENSION: FEAT_MOPS
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NMI, FEAT_GICv3_NMI
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_RME
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_SVE
+// ARCH-EXTENSION: FEAT_SVE2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_WFxT
+// ARCH-EXTENSION: FEAT_XS
diff --git a/clang/test/Driver/aarch64-v94a.c b/clang/test/Driver/aarch64-v94a.c
index 9998cc8a4a2160..d9f991fc95d3d5 100644
--- a/clang/test/Driver/aarch64-v94a.c
+++ b/clang/test/Driver/aarch64-v94a.c
@@ -13,3 +13,67 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.4a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV94A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=armv9.4-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV94A-BE %s
 // GENERICV94A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.4a"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv9.4-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CHK
+// ARCH-EXTENSION: FEAT_CLRBHB
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSSC
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FP16
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_HBC
+// ARCH-EXTENSION: FEAT_HCX
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MEC
+// ARCH-EXTENSION: FEAT_MOPS
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NMI, FEAT_GICv3_NMI
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_PRFMSLC
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RASv2
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_RME
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SPECRES2
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_SVE
+// ARCH-EXTENSION: FEAT_SVE2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_WFxT
+// ARCH-EXTENSION: FEAT_XS
diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 62878f2127626e..e3e97d2bef13f9 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -25,3 +25,70 @@
 // RUN: %clang -target aarch64 -march=armv9.5a+tlbiw -### -c %s 2>&1 | FileCheck -check-prefix=V95A-TLBIW %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+tlbiw -### -c %s 2>&1 | FileCheck -check-prefix=V95A-TLBIW %s
 // V95A-TLBIW: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}} "-target-feature" "+tlbiw"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv9.5-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AMUv1p1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BF16
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CHK
+// ARCH-EXTENSION: FEAT_CLRBHB
+// ARCH-EXTENSION: FEAT_CPA
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSSC
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ECV
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FAMINMAX
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FGT
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FP16
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_HBC
+// ARCH-EXTENSION: FEAT_HCX
+// ARCH-EXTENSION: FEAT_I8MM
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_LUT
+// ARCH-EXTENSION: FEAT_MEC
+// ARCH-EXTENSION: FEAT_MOPS
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NMI, FEAT_GICv3_NMI
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_PRFMSLC
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RASv2
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_RME
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SPECRES2
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_SVE
+// ARCH-EXTENSION: FEAT_SVE2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_WFxT
+// ARCH-EXTENSION: FEAT_XS
diff --git a/clang/test/Driver/aarch64-v9a.c b/clang/test/Driver/aarch64-v9a.c
new file mode 100644
index 00000000000000..f85e1c409010d5
--- /dev/null
+++ b/clang/test/Driver/aarch64-v9a.c
@@ -0,0 +1,62 @@
+// RUN: %clang --target=aarch64 -march=armv9a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A %s
+// RUN: %clang --target=aarch64 -march=armv9-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A %s
+// RUN: %clang --target=aarch64 -mlittle-endian -march=armv9a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A %s
+// RUN: %clang --target=aarch64 -mlittle-endian -march=armv9-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A %s
+// RUN: %clang --target=aarch64_be -mlittle-endian -march=armv9a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A %s
+// RUN: %clang --target=aarch64_be -mlittle-endian -march=armv9-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A %s
+// GENERICV9A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9a"{{.*}} "-target-feature" "+sve" "-target-feature" "+sve2"
+
+// RUN: %clang --target=aarch64_be -march=armv9a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-BE %s
+// RUN: %clang --target=aarch64_be -march=armv9-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-BE %s
+// RUN: %clang --target=aarch64 -mbig-endian -march=armv9a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-BE %s
+// RUN: %clang --target=aarch64 -mbig-endian -march=armv9-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-BE %s
+// RUN: %clang --target=aarch64_be -mbig-endian -march=armv9a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-BE %s
+// RUN: %clang --target=aarch64_be -mbig-endian -march=armv9-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV9A-BE %s
+// GENERICV9A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9a"{{.*}} "-target-feature" "+sve" "-target-feature" "+sve2"
+
+// ===== Architecture extensions =====
+
+// RUN: %if aarch64-registered-target %{ %clang -target aarch64 -march=armv9-a --print-enabled-extensions | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s %}
+// ARCH-EXTENSION: FEAT_AMUv1
+// ARCH-EXTENSION: FEAT_AdvSIMD
+// ARCH-EXTENSION: FEAT_BTI
+// ARCH-EXTENSION: FEAT_CCIDX
+// ARCH-EXTENSION: FEAT_CRC32
+// ARCH-EXTENSION: FEAT_CSV2_2
+// ARCH-EXTENSION: FEAT_DIT
+// ARCH-EXTENSION: FEAT_DPB
+// ARCH-EXTENSION: FEAT_DPB2
+// ARCH-EXTENSION: FEAT_DotProd
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_FCMA
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_FP16
+// ARCH-EXTENSION: FEAT_FRINTTS
+// ARCH-EXTENSION: FEAT_FlagM
+// ARCH-EXTENSION: FEAT_FlagM2
+// ARCH-EXTENSION: FEAT_JSCVT
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_LRCPC
+// ARCH-EXTENSION: FEAT_LRCPC2
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_LSE2
+// ARCH-EXTENSION: FEAT_MEC
+// ARCH-EXTENSION: FEAT_MPAM
+// ARCH-EXTENSION: FEAT_NV, FEAT_NV2
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_PAN2
+// ARCH-EXTENSION: FEAT_PAuth
+// ARCH-EXTENSION: FEAT_RAS, FEAT_RASv1p1
+// ARCH-EXTENSION: FEAT_RDM
+// ARCH-EXTENSION: FEAT_RME
+// ARCH-EXTENSION: FEAT_SB
+// ARCH-EXTENSION: FEAT_SEL2
+// ARCH-EXTENSION: FEAT_SPECRES
+// ARCH-EXTENSION: FEAT_SSBS, FEAT_SSBS2
+// ARCH-EXTENSION: FEAT_SVE
+// ARCH-EXTENSION: FEAT_SVE2
+// ARCH-EXTENSION: FEAT_TLBIOS, FEAT_TLBIRANGE
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_TRF
+// ARCH-EXTENSION: FEAT_UAO
+// ARCH-EXTENSION: FEAT_VHE
diff --git a/clang/test/Driver/cuda-arch-translation.cu b/clang/test/Driver/cuda-arch-translation.cu
index ff97f2dbda6c57..e96191cc9d4183 100644
--- a/clang/test/Driver/cuda-arch-translation.cu
+++ b/clang/test/Driver/cuda-arch-translation.cu
@@ -59,6 +59,8 @@
 // RUN: | FileCheck -check-prefixes=HIP,GFX900 %s
 // RUN: %clang -x hip -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=gfx902 -nogpuinc -nogpulib %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=HIP,GFX902 %s
+// RUN: %clang -x hip -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=amdgcnspirv -nogpuinc -nogpulib %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=HIP,SPIRV %s
 
 // CUDA: ptxas
 // CUDA-SAME: -m64
@@ -95,3 +97,4 @@
 // GFX810:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx810
 // GFX900:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx900
 // GFX902:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx902
+// SPIRV:-targets=host-x86_64-unknown-linux,hip-spirv64-amd-amdhsa--amdgcnspirv
diff --git a/clang/test/Driver/print-supported-extensions.c b/clang/test/Driver/print-supported-extensions.c
index 17894fc0f7ee0b..b9b16352f8295b 100644
--- a/clang/test/Driver/print-supported-extensions.c
+++ b/clang/test/Driver/print-supported-extensions.c
@@ -4,8 +4,8 @@
 // RUN: %if aarch64-registered-target %{ %clang --target=aarch64-linux-gnu \
 // RUN:   --print-supported-extensions 2>&1 | FileCheck %s --check-prefix AARCH64 %}
 // AARCH64: All available -march extensions for AArch64
-// AARCH64:     Name                Description
-// AARCH64:     aes                 Enable AES support (FEAT_AES, FEAT_PMULL)
+// AARCH64:     Name                Architecture Feature(s)                                Description
+// AARCH64:     aes                 FEAT_AES, FEAT_PMULL                                   Enable AES support
 
 // RUN: %if riscv-registered-target %{ %clang --target=riscv64-linux-gnu \
 // RUN:   --print-supported-extensions 2>&1 | FileCheck %s --check-prefix RISCV %}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 3022ed1250d590..6773571556bd42 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -423,8 +423,8 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -mno-apxf -mapxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=APXF %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapxf -mno-apxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-APXF %s
 //
-// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf"
-// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf"
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+cf"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" "-target-feature" "-cf"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=push2pop2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
@@ -433,6 +433,7 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=ccmp %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CCMP %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=nf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=cf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CF %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=zu %s -### -o %t.o 2>&1 | FileCheck -check-prefix=ZU %s
 // EGPR: "-target-feature" "+egpr"
 // PUSH2POP2: "-target-feature" "+push2pop2"
 // PPX: "-target-feature" "+ppx"
@@ -440,6 +441,7 @@
 // CCMP: "-target-feature" "+ccmp"
 // NF: "-target-feature" "+nf"
 // CF: "-target-feature" "+cf"
+// ZU: "-target-feature" "+zu"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr,ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr -mapx-features=ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s
diff --git a/clang/test/Frontend/embed-bitcode.ll b/clang/test/Frontend/embed-bitcode.ll
index 9b8632d04dd985..0959af48ad24af 100644
--- a/clang/test/Frontend/embed-bitcode.ll
+++ b/clang/test/Frontend/embed-bitcode.ll
@@ -10,6 +10,9 @@
 ; RUN: %clang_cc1 -triple aarch64 -emit-llvm \
 ; RUN:    -fembed-bitcode=all -x ir %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-ELF
+; RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-ELF
 
 ; check .bc input
 ; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm-bc \
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 99732694f72a5f..28df04c5e33ef4 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -201,6 +201,7 @@
 // CHECK-NEXT: Uninitialized (SubjectMatchRule_variable_is_local)
 // CHECK-NEXT: UnsafeBufferUsage (SubjectMatchRule_function)
 // CHECK-NEXT: UseHandle (SubjectMatchRule_variable_is_parameter)
+// CHECK-NEXT: VTablePointerAuthentication (SubjectMatchRule_record)
 // CHECK-NEXT: VecReturn (SubjectMatchRule_record)
 // CHECK-NEXT: VecTypeHint (SubjectMatchRule_function)
 // CHECK-NEXT: WarnUnused (SubjectMatchRule_record)
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 1a9063ee5a2574..a5f9ffa21220a9 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: generic, cortex-a35, cortex-a34, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-a725, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, cortex-x925, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx, thunderxt88, thunderxt81, thunderxt83, thunderx2t99, thunderx3t110, tsv110, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-s4, apple-s5, apple-a13, apple-a14, apple-m1, apple-a15, apple-m2, apple-a16, apple-m3, apple-a17, apple-m4, a64fx, carmel, ampere1, ampere1a, ampere1b, oryon-1, cobalt-100, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: a64fx, ampere1, ampere1a, ampere1b, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-a7, apple-a8, apple-a9, apple-m1, apple-m2, apple-m3, apple-m4, apple-s4, apple-s5, carmel, cobalt-100, cortex-a34, cortex-a35, cortex-a510, cortex-a520, cortex-a520ae, cortex-a53, cortex-a55, cortex-a57, cortex-a65, cortex-a65ae, cortex-a710, cortex-a715, cortex-a72, cortex-a720, cortex-a720ae, cortex-a725, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, cortex-x925, cyclone, exynos-m3, exynos-m4, exynos-m5, falkor, generic, grace, kryo, neoverse-512tvb, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, oryon-1, saphira, thunderx, thunderx2t99, thunderx3t110, thunderxt81, thunderxt83, thunderxt88, tsv110{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: generic, cortex-a35, cortex-a34, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-a725, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, cortex-x925, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx, thunderxt88, thunderxt81, thunderxt83, thunderx2t99, thunderx3t110, tsv110, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-s4, apple-s5, apple-a13, apple-a14, apple-m1, apple-a15, apple-m2, apple-a16, apple-m3, apple-a17, apple-m4, a64fx, carmel, ampere1, ampere1a, ampere1b, oryon-1, cobalt-100, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: a64fx, ampere1, ampere1a, ampere1b, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-a7, apple-a8, apple-a9, apple-m1, apple-m2, apple-m3, apple-m4, apple-s4, apple-s5, carmel, cobalt-100, cortex-a34, cortex-a35, cortex-a510, cortex-a520, cortex-a520ae, cortex-a53, cortex-a55, cortex-a57, cortex-a65, cortex-a65ae, cortex-a710, cortex-a715, cortex-a72, cortex-a720, cortex-a720ae, cortex-a725, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, cortex-x925, cyclone, exynos-m3, exynos-m4, exynos-m5, falkor, generic, grace, kryo, neoverse-512tvb, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, oryon-1, saphira, thunderx, thunderx2t99, thunderx3t110, thunderxt81, thunderxt83, thunderxt88, tsv110{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
@@ -29,7 +29,7 @@
 
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
 
 // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
 // R600: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/Misc/warning-flags.c b/clang/test/Misc/warning-flags.c
index dd73331913c6f6..ae14720959a0ef 100644
--- a/clang/test/Misc/warning-flags.c
+++ b/clang/test/Misc/warning-flags.c
@@ -18,10 +18,9 @@ This test serves two purposes:
 
 The list of warnings below should NEVER grow.  It should gradually shrink to 0.
 
-CHECK: Warnings without flags (66):
+CHECK: Warnings without flags (65):
 
 CHECK-NEXT:   ext_expected_semi_decl_list
-CHECK-NEXT:   ext_explicit_specialization_storage_class
 CHECK-NEXT:   ext_missing_whitespace_after_macro_name
 CHECK-NEXT:   ext_new_paren_array_nonconst
 CHECK-NEXT:   ext_plain_complex
diff --git a/clang/test/Modules/forward-friend.cppm b/clang/test/Modules/forward-friend.cppm
new file mode 100644
index 00000000000000..dfadf4fcc1dae9
--- /dev/null
+++ b/clang/test/Modules/forward-friend.cppm
@@ -0,0 +1,22 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/m.cppm -fsyntax-only -verify
+
+//--- foo.h
+
+template <typename... U>
+static void foo(U...) noexcept;
+
+class A {
+  template <typename... U>
+  friend void foo(U...) noexcept;
+};
+
+//--- m.cppm
+// expected-no-diagnostics
+module;
+#include "foo.h"
+export module m;
+export using ::A;
diff --git a/clang/test/Modules/no-eager-load.cppm b/clang/test/Modules/no-eager-load.cppm
index 8a2c7656bca2b4..c9eddaaed15552 100644
--- a/clang/test/Modules/no-eager-load.cppm
+++ b/clang/test/Modules/no-eager-load.cppm
@@ -44,6 +44,9 @@ void use() {
            // expected-note@* {{but in 'a' found a different body}}
 }
 
+// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' can't be attached to other modules}}
+// expected-note@b.cppm:* {{}}
+
 //--- h.cppm
 export module h;
 export import a;
@@ -55,3 +58,6 @@ void use() {
     foo(); // expected-error@* {{'foo' has different definitions in different modules;}}
            // expected-note@* {{but in 'a' found a different body}}
 }
+
+// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' can't be attached to other modules}}
+// expected-note@b.cppm:* {{}}
diff --git a/clang/test/Modules/no-external-identifier-id.cppm b/clang/test/Modules/no-external-identifier-id.cppm
new file mode 100644
index 00000000000000..25825ef67ad91f
--- /dev/null
+++ b/clang/test/Modules/no-external-identifier-id.cppm
@@ -0,0 +1,37 @@
+// Testing that we won't record the identifier ID from external modules.
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.pcm \
+// RUN:     -fmodule-file=a=%t/a.pcm
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/b.pcm | FileCheck %t/b.cppm
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.v1.cppm -emit-module-interface -o %t/a.v1.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.v1.pcm \
+// RUN:     -fmodule-file=a=%t/a.v1.pcm
+// RUN: diff %t/b.pcm %t/b.v1.pcm &> /dev/null
+
+//--- a.cppm
+export module a;
+export inline int a() {
+    int foo = 43;
+    return foo;
+}
+
+//--- b.cppm
+export module b;
+import a;
+export inline int b() {
+    int foo = 43;
+    return foo;
+}
+
+// CHECK: <DECL_VAR {{.*}} op5=4
+
+//--- a.v1.cppm
+// We remove the unused the function and testing if the format of the BMI of B will change.
+export module a;
+
diff --git a/clang/test/Modules/no-external-type-id.cppm b/clang/test/Modules/no-external-type-id.cppm
new file mode 100644
index 00000000000000..3ca50be18a7fd7
--- /dev/null
+++ b/clang/test/Modules/no-external-type-id.cppm
@@ -0,0 +1,32 @@
+// Testing that we won't record the type ID from external modules.
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.pcm \
+// RUN:     -fmodule-file=a=%t/a.pcm
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/b.pcm | FileCheck %t/b.cppm
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.v1.cppm -emit-module-interface -o %t/a.v1.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.v1.pcm \
+// RUN:     -fmodule-file=a=%t/a.v1.pcm
+// RUN: diff %t/b.pcm %t/b.v1.pcm &> /dev/null
+
+//--- a.cppm
+export module a;
+export int a();
+
+//--- b.cppm
+export module b;
+import a;
+export int b();
+
+// CHECK: <DECL_FUNCTION {{.*}} op8=4048
+// CHECK: <TYPE_FUNCTION_PROTO
+
+//--- a.v1.cppm
+// We remove the unused the function and testing if the format of the BMI of B will change.
+export module a;
+
diff --git a/clang/test/Modules/no-implicit-declarations.cppm b/clang/test/Modules/no-implicit-declarations.cppm
index 79c3c5e76f63ed..a5129d40194d31 100644
--- a/clang/test/Modules/no-implicit-declarations.cppm
+++ b/clang/test/Modules/no-implicit-declarations.cppm
@@ -16,11 +16,11 @@ export int a = 43;
 
 // CHECK:  <DECLTYPES_BLOCK
 // CHECK-NOT: <DECL_TYPEDEF
+// CHECK:    <TYPE_TYPEDEF
 // CHECK:    <DECL_CONTEXT_LEXICAL
 // CHECK:    <DECL_EXPORT
-// CHECK:    <TYPE_TYPEDEF
+// CHECK:    <TYPE_RECORD
 // CHECK:    <DECL_VAR
 // CHECK:    <EXPR_INTEGER_LITERAL
 // CHECK:    <STMT_STOP
-// CHECK:    <TYPE_RECORD
 // CHECK:  </DECLTYPES_BLOCK>
diff --git a/clang/test/Modules/no-transitive-decl-change-2.cppm b/clang/test/Modules/no-transitive-decl-change-2.cppm
new file mode 100644
index 00000000000000..fe29d9169201d6
--- /dev/null
+++ b/clang/test/Modules/no-transitive-decl-change-2.cppm
@@ -0,0 +1,28 @@
+// Test that adding a new unused decl within reduced BMI may not produce a transitive change.
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/A.cppm -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/B.cppm -o %t/B.pcm \
+// RUN:     -fmodule-file=A=%t/A.pcm
+//
+// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/A.v1.cppm -o %t/A.v1.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/B.cppm -o %t/B.v1.pcm \
+// RUN:     -fmodule-file=A=%t/A.v1.pcm
+//
+// RUN: diff %t/B.pcm %t/B.v1.pcm &> /dev/null
+
+//--- A.cppm
+export module A;
+export int a() { return 44; }
+
+//--- A.v1.cppm
+export module A;
+int a_impl() { return 48; }
+export int a() { return a_impl(); }
+
+//--- B.cppm
+export module B;
+import A;
+export int b() { return a(); }
diff --git a/clang/test/Modules/same-decl-in-different-modules.cppm b/clang/test/Modules/same-decl-in-different-modules.cppm
new file mode 100644
index 00000000000000..2e8e90f7cd8e95
--- /dev/null
+++ b/clang/test/Modules/same-decl-in-different-modules.cppm
@@ -0,0 +1,42 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/mod1.cppm -emit-module-interface -o %t/mod1.pcm
+// RUN: %clang_cc1 -std=c++20 %t/mod2.cppm -emit-module-interface -o %t/mod2.pcm
+// RUN: %clang_cc1 -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -verify
+
+//--- mod1.cppm
+export module mod1;
+export int v;
+export void func();
+export class A {};
+export template <class C>
+struct S {};
+
+//--- mod2.cppm
+export module mod2;
+export int v;
+export void func();
+export class A;
+export template <class C>
+struct S {};
+
+//--- test.cc
+import mod1;
+import mod2;
+void test() {
+    int value = v;
+    func();
+    A a;
+    S<int> s;
+}
+
+// expected-error@mod1.cppm:* {{declaration 'v' attached to named module 'mod1' can't be attached to other modules}}
+// expected-note@mod2.cppm:* {{}}
+// expected-error@mod1.cppm:* {{declaration 'func' attached to named module 'mod1' can't be attached to other modules}}
+// expected-note@mod2.cppm:* {{}}
+// expected-error@mod1.cppm:* {{declaration 'A' attached to named module 'mod1' can't be attached to other modules}}
+// expected-note@mod2.cppm:* {{}}
+// expected-error@mod1.cppm:* 1+{{declaration 'S' attached to named module 'mod1' can't be attached to other modules}}
+// expected-note@mod2.cppm:* 1+{{}}
diff --git a/clang/test/OpenMP/Inputs/nesting_of_regions.cpp b/clang/test/OpenMP/Inputs/nesting_of_regions.cpp
index e671f9b0cf4123..969ddfcce4cb0e 100644
--- a/clang/test/OpenMP/Inputs/nesting_of_regions.cpp
+++ b/clang/test/OpenMP/Inputs/nesting_of_regions.cpp
@@ -4880,6 +4880,12 @@ void foo() {
 #pragma omp teams  // expected-note {{nested teams construct here}}
     ++a;
   }
+#pragma omp target // expected-error {{target construct with nested teams region contains statements outside of the teams construct}}
+  {
+#pragma omp teams  // expected-note {{nested teams construct here}}
+    ++a;
+    ++a;           // expected-note {{statement outside teams construct here}}
+  }
 #pragma omp target // expected-error {{target construct with nested teams region contains statements outside of the teams construct}}
   {
     while (0)      // expected-note {{statement outside teams construct here}}
@@ -14133,6 +14139,12 @@ void foo() {
 #pragma omp teams // expected-note {{nested teams construct here}}
     ++a;
   }
+#pragma omp target // expected-error {{target construct with nested teams region contains statements outside of the teams construct}}
+  {
+#pragma omp teams // expected-note {{nested teams construct here}}
+    ++a;
+    ++a;          // expected-note {{statement outside teams construct here}}
+  }
 #pragma omp target
   {
 #pragma omp taskloop
diff --git a/clang/test/OpenMP/declare_variant_clauses_messages.cpp b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
index 2a9e5385c9ca64..284e49bbd21b4c 100644
--- a/clang/test/OpenMP/declare_variant_clauses_messages.cpp
+++ b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
@@ -186,6 +186,16 @@ void vararg_bar2(const char *fmt) { return; }
 // expected-error@+1 {{variant in '#pragma omp declare variant' with type 'void (float *, float *, int *, omp_interop_t)' (aka 'void (float *, float *, int *, void *)') is incompatible with type 'void (float *, float *, int *)'}}
 #pragma omp declare variant(foo_v4) match(construct={dispatch})
 
+// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}}
+#pragma omp declare variant(foo_v1)                        \
+   match(construct={dispatch}, device={arch(arm)})         \
+   adjust_args(badaaop:AAA,BBB)
+
+// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}}
+#pragma omp declare variant(foo_v1)                        \
+   match(construct={dispatch}, device={arch(arm)})         \
+   adjust_args(badaaop AAA,BBB)
+
 #endif // _OPENMP >= 202011
 #if _OPENMP < 202011  // OpenMP 5.0 or lower
 // expected-error@+2 {{expected 'match' clause on 'omp declare variant' directive}}
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index c077b58f614fa1..0753573c73bce8 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -223,7 +223,6 @@ int bar(int n){
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -239,6 +238,7 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK-64-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK-64:       body:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -253,11 +253,12 @@ int bar(int n){
 // CHECK-64:       else:
 // CHECK-64-NEXT:    br label [[IFCONT]]
 // CHECK-64:       ifcont:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-64:       then2:
+// CHECK-64:       then3:
 // CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
@@ -265,9 +266,9 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-64:       else3:
+// CHECK-64:       else4:
 // CHECK-64-NEXT:    br label [[IFCONT4]]
-// CHECK-64:       ifcont4:
+// CHECK-64:       ifcont5:
 // CHECK-64-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK-64-NEXT:    br label [[PRECOND]]
@@ -445,7 +446,6 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -454,6 +454,7 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -467,47 +468,50 @@ int bar(int n){
 // CHECK-64:       else:
 // CHECK-64-NEXT:    br label [[IFCONT]]
 // CHECK-64:       ifcont:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-64:       then2:
+// CHECK-64:       then3:
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
 // CHECK-64-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
 // CHECK-64-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
 // CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-64:       else3:
+// CHECK-64:       else4:
 // CHECK-64-NEXT:    br label [[IFCONT4]]
-// CHECK-64:       ifcont4:
+// CHECK-64:       ifcont5:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK-64:       then6:
+// CHECK-64:       then8:
 // CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK-64-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK-64:       else7:
+// CHECK-64:       else9:
 // CHECK-64-NEXT:    br label [[IFCONT8]]
-// CHECK-64:       ifcont8:
+// CHECK-64:       ifcont10:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK-64:       then10:
+// CHECK-64:       then13:
 // CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
 // CHECK-64-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK-64:       else11:
+// CHECK-64:       else14:
 // CHECK-64-NEXT:    br label [[IFCONT12]]
-// CHECK-64:       ifcont12:
+// CHECK-64:       ifcont15:
 // CHECK-64-NEXT:    ret void
 //
 //
@@ -698,7 +702,6 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -707,6 +710,7 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -720,47 +724,50 @@ int bar(int n){
 // CHECK-64:       else:
 // CHECK-64-NEXT:    br label [[IFCONT]]
 // CHECK-64:       ifcont:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-64:       then2:
+// CHECK-64:       then3:
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
 // CHECK-64-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-64:       else3:
+// CHECK-64:       else4:
 // CHECK-64-NEXT:    br label [[IFCONT4]]
-// CHECK-64:       ifcont4:
+// CHECK-64:       ifcont5:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK-64:       then6:
+// CHECK-64:       then8:
 // CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK-64-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK-64-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK-64-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK-64:       else7:
+// CHECK-64:       else9:
 // CHECK-64-NEXT:    br label [[IFCONT8]]
-// CHECK-64:       ifcont8:
+// CHECK-64:       ifcont10:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK-64:       then10:
+// CHECK-64:       then13:
 // CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
 // CHECK-64-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK-64-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK-64-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK-64:       else11:
+// CHECK-64:       else14:
 // CHECK-64-NEXT:    br label [[IFCONT12]]
-// CHECK-64:       ifcont12:
+// CHECK-64:       ifcont15:
 // CHECK-64-NEXT:    ret void
 //
 //
@@ -890,7 +897,6 @@ int bar(int n){
 // CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -906,6 +912,7 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK-32-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK-32:       body:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -920,11 +927,12 @@ int bar(int n){
 // CHECK-32:       else:
 // CHECK-32-NEXT:    br label [[IFCONT]]
 // CHECK-32:       ifcont:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-32:       then2:
+// CHECK-32:       then3:
 // CHECK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
@@ -932,9 +940,9 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK-32-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-32:       else3:
+// CHECK-32:       else4:
 // CHECK-32-NEXT:    br label [[IFCONT4]]
-// CHECK-32:       ifcont4:
+// CHECK-32:       ifcont5:
 // CHECK-32-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK-32-NEXT:    br label [[PRECOND]]
@@ -1112,7 +1120,6 @@ int bar(int n){
 // CHECK-32-NEXT:  entry:
 // CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1121,6 +1128,7 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-32-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK-32-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -1134,47 +1142,50 @@ int bar(int n){
 // CHECK-32:       else:
 // CHECK-32-NEXT:    br label [[IFCONT]]
 // CHECK-32:       ifcont:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-32:       then2:
+// CHECK-32:       then3:
 // CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK-32-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
 // CHECK-32-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
 // CHECK-32-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-32:       else3:
+// CHECK-32:       else4:
 // CHECK-32-NEXT:    br label [[IFCONT4]]
-// CHECK-32:       ifcont4:
+// CHECK-32:       ifcont5:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK-32:       then6:
+// CHECK-32:       then8:
 // CHECK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK-32-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
 // CHECK-32-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK-32:       else7:
+// CHECK-32:       else9:
 // CHECK-32-NEXT:    br label [[IFCONT8]]
-// CHECK-32:       ifcont8:
+// CHECK-32:       ifcont10:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK-32:       then10:
+// CHECK-32:       then13:
 // CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK-32-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
 // CHECK-32-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK-32:       else11:
+// CHECK-32:       else14:
 // CHECK-32-NEXT:    br label [[IFCONT12]]
-// CHECK-32:       ifcont12:
+// CHECK-32:       ifcont15:
 // CHECK-32-NEXT:    ret void
 //
 //
@@ -1365,7 +1376,6 @@ int bar(int n){
 // CHECK-32-NEXT:  entry:
 // CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1374,6 +1384,7 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-32-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK-32-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -1387,47 +1398,50 @@ int bar(int n){
 // CHECK-32:       else:
 // CHECK-32-NEXT:    br label [[IFCONT]]
 // CHECK-32:       ifcont:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-32:       then2:
+// CHECK-32:       then3:
 // CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK-32-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK-32-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-32:       else3:
+// CHECK-32:       else4:
 // CHECK-32-NEXT:    br label [[IFCONT4]]
-// CHECK-32:       ifcont4:
+// CHECK-32:       ifcont5:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK-32:       then6:
+// CHECK-32:       then8:
 // CHECK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK-32-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK-32-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK-32-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK-32:       else7:
+// CHECK-32:       else9:
 // CHECK-32-NEXT:    br label [[IFCONT8]]
-// CHECK-32:       ifcont8:
+// CHECK-32:       ifcont10:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK-32:       then10:
+// CHECK-32:       then13:
 // CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK-32-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK-32-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK-32-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK-32:       else11:
+// CHECK-32:       else14:
 // CHECK-32-NEXT:    br label [[IFCONT12]]
-// CHECK-32:       ifcont12:
+// CHECK-32:       ifcont15:
 // CHECK-32-NEXT:    ret void
 //
 //
@@ -1557,7 +1571,6 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1573,6 +1586,7 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK-32-EX-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK-32-EX:       body:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -1587,11 +1601,12 @@ int bar(int n){
 // CHECK-32-EX:       else:
 // CHECK-32-EX-NEXT:    br label [[IFCONT]]
 // CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-32-EX:       then2:
+// CHECK-32-EX:       then3:
 // CHECK-32-EX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-EX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
@@ -1599,9 +1614,9 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK-32-EX-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-32-EX:       else3:
+// CHECK-32-EX:       else4:
 // CHECK-32-EX-NEXT:    br label [[IFCONT4]]
-// CHECK-32-EX:       ifcont4:
+// CHECK-32-EX:       ifcont5:
 // CHECK-32-EX-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK-32-EX-NEXT:    br label [[PRECOND]]
@@ -1779,7 +1794,6 @@ int bar(int n){
 // CHECK-32-EX-NEXT:  entry:
 // CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1788,6 +1802,7 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-32-EX-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -1801,47 +1816,50 @@ int bar(int n){
 // CHECK-32-EX:       else:
 // CHECK-32-EX-NEXT:    br label [[IFCONT]]
 // CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-32-EX:       then2:
+// CHECK-32-EX:       then3:
 // CHECK-32-EX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-EX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
 // CHECK-32-EX-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
 // CHECK-32-EX-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-32-EX:       else3:
+// CHECK-32-EX:       else4:
 // CHECK-32-EX-NEXT:    br label [[IFCONT4]]
-// CHECK-32-EX:       ifcont4:
+// CHECK-32-EX:       ifcont5:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK-32-EX:       then6:
+// CHECK-32-EX:       then8:
 // CHECK-32-EX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK-32-EX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK-32-EX-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
 // CHECK-32-EX-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK-32-EX:       else7:
+// CHECK-32-EX:       else9:
 // CHECK-32-EX-NEXT:    br label [[IFCONT8]]
-// CHECK-32-EX:       ifcont8:
+// CHECK-32-EX:       ifcont10:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK-32-EX:       then10:
+// CHECK-32-EX:       then13:
 // CHECK-32-EX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-EX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
 // CHECK-32-EX-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK-32-EX:       else11:
+// CHECK-32-EX:       else14:
 // CHECK-32-EX-NEXT:    br label [[IFCONT12]]
-// CHECK-32-EX:       ifcont12:
+// CHECK-32-EX:       ifcont15:
 // CHECK-32-EX-NEXT:    ret void
 //
 //
@@ -2032,7 +2050,6 @@ int bar(int n){
 // CHECK-32-EX-NEXT:  entry:
 // CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -2041,6 +2058,7 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-32-EX-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -2054,46 +2072,49 @@ int bar(int n){
 // CHECK-32-EX:       else:
 // CHECK-32-EX-NEXT:    br label [[IFCONT]]
 // CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK-32-EX:       then2:
+// CHECK-32-EX:       then3:
 // CHECK-32-EX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-EX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK-32-EX-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK-32-EX:       else3:
+// CHECK-32-EX:       else4:
 // CHECK-32-EX-NEXT:    br label [[IFCONT4]]
-// CHECK-32-EX:       ifcont4:
+// CHECK-32-EX:       ifcont5:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK-32-EX:       then6:
+// CHECK-32-EX:       then8:
 // CHECK-32-EX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK-32-EX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK-32-EX-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK-32-EX-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK-32-EX:       else7:
+// CHECK-32-EX:       else9:
 // CHECK-32-EX-NEXT:    br label [[IFCONT8]]
-// CHECK-32-EX:       ifcont8:
+// CHECK-32-EX:       ifcont10:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK-32-EX:       then10:
+// CHECK-32-EX:       then13:
 // CHECK-32-EX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-32-EX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK-32-EX-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK-32-EX-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK-32-EX:       else11:
+// CHECK-32-EX:       else14:
 // CHECK-32-EX-NEXT:    br label [[IFCONT12]]
-// CHECK-32-EX:       ifcont12:
+// CHECK-32-EX:       ifcont15:
 // CHECK-32-EX-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index fac42732022ad8..7b37480856ca25 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -36,14 +36,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -66,78 +66,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16:![0-9]+]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR12:[0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -161,9 +161,9 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -197,79 +197,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3:[0-9]+]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -278,28 +278,28 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to float
-// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to float
-// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -309,25 +309,25 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -358,53 +358,53 @@ void test() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(ptr nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(ptr nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 4
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21:![0-9]+]]
-// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 8
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr %"class.std::complex", ptr [[TMP9]], i64 1
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK1-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
 // CHECK1-NEXT:    [[TMP15:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
-// CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
+// CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i64 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
-// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = icmp eq i16 [[TMP7]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP20:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -433,7 +433,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 8
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP36]], ptr align 4 [[TMP34]], i64 8, i1 false), !tbaa.struct [[TBAA_STRUCT23:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP36]], ptr align 8 [[TMP34]], i64 8, i1 false)
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -442,33 +442,33 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -477,46 +477,47 @@ void test() {
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK1:       then2:
+// CHECK1:       then3:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK1:       else3:
+// CHECK1:       else4:
 // CHECK1-NEXT:    br label [[IFCONT4]]
-// CHECK1:       ifcont4:
+// CHECK1:       ifcont5:
 // CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -527,14 +528,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -557,78 +558,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 16)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24:![0-9]+]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -652,9 +653,9 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -688,79 +689,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -769,28 +770,28 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to double
-// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to double
-// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -800,25 +801,25 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -849,26 +850,26 @@ void test() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(ptr nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(ptr nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
@@ -876,14 +877,14 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.0", align 8
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
@@ -908,7 +909,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
 // CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK1:       .shuffle.exit:
-// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -937,7 +938,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT29:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false)
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -946,33 +947,33 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 4
 // CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -981,46 +982,47 @@ void test() {
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK1:       then2:
+// CHECK1:       then3:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK1:       else3:
+// CHECK1:       else4:
 // CHECK1-NEXT:    br label [[IFCONT4]]
-// CHECK1:       ifcont4:
+// CHECK1:       ifcont5:
 // CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -1031,18 +1033,18 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA16]]
-// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[TBAA16]]
-// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
+// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1050,10 +1052,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
@@ -1061,10 +1063,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
@@ -1074,18 +1076,18 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA24]]
-// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[TBAA24]]
-// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8
+// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1093,10 +1095,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
 //
@@ -1104,9 +1106,9 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index 360a780c75383c..350b6f761e573a 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -182,7 +182,6 @@ int bar(int n){
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -198,6 +197,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -212,11 +212,12 @@ int bar(int n){
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK1:       then2:
+// CHECK1:       then3:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
@@ -224,9 +225,9 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK1:       else3:
+// CHECK1:       else4:
 // CHECK1-NEXT:    br label [[IFCONT4]]
-// CHECK1:       ifcont4:
+// CHECK1:       ifcont5:
 // CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND]]
@@ -496,7 +497,6 @@ int bar(int n){
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -505,6 +505,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -518,47 +519,50 @@ int bar(int n){
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK1:       then2:
+// CHECK1:       then3:
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
 // CHECK1-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK1:       else3:
+// CHECK1:       else4:
 // CHECK1-NEXT:    br label [[IFCONT4]]
-// CHECK1:       ifcont4:
+// CHECK1:       ifcont5
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK1:       then6:
+// CHECK1:       then8:
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
 // CHECK1-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK1:       else7:
+// CHECK1:       else9:
 // CHECK1-NEXT:    br label [[IFCONT8]]
-// CHECK1:       ifcont8:
+// CHECK1:       ifcont10:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK1:       then10:
+// CHECK1:       then13:
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
 // CHECK1-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
 // CHECK1-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK1:       else11:
+// CHECK1:       else14:
 // CHECK1-NEXT:    br label [[IFCONT12]]
-// CHECK1:       ifcont12:
+// CHECK1:       ifcont15:
 // CHECK1-NEXT:    ret void
 //
 //
@@ -579,7 +583,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK1-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK1-NEXT:    store i8 [[TMP9]], ptr [[C]], align 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -630,7 +634,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 1
 // CHECK1-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
@@ -909,7 +913,6 @@ int bar(int n){
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -918,6 +921,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -931,47 +935,50 @@ int bar(int n){
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK1:       then2:
+// CHECK1:       then3:
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK1-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK1:       else3:
+// CHECK1:       else4:
 // CHECK1-NEXT:    br label [[IFCONT4]]
-// CHECK1:       ifcont4:
+// CHECK1:       ifcont5:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK1:       then6:
+// CHECK1:       then8:
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK1-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK1-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK1:       else7:
+// CHECK1:       else9:
 // CHECK1-NEXT:    br label [[IFCONT8]]
-// CHECK1:       ifcont8:
+// CHECK1:       ifcont10:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK1:       then10:
+// CHECK1:       then13:
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK1-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK1-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK1:       else11:
+// CHECK1:       else14:
 // CHECK1-NEXT:    br label [[IFCONT12]]
-// CHECK1:       ifcont12:
+// CHECK1:       ifcont15:
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1067,7 +1074,6 @@ int bar(int n){
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1076,6 +1082,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -1089,47 +1096,50 @@ int bar(int n){
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK1:       then2:
+// CHECK1:       then3:
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK1-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK1:       else3:
+// CHECK1:       else4:
 // CHECK1-NEXT:    br label [[IFCONT4]]
-// CHECK1:       ifcont4:
+// CHECK1:       ifcont5:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK1:       then6:
+// CHECK1:       then8:
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK1-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK1-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK1:       else7:
+// CHECK1:       else9:
 // CHECK1-NEXT:    br label [[IFCONT8]]
-// CHECK1:       ifcont8:
+// CHECK1:       ifcont10:
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK1:       then10:
+// CHECK1:       then13:
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK1-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK1-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK1:       else11:
+// CHECK1:       else14:
 // CHECK1-NEXT:    br label [[IFCONT12]]
-// CHECK1:       ifcont12:
+// CHECK1:       ifcont15:
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1156,7 +1166,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK1-NEXT:    store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK1-NEXT:    store i16 [[TMP13]], ptr [[B]], align 2
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1207,7 +1217,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 2
 // CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
 // CHECK1-NEXT:    ret void
 //
@@ -1369,7 +1379,6 @@ int bar(int n){
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1385,6 +1394,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK2-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK2:       body:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -1399,11 +1409,12 @@ int bar(int n){
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK2:       then2:
+// CHECK2:       then3:
 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
@@ -1411,9 +1422,9 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // CHECK2-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK2:       else3:
+// CHECK2:       else4:
 // CHECK2-NEXT:    br label [[IFCONT4]]
-// CHECK2:       ifcont4:
+// CHECK2:       ifcont5:
 // CHECK2-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // CHECK2-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK2-NEXT:    br label [[PRECOND]]
@@ -1683,7 +1694,6 @@ int bar(int n){
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1692,6 +1702,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -1705,47 +1716,50 @@ int bar(int n){
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK2:       then2:
+// CHECK2:       then3:
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK2-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
 // CHECK2-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK2:       else3:
+// CHECK2:       else4:
 // CHECK2-NEXT:    br label [[IFCONT4]]
-// CHECK2:       ifcont4:
+// CHECK2:       ifcont5:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK2:       then6:
+// CHECK2:       then8:
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK2-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
 // CHECK2-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK2:       else7:
+// CHECK2:       else9:
 // CHECK2-NEXT:    br label [[IFCONT8]]
-// CHECK2:       ifcont8:
+// CHECK2:       ifcont10:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK2:       then10:
+// CHECK2:       then13:
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK2-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
 // CHECK2-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
 // CHECK2-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK2:       else11:
+// CHECK2:       else14:
 // CHECK2-NEXT:    br label [[IFCONT12]]
-// CHECK2:       ifcont12:
+// CHECK2:       ifcont15:
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1766,7 +1780,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK2-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK2-NEXT:    store i8 [[TMP9]], ptr [[C]], align 1
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -1817,7 +1831,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 1
 // CHECK2-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
@@ -2096,7 +2110,6 @@ int bar(int n){
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -2105,6 +2118,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -2118,47 +2132,50 @@ int bar(int n){
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK2:       then2:
+// CHECK2:       then3:
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK2-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK2-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK2:       else3:
+// CHECK2:       else4:
 // CHECK2-NEXT:    br label [[IFCONT4]]
-// CHECK2:       ifcont4:
+// CHECK2:       ifcont5:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK2:       then6:
+// CHECK2:       then8:
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK2-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK2-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK2:       else7:
+// CHECK2:       else9:
 // CHECK2-NEXT:    br label [[IFCONT8]]
-// CHECK2:       ifcont8:
+// CHECK2:       ifcont10:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK2:       then10:
+// CHECK2:       then13:
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK2-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK2-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK2-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK2:       else11:
+// CHECK2:       else14:
 // CHECK2-NEXT:    br label [[IFCONT12]]
-// CHECK2:       ifcont12:
+// CHECK2:       ifcont15:
 // CHECK2-NEXT:    ret void
 //
 //
@@ -2254,7 +2271,6 @@ int bar(int n){
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -2263,6 +2279,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -2276,47 +2293,50 @@ int bar(int n){
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK2:       then2:
+// CHECK2:       then3:
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK2-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK2-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK2:       else3:
+// CHECK2:       else4:
 // CHECK2-NEXT:    br label [[IFCONT4]]
-// CHECK2:       ifcont4:
+// CHECK2:       ifcont5:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK2:       then6:
+// CHECK2:       then8:
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK2-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK2-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK2:       else7:
+// CHECK2:       else9:
 // CHECK2-NEXT:    br label [[IFCONT8]]
-// CHECK2:       ifcont8:
+// CHECK2:       ifcont10:
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK2:       then10:
+// CHECK2:       then13:
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK2-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK2-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK2-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK2:       else11:
+// CHECK2:       else14:
 // CHECK2-NEXT:    br label [[IFCONT12]]
-// CHECK2:       ifcont12:
+// CHECK2:       ifcont15:
 // CHECK2-NEXT:    ret void
 //
 //
@@ -2343,7 +2363,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK2-NEXT:    store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK2-NEXT:    store i16 [[TMP13]], ptr [[B]], align 2
 // CHECK2-NEXT:    ret void
 //
 //
@@ -2394,7 +2414,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 2
 // CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
 // CHECK2-NEXT:    ret void
 //
@@ -2556,7 +2576,6 @@ int bar(int n){
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -2572,6 +2591,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK3-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK3:       body:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -2586,11 +2606,12 @@ int bar(int n){
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK3:       then2:
+// CHECK3:       then3:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
@@ -2598,9 +2619,9 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // CHECK3-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK3:       else3:
+// CHECK3:       else4:
 // CHECK3-NEXT:    br label [[IFCONT4]]
-// CHECK3:       ifcont4:
+// CHECK3:       ifcont5:
 // CHECK3-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // CHECK3-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK3-NEXT:    br label [[PRECOND]]
@@ -2870,7 +2891,6 @@ int bar(int n){
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -2879,6 +2899,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -2892,47 +2913,50 @@ int bar(int n){
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK3:       then2:
+// CHECK3:       then3:
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK3-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
 // CHECK3-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK3:       else3:
+// CHECK3:       else4:
 // CHECK3-NEXT:    br label [[IFCONT4]]
-// CHECK3:       ifcont4:
+// CHECK3:       ifcont5:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK3:       then6:
+// CHECK3:       then8:
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
 // CHECK3-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK3:       else7:
+// CHECK3:       else9:
 // CHECK3-NEXT:    br label [[IFCONT8]]
-// CHECK3:       ifcont8:
+// CHECK3:       ifcont10:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK3:       then10:
+// CHECK3:       then13:
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
 // CHECK3-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
 // CHECK3-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK3:       else11:
+// CHECK3:       else14:
 // CHECK3-NEXT:    br label [[IFCONT12]]
-// CHECK3:       ifcont12:
+// CHECK3:       ifcont15:
 // CHECK3-NEXT:    ret void
 //
 //
@@ -2953,7 +2977,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[C]], align 1
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -3004,7 +3028,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 1
 // CHECK3-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
@@ -3283,7 +3307,6 @@ int bar(int n){
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -3292,6 +3315,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -3305,47 +3329,50 @@ int bar(int n){
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK3:       then2:
+// CHECK3:       then3:
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK3-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK3-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK3:       else3:
+// CHECK3:       else4:
 // CHECK3-NEXT:    br label [[IFCONT4]]
-// CHECK3:       ifcont4:
+// CHECK3:       ifcont5:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK3:       then6:
+// CHECK3:       then8:
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK3-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK3-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK3:       else7:
+// CHECK3:       else9:
 // CHECK3-NEXT:    br label [[IFCONT8]]
-// CHECK3:       ifcont8:
+// CHECK3:       ifcont10:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK3:       then10:
+// CHECK3:       then13:
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK3-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK3-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK3:       else11:
+// CHECK3:       else14:
 // CHECK3-NEXT:    br label [[IFCONT12]]
-// CHECK3:       ifcont12:
+// CHECK3:       ifcont15:
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3441,7 +3468,6 @@ int bar(int n){
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -3450,6 +3476,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -3463,47 +3490,50 @@ int bar(int n){
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK3:       then2:
+// CHECK3:       then3:
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
 // CHECK3-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
 // CHECK3-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK3:       else3:
+// CHECK3:       else4:
 // CHECK3-NEXT:    br label [[IFCONT4]]
-// CHECK3:       ifcont4:
+// CHECK3:       ifcont5:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
-// CHECK3:       then6:
+// CHECK3:       then8:
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
 // CHECK3-NEXT:    store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
 // CHECK3-NEXT:    br label [[IFCONT8:%.*]]
-// CHECK3:       else7:
+// CHECK3:       else9:
 // CHECK3-NEXT:    br label [[IFCONT8]]
-// CHECK3:       ifcont8:
+// CHECK3:       ifcont10:
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
-// CHECK3:       then10:
+// CHECK3:       then13:
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
 // CHECK3-NEXT:    store i16 [[TMP24]], ptr [[TMP23]], align 2
 // CHECK3-NEXT:    br label [[IFCONT12:%.*]]
-// CHECK3:       else11:
+// CHECK3:       else14:
 // CHECK3-NEXT:    br label [[IFCONT12]]
-// CHECK3:       ifcont12:
+// CHECK3:       ifcont15:
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3530,7 +3560,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK3-NEXT:    store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK3-NEXT:    store i16 [[TMP13]], ptr [[B]], align 2
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3581,7 +3611,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 2
 // CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
 // CHECK3-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/reduction_complex.c b/clang/test/OpenMP/reduction_complex.c
new file mode 100644
index 00000000000000..e00caa8f90fdf7
--- /dev/null
+++ b/clang/test/OpenMP/reduction_complex.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN:  -triple powerpc64le-unknown-unknown \
+// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
+// RUN:  %t-ppc-host.bc
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN:  -triple nvptx64-unknown-unknown -DCUA \
+// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s \
+// RUN:  -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
+// RUN:  -o - | FileCheck %s --check-prefix CHECK
+
+// expected-no-diagnostics
+int foo() {
+  int i;
+  int j;
+  _Complex float sum = 0;
+
+#pragma omp target teams loop reduction(+:sum) collapse(2) bind(parallel) order(concurrent) lastprivate(j) map(tofrom:sum)
+
+  for(i=0; i<10; i++)
+    for(j=0; j<10; j++)
+      sum += i;
+
+  return 0;
+}
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:         %[[VAL_228:.*]] = alloca ptr, align 8
+// CHECK-NEXT:         %[[VAL_229:.*]] = alloca i16, align 2
+// CHECK-NEXT:         %[[VAL_230:.*]] = alloca i16, align 2
+// CHECK-NEXT:         %[[VAL_231:.*]] = alloca i16, align 2
+// CHECK-NEXT:         %[[VAL_232:.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:         %[[VAL_233:.*]] = alloca { float, float }, align 8
+// CHECK-NEXT:         store ptr %[[VAL_234:.*]], ptr %[[VAL_228]], align 8
+// CHECK-NEXT:         store i16 %[[VAL_235:.*]], ptr %[[VAL_229]], align 2
+// CHECK-NEXT:         store i16 %[[VAL_236:.*]], ptr %[[VAL_230]], align 2
+// CHECK-NEXT:         store i16 %[[VAL_237:.*]], ptr %[[VAL_231]], align 2
+// CHECK-NEXT:         %[[VAL_238:.*]] = load ptr, ptr %[[VAL_228]], align 8
+// CHECK-NEXT:         %[[VAL_239:.*]] = load i16, ptr %[[VAL_229]], align 2
+// CHECK-NEXT:         %[[VAL_240:.*]] = load i16, ptr %[[VAL_230]], align 2
+// CHECK-NEXT:         %[[VAL_241:.*]] = load i16, ptr %[[VAL_231]], align 2
+// CHECK-NEXT:         %[[VAL_242:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_238]], i64 0, i64 0
+// CHECK-NEXT:         %[[VAL_243:.*]] = load ptr, ptr %[[VAL_242]], align 8
+// CHECK-NEXT:         %[[VAL_244:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_232]], i64 0, i64 0
+// CHECK-NEXT:         %[[VAL_245:.*]] = getelementptr { float, float }, ptr %[[VAL_243]], i64 1
+// CHECK-NEXT:         %[[VAL_246:.*]] = load i64, ptr %[[VAL_243]], align 8
+// CHECK-NEXT:         %[[VAL_247:.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-NEXT:         %[[VAL_248:.*]] = trunc i32 %[[VAL_247]] to i16
+// CHECK-NEXT:         %[[VAL_249:.*]] = call i64 @__kmpc_shuffle_int64(i64 %[[VAL_246]], i16 %[[VAL_240]], i16 %[[VAL_248]])
+// CHECK-NEXT:         store i64 %[[VAL_249]], ptr %[[VAL_233]], align 8
+// CHECK-NEXT:         %[[VAL_250:.*]] = getelementptr i64, ptr %[[VAL_243]], i64 1
+// CHECK-NEXT:         %[[VAL_251:.*]] = getelementptr i64, ptr %[[VAL_233]], i64 1
+// CHECK-NEXT:         store ptr %[[VAL_233]], ptr %[[VAL_244]], align 8
+// CHECK-NEXT:         %[[VAL_252:.*]] = icmp eq i16 %[[VAL_241]], 0
+// CHECK-NEXT:         %[[VAL_253:.*]] = icmp eq i16 %[[VAL_241]], 1
+// CHECK-NEXT:         %[[VAL_254:.*]] = icmp ult i16 %[[VAL_239]], %[[VAL_240]]
+// CHECK-NEXT:         %[[VAL_255:.*]] = and i1 %[[VAL_253]], %[[VAL_254]]
+// CHECK-NEXT:         %[[VAL_256:.*]] = icmp eq i16 %[[VAL_241]], 2
+// CHECK-NEXT:         %[[VAL_257:.*]] = and i16 %[[VAL_239]], 1
+// CHECK-NEXT:         %[[VAL_258:.*]] = icmp eq i16 %[[VAL_257]], 0
+// CHECK-NEXT:         %[[VAL_259:.*]] = and i1 %[[VAL_256]], %[[VAL_258]]
+// CHECK-NEXT:         %[[VAL_260:.*]] = icmp sgt i16 %[[VAL_240]], 0
+// CHECK-NEXT:         %[[VAL_261:.*]] = and i1 %[[VAL_259]], %[[VAL_260]]
+// CHECK-NEXT:         %[[VAL_262:.*]] = or i1 %[[VAL_252]], %[[VAL_255]]
+// CHECK-NEXT:         %[[VAL_263:.*]] = or i1 %[[VAL_262]], %[[VAL_261]]
+// CHECK-NEXT:         br i1 %[[VAL_263]], label %[[VAL_264:.*]], label %[[VAL_265:.*]]
+// CHECK:       then:                                             ; preds = %[[VAL_266:.*]]
+// CHECK-NEXT:         call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l{{[0-9]+}}_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr %[[VAL_238]], ptr %[[VAL_232]]) #2
+// CHECK-NEXT:         br label %[[VAL_267:.*]]
+// CHECK:       else:                                             ; preds = %[[VAL_266]]
+// CHECK-NEXT:         br label %[[VAL_267]]
+// CHECK:       ifcont:                                           ; preds = %[[VAL_265]], %[[VAL_264]]
+// CHECK-NEXT:         %[[VAL_268:.*]] = icmp eq i16 %[[VAL_241]], 1
+// CHECK-NEXT:         %[[VAL_269:.*]] = icmp uge i16 %[[VAL_239]], %[[VAL_240]]
+// CHECK-NEXT:         %[[VAL_270:.*]] = and i1 %[[VAL_268]], %[[VAL_269]]
+// CHECK-NEXT:         br i1 %[[VAL_270]], label %[[VAL_271:.*]], label %[[VAL_272:.*]]
+// CHECK:       then4:                                            ; preds = %[[VAL_267]]
+// CHECK-NEXT:         %[[VAL_273:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_232]], i64 0, i64 0
+// CHECK-NEXT:         %[[VAL_274:.*]] = load ptr, ptr %[[VAL_273]], align 8
+// CHECK-NEXT:         %[[VAL_275:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_238]], i64 0, i64 0
+// CHECK-NEXT:         %[[VAL_276:.*]] = load ptr, ptr %[[VAL_275]], align 8
+// CHECK-NEXT:         %[[VAL_277:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_274]], i32 0, i32 0
+// CHECK-NEXT:         %[[VAL_278:.*]] = load float, ptr %[[VAL_277]], align 4
+// CHECK-NEXT:         %[[VAL_279:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_274]], i32 0, i32 1
+// CHECK-NEXT:         %[[VAL_280:.*]] = load float, ptr %[[VAL_279]], align 4
+// CHECK-NEXT:         %[[VAL_281:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_276]], i32 0, i32 0
+// CHECK-NEXT:         %[[VAL_282:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_276]], i32 0, i32 1
+// CHECK-NEXT:         store float %[[VAL_278]], ptr %[[VAL_281]], align 4
+// CHECK-NEXT:         store float %[[VAL_280]], ptr %[[VAL_282]], align 4
+// CHECK-NEXT:         br label %[[VAL_283:.*]]
+// CHECK:       else7:                                            ; preds = %[[VAL_267]]
+// CHECK-NEXT:         br label %[[VAL_283]]
+// CHECK:       ifcont8:                                          ; preds = %[[VAL_272]], %[[VAL_271]]
+// CHECK-NEXT:         ret void
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 765e90bcba8531..78aca56acca308 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -233,7 +233,6 @@ int main()
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -249,6 +248,7 @@ int main()
 // CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK:       body:
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
 // CHECK-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -263,11 +263,12 @@ int main()
 // CHECK:       else:
 // CHECK-NEXT:    br label [[IFCONT]]
 // CHECK:       ifcont:
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// CHECK:       then2:
+// CHECK:       then3:
 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
@@ -275,9 +276,9 @@ int main()
 // CHECK-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK-NEXT:    br label [[IFCONT4:%.*]]
-// CHECK:       else3:
+// CHECK:       else4:
 // CHECK-NEXT:    br label [[IFCONT4]]
-// CHECK:       ifcont4:
+// CHECK:       ifcont5:
 // CHECK-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // CHECK-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK-NEXT:    br label [[PRECOND]]
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 3f752ac663f412..e05b3209f9eff2 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -85,6 +85,7 @@ int foo() {
 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
@@ -102,7 +103,6 @@ int foo() {
 // IR-GPU-NEXT:    [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
@@ -258,6 +258,7 @@ int foo() {
 // IR-GPU-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[J5:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
@@ -275,7 +276,6 @@ int foo() {
 // IR-GPU-NEXT:    [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
 // IR-GPU-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
 // IR-GPU-NEXT:    [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
@@ -399,12 +399,12 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -480,10 +480,9 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -499,6 +498,7 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // IR-GPU:       body:
+// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -513,11 +513,12 @@ int foo() {
 // IR-GPU:       else:
 // IR-GPU-NEXT:    br label [[IFCONT]]
 // IR-GPU:       ifcont:
+// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// IR-GPU:       then2:
+// IR-GPU:       then3:
 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
@@ -525,9 +526,9 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
-// IR-GPU:       else3:
+// IR-GPU:       else4:
 // IR-GPU-NEXT:    br label [[IFCONT4]]
-// IR-GPU:       ifcont4:
+// IR-GPU:       ifcont5:
 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
 // IR-GPU-NEXT:    br label [[PRECOND]]
@@ -544,12 +545,12 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -625,10 +626,9 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -644,6 +644,7 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // IR-GPU:       body:
+// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -658,11 +659,12 @@ int foo() {
 // IR-GPU:       else:
 // IR-GPU-NEXT:    br label [[IFCONT]]
 // IR-GPU:       ifcont:
+// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// IR-GPU:       then2:
+// IR-GPU:       then3:
 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
@@ -670,9 +672,9 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
-// IR-GPU:       else3:
+// IR-GPU:       else4:
 // IR-GPU-NEXT:    br label [[IFCONT4]]
-// IR-GPU:       ifcont4:
+// IR-GPU:       ifcont5:
 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
 // IR-GPU-NEXT:    br label [[PRECOND]]
diff --git a/clang/test/PCH/cxx2a-requires-expr.cpp b/clang/test/PCH/cxx2a-requires-expr.cpp
index 7f8f258a0f8f3c..936f6016854638 100644
--- a/clang/test/PCH/cxx2a-requires-expr.cpp
+++ b/clang/test/PCH/cxx2a-requires-expr.cpp
@@ -22,3 +22,20 @@ bool f() {
     requires C<typename T::val> || (C<typename T::val> || C<T>);
   };
 }
+
+namespace trailing_requires_expression {
+
+template <typename T> requires C<T> && C2<T, T>
+// CHECK: template <typename T> requires C<T> && C2<T, T> void g();
+void g();
+
+template <typename T> requires C<T> || C2<T, T>
+// CHECK: template <typename T> requires C<T> || C2<T, T> constexpr int h = sizeof(T);
+constexpr int h = sizeof(T);
+
+template <typename T> requires C<T>
+// CHECK:      template <typename T> requires C<T> class i {
+// CHECK-NEXT: };
+class i {};
+
+}
diff --git a/clang/test/Parser/cxx2a-concepts-requires-expr.cpp b/clang/test/Parser/cxx2a-concepts-requires-expr.cpp
index 971591afb08dba..5755844a323d2c 100644
--- a/clang/test/Parser/cxx2a-concepts-requires-expr.cpp
+++ b/clang/test/Parser/cxx2a-concepts-requires-expr.cpp
@@ -83,7 +83,7 @@ bool r23 = requires { typename identity<T>::temp<T>; };
 template<typename T>
 bool r24 = requires {
     typename identity<T>::template temp<T>;
-    typename identity<T>::template temp; // expected-error{{expected an identifier or template-id after '::'}}
+    typename identity<T>::template temp; // expected-error{{template argument list is expected after a name prefixed by the template keyword}}
 };
 
 bool r25 = requires { ; };
diff --git a/clang/test/ParserHLSL/bitfields.hlsl b/clang/test/ParserHLSL/bitfields.hlsl
new file mode 100644
index 00000000000000..307d1143a068e2
--- /dev/null
+++ b/clang/test/ParserHLSL/bitfields.hlsl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -x hlsl -o - %s | FileCheck %s
+
+
+struct MyBitFields {
+  // CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:9:3, col:25> col:16 referenced field1 'unsigned int'
+  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:25> 'int'
+  // CHECK:-value: Int 3
+  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:25> 'int' 3
+  unsigned int field1 : 3; // 3 bits for field1
+
+  // CHECK:FieldDecl 0x{{[0-9a-f]+}} <line:15:3, col:25> col:16 referenced field2 'unsigned int'
+  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:25> 'int'
+  // CHECK:-value: Int 4
+  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:25> 'int' 4
+  unsigned int field2 : 4; // 4 bits for field2
+  
+  // CHECK:FieldDecl 0x{{[0-9a-f]+}} <line:21:3, col:16> col:7 field3 'int'
+  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:16> 'int'
+  // CHECK:-value: Int 5
+  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:16> 'int' 5
+  int field3 : 5;          // 5 bits for field3 (signed)
+};
+
+
+
+[numthreads(1,1,1)]
+void main() {
+  MyBitFields m;
+  m.field1 = 4;
+  m.field2 = m.field1*2;
+}
\ No newline at end of file
diff --git a/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl
new file mode 100644
index 00000000000000..2eebc920388b5b
--- /dev/null
+++ b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// tests that hlsl annotations are properly parsed when applied on field decls,
+// and that the annotation gets properly placed on the AST.
+
+struct Eg9{
+  // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:1, col:8> col:8 implicit struct Eg9
+  // CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:10:3, col:16> col:16 referenced a 'unsigned int'
+  // CHECK: -HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-f]+}} <col:20>
+  unsigned int a : SV_DispatchThreadID;
+};
+Eg9 e9;
+
+
+RWBuffer<int> In : register(u1);
+
+
+[numthreads(1,1,1)]
+void main() {
+  In[0] = e9.a;
+}
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 6f359461dea885..71cc36acf3f0e1 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -315,37 +315,37 @@
 // RUN: %clang -target aarch64 -mcpu=thunderx2t99 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-THUNDERX2T99 %s
 // RUN: %clang -target aarch64 -mcpu=a64fx -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A64FX %s
 // RUN: %clang -target aarch64 -mcpu=carmel -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-CARMEL %s
-// CHECK-MCPU-APPLE-A7: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes"{{.*}} "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-APPLE-A10: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes"{{.*}} "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-APPLE-A11: "-cc1"{{.*}} "-triple" "aarch64{{.*}}"{{.*}}"-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-APPLE-A12: "-cc1"{{.*}} "-triple" "aarch64"{{.*}} "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-A34: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-APPLE-A13: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "apple-a13" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon"
-// CHECK-MCPU-A35: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-A53: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+flagm" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+predres" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+neon" "-target-feature" "+ssbs"
-// CHECK-MCPU-M3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-M4: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-THUNDERX2T99: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-A64FX: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" "-target-feature" "+sve"
-// CHECK-MCPU-CARMEL: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
+// CHECK-MCPU-APPLE-A7: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-APPLE-A10: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lor" "-target-feature" "+neon" "-target-feature" "+pan" "-target-feature" "+perfmon" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+vh"
+// CHECK-MCPU-APPLE-A11: "-cc1"{{.*}} "-triple" "aarch64{{.*}}"{{.*}}"-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2"
+// CHECK-MCPU-APPLE-A12: "-cc1"{{.*}} "-triple" "aarch64"{{.*}} "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2"
+// CHECK-MCPU-A34: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-APPLE-A13: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fp16fml" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3"
+// CHECK-MCPU-A35: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-A53: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+ccdp" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+flagm" "-target-feature" "+fp-armv8" "-target-feature" "+fp16fml" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+predres" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+ssbs"
+// CHECK-MCPU-M3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-M4: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2"
+// CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2"
+// CHECK-MCPU-THUNDERX2T99: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+rdm" "-target-feature" "+sha2
+// CHECK-MCPU-A64FX: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sve"
+// CHECK-MCPU-CARMEL: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s
-// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+predres" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon" "-target-feature" "+ssbs"
+// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+altnzcv" "-target-feature" "+ccdp" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fp16fml" "-target-feature" "+fptoint" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+predres" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+specrestrict" "-target-feature" "+ssbs"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64_32 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64_32 %s
-// CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
+// CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2"
 
 // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto -mabi=aapcs-soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
 // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto+nofp+nosimd+nocrc+nocrypto -mabi=aapcs-soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nosimd -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-3 %s
-// CHECK-MARCH-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
+// CHECK-MARCH-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+sha2"
 // CHECK-MARCH-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-fp-armv8"{{.*}} "-target-feature" "-neon"
 // CHECK-MARCH-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-neon"
 
@@ -463,8 +463,8 @@
 // RUN: %clang -target aarch64 -mcpu=GENERIC+nocrc+CRC -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-2 %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a53+noSIMD -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-3 %s
 // CHECK-MCPU-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-aes"{{.*}} "-target-feature" "-sha2"
-// CHECK-MCPU-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon"
-// CHECK-MCPU-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "-aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "-sha2" "-target-feature" "-neon"
+// CHECK-MCPU-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+crc"{{.*}} "-target-feature" "+fp-armv8"{{.*}} "-target-feature" "+neon"
+// CHECK-MCPU-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "-aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "-neon" "-target-feature" "+perfmon" "-target-feature" "-sha2"
 
 // RUN: %clang -target aarch64 -mcpu=cyclone+nocrc+nocrypto -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-MARCH %s
 // RUN: %clang -target aarch64 -march=armv8-a -mcpu=cyclone+nocrc+nocrypto  -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-MARCH %s
@@ -492,9 +492,9 @@
 // RUN: %clang -target aarch64 -march=ARMV8.1A+CRYPTO -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-FEATURE-1 %s
 // RUN: %clang -target aarch64 -march=Armv8.1a+NOcrypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-FEATURE-2 %s
 // RUN: %clang -target aarch64 -march=armv8.1a+noSIMD -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-FEATURE-3 %s
-// CHECK-V81A-FEATURE-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-V81A-FEATURE-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "+rdm" "-target-feature" "+neon"
-// CHECK-V81A-FEATURE-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "-rdm" "-target-feature" "-neon"
+// CHECK-V81A-FEATURE-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+rdm" "-target-feature" "+sha2"
+// CHECK-V81A-FEATURE-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+rdm"
+// CHECK-V81A-FEATURE-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "-neon" "-target-feature" "-rdm"
 
 // ================== Check Memory Tagging Extensions (MTE).
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8.5-a+memtag -x c -E -dM %s -o - 2>&1 | FileCheck -check-prefix=CHECK-MEMTAG %s
diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp
index a90d3bc330538b..31b622c848d6a9 100644
--- a/clang/test/Preprocessor/embed_weird.cpp
+++ b/clang/test/Preprocessor/embed_weird.cpp
@@ -1,7 +1,9 @@
-// RUN: printf "\0" > %S/Inputs/null_byte.bin
-// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify=expected,cxx -Wno-c23-extensions
-// RUN: %clang_cc1 -x c -std=c23 %s -fsyntax-only --embed-dir=%S/Inputs -verify=expected,c
-// RUN: rm %S/Inputs/null_byte.bin
+// RUN: rm -rf %t && mkdir -p %t/media
+// RUN: cp %S/Inputs/single_byte.txt %S/Inputs/jk.txt %S/Inputs/numbers.txt %t/
+// RUN: cp %S/Inputs/media/empty %t/media/
+// RUN: printf "\0" > %t/null_byte.bin
+// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%t -verify=expected,cxx -Wno-c23-extensions
+// RUN: %clang_cc1 -x c -std=c23 %s -fsyntax-only --embed-dir=%t -verify=expected,c
 #embed <media/empty>
 ;
 
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index 14e2f584bd0938..7d29e45c7d5ac6 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -116,6 +116,7 @@
 // CHECK-X86-MINGW: #define WINNT 1
 // CHECK-X86-MINGW: #define _WIN32 1
 // CHECK-X86-MINGW-NOT: #define _WIN64 1
+// CHECK-X86-MINGW: #define __GXX_TYPEINFO_EQUALITY_INLINE 0
 
 // RUN: %clang_cc1 -triple thumbv7-windows-gnu %s -E -dM -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-ARM-MINGW
@@ -125,6 +126,7 @@
 // CHECK-ARM-MINGW: #define WINNT 1
 // CHECK-ARM-MINGW: #define _WIN32 1
 // CHECK-ARM-MINGW-NOT: #define _WIN64 1
+// CHECK-ARM-MINGW: #define __GXX_TYPEINFO_EQUALITY_INLINE 0
 
 // RUN: %clang_cc1 -triple x86_64-windows-gnu %s -E -dM -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-AMD64-MINGW
@@ -134,6 +136,7 @@
 // CHECK-AMD64-MINGW: #define WINNT 1
 // CHECK-AMD64-MINGW: #define _WIN32 1
 // CHECK-AMD64-MINGW: #define _WIN64 1
+// CHECK-AMD64-MINGW: #define __GXX_TYPEINFO_EQUALITY_INLINE 0
 
 // RUN: %clang_cc1 -triple aarch64-windows-gnu %s -E -dM -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-ARM64-MINGW
@@ -145,6 +148,7 @@
 // CHECK-ARM64-MINGW: #define _WIN32 1
 // CHECK-ARM64-MINGW: #define _WIN64 1
 // CHECK-ARM64-MINGW: #define __GCC_ASM_FLAG_OUTPUTS__ 1
+// CHECK-ARM64-MINGW: #define __GXX_TYPEINFO_EQUALITY_INLINE 0
 // CHECK-ARM64-MINGW: #define __aarch64__ 1
 
 // RUN: %clang_cc1 -triple arm64ec-windows-gnu %s -E -dM -o - \
@@ -157,6 +161,7 @@
 // CHECK-ARM64EC-MINGW: #define _WIN32 1
 // CHECK-ARM64EC-MINGW: #define _WIN64 1
 // CHECK-ARM64EC-MINGW: #define __GCC_ASM_FLAG_OUTPUTS__ 1
+// CHECK-ARM64EC-MINGW: #define __GXX_TYPEINFO_EQUALITY_INLINE 0
 // CHECK-ARM64EC-MINGW-NOT: #define __aarch64__ 1
 // CHECK-ARM64EC-MINGW: #define __amd64 1
 // CHECK-ARM64EC-MINGW: #define __amd64__ 1
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index d7935af532dfa1..4dccbfa3b8a1ae 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -1683,44 +1683,44 @@
 // CHECK-ZICFISS-EXT: __riscv_zicfiss 4000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN:   -march=rv32i_ssnpm0p8 -E -dM %s \
+// RUN:   -march=rv32i_ssnpm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SSNPM-EXT %s
 // RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN:   -march=rv64i_ssnpm0p8 -E -dM %s \
+// RUN:   -march=rv64i_ssnpm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SSNPM-EXT %s
-// CHECK-SSNPM-EXT: __riscv_ssnpm 8000{{$}}
+// CHECK-SSNPM-EXT: __riscv_ssnpm 1000000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN:   -march=rv32i_smnpm0p8 -E -dM %s \
+// RUN:   -march=rv32i_smnpm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMNPM-EXT %s
 // RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN:   -march=rv64i_smnpm0p8 -E -dM %s \
+// RUN:   -march=rv64i_smnpm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMNPM-EXT %s
-// CHECK-SMNPM-EXT: __riscv_smnpm 8000{{$}}
+// CHECK-SMNPM-EXT: __riscv_smnpm 1000000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN:   -march=rv32i_smmpm0p8 -E -dM %s \
+// RUN:   -march=rv32i_smmpm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMMPM-EXT %s
 // RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN:   -march=rv64i_smmpm0p8 -E -dM %s \
+// RUN:   -march=rv64i_smmpm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMMPM-EXT %s
-// CHECK-SMMPM-EXT: __riscv_smmpm 8000{{$}}
+// CHECK-SMMPM-EXT: __riscv_smmpm 1000000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN:   -march=rv32i_sspm0p8 -E -dM %s \
+// RUN:   -march=rv32i_sspm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SSPM-EXT %s
 // RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN:   -march=rv64i_sspm0p8 -E -dM %s \
+// RUN:   -march=rv64i_sspm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SSPM-EXT %s
-// CHECK-SSPM-EXT: __riscv_sspm 8000{{$}}
+// CHECK-SSPM-EXT: __riscv_sspm 1000000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN:   -march=rv32i_supm0p8 -E -dM %s \
+// RUN:   -march=rv32i_supm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SUPM-EXT %s
 // RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN:   -march=rv64i_supm0p8 -E -dM %s \
+// RUN:   -march=rv64i_supm1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SUPM-EXT %s
-// CHECK-SUPM-EXT: __riscv_supm 8000{{$}}
+// CHECK-SUPM-EXT: __riscv_supm 1000000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
 // RUN:   -march=rv32i_ssqosid1p0 -E -dM %s \
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 3e63e2c77fddf9..6e9c968273a466 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -754,7 +754,8 @@
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ccmp -x c -E -dM -o - %s | FileCheck --check-prefix=CCMP %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s
-// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,APXF %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=zu -x c -E -dM -o - %s | FileCheck --check-prefix=ZU %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,CF,APXF %s
 // APXF: #define __APX_F__ 1
 // CCMP: #define __CCMP__ 1
 // CF: #define __CF__ 1
@@ -763,6 +764,7 @@
 // NF: #define __NF__ 1
 // PPX: #define __PPX__ 1
 // PUSH2POP2: #define __PUSH2POP2__ 1
+// ZU: #define __ZU__ 1
 
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-inline-asm-use-gpr32 -x c -E -dM -o - %s | FileCheck --check-prefixes=NOUSEGPR32 %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=egpr -mapx-inline-asm-use-gpr32 -x c -E -dM -o - %s | FileCheck --check-prefixes=USEGPR32 %s
diff --git a/clang/test/Sema/aarch64-sme2p1-intrinsics/acle_sme2p1_imm.cpp b/clang/test/Sema/aarch64-sme2p1-intrinsics/acle_sme2p1_imm.cpp
new file mode 100644
index 00000000000000..06b1e8301ce49d
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p1-intrinsics/acle_sme2p1_imm.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN:    -target-feature +sme -target-feature +sme2p1 -target-feature +bf16 -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+void tests_readz_tile_to_vector_single(uint32_t slice) __arm_streaming __arm_inout("za") {
+  svreadz_hor_za8_s8(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
+  svreadz_hor_za16_s16(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+  svreadz_hor_za32_s32(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+  svreadz_hor_za64_s64(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svreadz_hor_za128_s8(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+  svreadz_hor_za128_s16(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+  svreadz_hor_za128_s32(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+  svreadz_hor_za128_s64(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+  svreadz_hor_za128_bf16(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+  return;
+}
+
+
diff --git a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
index 95baf9f77fb14e..fcdd0516ed5a9d 100644
--- a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
+++ b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
@@ -6,23 +6,23 @@
 
 void test_bfloat(svbool_t pg, uint64_t u64, int64_t i64, const bfloat16_t *const_bf16_ptr, bfloat16_t *bf16_ptr, svbfloat16_t bf16, svbfloat16x2_t bf16x2, svbfloat16x3_t bf16x3, svbfloat16x4_t bf16x4)
 {
-  // expected-error@+1 {{'svcreate2_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svcreate2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svcreate2_bf16(bf16, bf16);
-  // expected-error@+1 {{'svcreate3_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svcreate3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svcreate3_bf16(bf16, bf16, bf16);
-  // expected-error@+1 {{'svcreate4_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svcreate4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svcreate4_bf16(bf16, bf16, bf16, bf16);
-  // expected-error@+1 {{'svget2_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svget2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svget2_bf16(bf16x2, 1);
-  // expected-error@+1 {{'svget3_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svget3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svget3_bf16(bf16x3, 1);
-  // expected-error@+1 {{'svget4_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svget4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svget4_bf16(bf16x4, 1);
-  // expected-error@+1 {{'svld1_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svld1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svld1_bf16(pg, const_bf16_ptr);
-  // expected-error@+1 {{'svld1_vnum_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svld1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svld1_vnum_bf16(pg, const_bf16_ptr, i64);
-  // expected-error@+1 {{'svld1rq_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svld1rq_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svld1rq_bf16(pg, const_bf16_ptr);
   // expected-error@+1 {{'svldff1_bf16' needs target feature sve,bf16}}
   svldff1_bf16(pg, const_bf16_ptr);
@@ -32,55 +32,55 @@ void test_bfloat(svbool_t pg, uint64_t u64, int64_t i64, const bfloat16_t *const
   svldnf1_bf16(pg, const_bf16_ptr);
   // expected-error@+1 {{'svldnf1_vnum_bf16' needs target feature sve,bf16}}
   svldnf1_vnum_bf16(pg, const_bf16_ptr, i64);
-  // expected-error@+1 {{'svldnt1_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svldnt1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svldnt1_bf16(pg, const_bf16_ptr);
-  // expected-error@+1 {{'svldnt1_vnum_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svldnt1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svldnt1_vnum_bf16(pg, const_bf16_ptr, i64);
-  // expected-error@+1 {{'svrev_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svrev_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svrev_bf16(bf16);
-  // expected-error@+1 {{'svset2_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svset2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svset2_bf16(bf16x2, 1, bf16);
-  // expected-error@+1 {{'svset3_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svset3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svset3_bf16(bf16x3, 1, bf16);
-  // expected-error@+1 {{'svset4_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svset4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svset4_bf16(bf16x4, 1, bf16);
-  // expected-error@+1 {{'svst1_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svst1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svst1_bf16(pg, bf16_ptr, bf16);
-  // expected-error@+1 {{'svst1_vnum_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svst1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svst1_vnum_bf16(pg, bf16_ptr, i64, bf16);
-  // expected-error@+1 {{'svstnt1_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svstnt1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svstnt1_bf16(pg, bf16_ptr, bf16);
-  // expected-error@+1 {{'svstnt1_vnum_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svstnt1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svstnt1_vnum_bf16(pg, bf16_ptr, i64, bf16);
-  // expected-error@+1 {{'svtrn1_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svtrn1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svtrn1_bf16(bf16, bf16);
   // expected-error@+1 {{'svtrn1q_bf16' needs target feature sve,bf16}}
   svtrn1q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svtrn2_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svtrn2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svtrn2_bf16(bf16, bf16);
   // expected-error@+1 {{'svtrn2q_bf16' needs target feature sve,bf16}}
   svtrn2q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svundef_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svundef_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svundef_bf16();
-  // expected-error@+1 {{'svundef2_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svundef2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svundef2_bf16();
-  // expected-error@+1 {{'svundef3_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svundef3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svundef3_bf16();
-  // expected-error@+1 {{'svundef4_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svundef4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svundef4_bf16();
-  // expected-error@+1 {{'svuzp1_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svuzp1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svuzp1_bf16(bf16, bf16);
   // expected-error@+1 {{'svuzp1q_bf16' needs target feature sve,bf16}}
   svuzp1q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svuzp2_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svuzp2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svuzp2_bf16(bf16, bf16);
   // expected-error@+1 {{'svuzp2q_bf16' needs target feature sve,bf16}}
   svuzp2q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svzip1_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svzip1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svzip1_bf16(bf16, bf16);
   // expected-error@+1 {{'svzip1q_bf16' needs target feature sve,bf16}}
   svzip1q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svzip2_bf16' needs target feature (sve|sme),bf16}}
+  // expected-error@+1 {{'svzip2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
   svzip2_bf16(bf16, bf16);
   // expected-error@+1 {{'svzip2q_bf16' needs target feature sve,bf16}}
   svzip2q_bf16(bf16, bf16);
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
index bed86be606b104..bb72a3eaf60eca 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
@@ -14,16 +14,16 @@
 
 void test_bfloat(const bfloat16_t *const_bf16_ptr, svbfloat16_t bf16, svbfloat16x2_t bf16x2)
 {
-  // expected-error@+2 {{'svwhilerw_bf16' needs target feature (sve2|sme),bf16}}
-  // overload-error@+1 {{'svwhilerw' needs target feature (sve2|sme),bf16}}
+  // expected-error@+2 {{'svwhilerw_bf16' needs target feature (sve2,bf16)|(sme,bf16)}}
+  // overload-error@+1 {{'svwhilerw' needs target feature (sve2,bf16)|(sme,bf16)}}
   SVE_ACLE_FUNC(svwhilerw,_bf16,,)(const_bf16_ptr, const_bf16_ptr);
-  // expected-error@+2 {{'svtbx_bf16' needs target feature (sve2|sme),bf16}}
-  // overload-error@+1 {{'svtbx' needs target feature (sve2|sme),bf16}}
+  // expected-error@+2 {{'svtbx_bf16' needs target feature (sve2,bf16)|(sme,bf16)}}
+  // overload-error@+1 {{'svtbx' needs target feature (sve2,bf16)|(sme,bf16)}}
   SVE_ACLE_FUNC(svtbx,_bf16,,)(bf16, bf16, svundef_u16());
-  // expected-error@+2 {{'svtbl2_bf16' needs target feature (sve2|sme),bf16}}
-  // overload-error@+1 {{'svtbl2' needs target feature (sve2|sme),bf16}}
+  // expected-error@+2 {{'svtbl2_bf16' needs target feature (sve2,bf16)|(sme,bf16)}}
+  // overload-error@+1 {{'svtbl2' needs target feature (sve2,bf16)|(sme,bf16)}}
   SVE_ACLE_FUNC(svtbl2,_bf16,,)(bf16x2, svundef_u16());
-  // expected-error@+2 {{'svwhilewr_bf16' needs target feature (sve2|sme),bf16}}
-  // overload-error@+1 {{'svwhilewr' needs target feature (sve2|sme),bf16}}
+  // expected-error@+2 {{'svwhilewr_bf16' needs target feature (sve2,bf16)|(sme,bf16)}}
+  // overload-error@+1 {{'svwhilewr' needs target feature (sve2,bf16)|(sme,bf16)}}
   SVE_ACLE_FUNC(svwhilewr,_bf16,,)(const_bf16_ptr, const_bf16_ptr);
 }
diff --git a/clang/test/Sema/arm-vector-types-support.c b/clang/test/Sema/arm-vector-types-support.c
index ed5f5ba175a94a..8b8c9634631d05 100644
--- a/clang/test/Sema/arm-vector-types-support.c
+++ b/clang/test/Sema/arm-vector-types-support.c
@@ -1,7 +1,8 @@
-// RUN: %clang_cc1 %s -triple armv7 -fsyntax-only -verify
-// RUN: %clang_cc1 %s -triple aarch64 -fsyntax-only -verify
-// RUN: %clang_cc1 %s -triple aarch64 -target-feature -fp-armv8 -target-abi aapcs-soft -fsyntax-only -verify
+// RUN: %clang_cc1 %s -triple armv8.1m.main -fsyntax-only -verify
+// RUN: %clang_cc1 %s -triple aarch64 -fsyntax-only -verify=sve-type
+// RUN: %clang_cc1 %s -triple aarch64 -target-feature -fp-armv8 -target-abi aapcs-soft -fsyntax-only -verify=sve-type
 
-typedef __attribute__((neon_vector_type(2))) int int32x2_t; // expected-error{{'neon_vector_type' attribute is not supported on targets missing 'neon', 'mve', 'sve' or 'sme'; specify an appropriate -march= or -mcpu=}}
-typedef __attribute__((neon_polyvector_type(16))) short poly8x16_t; // expected-error{{'neon_polyvector_type' attribute is not supported on targets missing 'neon' or 'mve'; specify an appropriate -march= or -mcpu=}}
+typedef __attribute__((neon_vector_type(2))) int int32x2_t; // expected-error{{on M-profile architectures 'neon_vector_type' attribute is not supported on targets missing 'mve'; specify an appropriate -march= or -mcpu=}}
+typedef __attribute__((neon_polyvector_type(16))) unsigned char poly8x16_t; // expected-error{{on M-profile architectures 'neon_polyvector_type' attribute is not supported on targets missing 'mve'; specify an appropriate -march= or -mcpu=}}
 typedef __attribute__((arm_sve_vector_bits(256))) void nosveflag; // expected-error{{'arm_sve_vector_bits' attribute is not supported on targets missing 'sve'; specify an appropriate -march= or -mcpu=}}
+                                                                  // sve-type-error@-1{{'arm_sve_vector_bits' attribute is not supported on targets missing 'sve'; specify an appropriate -march= or -mcpu=}}
diff --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c
index 2024b81ce6aec3..9b82d82ff82697 100644
--- a/clang/test/Sema/atomic-ops.c
+++ b/clang/test/Sema/atomic-ops.c
@@ -671,6 +671,36 @@ void zeroSizeArgError(struct Z *a, struct Z *b, struct Z *c) {
 
 }
 
+struct IncompleteTy IncA, IncB, IncC; // expected-error 3{{tentative definition has type 'struct IncompleteTy' that is never completed}} \
+                                      // expected-note 27{{forward declaration of 'struct IncompleteTy'}}
+void incompleteTypeArgError() {
+  __atomic_exchange(&IncB, &IncB, &IncC, memory_order_relaxed); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_exchange(&IncB, &IncB, &IncC, memory_order_acq_rel); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_exchange(&IncB, &IncB, &IncC, memory_order_acquire); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_exchange(&IncB, &IncB, &IncC, memory_order_consume); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_exchange(&IncB, &IncB, &IncC, memory_order_release); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_exchange(&IncB, &IncB, &IncC, memory_order_seq_cst); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_load(&IncA, &IncB, memory_order_relaxed); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_load(&IncA, &IncB, memory_order_acq_rel); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_load(&IncA, &IncB, memory_order_acquire); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_load(&IncA, &IncB, memory_order_consume); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_load(&IncA, &IncB, memory_order_release); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_load(&IncA, &IncB, memory_order_seq_cst); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_store(&IncA, &IncB, memory_order_relaxed); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_store(&IncA, &IncB, memory_order_acq_rel); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_store(&IncA, &IncB, memory_order_acquire); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_store(&IncA, &IncB, memory_order_consume); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_store(&IncA, &IncB, memory_order_release); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_store(&IncA, &IncB, memory_order_seq_cst); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_compare_exchange(&IncA, &IncB, &IncC, 0, memory_order_relaxed, memory_order_relaxed); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_compare_exchange(&IncA, &IncB, &IncC, 0, memory_order_acq_rel, memory_order_acq_rel); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_compare_exchange(&IncA, &IncB, &IncC, 0, memory_order_acquire, memory_order_acquire); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_compare_exchange(&IncA, &IncB, &IncC, 0, memory_order_consume, memory_order_consume); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_compare_exchange(&IncA, &IncB, &IncC, 0, memory_order_release, memory_order_release); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+  __atomic_compare_exchange(&IncA, &IncB, &IncC, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{incomplete type 'struct IncompleteTy' where a complete type is required}}
+
+}
+
 void nullPointerWarning(void) {
   volatile _Atomic(int) vai;
   _Atomic(int) ai;
diff --git a/clang/test/Sema/attr-capabilities.c b/clang/test/Sema/attr-capabilities.c
index 5138803bd5eb7a..91a43c79d5b917 100644
--- a/clang/test/Sema/attr-capabilities.c
+++ b/clang/test/Sema/attr-capabilities.c
@@ -54,14 +54,14 @@ void Func18(void) __attribute__((release_capability())) {} // expected-warning {
 void Func19(void) __attribute__((release_shared_capability())) {} // expected-warning {{'release_shared_capability' attribute without capability arguments can only be applied to non-static methods of a class}}
 void Func20(void) __attribute__((release_generic_capability())) {} // expected-warning {{'release_generic_capability' attribute without capability arguments can only be applied to non-static methods of a class}}
 
-void Func21(void) __attribute__((try_acquire_capability(1))) {} // expected-warning {{'try_acquire_capability' attribute without capability arguments can only be applied to non-static methods of a class}}
-void Func22(void) __attribute__((try_acquire_shared_capability(1))) {} // expected-warning {{'try_acquire_shared_capability' attribute without capability arguments can only be applied to non-static methods of a class}}
+int Func21(void) __attribute__((try_acquire_capability(1))) {} // expected-warning {{'try_acquire_capability' attribute without capability arguments can only be applied to non-static methods of a class}}
+int Func22(void) __attribute__((try_acquire_shared_capability(1))) {} // expected-warning {{'try_acquire_shared_capability' attribute without capability arguments can only be applied to non-static methods of a class}}
 
-void Func23(void) __attribute__((try_acquire_capability(1, GUI))) {}
-void Func24(void) __attribute__((try_acquire_shared_capability(1, GUI))) {}
+int Func23(void) __attribute__((try_acquire_capability(1, GUI))) {}
+int Func24(void) __attribute__((try_acquire_shared_capability(1, GUI))) {}
 
-void Func25(void) __attribute__((try_acquire_capability())) {} // expected-error {{'try_acquire_capability' attribute takes at least 1 argument}}
-void Func26(void) __attribute__((try_acquire_shared_capability())) {} // expected-error {{'try_acquire_shared_capability' attribute takes at least 1 argument}}
+int Func25(void) __attribute__((try_acquire_capability())) {} // expected-error {{'try_acquire_capability' attribute takes at least 1 argument}}
+int Func26(void) __attribute__((try_acquire_shared_capability())) {} // expected-error {{'try_acquire_shared_capability' attribute takes at least 1 argument}}
 
 // Test that boolean logic works with capability attributes
 void Func27(void) __attribute__((requires_capability(!GUI)));
diff --git a/clang/test/Sema/attr-nonblocking-sema.c b/clang/test/Sema/attr-nonblocking-sema.c
new file mode 100644
index 00000000000000..0647e47febef20
--- /dev/null
+++ b/clang/test/Sema/attr-nonblocking-sema.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c89 %s
+
+// Tests for a few cases involving C functions without prototypes.
+
+void noproto() __attribute__((nonblocking)) // expected-error {{'nonblocking' function must have a prototype}}
+{
+}
+
+// This will succeed
+void noproto(void) __attribute__((blocking));
+
+// A redeclaration isn't any different - a prototype is required.
+void f1(void);
+void f1() __attribute__((nonblocking)); // expected-error {{'nonblocking' function must have a prototype}}
diff --git a/clang/test/Sema/attr-nonblocking-sema.cpp b/clang/test/Sema/attr-nonblocking-sema.cpp
new file mode 100644
index 00000000000000..38bf2ac8f8a4cc
--- /dev/null
+++ b/clang/test/Sema/attr-nonblocking-sema.cpp
@@ -0,0 +1,183 @@
+// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -x c -std=c23 %s
+
+#if !__has_attribute(nonblocking)
+#error "the 'nonblocking' attribute is not available"
+#endif
+
+// --- ATTRIBUTE SYNTAX: SUBJECTS ---
+
+int nl_var [[clang::nonblocking]]; // expected-warning {{'nonblocking' only applies to function types; type here is 'int'}}
+struct nl_struct {} [[clang::nonblocking]]; // expected-warning {{attribute 'nonblocking' is ignored, place it after "struct" to apply attribute to type declaration}}
+struct [[clang::nonblocking]] nl_struct2 {}; // expected-error {{'nonblocking' attribute cannot be applied to a declaration}}
+
+// Positive case
+typedef void (*fo)() [[clang::nonblocking]];
+void (*read_me_and_weep(
+  int val, void (*func)(int) [[clang::nonblocking]])
+  [[clang::nonblocking]]) (int)
+  [[clang::nonblocking]];
+
+// --- ATTRIBUTE SYNTAX: ARGUMENT COUNT ---
+void nargs_1() [[clang::nonblocking(1, 2)]];  // expected-error {{'nonblocking' attribute takes no more than 1 argument}}
+void nargs_2() [[clang::nonallocating(1, 2)]]; // expected-error {{'nonallocating' attribute takes no more than 1 argument}}
+void nargs_3() [[clang::blocking(1)]]; // expected-error {{'blocking' attribute takes no arguments}}
+void nargs_4() [[clang::allocating(1)]]; // expected-error {{'allocating' attribute takes no arguments}}
+
+// --- ATTRIBUTE SYNTAX: COMBINATIONS ---
+// Check invalid combinations of nonblocking/nonallocating attributes
+
+void nl_true_false_1() [[clang::nonblocking(true)]] [[clang::blocking]]; // expected-error {{'blocking' and 'nonblocking' attributes are not compatible}}
+void nl_true_false_2() [[clang::blocking]] [[clang::nonblocking(true)]]; // expected-error {{'nonblocking' and 'blocking' attributes are not compatible}}
+
+void nl_true_false_3() [[clang::nonblocking, clang::blocking]]; // expected-error {{'blocking' and 'nonblocking' attributes are not compatible}}
+void nl_true_false_4() [[clang::blocking, clang::nonblocking]]; // expected-error {{'nonblocking' and 'blocking' attributes are not compatible}}
+
+void na_true_false_1() [[clang::nonallocating(true)]] [[clang::allocating]]; // expected-error {{'allocating' and 'nonallocating' attributes are not compatible}}
+void na_true_false_2() [[clang::allocating]] [[clang::nonallocating(true)]]; // expected-error {{'nonallocating' and 'allocating' attributes are not compatible}}
+
+void na_true_false_3() [[clang::nonallocating, clang::allocating]]; // expected-error {{'allocating' and 'nonallocating' attributes are not compatible}}
+void na_true_false_4() [[clang::allocating, clang::nonallocating]]; // expected-error {{'nonallocating' and 'allocating' attributes are not compatible}}
+
+void nl_true_na_true_1() [[clang::nonblocking]] [[clang::nonallocating]];
+void nl_true_na_true_2() [[clang::nonallocating]] [[clang::nonblocking]];
+
+void nl_true_na_false_1() [[clang::nonblocking]] [[clang::allocating]]; // expected-error {{'allocating' and 'nonblocking' attributes are not compatible}}
+void nl_true_na_false_2() [[clang::allocating]] [[clang::nonblocking]]; // expected-error {{'nonblocking' and 'allocating' attributes are not compatible}}
+
+void nl_false_na_true_1() [[clang::blocking]] [[clang::nonallocating]];
+void nl_false_na_true_2() [[clang::nonallocating]] [[clang::blocking]];
+
+void nl_false_na_false_1() [[clang::blocking]] [[clang::allocating]];
+void nl_false_na_false_2() [[clang::allocating]] [[clang::blocking]];
+
+// --- TYPE CONVERSIONS ---
+
+void unannotated();
+void nonblocking() [[clang::nonblocking]];
+void nonallocating() [[clang::nonallocating]];
+void type_conversions()
+{
+  // It's fine to remove a performance constraint.
+  void (*fp_plain)();
+
+  fp_plain = nullptr;
+  fp_plain = unannotated;
+  fp_plain = nonblocking;
+  fp_plain = nonallocating;
+
+  // Adding/spoofing nonblocking is unsafe.
+  void (*fp_nonblocking)() [[clang::nonblocking]];
+  fp_nonblocking = nullptr;
+  fp_nonblocking = nonblocking;
+  fp_nonblocking = unannotated; // expected-warning {{attribute 'nonblocking' should not be added via type conversion}}
+  fp_nonblocking = nonallocating; // expected-warning {{attribute 'nonblocking' should not be added via type conversion}}
+
+  // Adding/spoofing nonallocating is unsafe.
+  void (*fp_nonallocating)() [[clang::nonallocating]];
+  fp_nonallocating = nullptr;
+  fp_nonallocating = nonallocating;
+  fp_nonallocating = nonblocking; // no warning because nonblocking includes nonallocating fp_nonallocating = unannotated;
+  fp_nonallocating = unannotated; // expected-warning {{attribute 'nonallocating' should not be added via type conversion}}
+}
+
+#ifdef __cplusplus
+struct PTMF {
+  void unannotated();
+  void nonblocking() [[clang::nonblocking]];
+  void nonallocating() [[clang::nonallocating]];
+};
+
+void type_conversions_ptmf()
+{
+  // It's fine to remove a performance constraint.
+  void (PTMF::*ptmf_plain)() = nullptr;
+
+  ptmf_plain = &PTMF::unannotated;
+  ptmf_plain = &PTMF::nonblocking;
+  ptmf_plain = &PTMF::nonallocating;
+
+  // Adding/spoofing nonblocking is unsafe.
+  void (PTMF::*fp_nonblocking)() [[clang::nonblocking]] = nullptr;
+  fp_nonblocking = &PTMF::nonblocking;
+  fp_nonblocking = &PTMF::unannotated; // expected-warning {{attribute 'nonblocking' should not be added via type conversion}}
+  fp_nonblocking = &PTMF::nonallocating; // expected-warning {{attribute 'nonblocking' should not be added via type conversion}}
+
+  // Adding/spoofing nonallocating is unsafe.
+  void (PTMF::*fp_nonallocating)() [[clang::nonallocating]] = nullptr;
+  fp_nonallocating = &PTMF::nonallocating;
+  fp_nonallocating = &PTMF::nonblocking; // no warning because nonblocking includes nonallocating fp_nonallocating = unannotated;
+  fp_nonallocating = &PTMF::unannotated; // expected-warning {{attribute 'nonallocating' should not be added via type conversion}}
+}
+
+// There was a bug: noexcept and nonblocking could be individually removed in conversion, but not both  
+void type_conversions_2()
+{
+  auto receives_fp = [](void (*fp)()) {
+  };
+  
+  auto ne = +[]() noexcept {};
+  auto nl = +[]() [[clang::nonblocking]] {};
+  auto nl_ne = +[]() noexcept [[clang::nonblocking]] {};
+  
+  receives_fp(ne);
+  receives_fp(nl);
+  receives_fp(nl_ne);
+}
+#endif
+
+// --- VIRTUAL METHODS ---
+// Attributes propagate to overridden methods, so no diagnostics except for conflicts.
+// Check this in the syntax tests too.
+#ifdef __cplusplus
+struct Base {
+  virtual void f1();
+  virtual void nonblocking() noexcept [[clang::nonblocking]];
+  virtual void nonallocating() noexcept [[clang::nonallocating]];
+  virtual void f2() [[clang::nonallocating]]; // expected-note {{previous declaration is here}}
+};
+
+struct Derived : public Base {
+  void f1() [[clang::nonblocking]] override;
+  void nonblocking() noexcept override;
+  void nonallocating() noexcept override;
+  void f2() [[clang::allocating]] override; // expected-warning {{effects conflict when merging declarations; kept 'allocating', discarded 'nonallocating'}}
+};
+#endif // __cplusplus
+
+// --- REDECLARATIONS ---
+
+void f2();
+void f2() [[clang::nonblocking]]; // expected-note {{previous declaration is here}}
+void f2(); // expected-warning {{attribute 'nonblocking' on function does not match previous declaration}}
+// Note: we verify that the attribute is actually seen during the constraints tests.
+
+void f3() [[clang::blocking]]; // expected-note {{previous declaration is here}}
+void f3() [[clang::nonblocking]]; // expected-warning {{effects conflict when merging declarations; kept 'blocking', discarded 'nonblocking'}}
+
+// --- OVERLOADS ---
+#ifdef __cplusplus
+struct S {
+  void foo(); // expected-note {{previous declaration is here}}
+  void foo() [[clang::nonblocking]]; // expected-error {{class member cannot be redeclared}}
+};
+#endif // __cplusplus
+
+// --- COMPUTED NONBLOCKING ---
+void f4() [[clang::nonblocking(__builtin_memset)]] {} // expected-error {{nonblocking attribute requires an integer constant}}
+
+#ifdef __cplusplus
+// Unexpanded parameter pack
+template <bool ...val>
+void f5() [[clang::nonblocking(val /* NO ... here */)]] {} // expected-error {{expression contains unexpanded parameter pack 'val'}}
+
+void f6() { f5<true, false>(); }
+
+template <bool B>
+void ambiguous() [[clang::nonblocking(B)]] [[clang::blocking]]; // expected-note {{candidate template ignored: substitution failure [with B = true]: 'blocking' and 'nonblocking' attributes are not compatible}}
+
+void f7() {
+  ambiguous<true>(); // expected-error {{no matching function for call to 'ambiguous'}}
+  ambiguous<false>();
+}
+#endif // __cplusplus
diff --git a/clang/test/Sema/attr-nonblocking-syntax.cpp b/clang/test/Sema/attr-nonblocking-syntax.cpp
new file mode 100644
index 00000000000000..644ed754b04daa
--- /dev/null
+++ b/clang/test/Sema/attr-nonblocking-syntax.cpp
@@ -0,0 +1,216 @@
+// RUN: %clang_cc1 %s -ast-dump -fblocks | FileCheck %s
+
+// Make sure that the attribute gets parsed and attached to the correct AST elements.
+
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+// =========================================================================================
+// Square brackets, true
+
+namespace square_brackets {
+
+// 1. On the type of the FunctionDecl
+void nl_function() [[clang::nonblocking]];
+// CHECK: FunctionDecl {{.*}} nl_function 'void () __attribute__((nonblocking))'
+
+// 2. On the type of the VarDecl holding a function pointer
+void (*nl_func_a)() [[clang::nonblocking]];
+// CHECK: VarDecl {{.*}} nl_func_a 'void (*)() __attribute__((nonblocking))'
+
+// 3. On the type of the ParmVarDecl of a function parameter
+static void nlReceiver(void (*nl_func)() [[clang::nonblocking]]);
+// CHECK: ParmVarDecl {{.*}} nl_func 'void (*)() __attribute__((nonblocking))'
+
+// 4. As an AttributedType within the nested types of a typedef
+typedef void (*nl_fp_type)() [[clang::nonblocking]];
+// CHECK: TypedefDecl {{.*}} nl_fp_type 'void (*)() __attribute__((nonblocking))'
+using nl_fp_talias = void (*)() [[clang::nonblocking]];
+// CHECK: TypeAliasDecl {{.*}} nl_fp_talias 'void (*)() __attribute__((nonblocking))'
+
+// 5. From a typedef or typealias, on a VarDecl
+nl_fp_type nl_fp_var1;
+// CHECK: VarDecl {{.*}} nl_fp_var1 'nl_fp_type':'void (*)() __attribute__((nonblocking))'
+nl_fp_talias nl_fp_var2;
+// CHECK: VarDecl {{.*}} nl_fp_var2 'nl_fp_talias':'void (*)() __attribute__((nonblocking))'
+
+// 6. On type of a FieldDecl
+struct Struct {
+  void (*nl_func_field)() [[clang::nonblocking]];
+// CHECK: FieldDecl {{.*}} nl_func_field 'void (*)() __attribute__((nonblocking))'
+};
+
+// nonallocating should NOT be subsumed into nonblocking
+void nl1() [[clang::nonblocking]] [[clang::nonallocating]];
+// CHECK: FunctionDecl {{.*}} nl1 'void () __attribute__((nonblocking)) __attribute__((nonallocating))'
+
+void nl2() [[clang::nonallocating]] [[clang::nonblocking]];
+// CHECK: FunctionDecl {{.*}} nl2 'void () __attribute__((nonblocking)) __attribute__((nonallocating))'
+
+decltype(nl1) nl3;
+// CHECK: FunctionDecl {{.*}} nl3 'decltype(nl1)':'void () __attribute__((nonblocking)) __attribute__((nonallocating))'
+
+// Attribute propagates from base class virtual method to overrides.
+struct Base {
+  virtual void nb_method() [[clang::nonblocking]];
+};
+struct Derived : public Base {
+  void nb_method() override;
+  // CHECK: CXXMethodDecl {{.*}} nb_method 'void () __attribute__((nonblocking))'
+};
+
+// Dependent expression
+template <bool V>
+struct Dependent {
+  void nb_method2() [[clang::nonblocking(V)]];
+  // CHECK: CXXMethodDecl {{.*}} nb_method2 'void () __attribute__((nonblocking(V)))'
+};
+
+// --- Blocks ---
+
+// On the type of the VarDecl holding a BlockDecl
+void (^nl_block1)() [[clang::nonblocking]] = ^() [[clang::nonblocking]] {};
+// CHECK: VarDecl {{.*}} nl_block1 'void (^)() __attribute__((nonblocking))'
+
+int (^nl_block2)() [[clang::nonblocking]] = ^() [[clang::nonblocking]] { return 0; };
+// CHECK: VarDecl {{.*}} nl_block2 'int (^)() __attribute__((nonblocking))'
+
+// The operand of the CallExpr is an ImplicitCastExpr of a DeclRefExpr -> nl_block which hold the attribute
+static void blockCaller() { nl_block1(); }
+// CHECK: DeclRefExpr {{.*}} 'nl_block1' 'void (^)() __attribute__((nonblocking))'
+
+// --- Lambdas ---
+
+// On the operator() of a lambda's CXXMethodDecl
+auto nl_lambda = []() [[clang::nonblocking]] {};
+// CHECK: CXXMethodDecl {{.*}} operator() 'void () const __attribute__((nonblocking))' inline
+
+// =========================================================================================
+// Square brackets, false
+
+void nl_func_false() [[clang::blocking]];
+// CHECK: FunctionDecl {{.*}} nl_func_false 'void () __attribute__((blocking))'
+
+auto nl_lambda_false = []() [[clang::blocking]] {};
+// CHECK: CXXMethodDecl {{.*}} operator() 'void () const __attribute__((blocking))'
+
+} // namespace square_brackets
+
+// =========================================================================================
+// GNU-style attribute, true
+
+namespace gnu_style {
+
+// 1. On the type of the FunctionDecl
+void nl_function() __attribute__((nonblocking));
+// CHECK: FunctionDecl {{.*}} nl_function 'void () __attribute__((nonblocking))'
+
+// 1a. Alternate placement on the FunctionDecl
+__attribute__((nonblocking)) void nl_function();
+// CHECK: FunctionDecl {{.*}} nl_function 'void () __attribute__((nonblocking))'
+
+// 2. On the type of the VarDecl holding a function pointer
+void (*nl_func_a)() __attribute__((nonblocking));
+// CHECK: VarDecl {{.*}} nl_func_a 'void (*)() __attribute__((nonblocking))'
+
+// 2a. Alternate attribute placement on VarDecl
+__attribute__((nonblocking)) void (*nl_func_b)();
+// CHECK: VarDecl {{.*}} nl_func_b 'void (*)() __attribute__((nonblocking))'
+
+// 3. On the type of the ParmVarDecl of a function parameter
+static void nlReceiver(void (*nl_func)() __attribute__((nonblocking)));
+// CHECK: ParmVarDecl {{.*}} nl_func 'void (*)() __attribute__((nonblocking))'
+
+// 4. As an AttributedType within the nested types of a typedef
+// Note different placement from square brackets for the typealias.
+typedef void (*nl_fp_type)() __attribute__((nonblocking));
+// CHECK: TypedefDecl {{.*}} nl_fp_type 'void (*)() __attribute__((nonblocking))'
+using nl_fp_talias = __attribute__((nonblocking)) void (*)();
+// CHECK: TypeAliasDecl {{.*}} nl_fp_talias 'void (*)() __attribute__((nonblocking))'
+
+// 5. From a typedef or typealias, on a VarDecl
+nl_fp_type nl_fp_var1;
+// CHECK: VarDecl {{.*}} nl_fp_var1 'nl_fp_type':'void (*)() __attribute__((nonblocking))'
+nl_fp_talias nl_fp_var2;
+// CHECK: VarDecl {{.*}} nl_fp_var2 'nl_fp_talias':'void (*)() __attribute__((nonblocking))'
+
+// 6. On type of a FieldDecl
+struct Struct {
+  void (*nl_func_field)() __attribute__((nonblocking));
+// CHECK: FieldDecl {{.*}} nl_func_field 'void (*)() __attribute__((nonblocking))'
+};
+
+} // namespace gnu_style
+
+// =========================================================================================
+// nonallocating and allocating - quick checks because the code paths are generally
+// identical after parsing.
+
+void na_function() [[clang::nonallocating]];
+// CHECK: FunctionDecl {{.*}} na_function 'void () __attribute__((nonallocating))'
+
+void na_true_function() [[clang::nonallocating(true)]];
+// CHECK: FunctionDecl {{.*}} na_true_function 'void () __attribute__((nonallocating))'
+
+void na_false_function() [[clang::nonallocating(false)]];
+// CHECK: FunctionDecl {{.*}} na_false_function 'void () __attribute__((allocating))'
+
+void alloc_function() [[clang::allocating]];
+// CHECK: FunctionDecl {{.*}} alloc_function 'void () __attribute__((allocating))'
+
+
+// =========================================================================================
+// Non-blocking with an expression parameter
+
+void t0() [[clang::nonblocking(1 - 1)]];
+// CHECK: FunctionDecl {{.*}} t0 'void () __attribute__((blocking))'
+void t1() [[clang::nonblocking(1 + 1)]];
+// CHECK: FunctionDecl {{.*}} t1 'void () __attribute__((nonblocking))'
+
+template <bool V>
+struct ValueDependent {
+  void nb_method() [[clang::nonblocking(V)]];
+};
+
+void t3() [[clang::nonblocking]]
+{
+  ValueDependent<false> x1;
+  x1.nb_method();
+// CHECK: ClassTemplateSpecializationDecl {{.*}} ValueDependent
+// CHECK: TemplateArgument integral 'false'
+// CHECK: CXXMethodDecl {{.*}} nb_method 'void () __attribute__((blocking))'
+
+   ValueDependent<true> x2;
+   x2.nb_method();
+// CHECK: ClassTemplateSpecializationDecl {{.*}} ValueDependent
+// CHECK: TemplateArgument integral 'true'
+// CHECK: CXXMethodDecl {{.*}} nb_method 'void () __attribute__((nonblocking))'
+}
+
+template <typename X>
+struct TypeDependent {
+  void td_method() [[clang::nonblocking(X::is_nb)]];
+};
+
+struct NBPolicyTrue {
+  static constexpr bool is_nb = true;
+};
+
+struct NBPolicyFalse {
+  static constexpr bool is_nb = false;
+};
+
+void t4()
+{
+  TypeDependent<NBPolicyFalse> x1;
+  x1.td_method();
+// CHECK: ClassTemplateSpecializationDecl {{.*}} TypeDependent
+// CHECK: TemplateArgument type 'NBPolicyFalse'
+// CHECK: CXXMethodDecl {{.*}} td_method 'void () __attribute__((blocking))'
+
+  TypeDependent<NBPolicyTrue> x2;
+  x2.td_method();
+// CHECK: ClassTemplateSpecializationDecl {{.*}} TypeDependent
+// CHECK: TemplateArgument type 'NBPolicyTrue'
+// CHECK: CXXMethodDecl {{.*}} td_method 'void () __attribute__((nonblocking))'
+}
+
diff --git a/clang/test/Sema/preserve-none-call-conv.c b/clang/test/Sema/preserve-none-call-conv.c
index 4508095863de5c..678fa7d5317e55 100644
--- a/clang/test/Sema/preserve-none-call-conv.c
+++ b/clang/test/Sema/preserve-none-call-conv.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -fsyntax-only -triple x86_64-unknown-unknown -verify
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-unknown-unknown -verify
 
 typedef void typedef_fun_t(int);
 
diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c
index cd069881793c5d..fc1ae954fa36be 100644
--- a/clang/test/Sema/ptrauth.c
+++ b/clang/test/Sema/ptrauth.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s -fexperimental-new-constant-interpreter
 
 #if __has_feature(ptrauth_intrinsics)
 #warning Pointer authentication enabled!
diff --git a/clang/test/SemaCUDA/neon-attrs.cu b/clang/test/SemaCUDA/neon-attrs.cu
deleted file mode 100644
index 129056741ac9a4..00000000000000
--- a/clang/test/SemaCUDA/neon-attrs.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-// CPU-side compilation on ARM with neon enabled (no errors expected).
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -aux-triple nvptx64 -x cuda -fsyntax-only -verify=quiet %s
-
-// CPU-side compilation on ARM with neon disabled.
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature -neon -aux-triple nvptx64 -x cuda -fsyntax-only -verify %s
-
-// GPU-side compilation on ARM (no errors expected).
-// RUN: %clang_cc1 -triple nvptx64 -aux-triple arm64-linux-gnu -fcuda-is-device -x cuda -fsyntax-only -verify=quiet %s
-
-// Regular C++ compilation on ARM with neon enabled (no errors expected).
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -x c++ -fsyntax-only -verify=quiet %s
-
-// Regular C++ compilation on ARM with neon disabled.
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature -neon -x c++ -fsyntax-only -verify %s
-
-// quiet-no-diagnostics
-typedef __attribute__((neon_vector_type(4))) float float32x4_t;
-// expected-error@-1 {{'neon_vector_type' attribute is not supported on targets missing 'neon', 'mve', 'sve' or 'sme'}}
-// expect
-typedef unsigned char poly8_t;
-typedef __attribute__((neon_polyvector_type(8))) poly8_t poly8x8_t;
-// expected-error@-1 {{'neon_polyvector_type' attribute is not supported on targets missing 'neon' or 'mve'}}
diff --git a/clang/test/SemaCXX/atomic-ops.cpp b/clang/test/SemaCXX/atomic-ops.cpp
index 213161364f5852..30d829d1e97d79 100644
--- a/clang/test/SemaCXX/atomic-ops.cpp
+++ b/clang/test/SemaCXX/atomic-ops.cpp
@@ -7,3 +7,18 @@ void PR28623() {
   void helper(char); // expected-note{{target}}
   __atomic_store_n(helper, 0, 0); // expected-error{{reference to overloaded function could not be resolved}}
 }
+
+template<typename>
+struct X {
+  char arr[1];
+};
+
+extern X<void>* p, *q;
+
+// They should be accepted.
+void f() {
+  __atomic_exchange(p, p, q, __ATOMIC_RELAXED);
+  __atomic_load(p, p, __ATOMIC_RELAXED);
+  __atomic_store(p, p, __ATOMIC_RELAXED);
+  __atomic_compare_exchange(p, p, q, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+}
diff --git a/clang/test/SemaCXX/coroutine-vla.cpp b/clang/test/SemaCXX/coroutine-vla.cpp
index 176e35f346e2b4..996c89025e2ad1 100644
--- a/clang/test/SemaCXX/coroutine-vla.cpp
+++ b/clang/test/SemaCXX/coroutine-vla.cpp
@@ -16,6 +16,12 @@ struct promise
     void unhandled_exception();
 };
 
+// Test that we won't report the error incorrectly.
+void bar(int n) {
+  int array[n];
+  return;
+}
+
 coroutine foo(int n) {
   int array[n]; // expected-error {{variable length arrays in a coroutine are not supported}}
   co_return;
diff --git a/clang/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp b/clang/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp
index 74aa7cb2abe836..41b4f6dc308d9d 100644
--- a/clang/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp
+++ b/clang/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -DUNION_TEST -verify %s
 
 #ifdef UNION_TEST
diff --git a/clang/test/SemaCXX/lambda-call.cpp b/clang/test/SemaCXX/lambda-call.cpp
new file mode 100644
index 00000000000000..2c5b8b9a4b176b
--- /dev/null
+++ b/clang/test/SemaCXX/lambda-call.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -std=c++23 -verify -fsyntax-only %s
+
+namespace GH96205 {
+
+void f() {
+  auto l = [](this auto& self, int) -> void { self("j"); }; // expected-error {{no matching function for call to object of type}} \
+                                                            // expected-note {{no known conversion from 'const char[2]' to 'int'}}
+  l(3); // expected-note {{requested here}}
+}
+
+}
diff --git a/clang/test/SemaCXX/ptrauth-incomplete-virtual-member-function-return-arg-type.cpp b/clang/test/SemaCXX/ptrauth-incomplete-virtual-member-function-return-arg-type.cpp
new file mode 100644
index 00000000000000..41bbba0e832fda
--- /dev/null
+++ b/clang/test/SemaCXX/ptrauth-incomplete-virtual-member-function-return-arg-type.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fptrauth-calls %s
+
+struct Incomplete0; // expected-note 3 {{forward declaration of 'Incomplete0'}}
+
+template <class T>
+struct Incomplete1; // expected-note {{template is declared here}}
+
+struct Complete0 {
+};
+
+template <class T>
+struct Complete1 {
+};
+
+struct S {
+  virtual int foo();
+  virtual Incomplete0 virtual0(); // expected-note 2 {{'Incomplete0' is incomplete}}
+  virtual void virtual1(Incomplete1<int>); // expected-note {{'Incomplete1<int>' is incomplete}}
+  virtual Complete0 virtual2();
+  virtual Complete1<int> virtual3();
+  Incomplete0 nonvirtual0();
+  template <class T>
+  void m0() {
+    (void)&S::virtual0; // expected-error {{incomplete type 'Incomplete0'}} expected-note {{cannot take an address of a virtual}}
+  }
+};
+
+template <bool T>
+struct S2 {
+  virtual Incomplete0 virtual0() noexcept(T); // expected-note {{'Incomplete0' is incomplete}}
+
+  void m0() {
+    (void)&S2<T>::virtual0;
+  }
+
+  void m1() {
+    (void)&S2<T>::virtual0; // expected-error {{incomplete type 'Incomplete0'}} expected-note {{cannot take an address of a virtual}}
+  }
+};
+
+void test_incomplete_virtual_member_function_return_arg_type() {
+  (void)&S::virtual0; // expected-error {{incomplete type 'Incomplete0}} expected-note {{cannot take an address of a virtual member function}}
+  (void)&S::virtual1; // expected-error {{implicit instantiation of undefined template 'Incomplete1<int>'}} expected-note {{cannot take an address of a virtual member function}}
+  (void)&S::virtual2;
+  (void)&S::virtual3;
+  (void)&S::nonvirtual0;
+  int s = sizeof(&S::virtual0);
+  S2<true>().m1(); // expected-note {{in instantiation of}}
+}
+
diff --git a/clang/test/SemaCXX/verbose-trap.cpp b/clang/test/SemaCXX/verbose-trap.cpp
new file mode 100644
index 00000000000000..2503f9860d9c34
--- /dev/null
+++ b/clang/test/SemaCXX/verbose-trap.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify %s
+
+#if !__has_builtin(__builtin_verbose_trap)
+#error
+#endif
+
+constexpr char const* constCat1 = "cat";
+char const* const constCat2 = "cat";
+char const constCat3[] = "cat";
+
+constexpr char const* constMsg1 = "hello";
+char const* const constMsg2 = "hello";
+char const constMsg3[] = "hello";
+
+template <const char * const category, const char * const reason>
+void f(const char * arg) {
+  __builtin_verbose_trap("cat1", "Arbitrary string literals can be used!");
+  __builtin_verbose_trap(" cat1 ", "Argument_must_not_be_null");
+  __builtin_verbose_trap("cat" "egory1", "hello" "world");
+  __builtin_verbose_trap(constCat1, constMsg1);
+  __builtin_verbose_trap(constCat2, constMsg2);
+  __builtin_verbose_trap("", "");
+  __builtin_verbose_trap(); // expected-error {{too few arguments}}
+  __builtin_verbose_trap(""); // expected-error {{too few arguments}}
+  __builtin_verbose_trap("", "", ""); // expected-error {{too many arguments}}
+  __builtin_verbose_trap("", 0); // expected-error {{argument to __builtin_verbose_trap must be a pointer to a constant string}}
+  __builtin_verbose_trap(1, ""); // expected-error {{cannot initialize a parameter of type 'const char *' with an rvalue of type 'int'}}
+  __builtin_verbose_trap(arg, ""); // expected-error {{argument to __builtin_verbose_trap must be a pointer to a constant string}}
+  __builtin_verbose_trap("cat$1", "hel$lo"); // expected-error 2 {{argument to __builtin_verbose_trap must not contain $}}
+  __builtin_verbose_trap(category, reason);
+  __builtin_verbose_trap(u8"cat1", u8"hello");
+#if __cplusplus >= 202002L
+  // FIXME: Accept c++20 u8 string literals.
+  // expected-error@-3 {{cannot initialize a parameter of type 'const char *' with an lvalue of type 'const char8_t[5]'}}
+#endif
+  __builtin_verbose_trap("", "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd");
+}
+
+template <const char * const category>
+void f2() {
+  __builtin_verbose_trap(category, 1); // expected-error {{cannot initialize a parameter of type 'const char *' with an rvalue of type 'int'}}
+}
+
+void test() {
+  f<constCat3, constMsg3>(nullptr);
+}
diff --git a/clang/test/SemaCXX/vtable_pointer_authentication_attribute.cpp b/clang/test/SemaCXX/vtable_pointer_authentication_attribute.cpp
new file mode 100644
index 00000000000000..3a3386196fbfa5
--- /dev/null
+++ b/clang/test/SemaCXX/vtable_pointer_authentication_attribute.cpp
@@ -0,0 +1,225 @@
+// RUN: %clang_cc1 -fsyntax-only -triple arm64-apple-ios -verify -fptrauth-calls -std=c++2a %s
+
+namespace basic {
+
+#define authenticated(a, b, c...) [[clang::ptrauth_vtable_pointer(a, b, c)]]
+
+// Basic sanity tests
+#define TEST_AUTH(name, auth...)                        \
+  struct [[clang::ptrauth_vtable_pointer(auth)]] name { \
+    virtual ~name() {}                                  \
+  }
+
+TEST_AUTH(NoParams);
+// expected-error@-1{{'ptrauth_vtable_pointer' attribute takes at least 3 arguments}}
+TEST_AUTH(NoAuth, no_authentication, default_address_discrimination, default_extra_discrimination);
+TEST_AUTH(InvalidKey, wat, default_address_discrimination, default_extra_discrimination);
+// expected-error@-1{{invalid authentication key 'wat'}}
+TEST_AUTH(InvalidAddressDiscrimination, no_authentication, wat, default_extra_discrimination);
+// expected-error@-1{{invalid address discrimination mode 'wat'}}
+TEST_AUTH(InvalidExtraDiscrimination, no_authentication, default_address_discrimination, wat);
+// expected-error@-1{{invalid extra discrimination selection 'wat'}}
+TEST_AUTH(InvalidNoCustomDiscrimination, no_authentication, default_address_discrimination, custom_discrimination);
+// expected-error@-1{{missing custom discrimination}}
+TEST_AUTH(InvalidCustomDiscrimination, no_authentication, default_address_discrimination, custom_discrimination, wat);
+// expected-error@-1{{invalid custom discrimination}}
+TEST_AUTH(Default, default_key, default_address_discrimination, default_extra_discrimination);
+TEST_AUTH(InvalidDefaultExtra, default_key, default_address_discrimination, default_extra_discrimination, 1);
+// expected-error@-1{{'ptrauth_vtable_pointer' attribute takes no more than 3 arguments}}
+TEST_AUTH(ProcessDependentKey, process_dependent, default_address_discrimination, default_extra_discrimination);
+TEST_AUTH(ProcessIndependentKey, process_independent, default_address_discrimination, default_extra_discrimination);
+TEST_AUTH(DefaultAddressDiscrimination, process_independent, default_address_discrimination, default_extra_discrimination);
+TEST_AUTH(NoAddressDiscrimination, process_independent, no_address_discrimination, default_extra_discrimination);
+TEST_AUTH(AddressDiscrimination, process_independent, address_discrimination, default_extra_discrimination);
+TEST_AUTH(DefaultExtraDiscrimination, process_independent, default_address_discrimination, default_extra_discrimination);
+TEST_AUTH(NoExtraDiscrimination, process_independent, default_address_discrimination, no_extra_discrimination);
+TEST_AUTH(TypeExtraDiscrimination, process_independent, default_address_discrimination, type_discrimination);
+TEST_AUTH(InvalidCustomExtraDiscrimination, process_independent, default_address_discrimination, custom_discrimination);
+// expected-error@-1{{missing custom discrimination}}
+TEST_AUTH(ValidCustomExtraDiscrimination, process_independent, default_address_discrimination, custom_discrimination, 1);
+
+// Basic valid authentication configuration
+#define generic_authenticated \
+  authenticated(process_independent, address_discrimination, type_discrimination)
+
+struct generic_authenticated ForwardDecl;
+
+struct generic_authenticated generic_authenticated InvalidDuplicateAttribute {
+  // expected-error@-1{{multiple vtable pointer authentication policies on 'InvalidDuplicateAttribute'}}
+  virtual ~InvalidDuplicateAttribute(){};
+};
+struct generic_authenticated ValidPolymorphic {
+  virtual ~ValidPolymorphic(){};
+};
+struct generic_authenticated InvalidMonomorphic { // expected-error{{cannot set vtable pointer authentication on monomorphic type 'InvalidMonomorphic'}}
+};
+struct ValidMonomorphic {
+};
+
+struct ValidSubclass : ValidPolymorphic {};
+struct generic_authenticated InvalidSubclass : ValidPolymorphic {}; // expected-error{{cannot set vtable pointer authentication on 'InvalidSubclass' which is a subclass of polymorphic type 'ValidPolymorphic'}}
+
+// Awful template time
+template <typename T>
+struct generic_authenticated ExplicitlyAuthedMonomorphicTemplateClass : T {};
+// expected-error@-1{{cannot set vtable pointer authentication on 'ExplicitlyAuthedMonomorphicTemplateClass<basic::ValidPolymorphic>' which is a subclass of polymorphic type 'ValidPolymorphic'}}
+// expected-error@-2{{cannot set vtable pointer authentication on monomorphic type 'ExplicitlyAuthedMonomorphicTemplateClass<basic::ValidMonomorphic>'}}
+template <typename T>
+struct generic_authenticated ExplicitlyAuthedPolymorphicTemplateClass : T { // expected-error{{cannot set vtable pointer authentication on 'ExplicitlyAuthedPolymorphicTemplateClass<basic::ValidPolymorphic>' which is a subclass of polymorphic type 'ValidPolymorphic'}}
+  virtual ~ExplicitlyAuthedPolymorphicTemplateClass(){};
+};
+template <typename T>
+struct UnauthedMonomorphicTemplateClass : T {};
+template <typename T>
+struct UnauthedPolymorphicTemplateClass : T {
+  virtual ~UnauthedPolymorphicTemplateClass(){};
+};
+
+ExplicitlyAuthedMonomorphicTemplateClass<ValidPolymorphic> test1;
+// expected-note@-1{{in instantiation of template class 'basic::ExplicitlyAuthedMonomorphicTemplateClass<basic::ValidPolymorphic>' requested here}}
+ExplicitlyAuthedMonomorphicTemplateClass<ValidMonomorphic> test2;
+// expected-note@-1{{in instantiation of template class 'basic::ExplicitlyAuthedMonomorphicTemplateClass<basic::ValidMonomorphic>' requested here}}
+ExplicitlyAuthedPolymorphicTemplateClass<ValidPolymorphic> test3;
+// expected-note@-1{{in instantiation of template class 'basic::ExplicitlyAuthedPolymorphicTemplateClass<basic::ValidPolymorphic>' requested here}}
+ExplicitlyAuthedPolymorphicTemplateClass<ValidMonomorphic> test4;
+
+UnauthedMonomorphicTemplateClass<ValidPolymorphic> test5;
+UnauthedMonomorphicTemplateClass<ValidMonomorphic> test6;
+UnauthedPolymorphicTemplateClass<ValidPolymorphic> test7;
+UnauthedPolymorphicTemplateClass<ValidMonomorphic> test8;
+
+// Just use a different policy from the generic macro to verify we won't complain
+// about the insanity
+struct authenticated(process_independent, no_address_discrimination, type_discrimination) SecondAuthenticatedPolymorphic {
+  virtual ~SecondAuthenticatedPolymorphic(){};
+};
+struct UnauthenticatedPolymorphic {
+  virtual ~UnauthenticatedPolymorphic(){};
+};
+
+struct MultipleParents1 : ValidPolymorphic, SecondAuthenticatedPolymorphic, UnauthenticatedPolymorphic {};
+struct MultipleParents2 : UnauthenticatedPolymorphic, ValidPolymorphic, SecondAuthenticatedPolymorphic {};
+struct generic_authenticated InvalidMultipleParents : UnauthenticatedPolymorphic, ValidPolymorphic, SecondAuthenticatedPolymorphic {};
+// expected-error@-1{{cannot set vtable pointer authentication on 'InvalidMultipleParents' which is a subclass of polymorphic type 'UnauthenticatedPolymorphic'}}
+
+template <typename T>
+struct generic_authenticated ExplicitlyAuthedPolymorphicTemplateClassNoBase {
+  virtual ~ExplicitlyAuthedPolymorphicTemplateClassNoBase();
+};
+
+ExplicitlyAuthedPolymorphicTemplateClassNoBase<int> v;
+
+struct ValidSubclassOfTemplate : ExplicitlyAuthedPolymorphicTemplateClassNoBase<int> {
+};
+
+struct generic_authenticated InvalidSubclassOfTemplate : ExplicitlyAuthedPolymorphicTemplateClassNoBase<int> {
+  // expected-error@-1{{cannot set vtable pointer authentication on 'InvalidSubclassOfTemplate' which is a subclass of polymorphic type 'ExplicitlyAuthedPolymorphicTemplateClassNoBase<int>'}}
+};
+
+template <typename T>
+struct generic_authenticated ExplicitlyAuthedMonomorphicTemplateClassNoBase {
+  // expected-error@-1{{cannot set vtable pointer authentication on monomorphic type 'ExplicitlyAuthedMonomorphicTemplateClassNoBase'}}
+  // expected-error@-2{{cannot set vtable pointer authentication on monomorphic type 'ExplicitlyAuthedMonomorphicTemplateClassNoBase<int>'}}
+};
+
+ExplicitlyAuthedMonomorphicTemplateClassNoBase<int> X;
+// expected-note@-1{{in instantiation of template class 'basic::ExplicitlyAuthedMonomorphicTemplateClassNoBase<int>' requested here}}
+
+template <typename T>
+struct generic_authenticated ExplicitlyAuthedTemplateClassValidBase : ValidMonomorphic {
+  // expected-error@-1{{cannot set vtable pointer authentication on monomorphic type 'ExplicitlyAuthedTemplateClassValidBase'}}
+  // expected-error@-2{{cannot set vtable pointer authentication on monomorphic type 'ExplicitlyAuthedTemplateClassValidBase<int>'}}
+};
+
+ExplicitlyAuthedTemplateClassValidBase<int> Y;
+// expected-note@-1{{in instantiation of template class 'basic::ExplicitlyAuthedTemplateClassValidBase<int>' requested here}}
+
+template <typename T>
+struct generic_authenticated ExplicitlyAuthedTemplateClassInvalidBase : ValidPolymorphic {
+  // expected-error@-1{{cannot set vtable pointer authentication on 'ExplicitlyAuthedTemplateClassInvalidBase' which is a subclass of polymorphic type 'ValidPolymorphic'}}
+  // expected-error@-2{{cannot set vtable pointer authentication on 'ExplicitlyAuthedTemplateClassInvalidBase<int>' which is a subclass of polymorphic type 'ValidPolymorphic'}}
+};
+
+ExplicitlyAuthedTemplateClassInvalidBase<int> Z;
+// expected-note@-1{{in instantiation of template class 'basic::ExplicitlyAuthedTemplateClassInvalidBase<int>' requested here}}
+
+template <class test1, class test2>
+class generic_authenticated TestPolymorphicTemplateSpecialization;
+
+template <>
+class TestPolymorphicTemplateSpecialization<double, float> {
+  MissingDecl *zl;
+  // expected-error@-1 {{unknown type name 'MissingDecl'}}
+public:
+  virtual ~TestPolymorphicTemplateSpecialization();
+};
+template <class test1>
+class generic_authenticated TestPolymorphicTemplateSpecialization<test1, double>
+// expected-error@-1 {{cannot set vtable pointer authentication on monomorphic type 'TestPolymorphicTemplateSpecialization<test1, double>'}}
+// expected-error@-2 {{cannot set vtable pointer authentication on monomorphic type 'TestPolymorphicTemplateSpecialization<double, double>'}}
+{
+};
+
+TestPolymorphicTemplateSpecialization<double, float> b;
+TestPolymorphicTemplateSpecialization<double, double> b2;
+// expected-note@-1 {{in instantiation of template class 'basic::TestPolymorphicTemplateSpecialization<double, double>' requested here}}
+
+template <typename A> class generic_authenticated TestMonomorphic {};
+// expected-error@-1 {{cannot set vtable pointer authentication on monomorphic type 'TestMonomorphic'}}
+// expected-error@-2 {{cannot set vtable pointer authentication on monomorphic type 'TestMonomorphic<double>'}}
+
+template <> class generic_authenticated TestMonomorphic<int> {
+public:
+  virtual ~TestMonomorphic();
+};
+
+struct TestMonomorphicSubclass : TestMonomorphic<int> {
+};
+template <typename T> struct generic_authenticated TestMonomorphicSubclass2 : TestMonomorphic<T> {
+  // expected-error@-1 {{cannot set vtable pointer authentication on 'TestMonomorphicSubclass2<int>' which is a subclass of polymorphic type 'TestMonomorphic<int>'}}
+  // expected-error@-2 {{cannot set vtable pointer authentication on monomorphic type 'TestMonomorphicSubclass2<double>'}}
+  // expected-note@-3 {{in instantiation of template class 'basic::TestMonomorphic<double>' requested here}}
+};
+
+TestMonomorphicSubclass tms_1;
+TestMonomorphicSubclass2<int> tms2_1;
+// expected-note@-1 {{in instantiation of template class 'basic::TestMonomorphicSubclass2<int>' requested here}}
+TestMonomorphicSubclass2<double> tms2_2;
+// expected-note@-1 {{in instantiation of template class 'basic::TestMonomorphicSubclass2<double>' requested here}}
+// expected-note@-2 {{in instantiation of template class 'basic::TestMonomorphicSubclass2<double>' requested here}}
+
+template <typename T>
+class generic_authenticated dependent_type {
+  // expected-error@-1 {{cannot set vtable pointer authentication on monomorphic type 'dependent_type'}}
+  static constexpr unsigned small_object_size = 1;
+  char _model[small_object_size];
+};
+
+template <typename... T>
+class generic_authenticated dependent_type2 : public T... {
+  // expected-error@-1 {{cannot set vtable pointer authentication on 'dependent_type2<basic::Foo>' which is a subclass of polymorphic type 'Foo'}}
+  static constexpr unsigned small_object_size = 1;
+  char _model[small_object_size];
+};
+
+struct Foo {
+  virtual ~Foo();
+};
+
+dependent_type2<Foo> thing;
+// expected-note@-1 {{in instantiation of template class 'basic::dependent_type2<basic::Foo>' requested here}}
+
+template <class>
+class task;
+template <unsigned align> struct alignedthing {
+  char buffer[align];
+};
+
+template <class R, class... Args>
+class generic_authenticated task<R(Args...)> {
+  // expected-error@-1 {{cannot set vtable pointer authentication on monomorphic type 'task<R (Args...)>'}}
+  static constexpr __SIZE_TYPE__ small_object_size = 256;
+  alignedthing<small_object_size> _model;
+};
+
+} // namespace basic
diff --git a/clang/test/SemaCXX/warn-explicit-specialization-storage-class.cpp b/clang/test/SemaCXX/warn-explicit-specialization-storage-class.cpp
new file mode 100644
index 00000000000000..5d9ace292dc06c
--- /dev/null
+++ b/clang/test/SemaCXX/warn-explicit-specialization-storage-class.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple i686-pc-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i686-pc-linux-gnu -fsyntax-only -Wno-explicit-specialization-storage-class -verify=expnone %s
+
+// expnone-no-diagnostics
+
+struct A {
+    template<typename T>
+    static constexpr int x = 0;
+
+    template<>
+    static constexpr int x<void> = 1; // expected-warning{{explicit specialization cannot have a storage class}}
+};
+
+template<typename T>
+static constexpr int x = 0;
+
+template<>
+static constexpr int x<void> = 1; // expected-warning{{explicit specialization cannot have a storage class}}
diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
index 73cc946ca0ce1c..a6ddf028e1fadb 100644
--- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -1954,6 +1954,125 @@ struct TestTryLock {
 } // end namespace TrylockTest
 
 
+// Regression test for trylock attributes with an enumerator success argument.
+// Prior versions of the analysis silently failed to evaluate success arguments
+// that were neither `CXXBoolLiteralExpr` nor `IntegerLiteral` and assumed the
+// value was false.
+namespace TrylockSuccessEnumFalseNegative {
+
+enum TrylockResult { Failure = 0, Success = 1 };
+
+class LOCKABLE Mutex {
+public:
+  TrylockResult TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(Success);
+  void Unlock() EXCLUSIVE_UNLOCK_FUNCTION();
+};
+
+class TrylockTest {
+  Mutex mu_;
+  int a_ GUARDED_BY(mu_) = 0;
+
+  void test_bool_expression() {
+    if (!mu_.TryLock()) { // expected-note {{mutex acquired here}}
+      a_++;  // expected-warning {{writing variable 'a_' requires holding mutex 'mu_' exclusively}}
+      mu_.Unlock();  // expected-warning {{releasing mutex 'mu_' that was not held}}
+    }
+  }  // expected-warning {{mutex 'mu_' is not held on every path through here}}
+};
+} // namespace TrylockSuccessEnumFalseNegative
+
+// This test demonstrates that the analysis does not distinguish between
+// different non-zero enumerators.
+namespace TrylockFalseNegativeWhenEnumHasMultipleFailures {
+
+enum TrylockResult { Failure1 = 0, Failure2, Success };
+
+class LOCKABLE Mutex {
+public:
+  TrylockResult TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(Success);
+  void Unlock() EXCLUSIVE_UNLOCK_FUNCTION();
+};
+
+class TrylockTest {
+  Mutex mu_;
+  int a_ GUARDED_BY(mu_) = 0;
+
+  void test_eq_success() {
+    if (mu_.TryLock() == Success) {
+      a_++;
+      mu_.Unlock();
+    }
+  }
+
+  void test_bool_expression() {
+    // This branch checks whether the trylock function returned a non-zero
+    // value. This satisfies the analysis, but fails to account for `Failure2`.
+    // From the analysis's perspective, `Failure2` and `Success` are equivalent!
+    if (mu_.TryLock()) {
+      a_++;
+      mu_.Unlock();
+    }
+  }
+};
+} // namespace TrylockSuccessEnumMultipleFailureModesFalseNegative
+
+
+// This test demonstrates that the analysis does not detect when all enumerators
+// are positive, and thus incapable of representing a failure.
+namespace TrylockSuccessEnumNoZeroFalseNegative {
+
+enum TrylockResult { Failure = 1, Success = 2 };
+
+class LOCKABLE Mutex {
+public:
+  TrylockResult TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(Success);
+  void Unlock() EXCLUSIVE_UNLOCK_FUNCTION();
+};
+
+class TrylockTest {
+  Mutex mu_;
+  int a_ GUARDED_BY(mu_) = 0;
+
+  void test_bool_expression() {
+    // This branch checks whether the trylock function returned a non-zero value
+    // This satisfies the analysis, but is actually incorrect because `Failure`
+    // and `Success` are both non-zero.
+    if (mu_.TryLock()) {
+      a_++;
+      mu_.Unlock();
+    }
+  }
+};
+} // namespace TrylockSuccessEnumNoZeroFalseNegative
+
+
+// Demonstrate a mutex with a trylock function that conditionally vends a
+// pointer to guarded data.
+namespace TrylockFunctionReturnPointer {
+
+class LOCKABLE OwningMutex {
+public:
+  int *TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(true);
+  void Unlock() EXCLUSIVE_UNLOCK_FUNCTION();
+
+private:
+  int guarded_ GUARDED_BY(this) = 0;
+};
+
+class TrylockTest {
+  OwningMutex mu_;
+
+  void test_ptr_return() {
+    if (int *p = mu_.TryLock()) {
+      *p = 0;
+      mu_.Unlock();
+    }
+  }
+};
+
+} // namespace TrylockFunctionReturnPointer
+
+
 namespace TestTemplateAttributeInstantiation {
 
 class Foo1 {
@@ -3657,8 +3776,8 @@ class Foo {
   int a GUARDED_BY(mu_);
   bool c;
 
-  int    tryLockMutexI() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu_);
-  Mutex* tryLockMutexP() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu_);
+  int tryLockMutexI() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu_);
+  Mutex *tryLockMutexP() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu_);
   void unlock() UNLOCK_FUNCTION(mu_);
 
   void test1();
diff --git a/clang/test/SemaCXX/warn-thread-safety-parsing.cpp b/clang/test/SemaCXX/warn-thread-safety-parsing.cpp
index 0c5b0cc85897be..4921a75b84e398 100644
--- a/clang/test/SemaCXX/warn-thread-safety-parsing.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-parsing.cpp
@@ -716,15 +716,17 @@ int slf_function_bad_7() SHARED_LOCK_FUNCTION(0); // \
 #error "Should support exclusive_trylock_function attribute"
 #endif
 
-// takes a mandatory boolean or integer argument specifying the retval
-// plus an optional list of locks (vars/fields)
+// The attribute takes a mandatory boolean or integer argument specifying the
+// retval plus an optional list of locks (vars/fields). The annotated function's
+// return type should be boolean or integer.
 
 void etf_function() __attribute__((exclusive_trylock_function)); // \
   // expected-error {{'exclusive_trylock_function' attribute takes at least 1 argument}}
 
-void etf_function_args() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu2);
+void etf_function_args() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu2); // \
+  // expected-error {{'exclusive_trylock_function' attribute only applies to functions that return bool, integer, or a pointer type}}
 
-void etf_function_arg() EXCLUSIVE_TRYLOCK_FUNCTION(1); // \
+int etf_function_arg() EXCLUSIVE_TRYLOCK_FUNCTION(1); // \
   // expected-warning {{'exclusive_trylock_function' attribute without capability arguments can only be applied to non-static methods of a class}}
 
 int etf_testfn(int y) EXCLUSIVE_TRYLOCK_FUNCTION(1); // \
@@ -743,10 +745,23 @@ class EtfFoo {
  private:
   int test_field EXCLUSIVE_TRYLOCK_FUNCTION(1); // \
     // expected-warning {{'exclusive_trylock_function' attribute only applies to functions}}
-  void test_method() EXCLUSIVE_TRYLOCK_FUNCTION(1); // \
+  bool test_method() EXCLUSIVE_TRYLOCK_FUNCTION(1); // \
     // expected-warning {{'exclusive_trylock_function' attribute without capability arguments refers to 'this', but 'EtfFoo' isn't annotated with 'capability' or 'scoped_lockable' attribute}}
 };
 
+// It does not make sense to annotate constructors/destructors as exclusive
+// trylock functions because they cannot return a value. See
+// <https://github.com/llvm/llvm-project/issues/92408>.
+class EtfConstructorDestructor {
+  EtfConstructorDestructor() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu); // \
+  // expected-error {{'exclusive_trylock_function' attribute only applies to functions that return bool, integer, or a pointer type}}
+
+  ~EtfConstructorDestructor() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu); // \
+  // expected-error {{'exclusive_trylock_function' attribute only applies to functions that return bool, integer, or a pointer type}}
+
+  Mutex mu;
+};
+
 class EXCLUSIVE_TRYLOCK_FUNCTION(1) EtfTestClass { // \
   // expected-warning {{'exclusive_trylock_function' attribute only applies to functions}}
 };
@@ -756,7 +771,68 @@ void etf_fun_params(int lvar EXCLUSIVE_TRYLOCK_FUNCTION(1)); // \
 
 // Check argument parsing.
 
-// legal attribute arguments
+int global_int = 0;
+int* etf_fun_with_global_ptr_success() EXCLUSIVE_TRYLOCK_FUNCTION(&global_int, &mu1); //\
+  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
+
+#ifdef CPP11
+constexpr int kTrylockSuccess = 1;
+int etf_succ_constexpr() EXCLUSIVE_TRYLOCK_FUNCTION(kTrylockSuccess, mu2); // \
+  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
+
+int success_value_non_constant = 1;
+
+bool etf_succ_variable() EXCLUSIVE_TRYLOCK_FUNCTION(success_value_non_constant, mu2); //\
+  // expected-error {{attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
+#endif
+
+
+// Legal permutations of the first argument and function return type.
+struct TrylockResult;
+
+#ifdef CPP11
+int* etf_fun_with_nullptr_success() EXCLUSIVE_TRYLOCK_FUNCTION(nullptr, &mu1);
+#endif
+
+#ifndef __cplusplus
+int* etf_fun_with_nullptr_success() EXCLUSIVE_TRYLOCK_FUNCTION(NULL, &mu1);
+#endif
+
+int etf_succ_1_ret_i() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu2);
+bool etf_succ_1_ret_b() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu2);
+TrylockResult *etf_succ_1_ret_p() EXCLUSIVE_TRYLOCK_FUNCTION(1, mu2);
+
+int etf_succ_true_ret_i() EXCLUSIVE_TRYLOCK_FUNCTION(true, mu2);
+bool etf_succ_true_ret_b() EXCLUSIVE_TRYLOCK_FUNCTION(true, mu2);
+TrylockResult *etf_succ_true_ret_p() EXCLUSIVE_TRYLOCK_FUNCTION(true, mu2);
+
+int etf_succ_false_ret_i() EXCLUSIVE_TRYLOCK_FUNCTION(false, mu2);
+bool etf_succ_false_ret_b() EXCLUSIVE_TRYLOCK_FUNCTION(false, mu2);
+TrylockResult *etf_succ_false_ret_p() EXCLUSIVE_TRYLOCK_FUNCTION(false, mu2);
+
+enum TrylockSuccessEnum { TrylockNotAcquired = 0, TrylockAcquired };
+int etf_succ_enum_ret_i() EXCLUSIVE_TRYLOCK_FUNCTION(TrylockAcquired, mu2);
+bool etf_succ_enum_ret_b() EXCLUSIVE_TRYLOCK_FUNCTION(TrylockAcquired, mu2);
+TrylockResult *etf_succ_enum_ret_p()
+    EXCLUSIVE_TRYLOCK_FUNCTION(TrylockAcquired, mu2);
+
+#ifdef CPP11
+enum class TrylockSuccessEnumClass { NotAcquired = 0, Acquired};
+int etf_succ_enum_class_ret_i()
+    EXCLUSIVE_TRYLOCK_FUNCTION(TrylockSuccessEnumClass::Acquired, mu2);
+bool etf_succ_enum_class_ret_b()
+    EXCLUSIVE_TRYLOCK_FUNCTION(TrylockSuccessEnumClass::Acquired, mu2);
+TrylockResult *etf_succ_enum_class_ret_p()
+    EXCLUSIVE_TRYLOCK_FUNCTION(TrylockSuccessEnumClass::Acquired, mu2);
+#endif
+
+// Demonstrate that we do not detect enum type mismatches between the success
+// argument and the function's return type.
+enum SomeOtherEnum { Foo = 1 };
+TrylockSuccessEnum etf_succ_enum_mismatch()
+    EXCLUSIVE_TRYLOCK_FUNCTION(Foo, mu2);
+
+// legal capability attribute arguments
 int etf_function_1() EXCLUSIVE_TRYLOCK_FUNCTION(1, muWrapper.mu);
 int etf_function_2() EXCLUSIVE_TRYLOCK_FUNCTION(1, muDoubleWrapper.muWrapper->mu);
 int etf_function_3() EXCLUSIVE_TRYLOCK_FUNCTION(1, muWrapper.getMu());
@@ -768,14 +844,13 @@ int etf_functetfn_8() EXCLUSIVE_TRYLOCK_FUNCTION(1, muPointer);
 int etf_function_9() EXCLUSIVE_TRYLOCK_FUNCTION(true); // \
   // expected-warning {{'exclusive_trylock_function' attribute without capability arguments can only be applied to non-static methods of a class}}
 
-
 // illegal attribute arguments
 int etf_function_bad_1() EXCLUSIVE_TRYLOCK_FUNCTION(mu1); // \
-  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be int or bool}}
+  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
 int etf_function_bad_2() EXCLUSIVE_TRYLOCK_FUNCTION("mu"); // \
-  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be int or bool}}
+  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
 int etf_function_bad_3() EXCLUSIVE_TRYLOCK_FUNCTION(muDoublePointer); // \
-  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be int or bool}}
+  // expected-error {{'exclusive_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
 
 int etf_function_bad_4() EXCLUSIVE_TRYLOCK_FUNCTION(1, "mu"); // \
   // expected-warning {{ignoring 'exclusive_trylock_function' attribute because its argument is invalid}}
@@ -799,9 +874,9 @@ int etf_function_bad_6() EXCLUSIVE_TRYLOCK_FUNCTION(1, umu); // \
 void stf_function() __attribute__((shared_trylock_function));  // \
   // expected-error {{'shared_trylock_function' attribute takes at least 1 argument}}
 
-void stf_function_args() SHARED_TRYLOCK_FUNCTION(1, mu2);
+bool stf_function_args() SHARED_TRYLOCK_FUNCTION(1, mu2);
 
-void stf_function_arg() SHARED_TRYLOCK_FUNCTION(1); // \
+bool stf_function_arg() SHARED_TRYLOCK_FUNCTION(1); // \
   // expected-warning {{'shared_trylock_function' attribute without capability arguments can only be applied to non-static methods of a class}}
 
 int stf_testfn(int y) SHARED_TRYLOCK_FUNCTION(1); // \
@@ -824,7 +899,7 @@ class StfFoo {
  private:
   int test_field SHARED_TRYLOCK_FUNCTION(1); // \
     // expected-warning {{'shared_trylock_function' attribute only applies to functions}}
-  void test_method() SHARED_TRYLOCK_FUNCTION(1); // \
+  bool test_method() SHARED_TRYLOCK_FUNCTION(1); // \
     // expected-warning {{'shared_trylock_function' attribute without capability arguments refers to 'this', but 'StfFoo' isn't annotated with 'capability' or 'scoped_lockable' attribute}}
 };
 
@@ -849,11 +924,11 @@ int stf_function_9() SHARED_TRYLOCK_FUNCTION(true); // \
 
 // illegal attribute arguments
 int stf_function_bad_1() SHARED_TRYLOCK_FUNCTION(mu1); // \
-  // expected-error {{'shared_trylock_function' attribute requires parameter 1 to be int or bool}}
+  // expected-error {{'shared_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
 int stf_function_bad_2() SHARED_TRYLOCK_FUNCTION("mu"); // \
-  // expected-error {{'shared_trylock_function' attribute requires parameter 1 to be int or bool}}
+  // expected-error {{'shared_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
 int stf_function_bad_3() SHARED_TRYLOCK_FUNCTION(muDoublePointer); // \
-  // expected-error {{'shared_trylock_function' attribute requires parameter 1 to be int or bool}}
+  // expected-error {{'shared_trylock_function' attribute requires parameter 1 to be nullptr or a bool, int, or enum literal}}
 
 int stf_function_bad_4() SHARED_TRYLOCK_FUNCTION(1, "mu"); // \
   // expected-warning {{ignoring 'shared_trylock_function' attribute because its argument is invalid}}
diff --git a/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl b/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl
index bf8027d029882d..bc3cf8bc51daf4 100644
--- a/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl
+++ b/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl
@@ -23,8 +23,7 @@ void foo() {
 }
 
 struct ST2 {
-// expected-error@+1 {{use of undeclared identifier 'SV_DispatchThreadID'}}
+// expected-warning@+1 {{'SV_DispatchThreadID' attribute only applies to parameters and non-static data members}}
     static uint X : SV_DispatchThreadID;
-// expected-error@+1 {{use of undeclared identifier 'SV_DispatchThreadID'}}
     uint s : SV_DispatchThreadID;
 };
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl
index 95774472aaea08..2f8aa098db701a 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl
@@ -55,6 +55,6 @@ void bar(RWBuffer<float> U : register(u3)) {
 struct S {
   // FIXME: generate better error when support semantic on struct field.
   // See https://github.com/llvm/llvm-project/issues/57889.
-  // expected-error@+1 {{expected expression}}
+  // expected-warning@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}}
   RWBuffer<float> U : register(u3);
 };
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-store-error.cl
new file mode 100644
index 00000000000000..356c031640483f
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-store-error.cl
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef char i8;
+typedef short i16;
+typedef int i32;
+typedef int i64 __attribute__((ext_vector_type(2)));
+typedef int i96 __attribute__((ext_vector_type(3)));
+typedef int i128 __attribute__((ext_vector_type(4)));
+
+void test_amdgcn_raw_ptr_buffer_store_b8(i8 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  __builtin_amdgcn_raw_buffer_store_b8(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b8' must be a constant integer}}
+}
+
+void test_amdgcn_raw_ptr_buffer_store_b16(i16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  __builtin_amdgcn_raw_buffer_store_b16(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b16' must be a constant integer}}
+}
+
+void test_amdgcn_raw_ptr_buffer_store_b32(i32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  __builtin_amdgcn_raw_buffer_store_b32(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b32' must be a constant integer}}
+}
+
+void test_amdgcn_raw_ptr_buffer_store_b64(i64 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  __builtin_amdgcn_raw_buffer_store_b64(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b64' must be a constant integer}}
+}
+
+void test_amdgcn_raw_ptr_buffer_store_b96(i96 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  __builtin_amdgcn_raw_buffer_store_b96(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b96' must be a constant integer}}
+}
+
+void test_amdgcn_raw_ptr_buffer_store_b128(i128 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) {
+  __builtin_amdgcn_raw_buffer_store_b128(vdata, rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_b128' must be a constant integer}}
+}
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index cdfe8cfbd93799..9027076119cf94 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -224,9 +224,8 @@ Error executeCommands(StringRef ExecutablePath, ArrayRef<StringRef> Args) {
 
   if (!DryRun)
     if (sys::ExecuteAndWait(ExecutablePath, Args))
-      return createStringError(inconvertibleErrorCode(),
-                               "'" + sys::path::filename(ExecutablePath) + "'" +
-                                   " failed");
+      return createStringError(
+          "'%s' failed", sys::path::filename(ExecutablePath).str().c_str());
   return Error::success();
 }
 
@@ -259,7 +258,6 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
       Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple()));
   if (Triple.isOSWindows())
     return createStringError(
-        inconvertibleErrorCode(),
         "Relocatable linking is not supported on COFF targets");
 
   Expected<std::string> ObjcopyPath =
@@ -272,8 +270,7 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   auto BufferOrErr = DryRun ? MemoryBuffer::getMemBuffer("")
                             : MemoryBuffer::getFileOrSTDIN(Output);
   if (!BufferOrErr)
-    return createStringError(inconvertibleErrorCode(), "Failed to open %s",
-                             Output.str().c_str());
+    return createStringError("Failed to open %s", Output.str().c_str());
   std::string Suffix = "_" + getHash((*BufferOrErr)->getBuffer());
 
   SmallVector<StringRef> ObjcopyArgs = {
@@ -492,8 +489,7 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
 
         file_magic Magic;
         if (auto EC = identify_magic(Arg->getValue(), Magic))
-          return createStringError(inconvertibleErrorCode(),
-                                   "Failed to open %s", Arg->getValue());
+          return createStringError("Failed to open %s", Arg->getValue());
         if (Magic != file_magic::archive &&
             Magic != file_magic::elf_shared_object)
           continue;
@@ -568,9 +564,8 @@ Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
   case Triple::systemz:
     return generic::clang(InputFiles, Args);
   default:
-    return createStringError(inconvertibleErrorCode(),
-                             Triple.getArchName() +
-                                 " linking is not supported");
+    return createStringError(Triple.getArchName() +
+                             " linking is not supported");
   }
 }
 
@@ -881,15 +876,13 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
     return Err;
 
   if (LTOError)
-    return createStringError(inconvertibleErrorCode(),
-                             "Errors encountered inside the LTO pipeline.");
+    return createStringError("Errors encountered inside the LTO pipeline.");
 
   // If we are embedding bitcode we only need the intermediate output.
   bool SingleOutput = Files.size() == 1;
   if (Args.hasArg(OPT_embed_bitcode)) {
     if (BitcodeOutput.size() != 1 || !SingleOutput)
-      return createStringError(inconvertibleErrorCode(),
-                               "Cannot embed bitcode with multiple files.");
+      return createStringError("Cannot embed bitcode with multiple files.");
     OutputFiles.push_back(Args.MakeArgString(BitcodeOutput.front()));
     return Error::success();
   }
@@ -936,7 +929,7 @@ Expected<StringRef> compileModule(Module &M, OffloadKind Kind) {
   std::string Msg;
   const Target *T = TargetRegistry::lookupTarget(M.getTargetTriple(), Msg);
   if (!T)
-    return createStringError(inconvertibleErrorCode(), Msg);
+    return createStringError(Msg);
 
   auto Options =
       codegen::InitTargetOptionsFromCodeGenFlags(Triple(M.getTargetTriple()));
@@ -966,8 +959,7 @@ Expected<StringRef> compileModule(Module &M, OffloadKind Kind) {
   CodeGenPasses.add(new TargetLibraryInfoWrapperPass(TLII));
   if (TM->addPassesToEmitFile(CodeGenPasses, *OS, nullptr,
                               CodeGenFileType::ObjectFile))
-    return createStringError(inconvertibleErrorCode(),
-                             "Failed to execute host backend");
+    return createStringError("Failed to execute host backend");
   CodeGenPasses.run(M);
 
   return *TempFileOrErr;
@@ -1012,9 +1004,8 @@ wrapDeviceImages(ArrayRef<std::unique_ptr<MemoryBuffer>> Buffers,
       return std::move(Err);
     break;
   default:
-    return createStringError(inconvertibleErrorCode(),
-                             getOffloadKindName(Kind) +
-                                 " wrapping is not supported");
+    return createStringError(getOffloadKindName(Kind) +
+                             " wrapping is not supported");
   }
 
   if (Args.hasArg(OPT_print_wrapped_module))
@@ -1109,9 +1100,8 @@ bundleLinkedOutput(ArrayRef<OffloadingImage> Images, const ArgList &Args,
   case OFK_HIP:
     return bundleHIP(Images, Args);
   default:
-    return createStringError(inconvertibleErrorCode(),
-                             getOffloadKindName(Kind) +
-                                 " bundling is not supported");
+    return createStringError(getOffloadKindName(Kind) +
+                             " bundling is not supported");
   }
 }
 
@@ -1209,7 +1199,7 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
     StringSaver Saver(Alloc);
     auto BaseArgs =
         Tbl.parseArgs(Argc, Argv, OPT_INVALID, Saver, [](StringRef Err) {
-          reportError(createStringError(inconvertibleErrorCode(), Err));
+          reportError(createStringError(Err));
         });
     auto LinkerArgs = getLinkerArgs(Input, BaseArgs);
 
@@ -1510,9 +1500,8 @@ getDeviceInput(const ArgList &Args) {
             : std::string(Arg->getValue());
 
     if (!Filename && Arg->getOption().matches(OPT_library))
-      reportError(createStringError(inconvertibleErrorCode(),
-                                    "unable to find library -l%s",
-                                    Arg->getValue()));
+      reportError(
+          createStringError("unable to find library -l%s", Arg->getValue()));
 
     if (!Filename || !sys::fs::exists(*Filename) ||
         sys::fs::is_directory(*Filename))
@@ -1646,7 +1635,7 @@ int main(int Argc, char **Argv) {
   BumpPtrAllocator Alloc;
   StringSaver Saver(Alloc);
   auto Args = Tbl.parseArgs(Argc, Argv, OPT_INVALID, Saver, [&](StringRef Err) {
-    reportError(createStringError(inconvertibleErrorCode(), Err));
+    reportError(createStringError(Err));
   });
 
   if (Args.hasArg(OPT_help) || Args.hasArg(OPT_help_hidden)) {
@@ -1691,9 +1680,9 @@ int main(int Argc, char **Argv) {
   if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) {
     unsigned Threads = 0;
     if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0)
-      reportError(createStringError(
-          inconvertibleErrorCode(), "%s: expected a positive integer, got '%s'",
-          Arg->getSpelling().data(), Arg->getValue()));
+      reportError(createStringError("%s: expected a positive integer, got '%s'",
+                                    Arg->getSpelling().data(),
+                                    Arg->getValue()));
     parallel::strategy = hardware_concurrency(Threads);
   }
 
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 2aebc6d3c01782..3c0599c2e51491 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -26,6 +26,7 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -148,7 +149,7 @@ static int PrintSupportedExtensions(std::string TargetStr) {
   if (MachineTriple.isRISCV())
     llvm::riscvExtensionsHelp(DescMap);
   else if (MachineTriple.isAArch64())
-    llvm::AArch64::PrintSupportedExtensions(DescMap);
+    llvm::AArch64::PrintSupportedExtensions();
   else if (MachineTriple.isARM())
     llvm::ARM::PrintSupportedExtensions(DescMap);
   else {
@@ -161,6 +162,45 @@ static int PrintSupportedExtensions(std::string TargetStr) {
   return 0;
 }
 
+static int PrintEnabledExtensions(const TargetOptions& TargetOpts) {
+  std::string Error;
+  const llvm::Target *TheTarget =
+      llvm::TargetRegistry::lookupTarget(TargetOpts.Triple, Error);
+  if (!TheTarget) {
+    llvm::errs() << Error;
+    return 1;
+  }
+
+  // Create a target machine using the input features, the triple information
+  // and a dummy instance of llvm::TargetOptions. Note that this is _not_ the
+  // same as the `clang::TargetOptions` instance we have access to here.
+  llvm::TargetOptions BackendOptions;
+  std::string FeaturesStr = llvm::join(TargetOpts.FeaturesAsWritten, ",");
+  std::unique_ptr<llvm::TargetMachine> TheTargetMachine(
+      TheTarget->createTargetMachine(TargetOpts.Triple, TargetOpts.CPU, FeaturesStr, BackendOptions, std::nullopt));
+  const llvm::Triple &MachineTriple = TheTargetMachine->getTargetTriple();
+  const llvm::MCSubtargetInfo *MCInfo = TheTargetMachine->getMCSubtargetInfo();
+
+  // Extract the feature names that are enabled for the given target.
+  // We do that by capturing the key from the set of SubtargetFeatureKV entries
+  // provided by MCSubtargetInfo, which match the '-target-feature' values.
+  const std::vector<llvm::SubtargetFeatureKV> Features =
+    MCInfo->getEnabledProcessorFeatures();
+  std::set<llvm::StringRef> EnabledFeatureNames;
+  for (const llvm::SubtargetFeatureKV &feature : Features)
+    EnabledFeatureNames.insert(feature.Key);
+
+  if (!MachineTriple.isAArch64()) {
+    // The option was already checked in Driver::HandleImmediateArgs,
+    // so we do not expect to get here if we are not a supported architecture.
+    assert(0 && "Unhandled triple for --print-enabled-extensions option.");
+    return 1;
+  }
+  llvm::AArch64::printEnabledExtensions(EnabledFeatureNames);
+
+  return 0;
+}
+
 int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
   ensureSufficientStack();
 
@@ -204,6 +244,10 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
   if (Clang->getFrontendOpts().PrintSupportedExtensions)
     return PrintSupportedExtensions(Clang->getTargetOpts().Triple);
 
+  // --print-enabled-extensions takes priority over the actual compilation.
+  if (Clang->getFrontendOpts().PrintEnabledExtensions)
+    return PrintEnabledExtensions(Clang->getTargetOpts());
+
   // Infer the builtin include path if unspecified.
   if (Clang->getHeaderSearchOpts().UseBuiltinIncludes &&
       Clang->getHeaderSearchOpts().ResourceDir.empty())
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 92f9bae6cb064b..4c33546de6f923 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -7851,7 +7851,7 @@ TEST_P(ImportAttributes, ImportAcquireCapability) {
 TEST_P(ImportAttributes, ImportTryAcquireCapability) {
   TryAcquireCapabilityAttr *FromAttr, *ToAttr;
   importAttr<FunctionDecl>(
-      "void test(int A1, int A2) __attribute__((try_acquire_capability(1, A1, "
+      "bool test(int A1, int A2) __attribute__((try_acquire_capability(1, A1, "
       "A2)));",
       FromAttr, ToAttr);
   checkImported(FromAttr->getSuccessValue(), ToAttr->getSuccessValue());
@@ -7948,7 +7948,7 @@ TEST_P(ImportAttributes, ImportAssertSharedLock) {
 TEST_P(ImportAttributes, ImportExclusiveTrylockFunction) {
   ExclusiveTrylockFunctionAttr *FromAttr, *ToAttr;
   importAttr<FunctionDecl>(
-      "void test(int A1, int A2) __attribute__((exclusive_trylock_function(1, "
+      "bool test(int A1, int A2) __attribute__((exclusive_trylock_function(1, "
       "A1, A2)));",
       FromAttr, ToAttr);
   checkImported(FromAttr->getSuccessValue(), ToAttr->getSuccessValue());
@@ -7958,7 +7958,7 @@ TEST_P(ImportAttributes, ImportExclusiveTrylockFunction) {
 TEST_P(ImportAttributes, ImportSharedTrylockFunction) {
   SharedTrylockFunctionAttr *FromAttr, *ToAttr;
   importAttr<FunctionDecl>(
-      "void test(int A1, int A2) __attribute__((shared_trylock_function(1, A1, "
+      "bool test(int A1, int A2) __attribute__((shared_trylock_function(1, A1, "
       "A2)));",
       FromAttr, ToAttr);
   checkImported(FromAttr->getSuccessValue(), ToAttr->getSuccessValue());
diff --git a/clang/unittests/AST/MatchVerifier.h b/clang/unittests/AST/MatchVerifier.h
index da1e351da4a096..60bb4a8716ae8c 100644
--- a/clang/unittests/AST/MatchVerifier.h
+++ b/clang/unittests/AST/MatchVerifier.h
@@ -209,7 +209,7 @@ class LocationVerifier : public MatchVerifier<NodeType> {
           << ">, found <";
       Loc.print(Msg, *Result.SourceManager);
       Msg << '>';
-      this->setFailure(Msg.str());
+      this->setFailure(MsgStr);
     }
   }
 
@@ -256,7 +256,7 @@ class RangeVerifier : public MatchVerifier<NodeType> {
       Msg << '-';
       End.print(Msg, *Result.SourceManager);
       Msg << '>';
-      this->setFailure(Msg.str());
+      this->setFailure(MsgStr);
     }
   }
 
@@ -282,12 +282,12 @@ class DumpVerifier : public MatchVerifier<DynTypedNode> {
     llvm::raw_string_ostream Dump(DumpStr);
     Node.dump(Dump, *Result.Context);
 
-    if (Dump.str().find(ExpectSubstring) == std::string::npos) {
+    if (DumpStr.find(ExpectSubstring) == std::string::npos) {
       std::string MsgStr;
       llvm::raw_string_ostream Msg(MsgStr);
       Msg << "Expected dump substring <" << ExpectSubstring << ">, found <"
-          << Dump.str() << '>';
-      this->setFailure(Msg.str());
+          << DumpStr << '>';
+      this->setFailure(MsgStr);
     }
   }
 
@@ -309,12 +309,12 @@ class PrintVerifier : public MatchVerifier<DynTypedNode> {
     llvm::raw_string_ostream Print(PrintStr);
     Node.print(Print, Result.Context->getPrintingPolicy());
 
-    if (Print.str() != ExpectString) {
+    if (PrintStr != ExpectString) {
       std::string MsgStr;
       llvm::raw_string_ostream Msg(MsgStr);
       Msg << "Expected pretty print <" << ExpectString << ">, found <"
-          << Print.str() << '>';
-      this->setFailure(Msg.str());
+          << PrintStr << '>';
+      this->setFailure(MsgStr);
     }
   }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index cfbc64c77b0cc4..e743eefa5d4585 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -3789,36 +3789,51 @@ TEST(TransferTest, AddrOfReference) {
 TEST(TransferTest, Preincrement) {
   std::string Code = R"(
     void target(int I) {
+      (void)0; // [[before]]
       int &IRef = ++I;
-      // [[p]]
+      // [[after]]
     }
   )";
   runDataflow(
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+        const Environment &EnvBefore =
+            getEnvironmentAtAnnotation(Results, "before");
+        const Environment &EnvAfter =
+            getEnvironmentAtAnnotation(Results, "after");
 
-        EXPECT_EQ(&getLocForDecl(ASTCtx, Env, "IRef"),
-                  &getLocForDecl(ASTCtx, Env, "I"));
+        EXPECT_EQ(&getLocForDecl(ASTCtx, EnvAfter, "IRef"),
+                  &getLocForDecl(ASTCtx, EnvBefore, "I"));
+
+        const ValueDecl *IDecl = findValueDecl(ASTCtx, "I");
+        EXPECT_NE(EnvBefore.getValue(*IDecl), nullptr);
+        EXPECT_EQ(EnvAfter.getValue(*IDecl), nullptr);
       });
 }
 
 TEST(TransferTest, Postincrement) {
   std::string Code = R"(
     void target(int I) {
+      (void)0; // [[before]]
       int OldVal = I++;
-      // [[p]]
+      // [[after]]
     }
   )";
   runDataflow(
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+        const Environment &EnvBefore =
+            getEnvironmentAtAnnotation(Results, "before");
+        const Environment &EnvAfter =
+            getEnvironmentAtAnnotation(Results, "after");
+
+        EXPECT_EQ(&getValueForDecl(ASTCtx, EnvBefore, "I"),
+                  &getValueForDecl(ASTCtx, EnvAfter, "OldVal"));
 
-        EXPECT_EQ(&getValueForDecl(ASTCtx, Env, "OldVal"),
-                  &getValueForDecl(ASTCtx, Env, "I"));
+        const ValueDecl *IDecl = findValueDecl(ASTCtx, "I");
+        EXPECT_EQ(EnvAfter.getValue(*IDecl), nullptr);
       });
 }
 
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index aded3ed2a6596e..2466677a3740d3 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -178,8 +178,9 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(IndentWrappedFunctionNames);
   CHECK_PARSE_BOOL(InsertBraces);
   CHECK_PARSE_BOOL(InsertNewlineAtEOF);
-  CHECK_PARSE_BOOL(KeepEmptyLinesAtEOF);
-  CHECK_PARSE_BOOL(KeepEmptyLinesAtTheStartOfBlocks);
+  CHECK_PARSE_BOOL_FIELD(KeepEmptyLines.AtEndOfFile, "KeepEmptyLinesAtEOF");
+  CHECK_PARSE_BOOL_FIELD(KeepEmptyLines.AtStartOfBlock,
+                         "KeepEmptyLinesAtTheStartOfBlocks");
   CHECK_PARSE_BOOL(ObjCSpaceAfterProperty);
   CHECK_PARSE_BOOL(ObjCSpaceBeforeProtocolList);
   CHECK_PARSE_BOOL(Cpp11BracedListStyle);
@@ -226,6 +227,9 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, SplitEmptyFunction);
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, SplitEmptyRecord);
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, SplitEmptyNamespace);
+  CHECK_PARSE_NESTED_BOOL(KeepEmptyLines, AtEndOfFile);
+  CHECK_PARSE_NESTED_BOOL(KeepEmptyLines, AtStartOfBlock);
+  CHECK_PARSE_NESTED_BOOL(KeepEmptyLines, AtStartOfFile);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterControlStatements);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterForeachMacros);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions,
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index db1decb20d626b..5276e79d759812 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -45,6 +45,10 @@ TEST_F(FormatTest, FormatsGlobalStatementsAt0) {
   verifyFormat("\nint i;", " \n\t \v \f  int i;");
   verifyFormat("int i;\nint j;", "    int i; int j;");
   verifyFormat("int i;\nint j;", "    int i;\n  int j;");
+
+  auto Style = getLLVMStyle();
+  Style.KeepEmptyLines.AtStartOfFile = false;
+  verifyFormat("int i;", " \n\t \v \f  int i;", Style);
 }
 
 TEST_F(FormatTest, FormatsUnwrappedLinesAtFirstFormat) {
@@ -163,7 +167,7 @@ TEST_F(FormatTest, RemovesEmptyLines) {
   auto CustomStyle = getLLVMStyle();
   CustomStyle.BreakBeforeBraces = FormatStyle::BS_Custom;
   CustomStyle.BraceWrapping.AfterNamespace = true;
-  CustomStyle.KeepEmptyLinesAtTheStartOfBlocks = false;
+  CustomStyle.KeepEmptyLines.AtStartOfBlock = false;
   verifyFormat("namespace N\n"
                "{\n"
                "\n"
@@ -389,7 +393,7 @@ TEST_F(FormatTest, RemovesEmptyLines) {
   Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterClass = true;
   Style.BraceWrapping.AfterFunction = true;
-  Style.KeepEmptyLinesAtTheStartOfBlocks = false;
+  Style.KeepEmptyLines.AtStartOfBlock = false;
 
   verifyFormat("class Foo\n"
                "{\n"
@@ -21956,6 +21960,11 @@ TEST_F(FormatTest, HandlesUTF8BOM) {
   verifyFormat("\xef\xbb\xbf");
   verifyFormat("\xef\xbb\xbf#include <iostream>");
   verifyFormat("\xef\xbb\xbf\n#include <iostream>");
+
+  auto Style = getLLVMStyle();
+  Style.KeepEmptyLines.AtStartOfFile = false;
+  verifyFormat("\xef\xbb\xbf#include <iostream>",
+               "\xef\xbb\xbf\n#include <iostream>", Style);
 }
 
 // FIXME: Encode Cyrillic and CJK characters below to appease MS compilers.
@@ -27230,7 +27239,7 @@ TEST_F(FormatTest, InsertNewlineAtEOF) {
 
 TEST_F(FormatTest, KeepEmptyLinesAtEOF) {
   FormatStyle Style = getLLVMStyle();
-  Style.KeepEmptyLinesAtEOF = true;
+  Style.KeepEmptyLines.AtEndOfFile = true;
 
   const StringRef Code{"int i;\n\n"};
   verifyNoChange(Code, Style);
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index d3b310fe59527a..5d83d8a0c44295 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -567,6 +567,24 @@ TEST_F(TokenAnnotatorTest, UnderstandsGreaterAfterTemplateCloser) {
   EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator);
 }
 
+TEST_F(TokenAnnotatorTest, UnderstandsTernaryInTemplate) {
+  // IsExpression = false
+  auto Tokens = annotate("foo<true ? 1 : 2>();");
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[3], tok::question, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[5], tok::colon, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+
+  // IsExpression = true
+  Tokens = annotate("return foo<true ? 1 : 2>();");
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[4], tok::question, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[6], tok::colon, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[8], tok::greater, TT_TemplateCloser);
+}
+
 TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) {
   auto Tokens = annotate("return a < b && c > d;");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index bbd854149d5f59..29c5ead60b81e0 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -101,7 +101,7 @@ TEST_F(InterpreterTest, Errors) {
   auto Interp = createInterpreter(ExtraArgs, DiagPrinter.get());
   auto Err = Interp->Parse("intentional_error v1 = 42; ").takeError();
   using ::testing::HasSubstr;
-  EXPECT_THAT(DiagnosticsOS.str(),
+  EXPECT_THAT(DiagnosticOutput,
               HasSubstr("error: unknown type name 'intentional_error'"));
   EXPECT_EQ("Parsing failed.", llvm::toString(std::move(Err)));
 
@@ -186,7 +186,7 @@ static std::string MangleName(NamedDecl *ND) {
   std::string mangledName;
   llvm::raw_string_ostream RawStr(mangledName);
   MangleC->mangleName(ND, RawStr);
-  return RawStr.str();
+  return mangledName;
 }
 
 TEST_F(InterpreterTest, FindMangledNameSymbol) {
diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp
index 03ab66235e43c6..1bccf899f3f196 100644
--- a/clang/unittests/Tooling/RangeSelectorTest.cpp
+++ b/clang/unittests/Tooling/RangeSelectorTest.cpp
@@ -651,6 +651,48 @@ TEST(RangeSelectorTest, CallArgsErrors) {
                        Failed<StringError>(withTypeErrorMessage("stmt")));
 }
 
+TEST(RangeSelectorTest, ConstructExprArgs) {
+  const StringRef Code = R"cc(
+    struct C {
+      C(int, int);
+    };
+    C f() {
+      return C(1, 2);
+    }
+  )cc";
+  const char *ID = "id";
+  TestMatch Match = matchCode(Code, cxxTemporaryObjectExpr().bind(ID));
+  EXPECT_THAT_EXPECTED(select(constructExprArgs(ID), Match), HasValue("1, 2"));
+}
+
+TEST(RangeSelectorTest, ConstructExprBracedArgs) {
+  const StringRef Code = R"cc(
+    struct C {
+      C(int, int);
+    };
+    C f() {
+      return {1, 2};
+    }
+  )cc";
+  const char *ID = "id";
+  TestMatch Match = matchCode(Code, cxxConstructExpr().bind(ID));
+  EXPECT_THAT_EXPECTED(select(constructExprArgs(ID), Match), HasValue("1, 2"));
+}
+
+TEST(RangeSelectorTest, ConstructExprNoArgs) {
+  const StringRef Code = R"cc(
+    struct C {
+      C();
+    };
+    C f() {
+      return C();
+    }
+  )cc";
+  const char *ID = "id";
+  TestMatch Match = matchCode(Code, cxxTemporaryObjectExpr().bind(ID));
+  EXPECT_THAT_EXPECTED(select(constructExprArgs(ID), Match), HasValue(""));
+}
+
 TEST(RangeSelectorTest, StatementsOp) {
   StringRef Code = R"cc(
     void g();
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
index 274f275ea66a9d..cd4bf0eb7bd5a7 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
@@ -22,7 +22,7 @@ class DeductionGuideVisitor
     std::string Storage;
     llvm::raw_string_ostream Stream(Storage);
     D->print(Stream);
-    Match(Stream.str(), D->getLocation());
+    Match(Storage, D->getLocation());
     return true;
   }
 
diff --git a/clang/unittests/Tooling/ReplacementsYamlTest.cpp b/clang/unittests/Tooling/ReplacementsYamlTest.cpp
index 3328d9bad55c71..9558d7270159f7 100644
--- a/clang/unittests/Tooling/ReplacementsYamlTest.cpp
+++ b/clang/unittests/Tooling/ReplacementsYamlTest.cpp
@@ -43,7 +43,7 @@ TEST(ReplacementsYamlTest, serializesReplacements) {
                "    Length:          2\n"
                "    ReplacementText: 'replacement #2'\n"
                "...\n",
-               YamlContentStream.str().c_str());
+               YamlContent.c_str());
 }
 
 TEST(ReplacementsYamlTest, serializesNewLines) {
@@ -67,7 +67,7 @@ TEST(ReplacementsYamlTest, serializesNewLines) {
                "    Length:          0\n"
                "    ReplacementText: \"#include <utility>\\n\"\n"
                "...\n",
-               YamlContentStream.str().c_str());
+               YamlContent.c_str());
 }
 
 TEST(ReplacementsYamlTest, deserializesReplacements) {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 307334a65b1fe2..7bf6091e63d185 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -2578,6 +2578,32 @@ static void emitClangAttrIdentifierArgList(RecordKeeper &Records, raw_ostream &O
   OS << "#endif // CLANG_ATTR_IDENTIFIER_ARG_LIST\n\n";
 }
 
+// Emits the indexed-argument-is-identifier property for attributes.
+static void emitClangAttrStrictIdentifierArgAtIndexList(RecordKeeper &Records,
+                                                        raw_ostream &OS) {
+  OS << "#if defined(CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST)\n";
+  std::vector<Record *> Attrs = Records.getAllDerivedDefinitions("Attr");
+
+  for (const auto *Attr : Attrs) {
+    if (!Attr->getValueAsBit("StrictEnumParameters"))
+      continue;
+    // Determine whether the first argument is an identifier.
+    std::vector<Record *> Args = Attr->getValueAsListOfDefs("Args");
+    uint64_t enumAtIndex = 0;
+    for (size_t i = 0; i < Args.size(); i++) {
+      enumAtIndex |= ((uint64_t)isIdentifierArgument(Args[0])) << i;
+    }
+    if (!enumAtIndex)
+      continue;
+
+    // All these spellings take an identifier argument.
+    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.name() << "\", " << enumAtIndex << "ull)\n";
+    });
+  }
+  OS << "#endif // CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST\n\n";
+}
+
 static bool keywordThisIsaIdentifierInArgument(const Record *Arg) {
   return !Arg->getSuperClasses().empty() &&
          llvm::StringSwitch<bool>(
@@ -4959,6 +4985,7 @@ void EmitClangAttrParserStringSwitches(RecordKeeper &Records, raw_ostream &OS) {
   emitClangAttrTypeArgList(Records, OS);
   emitClangAttrLateParsedList(Records, OS);
   emitClangAttrLateParsedExperimentalList(Records, OS);
+  emitClangAttrStrictIdentifierArgAtIndexList(Records, OS);
 }
 
 void EmitClangAttrSubjectMatchRulesParserStringSwitches(RecordKeeper &Records,
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 56f1fdf9ef574f..626031d38cf003 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -2370,10 +2370,6 @@ void NeonEmitter::run(raw_ostream &OS) {
         "Please use -mfloat-abi=softfp or -mfloat-abi=hard\"\n";
   OS << "#else\n\n";
 
-  OS << "#if !defined(__ARM_NEON)\n";
-  OS << "#error \"NEON support not enabled\"\n";
-  OS << "#else\n\n";
-
   OS << "#include <stdint.h>\n\n";
 
   OS << "#include <arm_bf16.h>\n";
@@ -2450,7 +2446,6 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "#undef __ai\n\n";
   OS << "#endif /* if !defined(__ARM_NEON) */\n";
   OS << "#endif /* ifndef __ARM_FP */\n";
-  OS << "#endif /* __ARM_NEON_H */\n";
 }
 
 /// run - Read the records in arm_fp16.td and output arm_fp16.h.  arm_fp16.h
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index c41c812ab9707a..caedd5978a87c0 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -165,7 +165,7 @@ class Intrinsic {
   ClassKind Class;
 
   /// The architectural #ifdef guard.
-  std::string Guard;
+  std::string SVEGuard, SMEGuard;
 
   // The merge suffix such as _m, _x or _z.
   std::string MergeSuffix;
@@ -184,7 +184,8 @@ class Intrinsic {
   Intrinsic(StringRef Name, StringRef Proto, uint64_t MergeTy,
             StringRef MergeSuffix, uint64_t MemoryElementTy, StringRef LLVMName,
             uint64_t Flags, ArrayRef<ImmCheck> ImmChecks, TypeSpec BT,
-            ClassKind Class, SVEEmitter &Emitter, StringRef Guard);
+            ClassKind Class, SVEEmitter &Emitter, StringRef SVEGuard,
+            StringRef SMEGuard);
 
   ~Intrinsic()=default;
 
@@ -194,7 +195,27 @@ class Intrinsic {
   TypeSpec getBaseTypeSpec() const { return BaseTypeSpec; }
   SVEType getBaseType() const { return BaseType; }
 
-  StringRef getGuard() const { return Guard; }
+  StringRef getSVEGuard() const { return SVEGuard; }
+  StringRef getSMEGuard() const { return SMEGuard; }
+  void printGuard(raw_ostream &OS) const {
+    if (!SVEGuard.empty() && SMEGuard.empty())
+      OS << SVEGuard;
+    else if (SVEGuard.empty() && !SMEGuard.empty())
+      OS << SMEGuard;
+    else {
+      if (SVEGuard.find(",") != std::string::npos ||
+          SVEGuard.find("|") != std::string::npos)
+        OS << "(" << SVEGuard << ")";
+      else
+        OS << SVEGuard;
+      OS << "|";
+      if (SMEGuard.find(",") != std::string::npos ||
+          SMEGuard.find("|") != std::string::npos)
+        OS << "(" << SMEGuard << ")";
+      else
+        OS << SMEGuard;
+    }
+  }
   ClassKind getClassKind() const { return Class; }
 
   SVEType getReturnType() const { return Types[0]; }
@@ -943,11 +964,12 @@ Intrinsic::Intrinsic(StringRef Name, StringRef Proto, uint64_t MergeTy,
                      StringRef MergeSuffix, uint64_t MemoryElementTy,
                      StringRef LLVMName, uint64_t Flags,
                      ArrayRef<ImmCheck> Checks, TypeSpec BT, ClassKind Class,
-                     SVEEmitter &Emitter, StringRef Guard)
+                     SVEEmitter &Emitter, StringRef SVEGuard,
+                     StringRef SMEGuard)
     : Name(Name.str()), LLVMName(LLVMName), Proto(Proto.str()),
-      BaseTypeSpec(BT), Class(Class), Guard(Guard.str()),
-      MergeSuffix(MergeSuffix.str()), BaseType(BT, 'd'), Flags(Flags),
-      ImmChecks(Checks.begin(), Checks.end()) {
+      BaseTypeSpec(BT), Class(Class), SVEGuard(SVEGuard.str()),
+      SMEGuard(SMEGuard.str()), MergeSuffix(MergeSuffix.str()),
+      BaseType(BT, 'd'), Flags(Flags), ImmChecks(Checks.begin(), Checks.end()) {
   // Types[0] is the return value.
   for (unsigned I = 0; I < (getNumParams() + 1); ++I) {
     char Mod;
@@ -1147,7 +1169,8 @@ void SVEEmitter::createIntrinsic(
   StringRef Name = R->getValueAsString("Name");
   StringRef Proto = R->getValueAsString("Prototype");
   StringRef Types = R->getValueAsString("Types");
-  StringRef Guard = R->getValueAsString("TargetGuard");
+  StringRef SVEGuard = R->getValueAsString("SVETargetGuard");
+  StringRef SMEGuard = R->getValueAsString("SMETargetGuard");
   StringRef LLVMName = R->getValueAsString("LLVMIntrinsic");
   uint64_t Merge = R->getValueAsInt("Merge");
   StringRef MergeSuffix = R->getValueAsString("MergeSuffix");
@@ -1203,13 +1226,13 @@ void SVEEmitter::createIntrinsic(
 
     Out.push_back(std::make_unique<Intrinsic>(
         Name, Proto, Merge, MergeSuffix, MemEltType, LLVMName, Flags, ImmChecks,
-        TS, ClassS, *this, Guard));
+        TS, ClassS, *this, SVEGuard, SMEGuard));
 
     // Also generate the short-form (e.g. svadd_m) for the given type-spec.
     if (Intrinsic::isOverloadedIntrinsic(Name))
       Out.push_back(std::make_unique<Intrinsic>(
           Name, Proto, Merge, MergeSuffix, MemEltType, LLVMName, Flags,
-          ImmChecks, TS, ClassG, *this, Guard));
+          ImmChecks, TS, ClassG, *this, SVEGuard, SMEGuard));
   }
 }
 
@@ -1229,9 +1252,9 @@ void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS,
                    [](const std::unique_ptr<Intrinsic> &A,
                       const std::unique_ptr<Intrinsic> &B) {
                      auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
-                       return std::make_tuple(I->getGuard(),
-                                              (unsigned)I->getClassKind(),
-                                              I->getName());
+                       return std::make_tuple(
+                           I->getSVEGuard().str() + I->getSMEGuard().str(),
+                           (unsigned)I->getClassKind(), I->getName());
                      };
                      return ToTuple(A) < ToTuple(B);
                    });
@@ -1434,10 +1457,12 @@ void SVEEmitter::createBuiltins(raw_ostream &OS) {
   for (auto &Def : Defs) {
     // Only create BUILTINs for non-overloaded intrinsics, as overloaded
     // declarations only live in the header file.
-    if (Def->getClassKind() != ClassG)
+    if (Def->getClassKind() != ClassG) {
       OS << "TARGET_BUILTIN(__builtin_sve_" << Def->getMangledName() << ", \""
-         << Def->getBuiltinTypeStr() << "\", \"n\", \"" << Def->getGuard()
-         << "\")\n";
+         << Def->getBuiltinTypeStr() << "\", \"n\", \"";
+      Def->printGuard(OS);
+      OS << "\")\n";
+    }
   }
 
   // Add reinterpret functions.
@@ -1638,10 +1663,12 @@ void SVEEmitter::createSMEBuiltins(raw_ostream &OS) {
   for (auto &Def : Defs) {
     // Only create BUILTINs for non-overloaded intrinsics, as overloaded
     // declarations only live in the header file.
-    if (Def->getClassKind() != ClassG)
+    if (Def->getClassKind() != ClassG) {
       OS << "TARGET_BUILTIN(__builtin_sme_" << Def->getMangledName() << ", \""
-         << Def->getBuiltinTypeStr() << "\", \"n\", \"" << Def->getGuard()
-         << "\")\n";
+         << Def->getBuiltinTypeStr() << "\", \"n\", \"";
+      Def->printGuard(OS);
+      OS << "\")\n";
+    }
   }
 
   OS << "#endif\n\n";
@@ -1783,9 +1810,9 @@ void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
       getEnumValueForFlag("IsStreamingCompatible");
 
   for (auto &Def : Defs) {
-    if (!Def->isFlagSet(VerifyRuntimeMode) && Def->getGuard().contains("sve") &&
-        Def->getGuard().contains("sme"))
-      llvm_unreachable("Missing VerifyRuntimeMode flag");
+    if (!Def->isFlagSet(VerifyRuntimeMode) && !Def->getSVEGuard().empty() &&
+        !Def->getSMEGuard().empty())
+      report_fatal_error("Missing VerifyRuntimeMode flag");
 
     if (Def->isFlagSet(IsStreamingFlag))
       StreamingMap["ArmStreaming"].insert(Def->getMangledName());
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index d71d4c216f7447..84cd8e836006c9 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -57,12 +57,12 @@ <h1>C Support in Clang</h1>
 <tr>
  <td><a href="#c11">C11</a></td>
  <td><tt>-std=c11</tt></td>
- <td class="unknown" align="center">Probably</td>
+ <td class="partial" align="center">Partial</td>
 </tr>
 <tr>
  <td><a href="#c17">C17</a></td>
  <td><tt>-std=c17</tt></td>
- <td class="unknown" align="center">Maybe?</td>
+ <td class="partial" align="center">Partial</td>
 </tr>
 <tr>
  <td><a href="#c2x">C23</a></td>
@@ -425,7 +425,7 @@ <h2 id="c11">C11 implementation status</h2>
     <tr>
       <td>Extending the lifetime of temporary objects (factored approach)</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1285.htm">N1285</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="none" align="center">No</td>
     </tr>
     <tr>
       <td>Requiring signed char to have no padding bits</td>
@@ -457,11 +457,6 @@ <h2 id="c11">C11 implementation status</h2>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1349.htm">N1349</a></td>
       <td class="unknown" align="center">Unknown</td>
     </tr>
-    <tr>
-      <td>FLT_EVAL_METHOD issues (first change only)</td>
-      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1353.pdf">N1353</a></td>
-      <td class="unknown" align="center">Unknown</td>
-    </tr>
     <tr>
       <td>_Bool bit-fields</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1356.htm">N1356</a></td>
@@ -493,11 +488,6 @@ <h2 id="c11">C11 implementation status</h2>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1367.htm">N1367</a></td>
       <td class="unknown" align="center">Unknown</td>
     </tr>
-    <tr>
-      <td>FLT_EVAL_METHOD and return</td>
-      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1382.htm">N1382</a></td>
-      <td class="unknown" align="center">Unknown</td>
-    </tr>
     <tr>
       <td>Floating-point to int/_Bool conversions</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1391.htm">N1391</a></td>
@@ -609,11 +599,6 @@ <h2 id="c11">C11 implementation status</h2>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1532.htm">N1532</a></td>
       <td class="full" align="center">Yes</td>
     </tr>
-    <tr>
-      <td>Synthesis re _Atomic</td>
-      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1537.htm">N1537</a></td>
-      <td class="unknown" align="center">Unknown</td>
-    </tr>
     <tr>
       <td>Clarification for wide evaluation</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1531.pdf">N1531</a></td>
@@ -652,11 +637,6 @@ <h2 id="c2x">C23 implementation status</h2>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2186.pdf">N2186</a></td>
       <td class="unknown" align="center">Unknown</td>
     </tr>
-    <tr>
-      <td>Clarifying the restrict Keyword v2</td>
-      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2260.pdf">N2660</a></td>
-      <td class="unknown" align="center">Unknown</td>
-    </tr>
     <tr>
       <td>Harmonizing static_assert with C++</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2265.pdf">N2665</a></td>
@@ -673,7 +653,7 @@ <h2 id="c2x">C23 implementation status</h2>
       <td class="full" align="center">Clang 9</td>
     </tr>
     <tr id="TS18661">
-      <td rowspan="9">TS 18661 Integration</td>
+      <td rowspan="10">TS 18661 Integration</td>
     </tr>
       <tr> <!-- Pre-Oct 2019 -->
         <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2314.pdf">N2314</a></td>
@@ -707,6 +687,10 @@ <h2 id="c2x">C23 implementation status</h2>
         <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2755.htm">N2755</a></td>
         <td class="unknown" align="center">Unknown</td>
       </tr>
+      <tr> <!-- Feb 2022 -->
+        <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2931.pdf">N2931</a></td>
+        <td class="unknown" align="center">Unknown</td>
+      </tr>
     <tr>
       <td>Preprocessor line numbers unspecified</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2322.htm">N2322</a></td>
@@ -762,11 +746,8 @@ <h2 id="c2x">C23 implementation status</h2>
       <td class="full" align="center">Clang 15</td>
     </tr>
     <!-- Oct 2019 Papers -->
-    <tr>
-      <td>*_IS_IEC_60559 feature test macros</td>
-      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2379.htm">N2379</a></td>
-      <td class="unknown" align="center">Unknown</td>
-    </tr>
+    <!-- WG14 N2379 *_IS_IEC_60559 feature test macros was removed by a later
+         change to C23. -->
     <tr>
       <td>Annex F.8 update for implementation extensions and rounding</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2384.pdf">N2384</a></td>
@@ -999,7 +980,7 @@ <h2 id="c2x">C23 implementation status</h2>
     <tr>
       <td>Clarification for max exponent macros</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2843.pdf">N2843</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="na" align="center">Subsumed by N2882</td>
     </tr>
     <tr>
       <td>Clarification about expression transformations</td>
@@ -1040,7 +1021,7 @@ <h2 id="c2x">C23 implementation status</h2>
     <tr>
       <td>Disambiguate the storage class of some compound literals</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2819.pdf">N2819</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="none" align="center">No</td>
     </tr>
     <tr>
       <td>Add annotations for unreachable control flow v2</td>
@@ -1109,11 +1090,6 @@ <h2 id="c2x">C23 implementation status</h2>
         <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2930.pdf">N2930</a></td>
         <td class="full" align="center">Clang 16</td>
       </tr>
-    <tr>
-      <td>Type annex tgmath narrowing macros with integer args v2</td>
-      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2931.pdf">N2931</a></td>
-      <td class="unknown" align="center">Unknown</td>
-    </tr>
     <tr>
       <td>Revise spelling of keywords v7</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2934.pdf">N2934</a></td>
@@ -1194,7 +1170,7 @@ <h2 id="c2x">C23 implementation status</h2>
     <tr>
       <td>Comma ommission and deletion (__VA_OPT__)</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3033.htm">N3033</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="full" align="center">Clang 12</td>
     </tr>
     <tr>
       <td>Underspecified object definitions</td>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 65dd31a0fb802c..8f21a7c9f8737d 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -204,6 +204,27 @@ <h2 id="cxx26">C++2c implementation status</h2>
   <td><a href="https://wg21.link/P2893R3">P2893R3</a></td>
   <td class="none" align="center">No</td>
  </tr>
+ <!-- Summer 2024 papers (St Louis) -->
+ <tr>
+  <td><tt>constexpr</tt> placement new</td>
+  <td><a href="https://wg21.link/P2747R2">P2747R2</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Deleting a Pointer to an Incomplete Type Should be Ill-formed</td>
+  <td><a href="https://wg21.link/P3144">P3144R2</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Ordering of constraints involving fold expressions</td>
+  <td><a href="https://wg21.link/P2963R3">P2963R3</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Structured binding declaration as a condition</td>
+  <td><a href="https://wg21.link/P0963R3">P0963R3</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
 </table>
 </details>
 
@@ -247,7 +268,7 @@ <h2 id="cxx23">C++23 implementation status</h2>
     </tr>
     <tr>
       <td>Allow duplicate attributes</td>
-      <td><a href="https://wg21.link/P2156R1">P2156R1</a></td>
+      <td><a href="https://wg21.link/P2156R1">P2156R1</a> (<a href="#dr">DR</a>)</td>
       <td class="full" align="center">Clang 13</td>
     </tr>
     <tr>
@@ -267,7 +288,7 @@ <h2 id="cxx23">C++23 implementation status</h2>
     </tr>
     <tr>
       <td>C++ identifier syntax using UAX 31</td>
-      <td><a href="https://wg21.link/P1949R7">P1949R7</a></td>
+      <td><a href="https://wg21.link/P1949R7">P1949R7</a> (<a href="#dr">DR</a>)</td>
       <td class="full" align="center">Clang 14</td>
     </tr>
     <tr>
@@ -287,11 +308,11 @@ <h2 id="cxx23">C++23 implementation status</h2>
     </tr>
     <tr>
       <td rowspan=2>Change scope of lambda trailing-return-type</td>
-      <td><a href="https://wg21.link/P2036R3">P2036R3</a></td>
+      <td><a href="https://wg21.link/P2036R3">P2036R3</a> (<a href="#dr">DR</a>)</td>
       <td rowspan=2 class="full" align="center">Clang 17</td>
     </tr>
     <tr>
-      <td><a href="https://wg21.link/P2579R0">P2579R0</a></td>
+      <td><a href="https://wg21.link/P2579R0">P2579R0</a> (<a href="#dr">DR</a>)</td>
     </tr>
     <tr>
       <td>Multidimensional subscript operator</td>
@@ -352,12 +373,12 @@ <h2 id="cxx23">C++23 implementation status</h2>
     <!-- July 2022 papers -->
     <tr>
       <td>The Equality Operator You Are Looking For</td>
-      <td><a href="https://wg21.link/P2468R2">P2468R2</a></td>
+      <td><a href="https://wg21.link/P2468R2">P2468R2</a> (<a href="#dr">DR</a>)</td>
       <td class="full" align="center">Clang 16</td>
     </tr>
     <tr>
       <td>De-deprecating volatile compound operations</td>
-      <td><a href="https://wg21.link/P2327R1">P2327R1</a></td>
+      <td><a href="https://wg21.link/P2327R1">P2327R1</a> (<a href="#dr">DR</a>)</td>
       <td class="full" align="center">Clang 15</td>
     </tr>
     <tr>
@@ -422,12 +443,12 @@ <h2 id="cxx23">C++23 implementation status</h2>
     </tr>
     <tr>
       <td><code>char8_t</code> Compatibility and Portability Fix</td>
-      <td><a href="https://wg21.link/P2513R3">P2513R3</a></td>
+      <td><a href="https://wg21.link/P2513R3">P2513R3</a> (<a href="#dr">DR</a>)</td>
       <td class="full" align="center">Clang 16</td>
     </tr>
     <tr>
       <td>Relax requirements on <code>wchar_t</code> to match existing practices</td>
-      <td><a href="https://wg21.link/P2460R2">P2460R2</a></td>
+      <td><a href="https://wg21.link/P2460R2">P2460R2</a> (<a href="#dr">DR</a>)</td>
       <td class="full" align="center">Yes</td>
     </tr>
     <tr>
@@ -563,7 +584,7 @@ <h2 id="cxx20">C++20 implementation status</h2>
         <td><a href="https://wg21.link/p2103r0">P2103R0</a></td>
       </tr>
       <tr> <!-- from February 2022 -->
-        <td><a href="https://wg21.link/p2493r0">P2493R0</a></td>
+        <td><a href="https://wg21.link/p2493r0">P2493R0</a> (<a href="#dr">DR</a>)</td>
       </tr>
       <tr>
         <td><a href="https://wg21.link/p2092r0">P2092R0</a></td>
diff --git a/clang/www/get_involved.html b/clang/www/get_involved.html
index 3fc688837d0bc0..99fa03abb6234d 100755
--- a/clang/www/get_involved.html
+++ b/clang/www/get_involved.html
@@ -126,6 +126,12 @@ <h2 id="criteria">Contributing Extensions to Clang</h2>
   extension is not broken by ongoing maintenance in Clang. The test suite
   should be complete enough that another compiler vendor could conceivably
   validate their implementation of the feature against it.</li>
+
+  <li>A support story for other impacted projects within the monorepo: If the
+  extension can impact other parts of the project (libc++, lldb, compiler-rt,
+  etc), the proposal needs to document the impact for these projects to fully
+  support the extension and what level of support is expected. The impacted
+  project communities need to agree with that plan.</li>
 </ol>
 
 </div>
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 75e4d3677703ad..7037bf89d65327 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -837,7 +837,7 @@ else()
 endif()
 
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND SAFESTACK_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux|FreeBSD|NetBSD")
+    OS_NAME MATCHES "Linux|FreeBSD|NetBSD|SunOS")
   set(COMPILER_RT_HAS_SAFESTACK TRUE)
 else()
   set(COMPILER_RT_HAS_SAFESTACK FALSE)
diff --git a/compiler-rt/include/profile/MIBEntryDef.inc b/compiler-rt/include/profile/MIBEntryDef.inc
index 794163ae10386c..58c1fc4de4aba5 100644
--- a/compiler-rt/include/profile/MIBEntryDef.inc
+++ b/compiler-rt/include/profile/MIBEntryDef.inc
@@ -51,3 +51,5 @@ MIBEntryDef(MaxAccessDensity = 22, MaxAccessDensity, uint32_t)
 MIBEntryDef(TotalLifetimeAccessDensity = 23, TotalLifetimeAccessDensity, uint64_t)
 MIBEntryDef(MinLifetimeAccessDensity = 24, MinLifetimeAccessDensity, uint32_t)
 MIBEntryDef(MaxLifetimeAccessDensity = 25, MaxLifetimeAccessDensity, uint32_t)
+MIBEntryDef(AccessHistogramSize = 26, AccessHistogramSize, uint32_t)
+MIBEntryDef(AccessHistogram = 27, AccessHistogram, uintptr_t)
\ No newline at end of file
diff --git a/compiler-rt/include/profile/MemProfData.inc b/compiler-rt/include/profile/MemProfData.inc
index b82a4baf6dd74e..3dc88478fb93a0 100644
--- a/compiler-rt/include/profile/MemProfData.inc
+++ b/compiler-rt/include/profile/MemProfData.inc
@@ -33,7 +33,11 @@
    (uint64_t)'o' << 24 | (uint64_t)'f' << 16 | (uint64_t)'r' << 8 | (uint64_t)129)
 
 // The version number of the raw binary format.
-#define MEMPROF_RAW_VERSION 3ULL
+#define MEMPROF_RAW_VERSION 4ULL
+
+// Currently supported versions.
+#define MEMPROF_RAW_SUPPORTED_VERSIONS                                         \
+  { 3ULL, 4ULL }
 
 #define MEMPROF_BUILDID_MAX_SIZE 32ULL
 
@@ -119,7 +123,8 @@ MemInfoBlock() {
 }
 
 MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
-             uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu)
+             uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu,
+             uintptr_t Histogram, uint32_t HistogramSize)
     : MemInfoBlock() {
   AllocCount = 1U;
   TotalAccessCount = AccessCount;
@@ -149,6 +154,8 @@ MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
   AllocCpuId = AllocCpu;
   DeallocCpuId = DeallocCpu;
   NumMigratedCpu = AllocCpuId != DeallocCpuId;
+  AccessHistogramSize = HistogramSize;
+  AccessHistogram = Histogram;
 }
 
 void Merge(const MemInfoBlock &newMIB) {
@@ -194,6 +201,24 @@ void Merge(const MemInfoBlock &newMIB) {
   NumSameDeallocCpu += DeallocCpuId == newMIB.DeallocCpuId;
   AllocCpuId = newMIB.AllocCpuId;
   DeallocCpuId = newMIB.DeallocCpuId;
+
+  // For merging histograms, we always keep the longer histogram, and add
+  // values of shorter histogram to larger one.
+  uintptr_t ShorterHistogram;
+  uint32_t ShorterHistogramSize;
+  if (newMIB.AccessHistogramSize > AccessHistogramSize) {
+    ShorterHistogram = AccessHistogram;
+    ShorterHistogramSize = AccessHistogramSize;
+    // Swap histogram of current to larger histogram
+    AccessHistogram = newMIB.AccessHistogram;
+    AccessHistogramSize = newMIB.AccessHistogramSize;
+  } else {
+    ShorterHistogram = newMIB.AccessHistogram;
+    ShorterHistogramSize = newMIB.AccessHistogramSize;
+  }
+  for (size_t i = 0; i < ShorterHistogramSize; ++i) {
+    ((uint64_t *)AccessHistogram)[i] += ((uint64_t *)ShorterHistogram)[i];
+  }
 }
 
 #ifdef _MSC_VER
diff --git a/compiler-rt/lib/asan/asan_globals.cpp b/compiler-rt/lib/asan/asan_globals.cpp
index 6ac64c4b776bbb..d413b1ebc9fc0e 100644
--- a/compiler-rt/lib/asan/asan_globals.cpp
+++ b/compiler-rt/lib/asan/asan_globals.cpp
@@ -344,8 +344,8 @@ void __asan_unregister_image_globals(uptr *flag) {
 }
 
 void __asan_register_elf_globals(uptr *flag, void *start, void *stop) {
-  if (*flag) return;
-  if (!start) return;
+  if (*flag || start == stop)
+    return;
   CHECK_EQ(0, ((uptr)stop - (uptr)start) % sizeof(__asan_global));
   __asan_global *globals_start = (__asan_global*)start;
   __asan_global *globals_stop = (__asan_global*)stop;
@@ -354,8 +354,8 @@ void __asan_register_elf_globals(uptr *flag, void *start, void *stop) {
 }
 
 void __asan_unregister_elf_globals(uptr *flag, void *start, void *stop) {
-  if (!*flag) return;
-  if (!start) return;
+  if (!*flag || start == stop)
+    return;
   CHECK_EQ(0, ((uptr)stop - (uptr)start) % sizeof(__asan_global));
   __asan_global *globals_start = (__asan_global*)start;
   __asan_global *globals_stop = (__asan_global*)stop;
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 6d1360e104975f..f8f86a766b204f 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -333,7 +333,7 @@ INTERCEPTOR(int, pthread_timedjoin_np, void *thread, void **ret,
 }
 #    endif
 
-DEFINE_REAL_PTHREAD_FUNCTIONS
+DEFINE_INTERNAL_PTHREAD_FUNCTIONS
 #endif  // ASAN_INTERCEPT_PTHREAD_CREATE
 
 #if ASAN_INTERCEPT_SWAPCONTEXT
diff --git a/compiler-rt/lib/gwp_asan/guarded_pool_allocator.cpp b/compiler-rt/lib/gwp_asan/guarded_pool_allocator.cpp
index 5f41e1eb8a4b47..a5f1ecd4b26add 100644
--- a/compiler-rt/lib/gwp_asan/guarded_pool_allocator.cpp
+++ b/compiler-rt/lib/gwp_asan/guarded_pool_allocator.cpp
@@ -52,12 +52,12 @@ void GuardedPoolAllocator::init(const options::Options &Opts) {
       Opts.MaxSimultaneousAllocations == 0)
     return;
 
-  Check(Opts.SampleRate >= 0, "GWP-ASan Error: SampleRate is < 0.");
-  Check(Opts.SampleRate < (1 << 30), "GWP-ASan Error: SampleRate is >= 2^30.");
-  Check(Opts.MaxSimultaneousAllocations >= 0,
+  check(Opts.SampleRate >= 0, "GWP-ASan Error: SampleRate is < 0.");
+  check(Opts.SampleRate < (1 << 30), "GWP-ASan Error: SampleRate is >= 2^30.");
+  check(Opts.MaxSimultaneousAllocations >= 0,
         "GWP-ASan Error: MaxSimultaneousAllocations is < 0.");
 
-  Check(SingletonPtr == nullptr,
+  check(SingletonPtr == nullptr,
         "There's already a live GuardedPoolAllocator!");
   SingletonPtr = this;
   Backtrace = Opts.Backtrace;
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
index ca5231a235f5c0..3f39402652a998 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
@@ -24,13 +24,13 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  Check(Status == ZX_OK, "Failed to create Vmo");
+  check(Status == ZX_OK, "Failed to create Vmo");
   _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
   zx_vaddr_t Addr;
   Status = _zx_vmar_map(_zx_vmar_root_self(),
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_ALLOW_FAULTS,
                         0, Vmo, 0, Size, &Addr);
-  Check(Status == ZX_OK, "Vmo mapping failed");
+  check(Status == ZX_OK, "Vmo mapping failed");
   _zx_handle_close(Vmo);
   return reinterpret_cast<void *>(Addr);
 }
@@ -40,7 +40,7 @@ void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_status_t Status = _zx_vmar_unmap(_zx_vmar_root_self(),
                                       reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  Check(Status == ZX_OK, "Vmo unmapping failed");
+  check(Status == ZX_OK, "Vmo unmapping failed");
 }
 
 void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
@@ -50,7 +50,7 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
       _zx_vmar_root_self(),
       ZX_VM_CAN_MAP_READ | ZX_VM_CAN_MAP_WRITE | ZX_VM_CAN_MAP_SPECIFIC, 0,
       Size, &GuardedPagePoolPlatformData.Vmar, &Addr);
-  Check(Status == ZX_OK, "Failed to reserve guarded pool allocator memory");
+  check(Status == ZX_OK, "Failed to reserve guarded pool allocator memory");
   _zx_object_set_property(GuardedPagePoolPlatformData.Vmar, ZX_PROP_NAME,
                           kGwpAsanGuardPageName, strlen(kGwpAsanGuardPageName));
   return reinterpret_cast<void *>(Addr);
@@ -59,8 +59,8 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
 void GuardedPoolAllocator::unreserveGuardedPool() {
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
-  Check(_zx_vmar_destroy(Vmar) == ZX_OK, "Failed to destroy a vmar");
-  Check(_zx_handle_close(Vmar) == ZX_OK, "Failed to close a vmar");
+  check(_zx_vmar_destroy(Vmar) == ZX_OK, "Failed to destroy a vmar");
+  check(_zx_handle_close(Vmar) == ZX_OK, "Failed to close a vmar");
   GuardedPagePoolPlatformData.Vmar = ZX_HANDLE_INVALID;
 }
 
@@ -69,7 +69,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  Check(Status == ZX_OK, "Failed to create vmo");
+  check(Status == ZX_OK, "Failed to create vmo");
   _zx_object_set_property(Vmo, ZX_PROP_NAME, kGwpAsanAliveSlotName,
                           strlen(kGwpAsanAliveSlotName));
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
@@ -81,7 +81,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE |
                             ZX_VM_ALLOW_FAULTS | ZX_VM_SPECIFIC,
                         Offset, Vmo, 0, Size, &P);
-  Check(Status == ZX_OK, "Vmo mapping failed");
+  check(Status == ZX_OK, "Vmo mapping failed");
   _zx_handle_close(Vmo);
 }
 
@@ -93,7 +93,7 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
   const zx_status_t Status =
       _zx_vmar_unmap(Vmar, reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  Check(Status == ZX_OK, "Vmar unmapping failed");
+  check(Status == ZX_OK, "Vmar unmapping failed");
 }
 
 size_t GuardedPoolAllocator::getPlatformPageSize() {
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
index c036ebe3efcc02..549e31acb176d3 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
@@ -46,7 +46,7 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   void *Ptr = mmap(nullptr, Size, PROT_READ | PROT_WRITE,
                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  Check(Ptr != MAP_FAILED, "Failed to map guarded pool allocator memory");
+  check(Ptr != MAP_FAILED, "Failed to map guarded pool allocator memory");
   MaybeSetMappingName(Ptr, Size, Name);
   return Ptr;
 }
@@ -54,7 +54,7 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
 void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  Check(munmap(Ptr, Size) == 0,
+  check(munmap(Ptr, Size) == 0,
         "Failed to unmap guarded pool allocator memory.");
 }
 
@@ -62,7 +62,7 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
   assert((Size % State.PageSize) == 0);
   void *Ptr =
       mmap(nullptr, Size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  Check(Ptr != MAP_FAILED, "Failed to reserve guarded pool allocator memory");
+  check(Ptr != MAP_FAILED, "Failed to reserve guarded pool allocator memory");
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
   return Ptr;
 }
@@ -75,7 +75,7 @@ void GuardedPoolAllocator::unreserveGuardedPool() {
 void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  Check(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
+  check(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
         "Failed to allocate in guarded pool allocator memory");
   MaybeSetMappingName(Ptr, Size, kGwpAsanAliveSlotName);
 }
@@ -87,7 +87,7 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   // mmap() a PROT_NONE page over the address to release it to the system, if
   // we used mprotect() here the system would count pages in the quarantine
   // against the RSS.
-  Check(mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
+  check(mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
              0) != MAP_FAILED,
         "Failed to deallocate in guarded pool allocator memory");
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
diff --git a/compiler-rt/lib/gwp_asan/utilities.h b/compiler-rt/lib/gwp_asan/utilities.h
index d8bc0e491a3d2e..76e5df2e3eb8f8 100644
--- a/compiler-rt/lib/gwp_asan/utilities.h
+++ b/compiler-rt/lib/gwp_asan/utilities.h
@@ -18,7 +18,7 @@ namespace gwp_asan {
 void die(const char *Message);
 
 // Checks that `Condition` is true, otherwise dies with `Message`.
-GWP_ASAN_ALWAYS_INLINE void Check(bool Condition, const char *Message) {
+GWP_ASAN_ALWAYS_INLINE void check(bool Condition, const char *Message) {
   if (Condition)
     return;
   die(Message);
diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
index 08ae435b8214ae..c10b5c158548e9 100644
--- a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
@@ -334,7 +334,7 @@ INTERCEPTOR(int, pthread_timedjoin_np, void *thread, void **ret,
 }
 #    endif
 
-DEFINE_REAL_PTHREAD_FUNCTIONS
+DEFINE_INTERNAL_PTHREAD_FUNCTIONS
 
 DEFINE_REAL(int, vfork,)
 DECLARE_EXTERN_INTERCEPTOR_AND_WRAPPER(int, vfork,)
diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index 1fd0010f9ea936..6df4b6865b3794 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -525,7 +525,7 @@ INTERCEPTOR(int, pthread_timedjoin_np, void *thread, void **ret,
 #    define LSAN_MAYBE_INTERCEPT_TIMEDJOIN
 #  endif  // SANITIZER_INTERCEPT_TIMEDJOIN
 
-DEFINE_REAL_PTHREAD_FUNCTIONS
+DEFINE_INTERNAL_PTHREAD_FUNCTIONS
 
 INTERCEPTOR(void, _exit, int status) {
   if (status == 0 && HasReportedLeaks()) status = common_flags()->exitcode;
diff --git a/compiler-rt/lib/memprof/memprof_allocator.cpp b/compiler-rt/lib/memprof/memprof_allocator.cpp
index 35e941228525a8..19b2b901068246 100644
--- a/compiler-rt/lib/memprof/memprof_allocator.cpp
+++ b/compiler-rt/lib/memprof/memprof_allocator.cpp
@@ -34,6 +34,10 @@
 #include <sched.h>
 #include <time.h>
 
+#define MAX_HISTOGRAM_PRINT_SIZE 32U
+
+extern bool __memprof_histogram;
+
 namespace __memprof {
 namespace {
 using ::llvm::memprof::MemInfoBlock;
@@ -68,6 +72,14 @@ void Print(const MemInfoBlock &M, const u64 id, bool print_terse) {
            "cpu: %u, num same dealloc_cpu: %u\n",
            M.NumMigratedCpu, M.NumLifetimeOverlaps, M.NumSameAllocCpu,
            M.NumSameDeallocCpu);
+    Printf("AccessCountHistogram[%u]: ", M.AccessHistogramSize);
+    uint32_t PrintSize = M.AccessHistogramSize > MAX_HISTOGRAM_PRINT_SIZE
+                             ? MAX_HISTOGRAM_PRINT_SIZE
+                             : M.AccessHistogramSize;
+    for (size_t i = 0; i < PrintSize; ++i) {
+      Printf("%llu ", ((uint64_t *)M.AccessHistogram)[i]);
+    }
+    Printf("\n");
   }
 }
 } // namespace
@@ -216,6 +228,17 @@ u64 GetShadowCount(uptr p, u32 size) {
   return count;
 }
 
+// Accumulates the access count from the shadow for the given pointer and size.
+// See memprof_mapping.h for an overview on histogram counters.
+u64 GetShadowCountHistogram(uptr p, u32 size) {
+  u8 *shadow = (u8 *)HISTOGRAM_MEM_TO_SHADOW(p);
+  u8 *shadow_end = (u8 *)HISTOGRAM_MEM_TO_SHADOW(p + size);
+  u64 count = 0;
+  for (; shadow <= shadow_end; shadow++)
+    count += *shadow;
+  return count;
+}
+
 // Clears the shadow counters (when memory is allocated).
 void ClearShadow(uptr addr, uptr size) {
   CHECK(AddrIsAlignedByGranularity(addr));
@@ -223,8 +246,16 @@ void ClearShadow(uptr addr, uptr size) {
   CHECK(AddrIsAlignedByGranularity(addr + size));
   CHECK(AddrIsInMem(addr + size - SHADOW_GRANULARITY));
   CHECK(REAL(memset));
-  uptr shadow_beg = MEM_TO_SHADOW(addr);
-  uptr shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
+  uptr shadow_beg;
+  uptr shadow_end;
+  if (__memprof_histogram) {
+    shadow_beg = HISTOGRAM_MEM_TO_SHADOW(addr);
+    shadow_end = HISTOGRAM_MEM_TO_SHADOW(addr + size);
+  } else {
+    shadow_beg = MEM_TO_SHADOW(addr);
+    shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
+  }
+
   if (shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
     REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
   } else {
@@ -279,6 +310,44 @@ struct Allocator {
     Print(Value->mib, Key, bool(Arg));
   }
 
+  // See memprof_mapping.h for an overview on histogram counters.
+  static MemInfoBlock CreateNewMIB(uptr p, MemprofChunk *m, u64 user_size) {
+    if (__memprof_histogram) {
+      return CreateNewMIBWithHistogram(p, m, user_size);
+    } else {
+      return CreateNewMIBWithoutHistogram(p, m, user_size);
+    }
+  }
+
+  static MemInfoBlock CreateNewMIBWithHistogram(uptr p, MemprofChunk *m,
+                                                u64 user_size) {
+
+    u64 c = GetShadowCountHistogram(p, user_size);
+    long curtime = GetTimestamp();
+    uint32_t HistogramSize =
+        RoundUpTo(user_size, HISTOGRAM_GRANULARITY) / HISTOGRAM_GRANULARITY;
+    uintptr_t Histogram =
+        (uintptr_t)InternalAlloc(HistogramSize * sizeof(uint64_t));
+    memset((void *)Histogram, 0, HistogramSize * sizeof(uint64_t));
+    for (size_t i = 0; i < HistogramSize; ++i) {
+      u8 Counter =
+          *((u8 *)HISTOGRAM_MEM_TO_SHADOW(p + HISTOGRAM_GRANULARITY * i));
+      ((uint64_t *)Histogram)[i] = (uint64_t)Counter;
+    }
+    MemInfoBlock newMIB(user_size, c, m->timestamp_ms, curtime, m->cpu_id,
+                        GetCpuId(), Histogram, HistogramSize);
+    return newMIB;
+  }
+
+  static MemInfoBlock CreateNewMIBWithoutHistogram(uptr p, MemprofChunk *m,
+                                                   u64 user_size) {
+    u64 c = GetShadowCount(p, user_size);
+    long curtime = GetTimestamp();
+    MemInfoBlock newMIB(user_size, c, m->timestamp_ms, curtime, m->cpu_id,
+                        GetCpuId(), 0, 0);
+    return newMIB;
+  }
+
   void FinishAndWrite() {
     if (print_text && common_flags()->print_module_map)
       DumpProcessMap();
@@ -319,10 +388,7 @@ struct Allocator {
           if (!m)
             return;
           uptr user_beg = ((uptr)m) + kChunkHeaderSize;
-          u64 c = GetShadowCount(user_beg, user_requested_size);
-          long curtime = GetTimestamp();
-          MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
-                              m->cpu_id, GetCpuId());
+          MemInfoBlock newMIB = CreateNewMIB(user_beg, m, user_requested_size);
           InsertOrMerge(m->alloc_context_id, newMIB, A->MIBMap);
         },
         this);
@@ -451,11 +517,7 @@ struct Allocator {
         atomic_exchange(&m->user_requested_size, 0, memory_order_acquire);
     if (memprof_inited && atomic_load_relaxed(&constructed) &&
         !atomic_load_relaxed(&destructing)) {
-      u64 c = GetShadowCount(p, user_requested_size);
-      long curtime = GetTimestamp();
-
-      MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
-                          m->cpu_id, GetCpuId());
+      MemInfoBlock newMIB = this->CreateNewMIB(p, m, user_requested_size);
       InsertOrMerge(m->alloc_context_id, newMIB, MIBMap);
     }
 
diff --git a/compiler-rt/lib/memprof/memprof_flags.inc b/compiler-rt/lib/memprof/memprof_flags.inc
index ee0760ddc302a6..7c5dc091f79351 100644
--- a/compiler-rt/lib/memprof/memprof_flags.inc
+++ b/compiler-rt/lib/memprof/memprof_flags.inc
@@ -38,4 +38,4 @@ MEMPROF_FLAG(bool, allocator_frees_and_returns_null_on_realloc_zero, true,
 MEMPROF_FLAG(bool, print_text, false,
   "If set, prints the heap profile in text format. Else use the raw binary serialization format.")
 MEMPROF_FLAG(bool, print_terse, false,
-             "If set, prints memory profile in a terse format. Only applicable if print_text = true.")
+             "If set, prints memory profile in a terse format. Only applicable if print_text = true.")
\ No newline at end of file
diff --git a/compiler-rt/lib/memprof/memprof_interceptors.cpp b/compiler-rt/lib/memprof/memprof_interceptors.cpp
index a267f6d3d6717b..53ee4e953419b5 100644
--- a/compiler-rt/lib/memprof/memprof_interceptors.cpp
+++ b/compiler-rt/lib/memprof/memprof_interceptors.cpp
@@ -166,7 +166,7 @@ INTERCEPTOR(int, pthread_join, void *t, void **arg) {
   return REAL(pthread_join)(t, arg);
 }
 
-DEFINE_REAL_PTHREAD_FUNCTIONS
+DEFINE_INTERNAL_PTHREAD_FUNCTIONS
 
 INTERCEPTOR(char *, index, const char *string, int c)
 ALIAS(WRAP(strchr));
diff --git a/compiler-rt/lib/memprof/memprof_mapping.h b/compiler-rt/lib/memprof/memprof_mapping.h
index 1cc0836834cdf0..fef8acfcfc9219 100644
--- a/compiler-rt/lib/memprof/memprof_mapping.h
+++ b/compiler-rt/lib/memprof/memprof_mapping.h
@@ -22,7 +22,6 @@ static const u64 kDefaultShadowScale = 3;
 
 #define SHADOW_GRANULARITY (1ULL << SHADOW_SCALE)
 #define MEMPROF_ALIGNMENT 32
-
 namespace __memprof {
 
 extern uptr kHighMemEnd; // Initialized in __memprof_init.
@@ -37,6 +36,34 @@ extern uptr kHighMemEnd; // Initialized in __memprof_init.
 #define MEM_TO_SHADOW(mem)                                                     \
   ((((mem) & SHADOW_MASK) >> SHADOW_SCALE) + (SHADOW_OFFSET))
 
+// Histogram shadow memory is laid different to the standard configuration:
+
+//             8 bytes
+//         +---+---+---+  +---+---+---+  +---+---+---+
+//  Memory |     a     |  |     b     |  |     c     |
+//         +---+---+---+  +---+---+---+  +---+---+---+
+
+//             +---+          +---+          +---+
+//  Shadow     | a |          | b |          | c |
+//             +---+          +---+          +---+
+//            1 byte
+//
+// Where we have a 1 byte counter for each 8 bytes. HISTOGRAM_MEM_TO_SHADOW
+// translates a memory address to the address of its corresponding shadow
+// counter memory address. The same data is still provided in MIB whether
+// histograms are used or not. Total access counts per allocations are
+// computed by summing up all individual 1 byte counters. This can incur an
+// accuracy penalty.
+
+#define HISTOGRAM_GRANULARITY 8U
+
+#define HISTOGRAM_MAX_COUNTER 255U
+
+#define HISTOGRAM_SHADOW_MASK ~(HISTOGRAM_GRANULARITY - 1)
+
+#define HISTOGRAM_MEM_TO_SHADOW(mem)                                           \
+  ((((mem) & HISTOGRAM_SHADOW_MASK) >> SHADOW_SCALE) + (SHADOW_OFFSET))
+
 #define SHADOW_ENTRY_SIZE (MEM_GRANULARITY >> SHADOW_SCALE)
 
 #define kLowMemBeg 0
@@ -108,6 +135,14 @@ inline void RecordAccess(uptr a) {
   (*shadow_address)++;
 }
 
+inline void RecordAccessHistogram(uptr a) {
+  CHECK_EQ(SHADOW_ENTRY_SIZE, 8);
+  u8 *shadow_address = (u8 *)HISTOGRAM_MEM_TO_SHADOW(a);
+  if (*shadow_address < HISTOGRAM_MAX_COUNTER) {
+    (*shadow_address)++;
+  }
+}
+
 } // namespace __memprof
 
 #endif // MEMPROF_MAPPING_H
diff --git a/compiler-rt/lib/memprof/memprof_mibmap.cpp b/compiler-rt/lib/memprof/memprof_mibmap.cpp
index 32f0796c8f241a..a49ed8bf4fd64f 100644
--- a/compiler-rt/lib/memprof/memprof_mibmap.cpp
+++ b/compiler-rt/lib/memprof/memprof_mibmap.cpp
@@ -30,7 +30,18 @@ void InsertOrMerge(const uptr Id, const MemInfoBlock &Block, MIBMapTy &Map) {
   } else {
     LockedMemInfoBlock *lmib = *h;
     SpinMutexLock lock(&lmib->mutex);
+    uintptr_t ShorterHistogram;
+    if (Block.AccessHistogramSize > lmib->mib.AccessHistogramSize)
+      ShorterHistogram = lmib->mib.AccessHistogram;
+    else
+      ShorterHistogram = Block.AccessHistogram;
+
     lmib->mib.Merge(Block);
+    // The larger histogram is kept and the shorter histogram is discarded after
+    // adding the counters to the larger historam. Free only the shorter
+    // Histogram
+    if (Block.AccessHistogramSize > 0 || lmib->mib.AccessHistogramSize > 0)
+      InternalFree((void *)ShorterHistogram);
   }
 }
 
diff --git a/compiler-rt/lib/memprof/memprof_rawprofile.cpp b/compiler-rt/lib/memprof/memprof_rawprofile.cpp
index fa92fa0e4b53e9..a897648584828b 100644
--- a/compiler-rt/lib/memprof/memprof_rawprofile.cpp
+++ b/compiler-rt/lib/memprof/memprof_rawprofile.cpp
@@ -146,24 +146,38 @@ void SerializeStackToBuffer(const Vector<u64> &StackIds,
 // ---------- MIB Entry 0
 // Alloc Count
 // ...
+//       ---- AccessHistogram Entry 0
+//            ...
+//       ---- AccessHistogram Entry AccessHistogramSize - 1
 // ---------- MIB Entry 1
 // Alloc Count
 // ...
+//       ---- AccessHistogram Entry 0
+//            ...
+//       ---- AccessHistogram Entry AccessHistogramSize - 1
 // ----------
 void SerializeMIBInfoToBuffer(MIBMapTy &MIBMap, const Vector<u64> &StackIds,
                               const u64 ExpectedNumBytes, char *&Buffer) {
   char *Ptr = Buffer;
   const u64 NumEntries = StackIds.Size();
   Ptr = WriteBytes(NumEntries, Ptr);
-
   for (u64 i = 0; i < NumEntries; i++) {
     const u64 Key = StackIds[i];
     MIBMapTy::Handle h(&MIBMap, Key, /*remove=*/true, /*create=*/false);
     CHECK(h.exists());
     Ptr = WriteBytes(Key, Ptr);
+    // FIXME: We unnecessarily serialize the AccessHistogram pointer. Adding a
+    // serialization schema will fix this issue. See also FIXME in
+    // deserialization.
     Ptr = WriteBytes((*h)->mib, Ptr);
+    for (u64 j = 0; j < (*h)->mib.AccessHistogramSize; ++j) {
+      u64 HistogramEntry = ((u64 *)((*h)->mib.AccessHistogram))[j];
+      Ptr = WriteBytes(HistogramEntry, Ptr);
+    }
+    if ((*h)->mib.AccessHistogramSize > 0) {
+      InternalFree((void *)((*h)->mib.AccessHistogram));
+    }
   }
-
   CHECK(ExpectedNumBytes >= static_cast<u64>(Ptr - Buffer) &&
         "Expected num bytes != actual bytes written");
 }
@@ -192,7 +206,15 @@ void SerializeMIBInfoToBuffer(MIBMapTy &MIBMap, const Vector<u64> &StackIds,
 // ---------- MIB Entry
 // Alloc Count
 // ...
-// ----------
+//       ---- AccessHistogram Entry 0
+//            ...
+//       ---- AccessHistogram Entry AccessHistogramSize - 1
+// ---------- MIB Entry 1
+// Alloc Count
+// ...
+//       ---- AccessHistogram Entry 0
+//            ...
+//       ---- AccessHistogram Entry AccessHistogramSize - 1
 // Optional Padding Bytes
 // ---------- Stack Info
 // Num Entries
@@ -218,13 +240,26 @@ u64 SerializeToRawProfile(MIBMapTy &MIBMap, ArrayRef<LoadedModule> Modules,
   const u64 NumMIBInfoBytes = RoundUpTo(
       sizeof(u64) + StackIds.Size() * (sizeof(u64) + sizeof(MemInfoBlock)), 8);
 
+  // Get Number of AccessHistogram entries in total
+  u64 TotalAccessHistogramEntries = 0;
+  MIBMap.ForEach(
+      [](const uptr Key, UNUSED LockedMemInfoBlock *const &MIB, void *Arg) {
+        u64 *TotalAccessHistogramEntries = (u64 *)Arg;
+        *TotalAccessHistogramEntries += MIB->mib.AccessHistogramSize;
+      },
+      reinterpret_cast<void *>(&TotalAccessHistogramEntries));
+  const u64 NumHistogramBytes =
+      RoundUpTo(TotalAccessHistogramEntries * sizeof(uint64_t), 8);
+
   const u64 NumStackBytes = RoundUpTo(StackSizeBytes(StackIds), 8);
 
   // Ensure that the profile is 8b aligned. We allow for some optional padding
   // at the end so that any subsequent profile serialized to the same file does
   // not incur unaligned accesses.
-  const u64 TotalSizeBytes = RoundUpTo(
-      sizeof(Header) + NumSegmentBytes + NumStackBytes + NumMIBInfoBytes, 8);
+  const u64 TotalSizeBytes =
+      RoundUpTo(sizeof(Header) + NumSegmentBytes + NumStackBytes +
+                    NumMIBInfoBytes + NumHistogramBytes,
+                8);
 
   // Allocate the memory for the entire buffer incl. info blocks.
   Buffer = (char *)InternalAlloc(TotalSizeBytes);
@@ -235,14 +270,16 @@ u64 SerializeToRawProfile(MIBMapTy &MIBMap, ArrayRef<LoadedModule> Modules,
                 static_cast<u64>(TotalSizeBytes),
                 sizeof(Header),
                 sizeof(Header) + NumSegmentBytes,
-                sizeof(Header) + NumSegmentBytes + NumMIBInfoBytes};
+                sizeof(Header) + NumSegmentBytes + NumMIBInfoBytes +
+                    NumHistogramBytes};
   Ptr = WriteBytes(header, Ptr);
 
   SerializeSegmentsToBuffer(Modules, NumSegmentBytes, Ptr);
   Ptr += NumSegmentBytes;
 
-  SerializeMIBInfoToBuffer(MIBMap, StackIds, NumMIBInfoBytes, Ptr);
-  Ptr += NumMIBInfoBytes;
+  SerializeMIBInfoToBuffer(MIBMap, StackIds,
+                           NumMIBInfoBytes + NumHistogramBytes, Ptr);
+  Ptr += NumMIBInfoBytes + NumHistogramBytes;
 
   SerializeStackToBuffer(StackIds, NumStackBytes, Ptr);
 
diff --git a/compiler-rt/lib/memprof/memprof_rtl.cpp b/compiler-rt/lib/memprof/memprof_rtl.cpp
index 0a63f813848eef..a87e8cefd6c40f 100644
--- a/compiler-rt/lib/memprof/memprof_rtl.cpp
+++ b/compiler-rt/lib/memprof/memprof_rtl.cpp
@@ -32,6 +32,9 @@ uptr __memprof_shadow_memory_dynamic_address; // Global interface symbol.
 // Allow the user to specify a profile output file via the binary.
 SANITIZER_WEAK_ATTRIBUTE char __memprof_profile_filename[1];
 
+// Share ClHistogram compiler flag with runtime.
+SANITIZER_WEAK_ATTRIBUTE bool __memprof_histogram;
+
 namespace __memprof {
 
 static void MemprofDie() {
@@ -75,12 +78,23 @@ uptr kHighMemEnd;
 // exported functions
 
 #define MEMPROF_MEMORY_ACCESS_CALLBACK_BODY() __memprof::RecordAccess(addr);
+#define MEMPROF_MEMORY_ACCESS_CALLBACK_BODY_HIST()                             \
+  __memprof::RecordAccessHistogram(addr);
 
 #define MEMPROF_MEMORY_ACCESS_CALLBACK(type)                                   \
   extern "C" NOINLINE INTERFACE_ATTRIBUTE void __memprof_##type(uptr addr) {   \
     MEMPROF_MEMORY_ACCESS_CALLBACK_BODY()                                      \
   }
 
+#define MEMPROF_MEMORY_ACCESS_CALLBACK_HIST(type)                              \
+  extern "C" NOINLINE INTERFACE_ATTRIBUTE void __memprof_hist_##type(          \
+      uptr addr) {                                                             \
+    MEMPROF_MEMORY_ACCESS_CALLBACK_BODY_HIST()                                 \
+  }
+
+MEMPROF_MEMORY_ACCESS_CALLBACK_HIST(load)
+MEMPROF_MEMORY_ACCESS_CALLBACK_HIST(store)
+
 MEMPROF_MEMORY_ACCESS_CALLBACK(load)
 MEMPROF_MEMORY_ACCESS_CALLBACK(store)
 
@@ -260,11 +274,20 @@ void __memprof_record_access(void const volatile *addr) {
   __memprof::RecordAccess((uptr)addr);
 }
 
+void __memprof_record_access_hist(void const volatile *addr) {
+  __memprof::RecordAccessHistogram((uptr)addr);
+}
+
 void __memprof_record_access_range(void const volatile *addr, uptr size) {
   for (uptr a = (uptr)addr; a < (uptr)addr + size; a += kWordSize)
     __memprof::RecordAccess(a);
 }
 
+void __memprof_record_access_range_hist(void const volatile *addr, uptr size) {
+  for (uptr a = (uptr)addr; a < (uptr)addr + size; a += kWordSize)
+    __memprof::RecordAccessHistogram(a);
+}
+
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE u16
 __sanitizer_unaligned_load16(const uu16 *p) {
   __memprof_record_access(p);
diff --git a/compiler-rt/lib/memprof/tests/rawprofile.cpp b/compiler-rt/lib/memprof/tests/rawprofile.cpp
index c5dfdca890be91..5764af9ce8afb5 100644
--- a/compiler-rt/lib/memprof/tests/rawprofile.cpp
+++ b/compiler-rt/lib/memprof/tests/rawprofile.cpp
@@ -95,7 +95,7 @@ TEST(MemProf, Basic) {
   // sizeof(MemInfoBlock) contains stack id + MeminfoBlock.
   EXPECT_EQ(StackOffset - MIBOffset, 8 + 2 * (8 + sizeof(MemInfoBlock)));
 
-  EXPECT_EQ(StackOffset, 408ULL);
+  EXPECT_EQ(StackOffset, 432ULL);
   // We expect 2 stack entries, with 5 frames - 8b for total count,
   // 2 * (8b for id, 8b for frame count and 5*8b for fake frames).
   // Since this is the last section, there may be additional padding at the end
diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index 9abf2406332588..789b739b41189a 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -1226,7 +1226,7 @@ INTERCEPTOR(int, pthread_timedjoin_np, void *thread, void **retval,
 }
 #endif
 
-DEFINE_REAL_PTHREAD_FUNCTIONS
+DEFINE_INTERNAL_PTHREAD_FUNCTIONS
 
 extern char *tzname[2];
 
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 54ee59c30722f0..1c58584d2d4f73 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -210,10 +210,9 @@ COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
 #define WIN_SYM_PREFIX
 #endif
 #pragma comment(                                                               \
-        linker,                                                                \
-            "/alternatename:" WIN_SYM_PREFIX INSTR_PROF_QUOTE(                 \
-                    INSTR_PROF_PROFILE_COUNTER_BIAS_VAR) "=" WIN_SYM_PREFIX    \
-            INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR))
+    linker, "/alternatename:" WIN_SYM_PREFIX INSTR_PROF_QUOTE(                 \
+                INSTR_PROF_PROFILE_COUNTER_BIAS_VAR) "=" WIN_SYM_PREFIX        \
+                INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR))
 #else
 COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR
     __attribute__((weak, alias(INSTR_PROF_QUOTE(
diff --git a/compiler-rt/lib/safestack/safestack_platform.h b/compiler-rt/lib/safestack/safestack_platform.h
index 2b1fc139baa90a..d4b2e2ef7391c8 100644
--- a/compiler-rt/lib/safestack/safestack_platform.h
+++ b/compiler-rt/lib/safestack/safestack_platform.h
@@ -25,8 +25,9 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#if !(SANITIZER_NETBSD || SANITIZER_FREEBSD || SANITIZER_LINUX)
-#error "Support for your platform has not been implemented"
+#if !(SANITIZER_NETBSD || SANITIZER_FREEBSD || SANITIZER_LINUX || \
+      SANITIZER_SOLARIS)
+#  error "Support for your platform has not been implemented"
 #endif
 
 #if SANITIZER_NETBSD
@@ -39,6 +40,10 @@ extern "C" void *__mmap(void *, size_t, int, int, int, int, off_t);
 #include <sys/thr.h>
 #endif
 
+#if SANITIZER_SOLARIS
+#  include <thread.h>
+#endif
+
 namespace safestack {
 
 #if SANITIZER_NETBSD
@@ -73,6 +78,8 @@ inline ThreadId GetTid() {
   long Tid;
   thr_self(&Tid);
   return Tid;
+#elif SANITIZER_SOLARIS
+  return thr_self();
 #else
   return syscall(SYS_gettid);
 #endif
@@ -83,6 +90,12 @@ inline int TgKill(pid_t pid, ThreadId tid, int sig) {
   DEFINE__REAL(int, _lwp_kill, int a, int b);
   (void)pid;
   return _REAL(_lwp_kill, tid, sig);
+#elif SANITIZER_SOLARIS
+#  ifdef SYS_lwp_kill
+  return syscall(SYS_lwp_kill, tid, sig);
+#  else
+  return -1;
+#  endif
 #elif SANITIZER_FREEBSD
   return syscall(SYS_thr_kill2, pid, tid, sig);
 #else
@@ -96,6 +109,9 @@ inline void *Mmap(void *addr, size_t length, int prot, int flags, int fd,
   return __mmap(addr, length, prot, flags, fd, 0, offset);
 #elif SANITIZER_FREEBSD && (defined(__aarch64__) || defined(__x86_64__))
   return (void *)__syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
+#elif SANITIZER_SOLARIS
+  return (void *)(uintptr_t)syscall(SYS_mmap, addr, length, prot, flags, fd,
+                                    offset);
 #else
   return (void *)syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp
index 7b74bb1a7e0f3c..a174ae7be991d0 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp
@@ -87,8 +87,8 @@ void MaybeStartBackgroudThread() {
   if (!common_flags()->hard_rss_limit_mb &&
       !common_flags()->soft_rss_limit_mb &&
       !common_flags()->heap_profile) return;
-  if (!&real_pthread_create) {
-    VPrintf(1, "%s: real_pthread_create undefined\n", SanitizerToolName);
+  if (!&internal_pthread_create) {
+    VPrintf(1, "%s: internal_pthread_create undefined\n", SanitizerToolName);
     return;  // Can't spawn the thread anyway.
   }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index bf2002b6b3de63..12df3ef73da4bc 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1845,18 +1845,18 @@ HandleSignalMode GetHandleSignalMode(int signum) {
 
 #  if !SANITIZER_GO
 void *internal_start_thread(void *(*func)(void *arg), void *arg) {
-  if (&real_pthread_create == 0)
+  if (&internal_pthread_create == 0)
     return nullptr;
   // Start the thread with signals blocked, otherwise it can steal user signals.
   ScopedBlockSignals block(nullptr);
   void *th;
-  real_pthread_create(&th, nullptr, func, arg);
+  internal_pthread_create(&th, nullptr, func, arg);
   return th;
 }
 
 void internal_join_thread(void *th) {
-  if (&real_pthread_join)
-    real_pthread_join(th, nullptr);
+  if (&internal_pthread_join)
+    internal_pthread_join(th, nullptr);
 }
 #  else
 void *internal_start_thread(void *(*func)(void *), void *arg) { return 0; }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
index c5811dffea94b5..14617e4771bec4 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
@@ -74,21 +74,21 @@ int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
 // These functions call appropriate pthread_ functions directly, bypassing
 // the interceptor. They are weak and may not be present in some tools.
 SANITIZER_WEAK_ATTRIBUTE
-int real_pthread_create(void *th, void *attr, void *(*callback)(void *),
-                        void *param);
+int internal_pthread_create(void *th, void *attr, void *(*callback)(void *),
+                            void *param);
 SANITIZER_WEAK_ATTRIBUTE
-int real_pthread_join(void *th, void **ret);
-
-#define DEFINE_REAL_PTHREAD_FUNCTIONS                                          \
-  namespace __sanitizer {                                                      \
-  int real_pthread_create(void *th, void *attr, void *(*callback)(void *),     \
-                          void *param) {                                       \
-    return REAL(pthread_create)(th, attr, callback, param);                    \
-  }                                                                            \
-  int real_pthread_join(void *th, void **ret) {                                \
-    return REAL(pthread_join(th, ret));                                        \
-  }                                                                            \
-  }  // namespace __sanitizer
+int internal_pthread_join(void *th, void **ret);
+
+#  define DEFINE_INTERNAL_PTHREAD_FUNCTIONS                               \
+    namespace __sanitizer {                                               \
+    int internal_pthread_create(void *th, void *attr,                     \
+                                void *(*callback)(void *), void *param) { \
+      return REAL(pthread_create)(th, attr, callback, param);             \
+    }                                                                     \
+    int internal_pthread_join(void *th, void **ret) {                     \
+      return REAL(pthread_join(th, ret));                                 \
+    }                                                                     \
+    }  // namespace __sanitizer
 
 int internal_pthread_attr_getstack(void *attr, void **addr, uptr *size);
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index e74930148f25ee..5685a9335316d1 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -190,30 +190,31 @@ TEST_F(MapAllocatorTest, SecondaryIterate) {
   Str.output();
 }
 
-TEST_F(MapAllocatorTest, SecondaryOptions) {
-  // Test options that are only meaningful if the secondary cache is enabled.
-  if (Allocator->canCache(0U)) {
-    // Attempt to set a maximum number of entries higher than the array size.
-    EXPECT_TRUE(
-        Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
-    // Attempt to set an invalid (negative) number of entries
-    EXPECT_FALSE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, -1));
-    // Various valid combinations.
-    EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
-    EXPECT_TRUE(
-        Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
-    EXPECT_TRUE(Allocator->canCache(1UL << 18));
-    EXPECT_TRUE(
-        Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17));
-    EXPECT_FALSE(Allocator->canCache(1UL << 18));
-    EXPECT_TRUE(Allocator->canCache(1UL << 16));
-    EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 0U));
-    EXPECT_FALSE(Allocator->canCache(1UL << 16));
-    EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
-    EXPECT_TRUE(
-        Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
-    EXPECT_TRUE(Allocator->canCache(1UL << 16));
-  }
+TEST_F(MapAllocatorTest, SecondaryCacheOptions) {
+  if (!Allocator->canCache(0U))
+    TEST_SKIP("Secondary Cache disabled");
+
+  // Attempt to set a maximum number of entries higher than the array size.
+  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
+
+  // Attempt to set an invalid (negative) number of entries
+  EXPECT_FALSE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, -1));
+
+  // Various valid combinations.
+  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
+  EXPECT_TRUE(
+      Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
+  EXPECT_TRUE(Allocator->canCache(1UL << 18));
+  EXPECT_TRUE(
+      Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17));
+  EXPECT_FALSE(Allocator->canCache(1UL << 18));
+  EXPECT_TRUE(Allocator->canCache(1UL << 16));
+  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 0U));
+  EXPECT_FALSE(Allocator->canCache(1UL << 16));
+  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
+  EXPECT_TRUE(
+      Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
+  EXPECT_TRUE(Allocator->canCache(1UL << 16));
 }
 
 struct MapAllocatorWithReleaseTest : public MapAllocatorTest {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index 94adea777cafd9..034ae3d322b56b 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -1088,7 +1088,18 @@ TSAN_INTERCEPTOR(int, pthread_join, void *th, void **ret) {
   return res;
 }
 
-DEFINE_REAL_PTHREAD_FUNCTIONS
+// DEFINE_INTERNAL_PTHREAD_FUNCTIONS
+namespace __sanitizer {
+int internal_pthread_create(void *th, void *attr, void *(*callback)(void *),
+                            void *param) {
+  ScopedIgnoreInterceptors ignore;
+  return REAL(pthread_create)(th, attr, callback, param);
+}
+int internal_pthread_join(void *th, void **ret) {
+  ScopedIgnoreInterceptors ignore;
+  return REAL(pthread_join(th, ret));
+}
+}  // namespace __sanitizer
 
 TSAN_INTERCEPTOR(int, pthread_detach, void *th) {
   SCOPED_INTERCEPTOR_RAW(pthread_detach, th);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
index 6f118e0979e2de..e129e9af272f5f 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
@@ -9,17 +9,19 @@
 // This file is a part of ThreadSanitizer (TSan), a race detector.
 //
 //===----------------------------------------------------------------------===//
+#include "tsan_mman.h"
+
 #include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_allocator_report.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
+#include "sanitizer_common/sanitizer_stackdepot.h"
+#include "tsan_flags.h"
 #include "tsan_interface.h"
-#include "tsan_mman.h"
-#include "tsan_rtl.h"
 #include "tsan_report.h"
-#include "tsan_flags.h"
+#include "tsan_rtl.h"
 
 namespace __tsan {
 
@@ -115,12 +117,21 @@ ScopedGlobalProcessor::~ScopedGlobalProcessor() {
   gp->mtx.Unlock();
 }
 
-void AllocatorLock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+void AllocatorLockBeforeFork() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   global_proc()->internal_alloc_mtx.Lock();
   InternalAllocatorLock();
-}
-
-void AllocatorUnlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+#if !SANITIZER_APPLE
+  // OS X allocates from hooks, see 6a3958247a.
+  allocator()->ForceLock();
+  StackDepotLockBeforeFork();
+#endif
+}
+
+void AllocatorUnlockAfterFork(bool child) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+#if !SANITIZER_APPLE
+  StackDepotUnlockAfterFork(child);
+  allocator()->ForceUnlock();
+#endif
   InternalAllocatorUnlock();
   global_proc()->internal_alloc_mtx.Unlock();
 }
diff --git a/compiler-rt/lib/tsan/rtl/tsan_mman.h b/compiler-rt/lib/tsan/rtl/tsan_mman.h
index 2095f28c0253e4..f01bbc4d15718f 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_mman.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_mman.h
@@ -24,8 +24,8 @@ void ReplaceSystemMalloc();
 void AllocatorProcStart(Processor *proc);
 void AllocatorProcFinish(Processor *proc);
 void AllocatorPrintStats();
-void AllocatorLock();
-void AllocatorUnlock();
+void AllocatorLockBeforeFork();
+void AllocatorUnlockAfterFork(bool child);
 void GlobalProcessorLock();
 void GlobalProcessorUnlock();
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index fd9441dfcb53cb..e5ebb65754b327 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -35,8 +35,10 @@ extern "C" void __tsan_resume() {
   __tsan_resumed = 1;
 }
 
+#if SANITIZER_APPLE
 SANITIZER_WEAK_DEFAULT_IMPL
 void __tsan_test_only_on_fork() {}
+#endif
 
 namespace __tsan {
 
@@ -813,7 +815,7 @@ void ForkBefore(ThreadState* thr, uptr pc) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   ctx->thread_registry.Lock();
   ctx->slot_mtx.Lock();
   ScopedErrorReportLock::Lock();
-  AllocatorLock();
+  AllocatorLockBeforeFork();
   // Suppress all reports in the pthread_atfork callbacks.
   // Reports will deadlock on the report_mtx.
   // We could ignore sync operations as well,
@@ -828,14 +830,17 @@ void ForkBefore(ThreadState* thr, uptr pc) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   // Disables memory write in OnUserAlloc/Free.
   thr->ignore_reads_and_writes++;
 
+#  if SANITIZER_APPLE
   __tsan_test_only_on_fork();
+#  endif
 }
 
-static void ForkAfter(ThreadState* thr) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+static void ForkAfter(ThreadState* thr,
+                      bool child) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   thr->suppress_reports--;  // Enabled in ForkBefore.
   thr->ignore_interceptors--;
   thr->ignore_reads_and_writes--;
-  AllocatorUnlock();
+  AllocatorUnlockAfterFork(child);
   ScopedErrorReportLock::Unlock();
   ctx->slot_mtx.Unlock();
   ctx->thread_registry.Unlock();
@@ -845,10 +850,10 @@ static void ForkAfter(ThreadState* thr) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   GlobalProcessorUnlock();
 }
 
-void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr); }
+void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr, false); }
 
 void ForkChildAfter(ThreadState* thr, uptr pc, bool start_thread) {
-  ForkAfter(thr);
+  ForkAfter(thr, true);
   u32 nthread = ctx->thread_registry.OnFork(thr->tid);
   VPrintf(1,
           "ThreadSanitizer: forked new process with pid %d,"
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 64b9a9d025e7b5..70bf43e2fac593 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -586,21 +586,18 @@ def get_macos_aligned_version(macos_vers):
     for vers in min_macos_deployment_target_substitutions:
         flag = config.apple_platform_min_deployment_target_flag
         major, minor = get_macos_aligned_version(vers)
-        if "mtargetos" in flag:
+        apple_device = ""
+        sim = ""
+        if "target" in flag:
+            apple_device = config.apple_platform.split("sim")[0]
             sim = "-simulator" if "sim" in config.apple_platform else ""
-            config.substitutions.append(
-                (
-                    "%%min_macos_deployment_target=%s.%s" % vers,
-                    "{}{}.{}{}".format(flag, major, minor, sim),
-                )
-            )
-        else:
-            config.substitutions.append(
-                (
-                    "%%min_macos_deployment_target=%s.%s" % vers,
-                    "{}={}.{}".format(flag, major, minor),
-                )
+
+        config.substitutions.append(
+            (
+                "%%min_macos_deployment_target=%s.%s" % vers,
+                "{}={}{}.{}{}".format(flag, apple_device, major, minor, sim),
             )
+        )
 else:
     for vers in min_macos_deployment_target_substitutions:
         config.substitutions.append(("%%min_macos_deployment_target=%s.%s" % vers, ""))
diff --git a/compiler-rt/test/metadata/common.h b/compiler-rt/test/metadata/common.h
index af0e00902628e0..d9c26872b614fa 100644
--- a/compiler-rt/test/metadata/common.h
+++ b/compiler-rt/test/metadata/common.h
@@ -46,7 +46,7 @@ const char *meta_start;
 const char *meta_end;
 const char *atomics_start;
 const char *atomics_end;
-}; // namespace
+}  // namespace
 
 extern "C" {
 void __sanitizer_metadata_covered_add(uint32_t version, const char *start,
diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index e51805bdf923cb..efb24751da5ffe 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -1,63 +1,65 @@
-// REQUIRES: lld-available
+// REQUIRES: lld, lld-available
 
 // Building the instrumented binary will fail because lld doesn't support
 // big-endian ELF for PPC (aka ABI 1).
 // ld.lld: error: /lib/../lib64/Scrt1.o: ABI version 1 is not supported
 // UNSUPPORTED: ppc && host-byteorder-big-endian
 
-// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -g -fprofile-generate=. -mllvm -enable-vtable-value-profiling %s -o %t-test
-// RUN: env LLVM_PROFILE_FILE=%t-test.profraw %t-test
+// RUN: rm -rf %t && mkdir %t && cd %t
+
+// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -fprofile-generate=. -mllvm -enable-vtable-value-profiling %s -o test
+// RUN: env LLVM_PROFILE_FILE=test.profraw ./test
 
 // Show vtable profiles from raw profile.
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-test.profraw | FileCheck %s --check-prefixes=COMMON,RAW
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables test.profraw | FileCheck %s --check-prefixes=COMMON,RAW
 
 // Generate indexed profile from raw profile and show the data.
-// RUN: llvm-profdata merge %t-test.profraw -o %t-test.profdata
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-test.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
+// RUN: llvm-profdata merge --keep-vtable-symbols test.profraw -o test.profdata
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables test.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
 
 // Generate text profile from raw and indexed profiles respectively and show the data.
-// RUN: llvm-profdata merge --text %t-test.profraw -o %t-raw.proftext
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text %t-raw.proftext | FileCheck %s --check-prefix=ICTEXT
-// RUN: llvm-profdata merge --text %t-test.profdata -o %t-indexed.proftext
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text %t-indexed.proftext | FileCheck %s --check-prefix=ICTEXT
+// RUN: llvm-profdata merge --keep-vtable-symbols --text test.profraw -o raw.proftext
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text raw.proftext | FileCheck %s --check-prefix=ICTEXT
+// RUN: llvm-profdata merge --keep-vtable-symbols --text test.profdata -o indexed.proftext
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text indexed.proftext | FileCheck %s --check-prefix=ICTEXT
 
 // Generate indexed profile from text profiles and show the data
-// RUN: llvm-profdata merge --binary %t-raw.proftext -o %t-text.profraw
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-text.profraw | FileCheck %s --check-prefixes=COMMON,INDEXED
-// RUN: llvm-profdata merge --binary %t-indexed.proftext -o %t-text.profdata
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-text.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
+// RUN: llvm-profdata merge --keep-vtable-symbols --binary raw.proftext -o text.profraw
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables text.profraw | FileCheck %s --check-prefixes=COMMON,INDEXED
+// RUN: llvm-profdata merge --keep-vtable-symbols --binary indexed.proftext -o text.profdata
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables text.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
 
 // COMMON: Counters:
 // COMMON-NEXT:  main:
-// COMMON-NEXT:  Hash: 0x0f9a16fe6d398548
-// COMMON-NEXT:  Counters: 2
+// COMMON-NEXT:  Hash: 0x068617320ec408a0
+// COMMON-NEXT:  Counters: 4
 // COMMON-NEXT:  Indirect Call Site Count: 2
 // COMMON-NEXT:  Number of instrumented vtables: 2
 // RAW:  Indirect Target Results:
-// RAW-NEXT:       [  0, _ZN8Derived15func1Eii,        250 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func1Eii,        750 ] (75.00%)
-// RAW-NEXT:       [  1, _ZN8Derived15func2Eii,        250 ] (25.00%)
-// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func2Eii,        750 ] (75.00%)
+// RAW-NEXT:       [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
+// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// RAW-NEXT:       [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
+// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
 // RAW-NEXT:  VTable Results:
-// RAW-NEXT:       [  0, _ZTV8Derived1,        250 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+// RAW-NEXT:       [  0, _ZTV8Derived1,        50 ] (25.00%)
+// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
 // RAW-NEXT:       [  1, _ZTV8Derived1,        250 ] (25.00%)
 // RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED:     Indirect Target Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func1Eii,        750 ] (75.00%)
-// INDEXED-NEXT:         [  0, _ZN8Derived15func1Eii,        250 ] (25.00%)
-// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func2Eii,        750 ] (75.00%)
-// INDEXED-NEXT:         [  1, _ZN8Derived15func2Eii,        250 ] (25.00%)
+// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
+// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
+// INDEXED-NEXT:         [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
 // INDEXED-NEXT:     VTable Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
-// INDEXED-NEXT:         [  0, _ZTV8Derived1,        250 ] (25.00%)
+// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, _ZTV8Derived1,        50 ] (25.00%)
 // INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED-NEXT:         [  1, _ZTV8Derived1,        250 ] (25.00%)
 // COMMON: Instrumentation level: IR  entry_first = 0
 // COMMON-NEXT: Functions shown: 1
-// COMMON-NEXT: Total functions: 6
+// COMMON-NEXT: Total functions: 7
 // COMMON-NEXT: Maximum function count: 1000
-// COMMON-NEXT: Maximum internal block count: 250
+// COMMON-NEXT: Maximum internal block count: 1000
 // COMMON-NEXT: Statistics for indirect call sites profile:
 // COMMON-NEXT:   Total number of sites: 2
 // COMMON-NEXT:   Total number of sites with values: 2
@@ -76,11 +78,13 @@
 // ICTEXT: :ir
 // ICTEXT: main
 // ICTEXT: # Func Hash:
-// ICTEXT: 1124236338992350536
+// ICTEXT: 470088714870327456
 // ICTEXT: # Num Counters:
-// ICTEXT: 2
+// ICTEXT: 4
 // ICTEXT: # Counter Values:
 // ICTEXT: 1000
+// ICTEXT: 1000
+// ICTEXT: 200
 // ICTEXT: 1
 // ICTEXT: # Num Value Kinds:
 // ICTEXT: 2
@@ -89,41 +93,98 @@
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func1Eii:750
-// ICTEXT: _ZN8Derived15func1Eii:250
+// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii:150
+// ICTEXT: _ZN8Derived14funcEii:50
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func2Eii:750
-// ICTEXT: _ZN8Derived15func2Eii:250
+// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev:750
+// ICTEXT: _ZN8Derived1D0Ev:250
 // ICTEXT: # ValueKind = IPVK_VTableTarget:
 // ICTEXT: 2
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
-// ICTEXT: _ZTV8Derived1:250
+// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:150
+// ICTEXT: _ZTV8Derived1:50
 // ICTEXT: 2
 // ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
 // ICTEXT: _ZTV8Derived1:250
 
+// Test indirect call promotion transformation using vtable profiles.
+// - Build with `-g` to enable debug information.
+// - In real world settings, ICP pass is disabled in prelink pipeline. In
+//   the postlink pipeline, ICP is enabled after whole-program-devirtualization
+//   pass. Do the same thing in this test.
+// - Enable `-fwhole-program-vtables` generate type metadata and intrinsics.
+// - Enable `-fno-split-lto-unit` and `-Wl,-lto-whole-program-visibility` to
+//   preserve type intrinsics for ICP pass.
+// RUN: %clangxx -m64  -fprofile-use=test.profdata -Wl,--lto-whole-program-visibility \
+// RUN:    -mllvm -disable-icp=true -Wl,-mllvm,-disable-icp=false -fuse-ld=lld \
+// RUN:    -g -flto=thin -fwhole-program-vtables -fno-split-lto-unit -O2 \
+// RUN:    -mllvm -enable-vtable-value-profiling -Wl,-mllvm,-enable-vtable-value-profiling \
+// RUN:    -mllvm -enable-vtable-profile-use \
+// RUN:    -Wl,-mllvm,-enable-vtable-profile-use -Rpass=pgo-icall-prom \
+// RUN:    -Wl,-mllvm,-print-after=pgo-icall-prom \
+// RUN:    -Wl,-mllvm,-filter-print-funcs=main %s 2>&1 \
+// RUN:    | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
+
+// For the indirect call site `ptr->func`
+// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, sink 1 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
+// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+//
+// For the indirect call site `delete ptr`
+// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
+// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, sink 2 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+
+// The IR matchers for indirect callsite `ptr->func`.
+// IR-LABEL: @main
+// IR:   [[OBJ:%.*]] = {{.*}}call {{.*}} @_Z10createTypei
+// IR:   [[VTABLE:%.*]] = load ptr, ptr [[OBJ]]
+// IR:   [[CMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTVN12_GLOBAL__N_18Derived2E, i32 16)
+// IR:   br i1 [[CMP1]], label %[[BB1:.*]], label %[[BB2:[a-zA-Z0-9_.]+]],
+//
+// IR: [[BB1]]:
+// IR:   [[RESBB1:%.*]] = {{.*}}call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii
+// IR:   br label %[[MERGE0:[a-zA-Z0-9_.]+]]
+//
+// IR: [[BB2]]:
+// IR:   [[CMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV8Derived1, i32 16)
+// IR:   br i1 [[CMP2]], label %[[BB3:.*]], label %[[BB4:[a-zA-Z0-9_.]+]],
+//
+// IR: [[BB3]]:
+// IR:   [[RESBB3:%.*]] = {{.*}}call {{.*}} @_ZN8Derived14funcEii
+// IR:   br label %[[MERGE1:[a-zA-Z0-9_.]+]],
+//
+// IR: [[BB4]]:
+// IR:   [[FUNCPTR:%.*]] = load ptr, ptr [[VTABLE]]
+// IR:   [[RESBB4:%.*]] = {{.*}}call {{.*}} [[FUNCPTR]]
+// IR:   br label %[[MERGE1]]
+//
+// IR: [[MERGE1]]:
+// IR:    [[RES1:%.*]] = phi i32 [ [[RESBB4]], %[[BB4]] ], [ [[RESBB3]], %[[BB3]] ]
+// IR:    br label %[[MERGE0]]
+//
+// IR: [[MERGE0]]:
+// IR:    [[RES2:%.*]] = phi i32 [ [[RES1]], %[[MERGE1]] ], [ [[RESBB1]], %[[BB1]] ]
 #include <cstdio>
 #include <cstdlib>
 class Base {
 public:
-  virtual int func1(int a, int b) = 0;
-  virtual int func2(int a, int b) = 0;
+  virtual int func(int a, int b) = 0;
+
+  virtual ~Base() {};
 };
 class Derived1 : public Base {
 public:
-  int func1(int a, int b) override { return a + b; }
+  int func(int a, int b) override { return a * b; }
 
-  int func2(int a, int b) override { return a * b; }
+  ~Derived1() {}
 };
 namespace {
 class Derived2 : public Base {
 public:
-  int func1(int a, int b) override { return a - b; }
+  int func(int a, int b) override { return a * (a - b); }
 
-  int func2(int a, int b) override { return a * (a - b); }
+  ~Derived2() {}
 };
 } // namespace
 __attribute__((noinline)) Base *createType(int a) {
@@ -140,7 +201,10 @@ int main(int argc, char **argv) {
     int a = rand();
     int b = rand();
     Base *ptr = createType(i);
-    sum += ptr->func1(a, b) + ptr->func2(b, a);
+    if (i % 5 == 0)
+      sum += ptr->func(b, a);
+
+    delete ptr;
   }
   printf("sum is %d\n", sum);
   return 0;
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c b/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c
index 13240234a1c79b..27b67db0c0a38c 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c
@@ -6,9 +6,6 @@
 // FIXME: It probably hangs on this platform.
 // UNSUPPORTED: ppc
 
-// FIXME: TSAN does not lock allocator.
-// UNSUPPORTED: tsan
-
 // FIXME: False stack overflow report
 // UNSUPPORTED: android && asan
 
diff --git a/compiler-rt/test/tsan/Linux/fork_deadlock.cpp b/compiler-rt/test/tsan/Darwin/fork_deadlock.cpp
similarity index 96%
rename from compiler-rt/test/tsan/Linux/fork_deadlock.cpp
rename to compiler-rt/test/tsan/Darwin/fork_deadlock.cpp
index 952507032df65d..2a5b81cae24490 100644
--- a/compiler-rt/test/tsan/Linux/fork_deadlock.cpp
+++ b/compiler-rt/test/tsan/Darwin/fork_deadlock.cpp
@@ -1,3 +1,6 @@
+// TODO(#96597): fix and reenable
+// UNSUPPORTED: darwin
+
 // RUN: %clangxx_tsan -O1 %s -o %t && %env_tsan_opts=atexit_sleep_ms=0 %run %t 2>&1 | FileCheck %s
 
 // This test models what happens on Mac when fork
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 8853d4d9e1c79c..d1f7cd8372e243 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -893,16 +893,17 @@ used in constant expressions have currently no folding support at all.
 ##### `CMDSTAT`:
 
 - Synchronous execution:
-  - -2: No error condition occurs, but `WAIT` is present with the value `false`, and the processor does not support asynchronous execution.
-  - -1: The processor does not support command line execution.
+  - -2: `ASYNC_NO_SUPPORT_ERR` - No error condition occurs, but `WAIT` is present with the value `false`, and the processor does not support asynchronous execution.
+  - -1: `NO_SUPPORT_ERR` - The processor does not support command line execution. (system returns -1 with errno `ENOENT`)
+  - 0: `CMD_EXECUTED` - Command executed with no error.
   - \+ (positive value): An error condition occurs.
-    - 1: Fork Error (occurs only on POSIX-compatible systems).
-    - 2: Execution Error (command exits with status -1).
-    - 3: Invalid Command Error (determined by the exit code depending on the system).
-      - On Windows: exit code is 1.
-      - On POSIX-compatible systems: exit code is 127 or 126.
-    - 4: Signal error (either stopped or killed by signal, occurs only on POSIX-compatible systems).
-  - 0: Otherwise.
+    - 1: `FORK_ERR` - Fork Error (occurs only on POSIX-compatible systems).
+    - 2: `EXECL_ERR` - Execution Error (system returns -1 with other errno).
+    - 3: `COMMAND_EXECUTION_ERR` - Invalid Command Error (exit code 1).
+    - 4: `COMMAND_CANNOT_EXECUTE_ERR` - Command Cannot Execute Error (Linux exit code 126).
+    - 5: `COMMAND_NOT_FOUND_ERR` - Command Not Found Error (Linux exit code 127).
+    - 6: `INVALID_CL_ERR` - Invalid Command Line Error (covers all other non-zero exit codes).
+    - 7: `SIGNAL_ERR` - Signal error (either stopped or killed by signal, occurs only on POSIX-compatible systems).
 - Asynchronous execution:
   - 0 will always be assigned.
 
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index 9695c665d0cb5d..11533a7259b055 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -55,8 +55,8 @@ std::optional<bool> DistinguishableOpOrAssign(
 // Shapes of function results and dummy arguments have to have
 // the same rank, the same deferred dimensions, and the same
 // values for explicit dimensions when constant.
-bool ShapesAreCompatible(
-    const Shape &, const Shape &, bool *possibleWarning = nullptr);
+bool ShapesAreCompatible(const std::optional<Shape> &,
+    const std::optional<Shape> &, bool *possibleWarning = nullptr);
 
 class TypeAndShape {
 public:
@@ -64,17 +64,17 @@ class TypeAndShape {
       Attr, AssumedRank, AssumedShape, AssumedSize, DeferredShape, Coarray)
   using Attrs = common::EnumSet<Attr, Attr_enumSize>;
 
-  explicit TypeAndShape(DynamicType t) : type_{t} { AcquireLEN(); }
-  TypeAndShape(DynamicType t, int rank) : type_{t}, shape_(rank) {
+  explicit TypeAndShape(DynamicType t) : type_{t}, shape_{Shape{}} {
+    AcquireLEN();
+  }
+  TypeAndShape(DynamicType t, int rank) : type_{t}, shape_{Shape(rank)} {
     AcquireLEN();
   }
   TypeAndShape(DynamicType t, Shape &&s) : type_{t}, shape_{std::move(s)} {
     AcquireLEN();
   }
   TypeAndShape(DynamicType t, std::optional<Shape> &&s) : type_{t} {
-    if (s) {
-      shape_ = std::move(*s);
-    }
+    shape_ = std::move(s);
     AcquireLEN();
   }
   DEFAULT_CONSTRUCTORS_AND_ASSIGNMENTS(TypeAndShape)
@@ -172,11 +172,12 @@ class TypeAndShape {
     LEN_ = std::move(len);
     return *this;
   }
-  const Shape &shape() const { return shape_; }
+  const std::optional<Shape> &shape() const { return shape_; }
   const Attrs &attrs() const { return attrs_; }
   int corank() const { return corank_; }
 
-  int Rank() const { return GetRank(shape_); }
+  // Return -1 for assumed-rank as a safety.
+  int Rank() const { return shape_ ? GetRank(*shape_) : -1; }
 
   // Can sequence association apply to this argument?
   bool CanBeSequenceAssociated() const {
@@ -211,7 +212,7 @@ class TypeAndShape {
 protected:
   DynamicType type_;
   std::optional<Expr<SubscriptInteger>> LEN_;
-  Shape shape_;
+  std::optional<Shape> shape_;
   Attrs attrs_;
   int corank_{0};
 };
diff --git a/flang/include/flang/Evaluate/shape.h b/flang/include/flang/Evaluate/shape.h
index 1294c92a01abb6..e33044c0d34e56 100644
--- a/flang/include/flang/Evaluate/shape.h
+++ b/flang/include/flang/Evaluate/shape.h
@@ -46,6 +46,13 @@ Constant<ExtentType> AsConstantShape(const ConstantSubscripts &);
 ConstantSubscripts AsConstantExtents(const Constant<ExtentType> &);
 std::optional<ConstantSubscripts> AsConstantExtents(
     FoldingContext &, const Shape &);
+inline std::optional<ConstantSubscripts> AsConstantExtents(
+    FoldingContext &foldingContext, const std::optional<Shape> &maybeShape) {
+  if (maybeShape) {
+    return AsConstantExtents(foldingContext, *maybeShape);
+  }
+  return std::nullopt;
+}
 Shape AsShape(const ConstantSubscripts &);
 std::optional<Shape> AsShape(const std::optional<ConstantSubscripts> &);
 
@@ -121,6 +128,12 @@ MaybeExtentExpr CountTrips(
 // Computes SIZE() == PRODUCT(shape)
 MaybeExtentExpr GetSize(Shape &&);
 ConstantSubscript GetSize(const ConstantSubscripts &);
+inline MaybeExtentExpr GetSize(const std::optional<Shape> &maybeShape) {
+  if (maybeShape) {
+    return GetSize(Shape(*maybeShape));
+  }
+  return std::nullopt;
+}
 
 // Utility predicate: does an expression reference any implied DO index?
 bool ContainsAnyImpliedDoIndex(const ExtentExpr &);
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 340325b59c0abb..625f9e5f6576fc 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -98,6 +98,9 @@ template <typename T> bool IsAssumedRank(const Expr<T> &expr) {
 template <typename A> bool IsAssumedRank(const std::optional<A> &x) {
   return x && IsAssumedRank(*x);
 }
+template <typename A> bool IsAssumedRank(const A *x) {
+  return x && IsAssumedRank(*x);
+}
 
 // Predicate: true when an expression is a coarray (corank > 0)
 bool IsCoarray(const ActualArgument &);
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index f43dfd8343ece5..daded9091780e2 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -17,6 +17,7 @@
 #include "flang/Lower/LoweringOptions.h"
 #include "flang/Lower/PFTDefs.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Semantics/symbol.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -126,8 +127,8 @@ class AbstractConverter {
       const Fortran::semantics::Symbol &sym,
       mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr) = 0;
 
-  virtual void copyVar(mlir::Location loc, mlir::Value dst,
-                       mlir::Value src) = 0;
+  virtual void copyVar(mlir::Location loc, mlir::Value dst, mlir::Value src,
+                       fir::FortranVariableFlagsEnum attrs) = 0;
 
   /// For a given symbol, check if it is present in the inner-most
   /// level of the symbol map.
diff --git a/flang/include/flang/Lower/BoxAnalyzer.h b/flang/include/flang/Lower/BoxAnalyzer.h
index 3b8e2455ff273b..8eca7d66a71bf3 100644
--- a/flang/include/flang/Lower/BoxAnalyzer.h
+++ b/flang/include/flang/Lower/BoxAnalyzer.h
@@ -97,7 +97,7 @@ struct ScalarDynamicDerived : ScalarSym {
       : ScalarSym{sym}, lens{std::move(lens)} {}
 
 private:
-  llvm::SmallVector<Fortran::lower::SomeExpr> lens;
+  llvm::SmallVector<Fortran::lower::SomeExpr, 1> lens;
 };
 
 struct LBoundsAndShape {
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 1ca1539e76fc63..df709645c01b0b 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -57,6 +57,8 @@ namespace fir {
 #define GEN_PASS_DECL_OMPFUNCTIONFILTERING
 #define GEN_PASS_DECL_VSCALEATTR
 #define GEN_PASS_DECL_FUNCTIONATTR
+#define GEN_PASS_DECL_CONSTANTARGUMENTGLOBALISATIONOPT
+
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createAffineDemotionPass();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 27aee5650e75d5..b3ed9acad36df4 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -251,6 +251,15 @@ def MemoryAllocationOpt : Pass<"memory-allocation-opt", "mlir::func::FuncOp"> {
   ];
 }
 
+// This needs to be a "mlir::ModuleOp" pass, because it inserts global constants
+def ConstantArgumentGlobalisationOpt : Pass<"constant-argument-globalisation-opt", "mlir::ModuleOp"> {
+  let summary = "Convert constant function arguments to global constants.";
+  let description = [{
+    Convert scalar literals of function arguments to global constants.
+  }];
+  let dependentDialects = [ "fir::FIROpsDialect" ];
+}
+
 def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> {
   let summary = "Move local array allocations from heap memory into stack memory";
   let description = [{
diff --git a/flang/include/flang/Parser/preprocessor.h b/flang/include/flang/Parser/preprocessor.h
index c3076435be5f0b..57690dd226f628 100644
--- a/flang/include/flang/Parser/preprocessor.h
+++ b/flang/include/flang/Parser/preprocessor.h
@@ -82,6 +82,7 @@ class Preprocessor {
   bool IsNameDefined(const CharBlock &);
   bool IsFunctionLikeDefinition(const CharBlock &);
   bool AnyDefinitions() const { return !definitions_.empty(); }
+  bool InConditional() const { return !ifStack_.empty(); }
 
   // When called with partialFunctionLikeMacro not null, MacroReplacement()
   // and ReplaceMacros() handle an unclosed function-like macro reference
diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 6ceb70ebb676d0..704144f08114f2 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -115,6 +115,11 @@ bool RTDECL(PointerIsAssociated)(const Descriptor &);
 bool RTDECL(PointerIsAssociatedWith)(
     const Descriptor &, const Descriptor *target);
 
+// Fortran POINTERs are allocated with an extra validation word after their
+// payloads in order to detect erroneous deallocations later.
+RT_API_ATTRS void *AllocateValidatedPointerPayload(std::size_t);
+RT_API_ATTRS bool ValidatePointerPayload(const ISO::CFI_cdesc_t &);
+
 } // extern "C"
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_POINTER_H_
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index bb4347d0e82c8f..7f2910c5cfd3c3 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -25,6 +25,10 @@
   static llvm::cl::opt<bool> disable##DOName("disable-" DOOption, \
       llvm::cl::desc("disable " DODescription " pass"), llvm::cl::init(false), \
       llvm::cl::Hidden)
+#define EnableOption(EOName, EOOption, EODescription) \
+  static llvm::cl::opt<bool> enable##EOName("enable-" EOOption, \
+      llvm::cl::desc("enable " EODescription " pass"), llvm::cl::init(false), \
+      llvm::cl::Hidden)
 
 /// Shared option in tools to control whether dynamically sized array
 /// allocations should always be on the heap.
@@ -86,6 +90,8 @@ DisableOption(BoxedProcedureRewrite, "boxed-procedure-rewrite",
 
 DisableOption(ExternalNameConversion, "external-name-interop",
     "convert names with external convention");
+EnableOption(ConstantArgumentGlobalisation, "constant-argument-globalisation",
+    "the local constant argument to global constant conversion");
 
 using PassConstructor = std::unique_ptr<mlir::Pass>();
 
@@ -270,6 +276,8 @@ inline void createDefaultFIROptimizerPassPipeline(
     // These passes may increase code size.
     pm.addPass(fir::createSimplifyIntrinsics());
     pm.addPass(fir::createAlgebraicSimplificationPass(config));
+    if (enableConstantArgumentGlobalisation)
+      pm.addPass(fir::createConstantArgumentGlobalisationOpt());
   }
 
   if (pc.LoopVersioning)
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index a0ce190b90e926..70e24d6e82eb8e 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -39,13 +39,16 @@ static void CopyAttrs(const semantics::Symbol &src, A &dst,
 // Shapes of function results and dummy arguments have to have
 // the same rank, the same deferred dimensions, and the same
 // values for explicit dimensions when constant.
-bool ShapesAreCompatible(
-    const Shape &x, const Shape &y, bool *possibleWarning) {
-  if (x.size() != y.size()) {
+bool ShapesAreCompatible(const std::optional<Shape> &x,
+    const std::optional<Shape> &y, bool *possibleWarning) {
+  if (!x || !y) {
+    return !x && !y;
+  }
+  if (x->size() != y->size()) {
     return false;
   }
-  auto yIter{y.begin()};
-  for (const auto &xDim : x) {
+  auto yIter{y->begin()};
+  for (const auto &xDim : *x) {
     const auto &yDim{*yIter++};
     if (xDim && yDim) {
       if (auto equiv{AreEquivalentInInterface(*xDim, *yDim)}) {
@@ -178,9 +181,11 @@ bool TypeAndShape::IsCompatibleWith(parser::ContextualMessages &messages,
         thatIs, that.AsFortran(), thisIs, AsFortran());
     return false;
   }
-  return omitShapeConformanceCheck ||
-      CheckConformance(messages, shape_, that.shape_, flags, thisIs, thatIs)
-          .value_or(true /*fail only when nonconformance is known now*/);
+  return omitShapeConformanceCheck || (!shape_ && !that.shape_) ||
+      (shape_ && that.shape_ &&
+          CheckConformance(
+              messages, *shape_, *that.shape_, flags, thisIs, thatIs)
+              .value_or(true /*fail only when nonconformance is known now*/));
 }
 
 std::optional<Expr<SubscriptInteger>> TypeAndShape::MeasureElementSizeInBytes(
@@ -201,11 +206,11 @@ std::optional<Expr<SubscriptInteger>> TypeAndShape::MeasureElementSizeInBytes(
 
 std::optional<Expr<SubscriptInteger>> TypeAndShape::MeasureSizeInBytes(
     FoldingContext &foldingContext) const {
-  if (auto elements{GetSize(Shape{shape_})}) {
+  if (auto elements{GetSize(shape_)}) {
     // Sizes of arrays (even with single elements) are multiples of
     // their alignments.
     if (auto elementBytes{
-            MeasureElementSizeInBytes(foldingContext, GetRank(shape_) > 0)}) {
+            MeasureElementSizeInBytes(foldingContext, Rank() > 0)}) {
       return Fold(
           foldingContext, std::move(*elements) * std::move(*elementBytes));
     }
@@ -254,10 +259,12 @@ std::string TypeAndShape::AsFortran() const {
 llvm::raw_ostream &TypeAndShape::Dump(llvm::raw_ostream &o) const {
   o << type_.AsFortran(LEN_ ? LEN_->AsFortran() : "");
   attrs_.Dump(o, EnumToString);
-  if (!shape_.empty()) {
+  if (!shape_) {
+    o << " dimension(..)";
+  } else if (!shape_->empty()) {
     o << " dimension";
     char sep{'('};
-    for (const auto &expr : shape_) {
+    for (const auto &expr : *shape_) {
       o << sep;
       sep = ',';
       if (expr) {
@@ -1112,6 +1119,7 @@ bool FunctionResult::CanBeReturnedViaImplicitInterface(
 
 static std::optional<std::string> AreIncompatibleFunctionResultShapes(
     const Shape &x, const Shape &y) {
+  // Function results cannot be assumed-rank, hence the non optional arguments.
   int rank{GetRank(x)};
   if (int yrank{GetRank(y)}; yrank != rank) {
     return "rank "s + std::to_string(rank) + " vs " + std::to_string(yrank);
@@ -1147,7 +1155,8 @@ bool FunctionResult::IsCompatibleWith(
         }
       } else if (!attrs.test(Attr::Allocatable) && !attrs.test(Attr::Pointer) &&
           (details = AreIncompatibleFunctionResultShapes(
-               ifaceTypeShape->shape(), actualTypeShape->shape()))) {
+               ifaceTypeShape->shape().value(),
+               actualTypeShape->shape().value()))) {
         if (whyNot) {
           *whyNot = "function results have distinct extents (" + *details + ')';
         }
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index a4b152c60a72fc..342aac4dd5d53e 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -419,7 +419,7 @@ std::optional<Expr<SomeType>> NonPointerInitializationExpr(const Symbol &symbol,
     if (converted) {
       auto folded{Fold(context, std::move(*converted))};
       if (IsActuallyConstant(folded)) {
-        int symRank{GetRank(symTS->shape())};
+        int symRank{symTS->Rank()};
         if (IsImpliedShape(symbol)) {
           if (folded.Rank() == symRank) {
             return ArrayConstantBoundChanger{
@@ -442,7 +442,8 @@ std::optional<Expr<SomeType>> NonPointerInitializationExpr(const Symbol &symbol,
                     context, GetRawLowerBounds(context, NamedEntity{symbol}))}
                 .Expand(std::move(folded));
           } else if (auto resultShape{GetShape(context, folded)}) {
-            if (CheckConformance(context.messages(), symTS->shape(),
+            CHECK(symTS->shape()); // Assumed-ranks cannot be initialized.
+            if (CheckConformance(context.messages(), *symTS->shape(),
                     *resultShape, CheckConformanceFlags::None,
                     "initialized object", "initialization expression")
                     .value_or(false /*fail if not known now to conform*/)) {
@@ -649,7 +650,8 @@ class CheckSpecificationExprHelper
               return std::holds_alternative<characteristics::DummyProcedure>(
                   dummy.u);
             })};
-        if (iter != procChars->dummyArguments.end()) {
+        if (iter != procChars->dummyArguments.end() &&
+            ultimate.name().ToString() != "__builtin_c_funloc") {
           return "reference to function '"s + ultimate.name().ToString() +
               "' with dummy procedure argument '" + iter->name + '\'';
         }
diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index b76b9d49b58238..981cdff7f350b1 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -1116,14 +1116,25 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     return FoldMaxvalMinval<T>(
         context, std::move(funcRef), RelationalOperator::LT, T::Scalar::HUGE());
   } else if (name == "mod") {
+    bool badPConst{false};
+    if (auto *pExpr{UnwrapExpr<Expr<T>>(args[1])}) {
+      *pExpr = Fold(context, std::move(*pExpr));
+      if (auto pConst{GetScalarConstantValue<T>(*pExpr)}; pConst &&
+          pConst->IsZero() &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
+        context.messages().Say("MOD: P argument is zero"_warn_en_US);
+        badPConst = true;
+      }
+    }
     return FoldElementalIntrinsic<T, T, T>(context, std::move(funcRef),
         ScalarFuncWithContext<T, T, T>(
-            [](FoldingContext &context, const Scalar<T> &x,
+            [badPConst](FoldingContext &context, const Scalar<T> &x,
                 const Scalar<T> &y) -> Scalar<T> {
               auto quotRem{x.DivideSigned(y)};
               if (context.languageFeatures().ShouldWarn(
                       common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
-                if (quotRem.divisionByZero) {
+                if (!badPConst && quotRem.divisionByZero) {
                   context.messages().Say("mod() by zero"_warn_en_US);
                 } else if (quotRem.overflow) {
                   context.messages().Say("mod() folding overflowed"_warn_en_US);
@@ -1132,12 +1143,23 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
               return quotRem.remainder;
             }));
   } else if (name == "modulo") {
+    bool badPConst{false};
+    if (auto *pExpr{UnwrapExpr<Expr<T>>(args[1])}) {
+      *pExpr = Fold(context, std::move(*pExpr));
+      if (auto pConst{GetScalarConstantValue<T>(*pExpr)}; pConst &&
+          pConst->IsZero() &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
+        context.messages().Say("MODULO: P argument is zero"_warn_en_US);
+        badPConst = true;
+      }
+    }
     return FoldElementalIntrinsic<T, T, T>(context, std::move(funcRef),
-        ScalarFuncWithContext<T, T, T>([](FoldingContext &context,
+        ScalarFuncWithContext<T, T, T>([badPConst](FoldingContext &context,
                                            const Scalar<T> &x,
                                            const Scalar<T> &y) -> Scalar<T> {
           auto result{x.MODULO(y)};
-          if (result.overflow &&
+          if (!badPConst && result.overflow &&
               context.languageFeatures().ShouldWarn(
                   common::UsageWarning::FoldingException)) {
             context.messages().Say("modulo() folding overflowed"_warn_en_US);
diff --git a/flang/lib/Evaluate/fold-real.cpp b/flang/lib/Evaluate/fold-real.cpp
index f71addcc4094ac..69c7a924cc1c35 100644
--- a/flang/lib/Evaluate/fold-real.cpp
+++ b/flang/lib/Evaluate/fold-real.cpp
@@ -303,41 +303,72 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
         context, std::move(funcRef), RelationalOperator::LT, T::Scalar::HUGE());
   } else if (name == "mod") {
     CHECK(args.size() == 2);
+    bool badPConst{false};
+    if (auto *pExpr{UnwrapExpr<Expr<T>>(args[1])}) {
+      *pExpr = Fold(context, std::move(*pExpr));
+      if (auto pConst{GetScalarConstantValue<T>(*pExpr)}; pConst &&
+          pConst->IsZero() &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
+        context.messages().Say("MOD: P argument is zero"_warn_en_US);
+        badPConst = true;
+      }
+    }
     return FoldElementalIntrinsic<T, T, T>(context, std::move(funcRef),
-        ScalarFunc<T, T, T>(
-            [&context](const Scalar<T> &x, const Scalar<T> &y) -> Scalar<T> {
-              auto result{x.MOD(y)};
-              if (result.flags.test(RealFlag::DivideByZero) &&
-                  context.languageFeatures().ShouldWarn(
-                      common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
-                context.messages().Say(
-                    "second argument to MOD must not be zero"_warn_en_US);
-              }
-              return result.value;
-            }));
+        ScalarFunc<T, T, T>([&context, badPConst](const Scalar<T> &x,
+                                const Scalar<T> &y) -> Scalar<T> {
+          auto result{x.MOD(y)};
+          if (!badPConst && result.flags.test(RealFlag::DivideByZero) &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
+            context.messages().Say(
+                "second argument to MOD must not be zero"_warn_en_US);
+          }
+          return result.value;
+        }));
   } else if (name == "modulo") {
     CHECK(args.size() == 2);
+    bool badPConst{false};
+    if (auto *pExpr{UnwrapExpr<Expr<T>>(args[1])}) {
+      *pExpr = Fold(context, std::move(*pExpr));
+      if (auto pConst{GetScalarConstantValue<T>(*pExpr)}; pConst &&
+          pConst->IsZero() &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
+        context.messages().Say("MODULO: P argument is zero"_warn_en_US);
+        badPConst = true;
+      }
+    }
     return FoldElementalIntrinsic<T, T, T>(context, std::move(funcRef),
-        ScalarFunc<T, T, T>(
-            [&context](const Scalar<T> &x, const Scalar<T> &y) -> Scalar<T> {
-              auto result{x.MODULO(y)};
-              if (result.flags.test(RealFlag::DivideByZero) &&
-                  context.languageFeatures().ShouldWarn(
-                      common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
-                context.messages().Say(
-                    "second argument to MODULO must not be zero"_warn_en_US);
-              }
-              return result.value;
-            }));
+        ScalarFunc<T, T, T>([&context, badPConst](const Scalar<T> &x,
+                                const Scalar<T> &y) -> Scalar<T> {
+          auto result{x.MODULO(y)};
+          if (!badPConst && result.flags.test(RealFlag::DivideByZero) &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
+            context.messages().Say(
+                "second argument to MODULO must not be zero"_warn_en_US);
+          }
+          return result.value;
+        }));
   } else if (name == "nearest") {
-    if (const auto *sExpr{UnwrapExpr<Expr<SomeReal>>(args[1])}) {
+    if (auto *sExpr{UnwrapExpr<Expr<SomeReal>>(args[1])}) {
+      *sExpr = Fold(context, std::move(*sExpr));
       return common::visit(
           [&](const auto &sVal) {
             using TS = ResultType<decltype(sVal)>;
+            bool badSConst{false};
+            if (auto sConst{GetScalarConstantValue<TS>(sVal)}; sConst &&
+                sConst->IsZero() &&
+                context.languageFeatures().ShouldWarn(
+                    common::UsageWarning::FoldingValueChecks)) {
+              context.messages().Say("NEAREST: S argument is zero"_warn_en_US);
+              badSConst = true;
+            }
             return FoldElementalIntrinsic<T, T, TS>(context, std::move(funcRef),
                 ScalarFunc<T, T, TS>([&](const Scalar<T> &x,
                                          const Scalar<TS> &s) -> Scalar<T> {
-                  if (s.IsZero() &&
+                  if (!badSConst && s.IsZero() &&
                       context.languageFeatures().ShouldWarn(
                           common::UsageWarning::FoldingValueChecks)) {
                     context.messages().Say(
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 2733f994576a45..80752d02b5baf4 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -1113,6 +1113,7 @@ static const SpecificIntrinsicInterface specificIntrinsicFunction[]{
     {{"ddim", {{"x", DoublePrecision}, {"y", DoublePrecision}},
          DoublePrecision},
         "dim"},
+    {{"derf", {{"x", DoublePrecision}}, DoublePrecision}, "erf"},
     {{"dexp", {{"x", DoublePrecision}}, DoublePrecision}, "exp"},
     {{"dfloat", {{"a", AnyInt}}, DoublePrecision}, "real", true},
     {{"dim", {{"x", DefaultReal}, {"y", DefaultReal}}, DefaultReal}},
diff --git a/flang/lib/Evaluate/shape.cpp b/flang/lib/Evaluate/shape.cpp
index 5cf48b240eca62..c62d0cb0ff29dd 100644
--- a/flang/lib/Evaluate/shape.cpp
+++ b/flang/lib/Evaluate/shape.cpp
@@ -1037,7 +1037,7 @@ auto GetShapeHelper::operator()(const ProcedureRef &call) const -> Result {
       } else if (context_) {
         if (auto moldTypeAndShape{characteristics::TypeAndShape::Characterize(
                 call.arguments().at(1), *context_)}) {
-          if (GetRank(moldTypeAndShape->shape()) == 0) {
+          if (moldTypeAndShape->Rank() == 0) {
             // SIZE= is absent and MOLD= is scalar: result is scalar
             return ScalarShape();
           } else {
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index b2a50ab9b6b833..a5f4faa0cef8fc 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -82,6 +82,8 @@ auto IsVariableHelper::operator()(const Symbol &symbol) const -> Result {
   const Symbol &ultimate{symbol.GetUltimate()};
   return !IsNamedConstant(ultimate) &&
       (ultimate.has<semantics::ObjectEntityDetails>() ||
+          (ultimate.has<semantics::EntityDetails>() &&
+              ultimate.attrs().test(semantics::Attr::TARGET)) ||
           ultimate.has<semantics::AssocEntityDetails>());
 }
 auto IsVariableHelper::operator()(const Component &x) const -> Result {
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 404d1f6d39446b..50f58843ec70bd 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -757,10 +757,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         });
   }
 
-  void copyVar(mlir::Location loc, mlir::Value dst,
-               mlir::Value src) override final {
+  void copyVar(mlir::Location loc, mlir::Value dst, mlir::Value src,
+               fir::FortranVariableFlagsEnum attrs) override final {
+    bool isAllocatable =
+        bitEnumContainsAny(attrs, fir::FortranVariableFlagsEnum::allocatable);
+    bool isPointer =
+        bitEnumContainsAny(attrs, fir::FortranVariableFlagsEnum::pointer);
+
     copyVarHLFIR(loc, Fortran::lower::SymbolBox::Intrinsic{dst},
-                 Fortran::lower::SymbolBox::Intrinsic{src});
+                 Fortran::lower::SymbolBox::Intrinsic{src}, isAllocatable,
+                 isPointer);
   }
 
   void copyHostAssociateVar(
@@ -1089,6 +1095,28 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst,
                     Fortran::lower::SymbolBox src) {
     assert(lowerToHighLevelFIR());
+
+    bool isBoxAllocatable = dst.match(
+        [](const fir::MutableBoxValue &box) { return box.isAllocatable(); },
+        [](const fir::FortranVariableOpInterface &box) {
+          return fir::FortranVariableOpInterface(box).isAllocatable();
+        },
+        [](const auto &box) { return false; });
+
+    bool isBoxPointer = dst.match(
+        [](const fir::MutableBoxValue &box) { return box.isPointer(); },
+        [](const fir::FortranVariableOpInterface &box) {
+          return fir::FortranVariableOpInterface(box).isPointer();
+        },
+        [](const auto &box) { return false; });
+
+    copyVarHLFIR(loc, dst, src, isBoxAllocatable, isBoxPointer);
+  }
+
+  void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst,
+                    Fortran::lower::SymbolBox src, bool isAllocatable,
+                    bool isPointer) {
+    assert(lowerToHighLevelFIR());
     hlfir::Entity lhs{dst.getAddr()};
     hlfir::Entity rhs{src.getAddr()};
     // Temporary_lhs is set to true in hlfir.assign below to avoid user
@@ -1105,21 +1133,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           /*temporary_lhs=*/true);
     };
 
-    bool isBoxAllocatable = dst.match(
-        [](const fir::MutableBoxValue &box) { return box.isAllocatable(); },
-        [](const fir::FortranVariableOpInterface &box) {
-          return fir::FortranVariableOpInterface(box).isAllocatable();
-        },
-        [](const auto &box) { return false; });
-
-    bool isBoxPointer = dst.match(
-        [](const fir::MutableBoxValue &box) { return box.isPointer(); },
-        [](const fir::FortranVariableOpInterface &box) {
-          return fir::FortranVariableOpInterface(box).isPointer();
-        },
-        [](const auto &box) { return false; });
-
-    if (isBoxAllocatable) {
+    if (isAllocatable) {
       // Deep copy allocatable if it is allocated.
       // Note that when allocated, the RHS is already allocated with the LHS
       // shape for copy on entry in createHostAssociateVarClone.
@@ -1134,7 +1148,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             copyData(lhs, rhs);
           })
           .end();
-    } else if (isBoxPointer) {
+    } else if (isPointer) {
       // Set LHS target to the target of RHS (do not copy the RHS
       // target data into the LHS target storage).
       auto loadVal = builder->create<fir::LoadOp>(loc, rhs);
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 30f985804d2ae7..c0ef96adc20c3e 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -177,9 +177,10 @@ mlir::Location Fortran::lower::CallerInterface::getCalleeLocation() const {
 // explicit.
 static Fortran::evaluate::characteristics::DummyDataObject
 asImplicitArg(Fortran::evaluate::characteristics::DummyDataObject &&dummy) {
-  Fortran::evaluate::Shape shape =
-      dummy.type.attrs().none() ? dummy.type.shape()
-                                : Fortran::evaluate::Shape(dummy.type.Rank());
+  std::optional<Fortran::evaluate::Shape> shape =
+      dummy.type.attrs().none()
+          ? dummy.type.shape()
+          : std::make_optional<Fortran::evaluate::Shape>(dummy.type.Rank());
   return Fortran::evaluate::characteristics::DummyDataObject(
       Fortran::evaluate::characteristics::TypeAndShape(dummy.type.type(),
                                                        std::move(shape)));
@@ -1308,18 +1309,17 @@ class Fortran::lower::CallInterfaceImpl {
   // with the shape (may contain unknown extents) for arrays.
   std::optional<fir::SequenceType::Shape> getBounds(
       const Fortran::evaluate::characteristics::TypeAndShape &typeAndShape) {
-    using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
-    if (typeAndShape.shape().empty() &&
-        !typeAndShape.attrs().test(ShapeAttr::AssumedRank))
+    if (typeAndShape.shape() && typeAndShape.shape()->empty())
       return std::nullopt;
     fir::SequenceType::Shape bounds;
-    for (const std::optional<Fortran::evaluate::ExtentExpr> &extent :
-         typeAndShape.shape()) {
-      fir::SequenceType::Extent bound = fir::SequenceType::getUnknownExtent();
-      if (std::optional<std::int64_t> i = toInt64(extent))
-        bound = *i;
-      bounds.emplace_back(bound);
-    }
+    if (typeAndShape.shape())
+      for (const std::optional<Fortran::evaluate::ExtentExpr> &extent :
+           *typeAndShape.shape()) {
+        fir::SequenceType::Extent bound = fir::SequenceType::getUnknownExtent();
+        if (std::optional<std::int64_t> i = toInt64(extent))
+          bound = *i;
+        bounds.emplace_back(bound);
+      }
     return bounds;
   }
   std::optional<std::int64_t>
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index f0af5f982c14f5..e8d6bb591e3617 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -718,7 +718,6 @@ gatherBoundsOrBoundValues(fir::FirOpBuilder &builder, mlir::Location loc,
 template <typename BoundsOp, typename BoundsType>
 llvm::SmallVector<mlir::Value>
 genBoundsOpsFromBox(fir::FirOpBuilder &builder, mlir::Location loc,
-                    Fortran::lower::AbstractConverter &converter,
                     fir::ExtendedValue dataExv,
                     Fortran::lower::AddrAndBoundsInfo &info) {
   llvm::SmallVector<mlir::Value> bounds;
@@ -778,7 +777,6 @@ genBoundsOpsFromBox(fir::FirOpBuilder &builder, mlir::Location loc,
 template <typename BoundsOp, typename BoundsType>
 llvm::SmallVector<mlir::Value>
 genBaseBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
-                 Fortran::lower::AbstractConverter &converter,
                  fir::ExtendedValue dataExv, bool isAssumedSize) {
   mlir::Type idxTy = builder.getIndexType();
   mlir::Type boundTy = builder.getType<BoundsType>();
@@ -1163,7 +1161,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
     info.rawInput = info.addr;
     if (mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType())))
       bounds = genBaseBoundsOps<BoundsOp, BoundsType>(builder, operandLocation,
-                                                      converter, compExv,
+                                                      compExv,
                                                       /*isAssumedSize=*/false);
     asFortran << designator.AsFortran();
 
@@ -1189,7 +1187,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
       info.addr = boxAddrOp.getVal();
       info.rawInput = info.addr;
       bounds = genBoundsOpsFromBox<BoundsOp, BoundsType>(
-          builder, operandLocation, converter, compExv, info);
+          builder, operandLocation, compExv, info);
     }
   } else {
     if (detail::getRef<evaluate::ArrayRef>(designator)) {
@@ -1206,13 +1204,13 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
       if (mlir::isa<fir::BaseBoxType>(
               fir::unwrapRefType(info.addr.getType()))) {
         bounds = genBoundsOpsFromBox<BoundsOp, BoundsType>(
-            builder, operandLocation, converter, dataExv, info);
+            builder, operandLocation, dataExv, info);
       }
       bool dataExvIsAssumedSize =
           Fortran::semantics::IsAssumedSizeArray(symRef->get().GetUltimate());
       if (mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType())))
         bounds = genBaseBoundsOps<BoundsOp, BoundsType>(
-            builder, operandLocation, converter, dataExv, dataExvIsAssumedSize);
+            builder, operandLocation, dataExv, dataExvIsAssumedSize);
       asFortran << symRef->get().name().ToString();
     } else { // Unsupported
       llvm::report_fatal_error("Unsupported type of OpenACC operand");
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 27eea2b133b3cf..f78cd0f9df1a18 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -394,6 +394,28 @@ bool ClauseProcessor::processNumThreads(
   return false;
 }
 
+bool ClauseProcessor::processOrder(mlir::omp::OrderClauseOps &result) const {
+  using Order = omp::clause::Order;
+  if (auto *clause = findUniqueClause<Order>()) {
+    fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+    result.orderAttr = mlir::omp::ClauseOrderKindAttr::get(
+        firOpBuilder.getContext(), mlir::omp::ClauseOrderKind::Concurrent);
+    const auto &modifier =
+        std::get<std::optional<Order::OrderModifier>>(clause->t);
+    if (modifier && *modifier == Order::OrderModifier::Unconstrained) {
+      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+          firOpBuilder.getContext(), mlir::omp::OrderModifier::unconstrained);
+    } else {
+      // "If order-modifier is not unconstrained, the behavior is as if the
+      // reproducible modifier is present."
+      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+          firOpBuilder.getContext(), mlir::omp::OrderModifier::reproducible);
+    }
+    return true;
+  }
+  return false;
+}
+
 bool ClauseProcessor::processOrdered(
     mlir::omp::OrderedClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Ordered>()) {
@@ -729,7 +751,7 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
   auto declSrc = builder.create<hlfir::DeclareOp>(
       loc, funcOp.getArgument(1), copyFuncName + "_src", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
-  converter.copyVar(loc, declDst.getBase(), declSrc.getBase());
+  converter.copyVar(loc, declDst.getBase(), declSrc.getBase(), varAttrs);
   builder.create<mlir::func::ReturnOp>(loc);
   return funcOp;
 }
@@ -1005,7 +1027,8 @@ bool ClauseProcessor::processReduction(
 
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
-        llvm::copy(reduceVarByRef, std::back_inserter(result.reduceVarByRef));
+        llvm::copy(reduceVarByRef,
+                   std::back_inserter(result.reductionVarsByRef));
         llvm::copy(reductionDeclSymbols,
                    std::back_inserter(result.reductionDeclSymbols));
 
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 5c9ab8baf82dd5..53571ae5abc204 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -77,6 +77,7 @@ class ClauseProcessor {
                        mlir::omp::NumTeamsClauseOps &result) const;
   bool processNumThreads(lower::StatementContext &stmtCtx,
                          mlir::omp::NumThreadsClauseOps &result) const;
+  bool processOrder(mlir::omp::OrderClauseOps &result) const;
   bool processOrdered(mlir::omp::OrderedClauseOps &result) const;
   bool processPriority(lower::StatementContext &stmtCtx,
                        mlir::omp::PriorityClauseOps &result) const;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 7d75e6f67dc1bc..22c41fce31723d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -776,6 +776,33 @@ static void genBodyOfTargetDataOp(
   }
 }
 
+// This generates intermediate common block member accesses within a region
+// and then rebinds the members symbol to the intermediate accessors we have
+// generated so that subsequent code generation will utilise these instead.
+//
+// When the scope changes, the bindings to the intermediate accessors should
+// be dropped in place of the original symbol bindings.
+//
+// This is for utilisation with TargetOp.
+static void genIntermediateCommonBlockAccessors(
+    Fortran::lower::AbstractConverter &converter,
+    const mlir::Location &currentLocation, mlir::Region &region,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSyms) {
+  for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) {
+    if (auto *details =
+            argSymbol->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
+      for (auto obj : details->objects()) {
+        auto targetCBMemberBind = Fortran::lower::genCommonBlockMember(
+            converter, currentLocation, *obj, region.getArgument(argIndex));
+        fir::ExtendedValue sexv = converter.getSymbolExtendedValue(*obj);
+        fir::ExtendedValue targetCBExv =
+            getExtendedValue(sexv, targetCBMemberBind);
+        converter.bindSymbol(*obj, targetCBExv);
+      }
+    }
+  }
+}
+
 // This functions creates a block for the body of the targetOp's region. It adds
 // all the symbols present in mapSymbols as block arguments to this block.
 static void
@@ -955,6 +982,16 @@ genBodyOfTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // Create the insertion point after the marker.
   firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
 
+  // If we map a common block using it's symbol e.g. map(tofrom: /common_block/)
+  // and accessing it's members within the target region, there is a large
+  // chance we will end up with uses external to the region accessing the common
+  // resolve these, we do so by generating new common block member accesses
+  // within the region, binding them to the member symbol for the scope of the
+  // region so that subsequent code generation within the region will utilise
+  // our new member accesses we have created.
+  genIntermediateCommonBlockAccessors(converter, currentLocation, region,
+                                      mapSyms);
+
   if (ConstructQueue::iterator next = std::next(item); next != queue.end()) {
     genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
                    next);
@@ -1000,6 +1037,7 @@ static void genDistributeClauses(lower::AbstractConverter &converter,
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDistSchedule(stmtCtx, clauseOps);
+  cp.processOrder(clauseOps);
   // TODO Support delayed privatization.
 }
 
@@ -1072,13 +1110,14 @@ static void genSimdClauses(lower::AbstractConverter &converter,
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAligned(clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
+  cp.processOrder(clauseOps);
   cp.processReduction(loc, clauseOps);
   cp.processSafelen(clauseOps);
   cp.processSimdlen(clauseOps);
 
   // TODO Support delayed privatization.
-  cp.processTODO<clause::Allocate, clause::Linear, clause::Nontemporal,
-                 clause::Order>(loc, llvm::omp::Directive::OMPD_simd);
+  cp.processTODO<clause::Allocate, clause::Linear, clause::Nontemporal>(
+      loc, llvm::omp::Directive::OMPD_simd);
 }
 
 static void genSingleClauses(lower::AbstractConverter &converter,
@@ -1243,12 +1282,13 @@ static void genWsloopClauses(
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processNowait(clauseOps);
+  cp.processOrder(clauseOps);
   cp.processOrdered(clauseOps);
   cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
   cp.processSchedule(stmtCtx, clauseOps);
   // TODO Support delayed privatization.
 
-  cp.processTODO<clause::Allocate, clause::Linear, clause::Order>(
+  cp.processTODO<clause::Allocate, clause::Linear>(
       loc, llvm::omp::Directive::OMPD_do);
 }
 
@@ -1670,6 +1710,13 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
     if (dsp.getAllSymbolsToPrivatize().contains(&sym))
       return;
 
+    // if the symbol is part of an already mapped common block, do not make a
+    // map for it.
+    if (const Fortran::semantics::Symbol *common =
+            Fortran::semantics::FindCommonBlockContaining(sym.GetUltimate()))
+      if (llvm::find(mapSyms, common) != mapSyms.end())
+        return;
+
     if (llvm::find(mapSyms, &sym) == mapSyms.end()) {
       mlir::Value baseOp = converter.getSymbolAddress(sym);
       if (!baseOp)
@@ -1691,15 +1738,14 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                 fir::unwrapRefType(info.addr.getType())))
           bounds = lower::genBoundsOpsFromBox<mlir::omp::MapBoundsOp,
                                               mlir::omp::MapBoundsType>(
-              firOpBuilder, converter.getCurrentLocation(), converter, dataExv,
-              info);
+              firOpBuilder, converter.getCurrentLocation(), dataExv, info);
         if (mlir::isa<fir::SequenceType>(
                 fir::unwrapRefType(info.addr.getType()))) {
           bool dataExvIsAssumedSize =
               semantics::IsAssumedSizeArray(sym.GetUltimate());
           bounds = lower::genBaseBoundsOps<mlir::omp::MapBoundsOp,
                                            mlir::omp::MapBoundsType>(
-              firOpBuilder, converter.getCurrentLocation(), converter, dataExv,
+              firOpBuilder, converter.getCurrentLocation(), dataExv,
               dataExvIsAssumedSize);
         }
 
@@ -1979,8 +2025,8 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
                                ConstructQueue::iterator item) {
   ClauseProcessor cp(converter, semaCtx, item->clauses);
   cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
-                 clause::Order, clause::Safelen, clause::Simdlen>(
-      loc, llvm::omp::OMPD_do_simd);
+                 clause::Safelen, clause::Simdlen>(loc,
+                                                   llvm::omp::OMPD_do_simd);
   // TODO: Add support for vectorization - add vectorization hints inside loop
   // body.
   // OpenMP standard does not specify the length of vector instructions.
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 149afdf601c936..94d94398d696ad 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_flang_library(FIRTransforms
   AnnotateConstant.cpp
   AssumedRankOpConversion.cpp
   CharacterConversion.cpp
+  ConstantArgumentGlobalisation.cpp
   ControlFlowConverter.cpp
   ArrayValueCopy.cpp
   ExternalNameConversion.cpp
diff --git a/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp b/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp
new file mode 100644
index 00000000000000..f7074a79a8a8d7
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp
@@ -0,0 +1,185 @@
+//===- ConstantArgumentGlobalisation.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CONSTANTARGUMENTGLOBALISATIONOPT
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "flang-constant-argument-globalisation-opt"
+
+namespace {
+unsigned uniqueLitId = 1;
+
+class CallOpRewriter : public mlir::OpRewritePattern<fir::CallOp> {
+protected:
+  const mlir::DominanceInfo &di;
+
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  CallOpRewriter(mlir::MLIRContext *ctx, const mlir::DominanceInfo &_di)
+      : OpRewritePattern(ctx), di(_di) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::CallOp callOp,
+                  mlir::PatternRewriter &rewriter) const override {
+    LLVM_DEBUG(llvm::dbgs() << "Processing call op: " << callOp << "\n");
+    auto module = callOp->getParentOfType<mlir::ModuleOp>();
+    bool needUpdate = false;
+    fir::FirOpBuilder builder(rewriter, module);
+    llvm::SmallVector<mlir::Value> newOperands;
+    llvm::SmallVector<std::pair<mlir::Operation *, mlir::Operation *>> allocas;
+    for (const mlir::Value &a : callOp.getArgs()) {
+      auto alloca = mlir::dyn_cast_or_null<fir::AllocaOp>(a.getDefiningOp());
+      // We can convert arguments that are alloca, and that has
+      // the value by reference attribute. All else is just added
+      // to the argument list.
+      if (!alloca || !alloca->hasAttr(fir::getAdaptToByRefAttrName())) {
+        newOperands.push_back(a);
+        continue;
+      }
+
+      mlir::Type varTy = alloca.getInType();
+      assert(!fir::hasDynamicSize(varTy) &&
+             "only expect statically sized scalars to be by value");
+
+      // Find immediate store with const argument
+      mlir::Operation *store = nullptr;
+      for (mlir::Operation *s : alloca->getUsers()) {
+        if (mlir::isa<fir::StoreOp>(s) && di.dominates(s, callOp)) {
+          // We can only deal with ONE store - if already found one,
+          // set to nullptr and exit the loop.
+          if (store) {
+            store = nullptr;
+            break;
+          }
+          store = s;
+        }
+      }
+
+      // If we didn't find any store, or multiple stores, add argument as is
+      // and move on.
+      if (!store) {
+        newOperands.push_back(a);
+        continue;
+      }
+
+      LLVM_DEBUG(llvm::dbgs() << " found store " << *store << "\n");
+
+      mlir::Operation *definingOp = store->getOperand(0).getDefiningOp();
+      // If not a constant, add to operands and move on.
+      if (!mlir::isa<mlir::arith::ConstantOp>(definingOp)) {
+        // Unable to remove alloca arg
+        newOperands.push_back(a);
+        continue;
+      }
+
+      LLVM_DEBUG(llvm::dbgs() << " found define " << *definingOp << "\n");
+
+      std::string globalName =
+          "_global_const_." + std::to_string(uniqueLitId++);
+      assert(!builder.getNamedGlobal(globalName) &&
+             "We should have a unique name here");
+
+      if (std::find_if(allocas.begin(), allocas.end(), [alloca](auto x) {
+            return x.first == alloca;
+          }) == allocas.end()) {
+        allocas.push_back(std::make_pair(alloca, store));
+      }
+
+      auto loc = callOp.getLoc();
+      fir::GlobalOp global = builder.createGlobalConstant(
+          loc, varTy, globalName,
+          [&](fir::FirOpBuilder &builder) {
+            mlir::Operation *cln = definingOp->clone();
+            builder.insert(cln);
+            mlir::Value val =
+                builder.createConvert(loc, varTy, cln->getResult(0));
+            builder.create<fir::HasValueOp>(loc, val);
+          },
+          builder.createInternalLinkage());
+      mlir::Value addr = builder.create<fir::AddrOfOp>(loc, global.resultType(),
+                                                       global.getSymbol());
+      newOperands.push_back(addr);
+      needUpdate = true;
+    }
+
+    if (needUpdate) {
+      auto loc = callOp.getLoc();
+      llvm::SmallVector<mlir::Type> newResultTypes;
+      newResultTypes.append(callOp.getResultTypes().begin(),
+                            callOp.getResultTypes().end());
+      fir::CallOp newOp = builder.create<fir::CallOp>(
+          loc, newResultTypes,
+          callOp.getCallee().has_value() ? callOp.getCallee().value()
+                                         : mlir::SymbolRefAttr{},
+          newOperands);
+      // Copy all the attributes from the old to new op.
+      newOp->setAttrs(callOp->getAttrs());
+      rewriter.replaceOp(callOp, newOp);
+
+      for (auto a : allocas) {
+        if (a.first->hasOneUse()) {
+          // If the alloca is only used for a store and the call operand, the
+          // store is no longer required.
+          rewriter.eraseOp(a.second);
+          rewriter.eraseOp(a.first);
+        }
+      }
+      LLVM_DEBUG(llvm::dbgs() << "global constant for " << callOp << " as "
+                              << newOp << '\n');
+      return mlir::success();
+    }
+
+    // Failure here just means "we couldn't do the conversion", which is
+    // perfectly acceptable to the upper layers of this function.
+    return mlir::failure();
+  }
+};
+
+// this pass attempts to convert immediate scalar literals in function calls
+// to global constants to allow transformations such as Dead Argument
+// Elimination
+class ConstantArgumentGlobalisationOpt
+    : public fir::impl::ConstantArgumentGlobalisationOptBase<
+          ConstantArgumentGlobalisationOpt> {
+public:
+  ConstantArgumentGlobalisationOpt() = default;
+
+  void runOnOperation() override {
+    mlir::ModuleOp mod = getOperation();
+    mlir::DominanceInfo *di = &getAnalysis<mlir::DominanceInfo>();
+    auto *context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    mlir::GreedyRewriteConfig config;
+    config.enableRegionSimplification =
+        mlir::GreedySimplifyRegionLevel::Disabled;
+    config.strictMode = mlir::GreedyRewriteStrictness::ExistingOps;
+
+    patterns.insert<CallOpRewriter>(context, *di);
+    if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
+            mod, std::move(patterns), config))) {
+      mlir::emitError(mod.getLoc(),
+                      "error in constant globalisation optimization\n");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 407ecc8e327b48..71cfee6d735aea 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -61,9 +61,11 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m)
   // descriptors like lower_bound and extent for each dimension.
   mlir::Type llvmDimsType = getDescFieldTypeModel<kDimsPosInBox>()(context);
   mlir::Type llvmPtrType = getDescFieldTypeModel<kAddrPosInBox>()(context);
+  mlir::Type llvmLenType = getDescFieldTypeModel<kElemLenPosInBox>()(context);
   dimsOffset = getComponentOffset<kDimsPosInBox>(*dl, context, llvmDimsType);
   dimsSize = dl->getTypeSize(llvmDimsType);
   ptrSize = dl->getTypeSize(llvmPtrType);
+  lenOffset = getComponentOffset<kElemLenPosInBox>(*dl, context, llvmLenType);
 }
 
 static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
@@ -192,10 +194,8 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
 
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType(
     fir::CharacterType charTy, mlir::LLVM::DIFileAttr fileAttr,
-    mlir::LLVM::DIScopeAttr scope, mlir::Location loc) {
+    mlir::LLVM::DIScopeAttr scope, mlir::Location loc, bool hasDescriptor) {
   mlir::MLIRContext *context = module.getContext();
-  if (!charTy.hasConstantLen())
-    return genPlaceholderType(context);
 
   // DWARF 5 says the following about the character encoding in 5.1.1.2.
   // "DW_ATE_ASCII and DW_ATE_UCS specify encodings for the Fortran 2003
@@ -205,16 +205,38 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType(
   if (charTy.getFKind() != 1)
     encoding = llvm::dwarf::DW_ATE_UCS;
 
+  uint64_t sizeInBits = 0;
+  mlir::LLVM::DIExpressionAttr lenExpr = nullptr;
+  mlir::LLVM::DIExpressionAttr locExpr = nullptr;
+
+  if (hasDescriptor) {
+    llvm::SmallVector<mlir::LLVM::DIExpressionElemAttr> ops;
+    auto addOp = [&](unsigned opc, llvm::ArrayRef<uint64_t> vals) {
+      ops.push_back(mlir::LLVM::DIExpressionElemAttr::get(context, opc, vals));
+    };
+    addOp(llvm::dwarf::DW_OP_push_object_address, {});
+    addOp(llvm::dwarf::DW_OP_plus_uconst, {lenOffset});
+    lenExpr = mlir::LLVM::DIExpressionAttr::get(context, ops);
+    ops.clear();
+
+    addOp(llvm::dwarf::DW_OP_push_object_address, {});
+    addOp(llvm::dwarf::DW_OP_deref, {});
+    locExpr = mlir::LLVM::DIExpressionAttr::get(context, ops);
+  } else if (charTy.hasConstantLen()) {
+    sizeInBits =
+        charTy.getLen() * kindMapping.getCharacterBitsize(charTy.getFKind());
+  } else {
+    return genPlaceholderType(context);
+  }
+
   // FIXME: Currently the DIStringType in llvm does not have the option to set
   // type of the underlying character. This restricts out ability to represent
   // string with non-default characters. Please see issue #95440 for more
   // details.
   return mlir::LLVM::DIStringTypeAttr::get(
       context, llvm::dwarf::DW_TAG_string_type,
-      mlir::StringAttr::get(context, ""),
-      charTy.getLen() * kindMapping.getCharacterBitsize(charTy.getFKind()),
-      /*alignInBits=*/0, /*stringLength=*/nullptr,
-      /*stringLengthExp=*/nullptr, /*stringLocationExp=*/nullptr, encoding);
+      mlir::StringAttr::get(context, ""), sizeInBits, /*alignInBits=*/0,
+      /*stringLength=*/nullptr, lenExpr, locExpr, encoding);
 }
 
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
@@ -229,6 +251,9 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
   if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(elTy))
     return convertBoxedSequenceType(seqTy, fileAttr, scope, loc, genAllocated,
                                     genAssociated);
+  if (auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(elTy))
+    return convertCharacterType(charTy, fileAttr, scope, loc,
+                                /*hasDescriptor=*/true);
 
   mlir::LLVM::DITypeAttr elTyAttr = convertType(elTy, fileAttr, scope, loc);
 
@@ -274,7 +299,8 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
   } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(Ty)) {
     return convertSequenceType(seqTy, fileAttr, scope, loc);
   } else if (auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(Ty)) {
-    return convertCharacterType(charTy, fileAttr, scope, loc);
+    return convertCharacterType(charTy, fileAttr, scope, loc,
+                                /*hasDescriptor=*/false);
   } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BoxType>(Ty)) {
     auto elTy = boxTy.getElementType();
     if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(elTy))
@@ -284,6 +310,10 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
       return convertPointerLikeType(heapTy.getElementType(), fileAttr, scope,
                                     loc, /*genAllocated=*/true,
                                     /*genAssociated=*/false);
+    if (auto ptrTy = mlir::dyn_cast_or_null<fir::PointerType>(elTy))
+      return convertPointerLikeType(ptrTy.getElementType(), fileAttr, scope,
+                                    loc, /*genAllocated=*/false,
+                                    /*genAssociated=*/true);
     return genPlaceholderType(context);
   } else {
     // FIXME: These types are currently unhandled. We are generating a
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
index 7816363e988214..ec881e8be7cadc 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -48,7 +48,8 @@ class DebugTypeGenerator {
   mlir::LLVM::DITypeAttr convertCharacterType(fir::CharacterType charTy,
                                               mlir::LLVM::DIFileAttr fileAttr,
                                               mlir::LLVM::DIScopeAttr scope,
-                                              mlir::Location loc);
+                                              mlir::Location loc,
+                                              bool hasDescriptor);
 
   mlir::LLVM::DITypeAttr
   convertPointerLikeType(mlir::Type elTy, mlir::LLVM::DIFileAttr fileAttr,
@@ -60,6 +61,7 @@ class DebugTypeGenerator {
   std::uint64_t dimsSize;
   std::uint64_t dimsOffset;
   std::uint64_t ptrSize;
+  std::uint64_t lenOffset;
 };
 
 } // namespace fir
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 2a6ecfbb0830ec..ca6a8239992441 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -105,13 +105,15 @@ void Prescanner::Statement() {
     NextLine();
     return;
   case LineClassification::Kind::ConditionalCompilationDirective:
-  case LineClassification::Kind::DefinitionDirective:
-  case LineClassification::Kind::PreprocessorDirective:
+  case LineClassification::Kind::IncludeDirective:
     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
+    afterPreprocessingDirective_ = true;
+    skipLeadingAmpersand_ |= !inFixedForm_;
     return;
-  case LineClassification::Kind::IncludeDirective:
+  case LineClassification::Kind::PreprocessorDirective:
+  case LineClassification::Kind::DefinitionDirective:
     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
-    afterIncludeDirective_ = true;
+    // Don't set afterPreprocessingDirective_
     return;
   case LineClassification::Kind::CompilerDirective: {
     directiveSentinel_ = line.sentinel;
@@ -171,15 +173,17 @@ void Prescanner::Statement() {
         NextChar();
       }
       LabelField(tokens);
-    } else if (skipLeadingAmpersand_) {
-      skipLeadingAmpersand_ = false;
-      const char *p{SkipWhiteSpace(at_)};
-      if (p < limit_ && *p == '&') {
-        column_ += ++p - at_;
-        at_ = p;
-      }
     } else {
-      SkipSpaces();
+      if (skipLeadingAmpersand_) {
+        skipLeadingAmpersand_ = false;
+        const char *p{SkipWhiteSpace(at_)};
+        if (p < limit_ && *p == '&') {
+          column_ += ++p - at_;
+          at_ = p;
+        }
+      } else {
+        SkipSpaces();
+      }
       // Check for a leading identifier that might be a keyword macro
       // that will expand to anything indicating a non-source line, like
       // a comment marker or directive sentinel.  If so, disable line
@@ -289,13 +293,14 @@ void Prescanner::CheckAndEmitLine(
   tokens.CheckBadFortranCharacters(
       messages_, *this, disableSourceContinuation_);
   // Parenthesis nesting check does not apply while any #include is
-  // active, nor on the lines before and after a top-level #include.
+  // active, nor on the lines before and after a top-level #include,
+  // nor before or after conditional source.
   // Applications play shenanigans with line continuation before and
-  // after #include'd subprogram argument lists.
+  // after #include'd subprogram argument lists and conditional source.
   if (!isNestedInIncludeDirective_ && !omitNewline_ &&
-      !afterIncludeDirective_ && tokens.BadlyNestedParentheses()) {
-    if (inFixedForm_ && nextLine_ < limit_ &&
-        IsPreprocessorDirectiveLine(nextLine_)) {
+      !afterPreprocessingDirective_ && tokens.BadlyNestedParentheses() &&
+      !preprocessor_.InConditional()) {
+    if (nextLine_ < limit_ && IsPreprocessorDirectiveLine(nextLine_)) {
       // don't complain
     } else {
       tokens.CheckBadParentheses(messages_);
@@ -306,7 +311,7 @@ void Prescanner::CheckAndEmitLine(
     omitNewline_ = false;
   } else {
     cooked_.Put('\n', newlineProvenance);
-    afterIncludeDirective_ = false;
+    afterPreprocessingDirective_ = false;
   }
 }
 
@@ -353,10 +358,11 @@ void Prescanner::LabelField(TokenSequence &token) {
     ++column_;
   }
   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
-    if (prescannerNesting_ > 0 && *badColumn == 6 &&
-        cooked_.BufferedBytes() == firstCookedCharacterOffset_) {
-      // This is the first source line in #included text or conditional
-      // code under #if.
+    if ((prescannerNesting_ > 0 && *badColumn == 6 &&
+            cooked_.BufferedBytes() == firstCookedCharacterOffset_) ||
+        afterPreprocessingDirective_) {
+      // This is the first source line in #include'd text or conditional
+      // code under #if, or the first source line after such.
       // If it turns out that the preprocessed text begins with a
       // fixed form continuation line, the newline at the end
       // of the latest source line beforehand will be deleted in
@@ -465,7 +471,7 @@ bool Prescanner::MustSkipToEndOfLine() const {
   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
     return true; // skip over ignored columns in right margin (73:80)
   } else if (*at_ == '!' && !inCharLiteral_) {
-    return true; // inline comment goes to end of source line
+    return !IsCompilerDirectiveSentinel(at_);
   } else {
     return false;
   }
@@ -599,7 +605,7 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
       char previous{at_ <= start_ ? ' ' : at_[-1]};
       NextChar();
       SkipSpaces();
-      if (*at_ == '\n') {
+      if (*at_ == '\n' && !omitNewline_) {
         // Discard white space at the end of a line.
       } else if (!inPreprocessorDirective_ &&
           (previous == '(' || *at_ == '(' || *at_ == ')')) {
@@ -814,6 +820,41 @@ void Prescanner::QuotedCharacterLiteral(
       }
       break;
     }
+    // Here's a weird edge case.  When there's a two or more following
+    // continuation lines at this point, and the entire significant part of
+    // the next continuation line is the name of a keyword macro, replace
+    // it in the character literal with its definition.  Example:
+    //   #define FOO foo
+    //   subroutine subr() bind(c, name="my_&
+    //     &FOO&
+    //     &_bar") ...
+    // produces a binding name of "my_foo_bar".
+    while (at_[1] == '&' && nextLine_ < limit_ && !InFixedFormSource()) {
+      const char *idStart{nextLine_};
+      if (const char *amper{SkipWhiteSpace(nextLine_)}; *amper == '&') {
+        idStart = amper + 1;
+      }
+      if (IsLegalIdentifierStart(*idStart)) {
+        std::size_t idLen{1};
+        for (; IsLegalInIdentifier(idStart[idLen]); ++idLen) {
+        }
+        if (idStart[idLen] == '&') {
+          CharBlock id{idStart, idLen};
+          if (preprocessor_.IsNameDefined(id)) {
+            TokenSequence ppTokens;
+            ppTokens.Put(id, GetProvenance(idStart));
+            if (auto replaced{
+                    preprocessor_.MacroReplacement(ppTokens, *this)}) {
+              tokens.Put(*replaced);
+              at_ = &idStart[idLen - 1];
+              NextLine();
+              continue; // try again on the next line
+            }
+          }
+        }
+      }
+      break;
+    }
     end = at_ + 1;
     NextChar();
     if (*at_ == quote && !isEscaped) {
@@ -1069,6 +1110,17 @@ bool Prescanner::SkipCommentLine(bool afterAmpersand) {
     return true;
   } else if (inPreprocessorDirective_) {
     return false;
+  } else if (afterAmpersand &&
+      (lineClass.kind ==
+              LineClassification::Kind::ConditionalCompilationDirective ||
+          lineClass.kind == LineClassification::Kind::DefinitionDirective ||
+          lineClass.kind == LineClassification::Kind::PreprocessorDirective ||
+          lineClass.kind == LineClassification::Kind::IncludeDirective ||
+          lineClass.kind == LineClassification::Kind::IncludeLine)) {
+    SkipToEndOfLine();
+    omitNewline_ = true;
+    skipLeadingAmpersand_ = true;
+    return false;
   } else if (lineClass.kind ==
           LineClassification::Kind::ConditionalCompilationDirective ||
       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
@@ -1080,13 +1132,6 @@ bool Prescanner::SkipCommentLine(bool afterAmpersand) {
     // continued line).
     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
     return true;
-  } else if (afterAmpersand &&
-      (lineClass.kind == LineClassification::Kind::IncludeDirective ||
-          lineClass.kind == LineClassification::Kind::IncludeLine)) {
-    SkipToEndOfLine();
-    omitNewline_ = true;
-    skipLeadingAmpersand_ = true;
-    return false;
   } else {
     return false;
   }
@@ -1335,32 +1380,12 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
 
 std::optional<Prescanner::LineClassification>
 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
-  char sentinel[8];
-  const char *p{SkipWhiteSpace(start)};
-  if (*p++ != '!') {
-    return std::nullopt;
-  }
-  for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
-    if (*p == '\n') {
-      break;
-    }
-    if (*p == ' ' || *p == '\t' || *p == '&') {
-      if (j == 0) {
-        break;
-      }
-      sentinel[j] = '\0';
-      p = SkipWhiteSpace(p + 1);
-      if (*p == '!') {
-        break;
-      }
-      if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
-        std::size_t offset = p - start;
-        return {LineClassification{
-            LineClassification::Kind::CompilerDirective, offset, sp}};
-      }
-      break;
+  if (const char *p{SkipWhiteSpace(start)}; p && *p++ == '!') {
+    if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
+      auto offset{static_cast<std::size_t>(maybePair->second - start)};
+      return {LineClassification{LineClassification::Kind::CompilerDirective,
+          offset, maybePair->first}};
     }
-    sentinel[j] = ToLowerCaseLetter(*p);
   }
   return std::nullopt;
 }
@@ -1405,6 +1430,28 @@ const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
   return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
 }
 
+std::optional<std::pair<const char *, const char *>>
+Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
+  char sentinel[8];
+  for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
+    if (*p == ' ' || *p == '\t' || *p == '&') {
+      if (j > 0) {
+        sentinel[j] = '\0';
+        p = SkipWhiteSpace(p + 1);
+        if (*p != '!') {
+          if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
+            return std::make_pair(sp, p);
+          }
+        }
+      }
+      break;
+    } else {
+      sentinel[j] = ToLowerCaseLetter(*p);
+    }
+  }
+  return std::nullopt;
+}
+
 constexpr bool IsDirective(const char *match, const char *dir) {
   for (; *match; ++match) {
     if (*match != ToLowerCaseLetter(*dir++)) {
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index b6f6d2ca439ee7..a64df5377e7e03 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -74,6 +74,9 @@ class Prescanner {
 
   const char *IsCompilerDirectiveSentinel(const char *, std::size_t) const;
   const char *IsCompilerDirectiveSentinel(CharBlock) const;
+  // 'first' is the sentinel, 'second' is beginning of payload
+  std::optional<std::pair<const char *, const char *>>
+  IsCompilerDirectiveSentinel(const char *p) const;
 
   template <typename... A> Message &Say(A &&...a) {
     return messages_.Say(std::forward<A>(a)...);
@@ -214,7 +217,7 @@ class Prescanner {
   int prescannerNesting_{0};
   int continuationLines_{0};
   bool isPossibleMacroCall_{false};
-  bool afterIncludeDirective_{false};
+  bool afterPreprocessingDirective_{false};
   bool disableSourceContinuation_{false};
 
   Provenance startProvenance_;
diff --git a/flang/lib/Parser/token-sequence.cpp b/flang/lib/Parser/token-sequence.cpp
index 133e60ba4f0098..c73bca1df33de3 100644
--- a/flang/lib/Parser/token-sequence.cpp
+++ b/flang/lib/Parser/token-sequence.cpp
@@ -266,7 +266,7 @@ TokenSequence &TokenSequence::ClipComment(
     if (std::size_t blanks{tok.CountLeadingBlanks()};
         blanks < tok.size() && tok[blanks] == '!') {
       // Retain active compiler directive sentinels (e.g. "!dir$")
-      for (std::size_t k{j + 1}; k < tokens && tok.size() < blanks + 5; ++k) {
+      for (std::size_t k{j + 1}; k < tokens && tok.size() <= blanks + 5; ++k) {
         if (tok.begin() + tok.size() == TokenAt(k).begin()) {
           tok.ExtendToCover(TokenAt(k));
         } else {
@@ -274,12 +274,9 @@ TokenSequence &TokenSequence::ClipComment(
         }
       }
       bool isSentinel{false};
-      if (tok.size() == blanks + 5) {
-        char sentinel[4];
-        for (int k{0}; k < 4; ++k) {
-          sentinel[k] = ToLowerCaseLetter(tok[blanks + k + 1]);
-        }
-        isSentinel = prescanner.IsCompilerDirectiveSentinel(sentinel, 4);
+      if (tok.size() > blanks + 5) {
+        isSentinel = prescanner.IsCompilerDirectiveSentinel(&tok[blanks + 1])
+                         .has_value();
       }
       if (isSentinel) {
       } else if (skipFirst) {
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index b4c5660670579d..a4fa72b03ca18f 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -539,7 +539,7 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
   // Shape related checks
   if (ultimate_ && evaluate::IsAssumedRank(*ultimate_)) {
     context.Say(name_.source,
-        "An assumed-rank object may not appear in an ALLOCATE statement"_err_en_US);
+        "An assumed-rank dummy argument may not appear in an ALLOCATE statement"_err_en_US);
     return false;
   }
   if (ultimate_ && IsAssumedSizeArray(*ultimate_) && context.AnyFatalError()) {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 72576942e62cb3..8fe90eedc913f9 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -67,11 +67,9 @@ static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
         messages.Say(
             "Coarray argument requires an explicit interface"_err_en_US);
       }
-      if (const auto *details{symbol.detailsIf<ObjectEntityDetails>()}) {
-        if (details->IsAssumedRank()) {
-          messages.Say(
-              "Assumed rank argument requires an explicit interface"_err_en_US);
-        }
+      if (evaluate::IsAssumedRank(symbol)) {
+        messages.Say(
+            "Assumed rank argument requires an explicit interface"_err_en_US);
       }
       if (symbol.attrs().test(Attr::ASYNCHRONOUS)) {
         messages.Say(
@@ -143,8 +141,8 @@ static void CheckCharacterActual(evaluate::Expr<evaluate::SomeType> &actual,
         bool canAssociate{CanAssociateWithStorageSequence(dummy)};
         if (dummy.type.Rank() > 0 && canAssociate) {
           // Character storage sequence association (F'2023 15.5.2.12p4)
-          if (auto dummySize{evaluate::ToInt64(evaluate::Fold(foldingContext,
-                  evaluate::GetSize(evaluate::Shape{dummy.type.shape()})))}) {
+          if (auto dummySize{evaluate::ToInt64(evaluate::Fold(
+                  foldingContext, evaluate::GetSize(dummy.type.shape())))}) {
             auto dummyChars{*dummySize * *dummyLength};
             if (actualType.Rank() == 0) {
               evaluate::DesignatorFolder folder{
@@ -183,8 +181,7 @@ static void CheckCharacterActual(evaluate::Expr<evaluate::SomeType> &actual,
               }
             } else { // actual.type.Rank() > 0
               if (auto actualSize{evaluate::ToInt64(evaluate::Fold(
-                      foldingContext,
-                      evaluate::GetSize(evaluate::Shape(actualType.shape()))))};
+                      foldingContext, evaluate::GetSize(actualType.shape())))};
                   actualSize &&
                   *actualSize * *actualLength < *dummySize * *dummyLength &&
                   (extentErrors ||
@@ -251,7 +248,7 @@ static void ConvertIntegerActual(evaluate::Expr<evaluate::SomeType> &actual,
   if (dummyType.type().category() == TypeCategory::Integer &&
       actualType.type().category() == TypeCategory::Integer &&
       dummyType.type().kind() != actualType.type().kind() &&
-      GetRank(dummyType.shape()) == 0 && GetRank(actualType.shape()) == 0 &&
+      dummyType.Rank() == 0 && actualType.Rank() == 0 &&
       !evaluate::IsVariable(actual)) {
     auto converted{
         evaluate::ConvertToType(dummyType.type(), std::move(actual))};
@@ -387,10 +384,10 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       // if the actual argument is an array or array element designator,
       // and the dummy is an array, but not assumed-shape or an INTENT(IN)
       // pointer that's standing in for an assumed-shape dummy.
-    } else {
+    } else if (dummy.type.shape() && actualType.shape()) {
       // Let CheckConformance accept actual scalars; storage association
       // cases are checked here below.
-      CheckConformance(messages, dummy.type.shape(), actualType.shape(),
+      CheckConformance(messages, *dummy.type.shape(), *actualType.shape(),
           dummyIsAllocatableOrPointer
               ? evaluate::CheckConformanceFlags::None
               : evaluate::CheckConformanceFlags::RightScalarExpandable,
@@ -579,8 +576,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
         CanAssociateWithStorageSequence(dummy) &&
         !dummy.attrs.test(
             characteristics::DummyDataObject::Attr::DeducedFromActual)) {
-      if (auto dummySize{evaluate::ToInt64(evaluate::Fold(foldingContext,
-              evaluate::GetSize(evaluate::Shape{dummy.type.shape()})))}) {
+      if (auto dummySize{evaluate::ToInt64(evaluate::Fold(
+              foldingContext, evaluate::GetSize(dummy.type.shape())))}) {
         if (actualRank == 0 && !actualIsAssumedRank) {
           if (evaluate::IsArrayElement(actual)) {
             // Actual argument is a scalar array element
@@ -622,8 +619,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
             }
           }
         } else { // actualRank > 0 || actualIsAssumedRank
-          if (auto actualSize{evaluate::ToInt64(evaluate::Fold(foldingContext,
-                  evaluate::GetSize(evaluate::Shape(actualType.shape()))))};
+          if (auto actualSize{evaluate::ToInt64(evaluate::Fold(
+                  foldingContext, evaluate::GetSize(actualType.shape())))};
               actualSize && *actualSize < *dummySize &&
               (extentErrors ||
                   context.ShouldWarn(common::UsageWarning::ShortArrayActual))) {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 4bb625bfbc2cae..dae40502792004 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -252,8 +252,7 @@ void CheckHelper::Check(const Symbol &symbol) {
       &symbol == &symbol.GetUltimate()) {
     if (context_.ShouldWarn(common::LanguageFeature::LongNames)) {
       WarnIfNotInModuleFile(symbol.name(),
-          "%s has length %d, which is greater than the maximum name length "
-          "%d"_port_en_US,
+          "%s has length %d, which is greater than the maximum name length %d"_port_en_US,
           symbol.name(), symbol.name().size(), common::maxNameLen);
     }
   }
@@ -354,7 +353,10 @@ void CheckHelper::Check(const Symbol &symbol) {
       messages_.Say(
           "A pure subprogram may not have a variable with the VOLATILE attribute"_err_en_US);
     }
-    if (IsProcedure(symbol) && !IsPureProcedure(symbol) && IsDummy(symbol)) {
+    if (innermostSymbol_ && innermostSymbol_->name() == "__builtin_c_funloc") {
+      // The intrinsic procedure C_FUNLOC() gets a pass on this check.
+    } else if (IsProcedure(symbol) && !IsPureProcedure(symbol) &&
+        IsDummy(symbol)) {
       messages_.Say(
           "A dummy procedure of a pure subprogram must be pure"_err_en_US);
     }
@@ -463,12 +465,12 @@ void CheckHelper::Check(const Symbol &symbol) {
           symbol.name());
     }
   }
-  if (IsProcedure(symbol) && !symbol.HasExplicitInterface()) {
+  if (IsProcedure(symbol)) {
     if (IsAllocatable(symbol)) {
       messages_.Say(
-          "Procedure '%s' may not be ALLOCATABLE without an explicit interface"_err_en_US,
-          symbol.name());
-    } else if (symbol.Rank() > 0) {
+          "Procedure '%s' may not be ALLOCATABLE"_err_en_US, symbol.name());
+    }
+    if (!symbol.HasExplicitInterface() && symbol.Rank() > 0) {
       messages_.Say(
           "Procedure '%s' may not be an array without an explicit interface"_err_en_US,
           symbol.name());
@@ -1325,6 +1327,13 @@ class SubprogramMatchHelper {
   bool CheckSameAttrs(const Symbol &, const Symbol &, ATTRS, ATTRS);
   bool ShapesAreCompatible(const DummyDataObject &, const DummyDataObject &);
   evaluate::Shape FoldShape(const evaluate::Shape &);
+  std::optional<evaluate::Shape> FoldShape(
+      const std::optional<evaluate::Shape> &shape) {
+    if (shape) {
+      return FoldShape(*shape);
+    }
+    return std::nullopt;
+  }
   std::string AsFortran(DummyDataObject::Attr attr) {
     return parser::ToUpperCaseLetters(DummyDataObject::EnumToString(attr));
   }
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 803c655b3174fe..2202639a92e43d 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -141,7 +141,7 @@ class ArgumentAnalyzer {
   }
   void Analyze(const parser::Variable &);
   void Analyze(const parser::ActualArgSpec &, bool isSubroutine);
-  void ConvertBOZ(std::optional<DynamicType> &thisType, std::size_t i,
+  void ConvertBOZ(std::optional<DynamicType> *thisType, std::size_t,
       std::optional<DynamicType> otherType);
 
   bool IsIntrinsicRelational(
@@ -153,6 +153,7 @@ class ArgumentAnalyzer {
   bool CheckConformance();
   bool CheckAssignmentConformance();
   bool CheckForNullPointer(const char *where = "as an operand here");
+  bool CheckForAssumedRank(const char *where = "as an operand here");
 
   // Find and return a user-defined operator or report an error.
   // The provided message is used if there is no such operator.
@@ -3200,6 +3201,7 @@ const Assignment *ExpressionAnalyzer::Analyze(const parser::AssignmentStmt &x) {
       if (!procRef) {
         analyzer.CheckForNullPointer(
             "in a non-pointer intrinsic assignment statement");
+        analyzer.CheckForAssumedRank("in an assignment statement");
         const Expr<SomeType> &lhs{analyzer.GetExpr(0)};
         if (auto dyType{lhs.GetType()};
             dyType && dyType->IsPolymorphic()) { // 10.2.1.2p1(1)
@@ -3394,6 +3396,7 @@ static MaybeExpr NumericUnaryHelper(ExpressionAnalyzer &context,
   if (!analyzer.fatalErrors()) {
     if (analyzer.IsIntrinsicNumeric(opr)) {
       analyzer.CheckForNullPointer();
+      analyzer.CheckForAssumedRank();
       if (opr == NumericOperator::Add) {
         return analyzer.MoveExpr(0);
       } else {
@@ -3428,6 +3431,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr::NOT &x) {
   if (!analyzer.fatalErrors()) {
     if (analyzer.IsIntrinsicLogical()) {
       analyzer.CheckForNullPointer();
+      analyzer.CheckForAssumedRank();
       return AsGenericExpr(
           LogicalNegation(std::get<Expr<SomeLogical>>(analyzer.MoveExpr(0).u)));
     } else {
@@ -3476,6 +3480,7 @@ MaybeExpr NumericBinaryHelper(ExpressionAnalyzer &context, NumericOperator opr,
   if (!analyzer.fatalErrors()) {
     if (analyzer.IsIntrinsicNumeric(opr)) {
       analyzer.CheckForNullPointer();
+      analyzer.CheckForAssumedRank();
       analyzer.CheckConformance();
       return NumericOperation<OPR>(context.GetContextualMessages(),
           analyzer.MoveExpr(0), analyzer.MoveExpr(1),
@@ -3525,6 +3530,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr::Concat &x) {
   if (!analyzer.fatalErrors()) {
     if (analyzer.IsIntrinsicConcat()) {
       analyzer.CheckForNullPointer();
+      analyzer.CheckForAssumedRank();
       return common::visit(
           [&](auto &&x, auto &&y) -> MaybeExpr {
             using T = ResultType<decltype(x)>;
@@ -3567,11 +3573,12 @@ MaybeExpr RelationHelper(ExpressionAnalyzer &context, RelationalOperator opr,
   if (!analyzer.fatalErrors()) {
     std::optional<DynamicType> leftType{analyzer.GetType(0)};
     std::optional<DynamicType> rightType{analyzer.GetType(1)};
-    analyzer.ConvertBOZ(leftType, 0, rightType);
-    analyzer.ConvertBOZ(rightType, 1, leftType);
+    analyzer.ConvertBOZ(&leftType, 0, rightType);
+    analyzer.ConvertBOZ(&rightType, 1, leftType);
     if (leftType && rightType &&
         analyzer.IsIntrinsicRelational(opr, *leftType, *rightType)) {
       analyzer.CheckForNullPointer("as a relational operand");
+      analyzer.CheckForAssumedRank("as a relational operand");
       return AsMaybeExpr(Relate(context.GetContextualMessages(), opr,
           analyzer.MoveExpr(0), analyzer.MoveExpr(1)));
     } else {
@@ -3617,6 +3624,7 @@ MaybeExpr LogicalBinaryHelper(ExpressionAnalyzer &context, LogicalOperator opr,
   if (!analyzer.fatalErrors()) {
     if (analyzer.IsIntrinsicLogical()) {
       analyzer.CheckForNullPointer("as a logical operand");
+      analyzer.CheckForAssumedRank("as a logical operand");
       return AsGenericExpr(BinaryLogicalOperation(opr,
           std::get<Expr<SomeLogical>>(analyzer.MoveExpr(0).u),
           std::get<Expr<SomeLogical>>(analyzer.MoveExpr(1).u)));
@@ -4330,6 +4338,18 @@ bool ArgumentAnalyzer::CheckForNullPointer(const char *where) {
   return true;
 }
 
+bool ArgumentAnalyzer::CheckForAssumedRank(const char *where) {
+  for (const std::optional<ActualArgument> &arg : actuals_) {
+    if (arg && IsAssumedRank(arg->UnwrapExpr())) {
+      context_.Say(source_,
+          "An assumed-rank dummy argument is not allowed %s"_err_en_US, where);
+      fatalErrors_ = true;
+      return false;
+    }
+  }
+  return true;
+}
+
 MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     const char *opr, parser::MessageFixedText error, bool isUserOp) {
   if (AnyUntypedOrMissingOperand()) {
@@ -4404,7 +4424,7 @@ MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     context_.Say(
         "Operands of %s are not conformable; have rank %d and rank %d"_err_en_US,
         ToUpperCase(opr), actuals_[0]->Rank(), actuals_[1]->Rank());
-  } else if (CheckForNullPointer()) {
+  } else if (CheckForNullPointer() && CheckForAssumedRank()) {
     context_.Say(error, ToUpperCase(opr), TypeAsFortran(0), TypeAsFortran(1));
   }
   return result;
@@ -4468,9 +4488,22 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
     // allocatable (the explicit conversion would prevent the propagation of the
     // right hand side if it is a variable). Lowering will deal with the
     // conversion in this case.
-    if (lhsType && rhsType &&
-        (!IsAllocatableDesignator(lhs) || context_.inWhereBody())) {
-      AddAssignmentConversion(*lhsType, *rhsType);
+    if (lhsType) {
+      if (rhsType) {
+        if (!IsAllocatableDesignator(lhs) || context_.inWhereBody()) {
+          AddAssignmentConversion(*lhsType, *rhsType);
+        }
+      } else {
+        if (lhsType->category() == TypeCategory::Integer ||
+            lhsType->category() == TypeCategory::Real) {
+          ConvertBOZ(nullptr, 1, lhsType);
+        }
+        if (IsBOZLiteral(1)) {
+          context_.Say(
+              "Right-hand side of this assignment may not be BOZ"_err_en_US);
+          fatalErrors_ = true;
+        }
+      }
     }
     if (!fatalErrors_) {
       CheckAssignmentConformance();
@@ -4699,7 +4732,7 @@ int ArgumentAnalyzer::GetRank(std::size_t i) const {
 // otherType.  If it's REAL convert to REAL, otherwise convert to INTEGER.
 // Note that IBM supports comparing BOZ literals to CHARACTER operands.  That
 // is not currently supported.
-void ArgumentAnalyzer::ConvertBOZ(std::optional<DynamicType> &thisType,
+void ArgumentAnalyzer::ConvertBOZ(std::optional<DynamicType> *thisType,
     std::size_t i, std::optional<DynamicType> otherType) {
   if (IsBOZLiteral(i)) {
     Expr<SomeType> &&argExpr{MoveExpr(i)};
@@ -4709,13 +4742,17 @@ void ArgumentAnalyzer::ConvertBOZ(std::optional<DynamicType> &thisType,
       MaybeExpr realExpr{
           ConvertToKind<TypeCategory::Real>(kind, std::move(*boz))};
       actuals_[i] = std::move(*realExpr);
-      thisType.emplace(TypeCategory::Real, kind);
+      if (thisType) {
+        thisType->emplace(TypeCategory::Real, kind);
+      }
     } else {
       int kind{context_.context().GetDefaultKind(TypeCategory::Integer)};
       MaybeExpr intExpr{
           ConvertToKind<TypeCategory::Integer>(kind, std::move(*boz))};
       actuals_[i] = std::move(*intExpr);
-      thisType.emplace(TypeCategory::Integer, kind);
+      if (thisType) {
+        thisType->emplace(TypeCategory::Integer, kind);
+      }
     }
   }
 }
diff --git a/flang/lib/Semantics/pointer-assignment.cpp b/flang/lib/Semantics/pointer-assignment.cpp
index 077072060e9b11..6c634c64131917 100644
--- a/flang/lib/Semantics/pointer-assignment.cpp
+++ b/flang/lib/Semantics/pointer-assignment.cpp
@@ -148,6 +148,9 @@ bool PointerAssignmentChecker::CheckLeftHandSide(const SomeExpr &lhs) {
       msg->Attach(std::move(*whyNot));
     }
     return false;
+  } else if (evaluate::IsAssumedRank(lhs)) {
+    Say("The left-hand side of a pointer assignment must not be an assumed-rank dummy argument"_err_en_US);
+    return false;
   } else {
     return true;
   }
@@ -333,8 +336,8 @@ bool PointerAssignmentChecker::Check(const evaluate::Designator<T> &d) {
 
       } else if (!isBoundsRemapping_ &&
           !lhsType_->attrs().test(TypeAndShape::Attr::AssumedRank)) {
-        int lhsRank{evaluate::GetRank(lhsType_->shape())};
-        int rhsRank{evaluate::GetRank(rhsType->shape())};
+        int lhsRank{lhsType_->Rank()};
+        int rhsRank{rhsType->Rank()};
         if (lhsRank != rhsRank) {
           msg = MessageFormattedText{
               "Pointer has rank %d but target has rank %d"_err_en_US, lhsRank,
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index d4fe668b92ec93..88822974e0134e 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -661,8 +661,8 @@ class ScopeHandler : public ImplicitRulesVisitor {
   void MakeExternal(Symbol &);
 
   // C815 duplicated attribute checking; returns false on error
-  bool CheckDuplicatedAttr(SourceName, const Symbol &, Attr);
-  bool CheckDuplicatedAttrs(SourceName, const Symbol &, Attrs);
+  bool CheckDuplicatedAttr(SourceName, Symbol &, Attr);
+  bool CheckDuplicatedAttrs(SourceName, Symbol &, Attrs);
 
   void SetExplicitAttr(Symbol &symbol, Attr attr) const {
     symbol.attrs().set(attr);
@@ -1087,6 +1087,58 @@ class DeclarationVisitor : public ArraySpecVisitor,
   void NoteScalarSpecificationArgument(const Symbol &symbol) {
     mustBeScalar_.emplace(symbol);
   }
+  // Declare an object or procedure entity.
+  // T is one of: EntityDetails, ObjectEntityDetails, ProcEntityDetails
+  template <typename T>
+  Symbol &DeclareEntity(const parser::Name &name, Attrs attrs) {
+    Symbol &symbol{MakeSymbol(name, attrs)};
+    if (context().HasError(symbol) || symbol.has<T>()) {
+      return symbol; // OK or error already reported
+    } else if (symbol.has<UnknownDetails>()) {
+      symbol.set_details(T{});
+      return symbol;
+    } else if (auto *details{symbol.detailsIf<EntityDetails>()}) {
+      symbol.set_details(T{std::move(*details)});
+      return symbol;
+    } else if (std::is_same_v<EntityDetails, T> &&
+        (symbol.has<ObjectEntityDetails>() ||
+            symbol.has<ProcEntityDetails>())) {
+      return symbol; // OK
+    } else if (auto *details{symbol.detailsIf<UseDetails>()}) {
+      Say(name.source,
+          "'%s' is use-associated from module '%s' and cannot be re-declared"_err_en_US,
+          name.source, GetUsedModule(*details).name());
+    } else if (auto *details{symbol.detailsIf<SubprogramNameDetails>()}) {
+      if (details->kind() == SubprogramKind::Module) {
+        Say2(name,
+            "Declaration of '%s' conflicts with its use as module procedure"_err_en_US,
+            symbol, "Module procedure definition"_en_US);
+      } else if (details->kind() == SubprogramKind::Internal) {
+        Say2(name,
+            "Declaration of '%s' conflicts with its use as internal procedure"_err_en_US,
+            symbol, "Internal procedure definition"_en_US);
+      } else {
+        DIE("unexpected kind");
+      }
+    } else if (std::is_same_v<ObjectEntityDetails, T> &&
+        symbol.has<ProcEntityDetails>()) {
+      SayWithDecl(
+          name, symbol, "'%s' is already declared as a procedure"_err_en_US);
+    } else if (std::is_same_v<ProcEntityDetails, T> &&
+        symbol.has<ObjectEntityDetails>()) {
+      if (FindCommonBlockContaining(symbol)) {
+        SayWithDecl(name, symbol,
+            "'%s' may not be a procedure as it is in a COMMON block"_err_en_US);
+      } else {
+        SayWithDecl(
+            name, symbol, "'%s' is already declared as an object"_err_en_US);
+      }
+    } else if (!CheckPossibleBadForwardRef(symbol)) {
+      SayAlreadyDeclared(name, symbol);
+    }
+    context().SetError(symbol);
+    return symbol;
+  }
 
 private:
   // The attribute corresponding to the statement containing an ObjectDecl
@@ -1151,59 +1203,6 @@ class DeclarationVisitor : public ArraySpecVisitor,
   bool PassesLocalityChecks(
       const parser::Name &name, Symbol &symbol, Symbol::Flag flag);
   bool CheckForHostAssociatedImplicit(const parser::Name &);
-
-  // Declare an object or procedure entity.
-  // T is one of: EntityDetails, ObjectEntityDetails, ProcEntityDetails
-  template <typename T>
-  Symbol &DeclareEntity(const parser::Name &name, Attrs attrs) {
-    Symbol &symbol{MakeSymbol(name, attrs)};
-    if (context().HasError(symbol) || symbol.has<T>()) {
-      return symbol; // OK or error already reported
-    } else if (symbol.has<UnknownDetails>()) {
-      symbol.set_details(T{});
-      return symbol;
-    } else if (auto *details{symbol.detailsIf<EntityDetails>()}) {
-      symbol.set_details(T{std::move(*details)});
-      return symbol;
-    } else if (std::is_same_v<EntityDetails, T> &&
-        (symbol.has<ObjectEntityDetails>() ||
-            symbol.has<ProcEntityDetails>())) {
-      return symbol; // OK
-    } else if (auto *details{symbol.detailsIf<UseDetails>()}) {
-      Say(name.source,
-          "'%s' is use-associated from module '%s' and cannot be re-declared"_err_en_US,
-          name.source, GetUsedModule(*details).name());
-    } else if (auto *details{symbol.detailsIf<SubprogramNameDetails>()}) {
-      if (details->kind() == SubprogramKind::Module) {
-        Say2(name,
-            "Declaration of '%s' conflicts with its use as module procedure"_err_en_US,
-            symbol, "Module procedure definition"_en_US);
-      } else if (details->kind() == SubprogramKind::Internal) {
-        Say2(name,
-            "Declaration of '%s' conflicts with its use as internal procedure"_err_en_US,
-            symbol, "Internal procedure definition"_en_US);
-      } else {
-        DIE("unexpected kind");
-      }
-    } else if (std::is_same_v<ObjectEntityDetails, T> &&
-        symbol.has<ProcEntityDetails>()) {
-      SayWithDecl(
-          name, symbol, "'%s' is already declared as a procedure"_err_en_US);
-    } else if (std::is_same_v<ProcEntityDetails, T> &&
-        symbol.has<ObjectEntityDetails>()) {
-      if (FindCommonBlockContaining(symbol)) {
-        SayWithDecl(name, symbol,
-            "'%s' may not be a procedure as it is in a COMMON block"_err_en_US);
-      } else {
-        SayWithDecl(
-            name, symbol, "'%s' is already declared as an object"_err_en_US);
-      }
-    } else if (!CheckPossibleBadForwardRef(symbol)) {
-      SayAlreadyDeclared(name, symbol);
-    }
-    context().SetError(symbol);
-    return symbol;
-  }
   bool HasCycle(const Symbol &, const Symbol *interface);
   bool MustBeScalar(const Symbol &symbol) const {
     return mustBeScalar_.find(symbol) != mustBeScalar_.end();
@@ -1624,6 +1623,7 @@ class ResolveNamesVisitor : public virtual ScopeHandler,
 
   void PreSpecificationConstruct(const parser::SpecificationConstruct &);
   void CreateCommonBlockSymbols(const parser::CommonStmt &);
+  void CreateObjectSymbols(const std::list<parser::ObjectDecl> &, Attr);
   void CreateGeneric(const parser::GenericSpec &);
   void FinishSpecificationPart(const std::list<parser::DeclarationConstruct> &);
   void AnalyzeStmtFunctionStmt(const parser::StmtFunctionStmt &);
@@ -2806,12 +2806,13 @@ void ScopeHandler::MakeExternal(Symbol &symbol) {
 }
 
 bool ScopeHandler::CheckDuplicatedAttr(
-    SourceName name, const Symbol &symbol, Attr attr) {
+    SourceName name, Symbol &symbol, Attr attr) {
   if (attr == Attr::SAVE) {
     // checked elsewhere
   } else if (symbol.attrs().test(attr)) { // C815
     if (symbol.implicitAttrs().test(attr)) {
       // Implied attribute is now confirmed explicitly
+      symbol.implicitAttrs().reset(attr);
     } else {
       Say(name, "%s attribute was already specified on '%s'"_err_en_US,
           EnumToString(attr), name);
@@ -2822,7 +2823,7 @@ bool ScopeHandler::CheckDuplicatedAttr(
 }
 
 bool ScopeHandler::CheckDuplicatedAttrs(
-    SourceName name, const Symbol &symbol, Attrs attrs) {
+    SourceName name, Symbol &symbol, Attrs attrs) {
   bool ok{true};
   attrs.IterateOverMembers(
       [&](Attr x) { ok &= CheckDuplicatedAttr(name, symbol, x); });
@@ -5032,6 +5033,10 @@ Symbol &DeclarationVisitor::DeclareUnknownEntity(
     charInfo_.length.reset();
     if (symbol.attrs().test(Attr::EXTERNAL)) {
       ConvertToProcEntity(symbol);
+    } else if (symbol.attrs().HasAny(Attrs{Attr::ALLOCATABLE,
+                   Attr::ASYNCHRONOUS, Attr::CONTIGUOUS, Attr::PARAMETER,
+                   Attr::SAVE, Attr::TARGET, Attr::VALUE, Attr::VOLATILE})) {
+      ConvertToObjectEntity(symbol);
     }
     if (attrs.test(Attr::BIND_C)) {
       SetBindNameOn(symbol);
@@ -8551,11 +8556,19 @@ void ResolveNamesVisitor::PreSpecificationConstruct(
             }
           },
           [&](const parser::Statement<parser::OtherSpecificationStmt> &y) {
-            if (const auto *commonStmt{parser::Unwrap<parser::CommonStmt>(y)}) {
-              CreateCommonBlockSymbols(*commonStmt);
-            }
+            common::visit(
+                common::visitors{
+                    [&](const common::Indirection<parser::CommonStmt> &z) {
+                      CreateCommonBlockSymbols(z.value());
+                    },
+                    [&](const common::Indirection<parser::TargetStmt> &z) {
+                      CreateObjectSymbols(z.value().v, Attr::TARGET);
+                    },
+                    [](const auto &) {},
+                },
+                y.statement.u);
           },
-          [&](const auto &) {},
+          [](const auto &) {},
       },
       spec.u);
 }
@@ -8575,6 +8588,15 @@ void ResolveNamesVisitor::CreateCommonBlockSymbols(
   }
 }
 
+void ResolveNamesVisitor::CreateObjectSymbols(
+    const std::list<parser::ObjectDecl> &decls, Attr attr) {
+  for (const parser::ObjectDecl &decl : decls) {
+    SetImplicitAttr(DeclareEntity<ObjectEntityDetails>(
+                        std::get<parser::ObjectName>(decl.t), Attrs{}),
+        attr);
+  }
+}
+
 void ResolveNamesVisitor::CreateGeneric(const parser::GenericSpec &x) {
   auto info{GenericSpecInfo{x}};
   SourceName symbolName{info.symbolName()};
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 15ea34c66dba52..8939dc4499ec43 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -748,7 +748,7 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent(
       symbol, foldingContext)};
   CHECK(typeAndShape.has_value());
   auto dyType{typeAndShape->type()};
-  const auto &shape{typeAndShape->shape()};
+  int rank{typeAndShape->Rank()};
   AddValue(values, componentSchema_, "name"s,
       SaveNameAsPointerTarget(scope, symbol.name().ToString()));
   AddValue(values, componentSchema_, "category"s,
@@ -830,7 +830,6 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent(
         SomeExpr{evaluate::NullPointer{}});
   }
   // Shape information
-  int rank{evaluate::GetRank(shape)};
   AddValue(values, componentSchema_, "rank"s, IntExpr<1>(rank));
   if (rank > 0 && !IsAllocatable(symbol) && !IsPointer(symbol)) {
     std::vector<evaluate::StructureConstructor> bounds;
@@ -1143,7 +1142,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
           isArgDescriptorSet |= 1;
         } else {
           which = scalarFinalEnum_;
-          if (int rank{evaluate::GetRank(typeAndShape.shape())}; rank > 0) {
+          if (int rank{typeAndShape.Rank()}; rank > 0) {
             which = IntExpr<1>(ToInt64(which).value() + rank);
             if (dummyData.IsPassedByDescriptor(proc->IsBindC())) {
               argThatMightBeDescriptor = 1;
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index b33d843710127b..44b0f17339cd96 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -182,7 +182,10 @@
     __builtin_c_ptr_ne = x%__address /= y%__address
   end function
 
-  function __builtin_c_funloc(x)
+  ! Semantics has some special-case code that allows c_funloc()
+  ! to appear in a specification expression and exempts it
+  ! from the requirement that "x" be a pure dummy procedure.
+  pure function __builtin_c_funloc(x)
     type(__builtin_c_funptr) :: __builtin_c_funloc
     external :: x
     __builtin_c_funloc = __builtin_c_funptr(loc(x))
diff --git a/flang/runtime/ISO_Fortran_binding.cpp b/flang/runtime/ISO_Fortran_binding.cpp
index 99ba3aa56feee2..fe22026f31f55f 100644
--- a/flang/runtime/ISO_Fortran_binding.cpp
+++ b/flang/runtime/ISO_Fortran_binding.cpp
@@ -13,6 +13,7 @@
 #include "terminator.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/pointer.h"
 #include "flang/Runtime/type-code.h"
 #include <cstdlib>
 
@@ -75,7 +76,7 @@ RT_API_ATTRS int CFI_allocate(CFI_cdesc_t *descriptor,
     dim->sm = byteSize;
     byteSize *= extent;
   }
-  void *p{byteSize ? std::malloc(byteSize) : std::malloc(1)};
+  void *p{runtime::AllocateValidatedPointerPayload(byteSize)};
   if (!p && byteSize) {
     return CFI_ERROR_MEM_ALLOCATION;
   }
@@ -91,8 +92,11 @@ RT_API_ATTRS int CFI_deallocate(CFI_cdesc_t *descriptor) {
   if (descriptor->version != CFI_VERSION) {
     return CFI_INVALID_DESCRIPTOR;
   }
-  if (descriptor->attribute != CFI_attribute_allocatable &&
-      descriptor->attribute != CFI_attribute_pointer) {
+  if (descriptor->attribute == CFI_attribute_pointer) {
+    if (!runtime::ValidatePointerPayload(*descriptor)) {
+      return CFI_INVALID_DESCRIPTOR;
+    }
+  } else if (descriptor->attribute != CFI_attribute_allocatable) {
     // Non-interoperable object
     return CFI_INVALID_DESCRIPTOR;
   }
diff --git a/flang/runtime/descriptor.cpp b/flang/runtime/descriptor.cpp
index d8b51f1be0c5cb..9b04cb4f8d0d05 100644
--- a/flang/runtime/descriptor.cpp
+++ b/flang/runtime/descriptor.cpp
@@ -199,7 +199,16 @@ RT_API_ATTRS int Descriptor::Destroy(
   }
 }
 
-RT_API_ATTRS int Descriptor::Deallocate() { return ISO::CFI_deallocate(&raw_); }
+RT_API_ATTRS int Descriptor::Deallocate() {
+  ISO::CFI_cdesc_t &descriptor{raw()};
+  if (!descriptor.base_addr) {
+    return CFI_ERROR_BASE_ADDR_NULL;
+  } else {
+    std::free(descriptor.base_addr);
+    descriptor.base_addr = nullptr;
+    return CFI_SUCCESS;
+  }
+}
 
 RT_API_ATTRS bool Descriptor::DecrementSubscripts(
     SubscriptValue *subscript, const int *permutation) const {
diff --git a/flang/runtime/execute.cpp b/flang/runtime/execute.cpp
index 0f5bc5059e21dc..c7f8f386d81f4f 100644
--- a/flang/runtime/execute.cpp
+++ b/flang/runtime/execute.cpp
@@ -13,8 +13,10 @@
 #include "tools.h"
 #include "flang/Runtime/descriptor.h"
 #include <cstdlib>
+#include <errno.h>
 #include <future>
 #include <limits>
+
 #ifdef _WIN32
 #include "flang/Common/windows-include.h"
 #else
@@ -32,13 +34,16 @@ namespace Fortran::runtime {
 // and the processor does not support asynchronous execution. Otherwise it is
 // assigned the value 0
 enum CMD_STAT {
-  ASYNC_NO_SUPPORT_ERR = -2,
-  NO_SUPPORT_ERR = -1,
-  CMD_EXECUTED = 0,
-  FORK_ERR = 1,
-  EXECL_ERR = 2,
-  INVALID_CL_ERR = 3,
-  SIGNAL_ERR = 4
+  ASYNC_NO_SUPPORT_ERR = -2, // system returns -1 with ENOENT
+  NO_SUPPORT_ERR = -1, // Linux setsid() returns -1
+  CMD_EXECUTED = 0, // command executed with no error
+  FORK_ERR = 1, // Linux fork() returns < 0
+  EXECL_ERR = 2, // system returns -1 with other errno
+  COMMAND_EXECUTION_ERR = 3, // exit code 1
+  COMMAND_CANNOT_EXECUTE_ERR = 4, // Linux exit code 126
+  COMMAND_NOT_FOUND_ERR = 5, // Linux exit code 127
+  INVALID_CL_ERR = 6, // cover all other non-zero exit code
+  SIGNAL_ERR = 7
 };
 
 // Override CopyCharsToDescriptor in tools.h, pass string directly
@@ -62,24 +67,86 @@ void CheckAndStoreIntToDescriptor(
 
 // If a condition occurs that would assign a nonzero value to CMDSTAT but
 // the CMDSTAT variable is not present, error termination is initiated.
-int TerminationCheck(int status, const Descriptor *cmdstat,
+std::int64_t TerminationCheck(std::int64_t status, const Descriptor *cmdstat,
     const Descriptor *cmdmsg, Terminator &terminator) {
+  // On both Windows and Linux, errno is set when system returns -1.
   if (status == -1) {
-    if (!cmdstat) {
-      terminator.Crash("Execution error with system status code: %d", status);
+    // On Windows, ENOENT means the command interpreter can't be found.
+    // On Linux, system calls execl with filepath "/bin/sh", ENOENT means the
+    // file pathname does not exist.
+    if (errno == ENOENT) {
+      if (!cmdstat) {
+        terminator.Crash("Command line execution is not supported, system "
+                         "returns -1 with errno ENOENT.");
+      } else {
+        StoreIntToDescriptor(cmdstat, NO_SUPPORT_ERR, terminator);
+        CheckAndCopyCharsToDescriptor(cmdmsg,
+            "Command line execution is not supported, system returns -1 with "
+            "errno ENOENT.");
+      }
     } else {
-      StoreIntToDescriptor(cmdstat, EXECL_ERR, terminator);
-      CheckAndCopyCharsToDescriptor(cmdmsg, "Execution error");
+      char err_buffer[30];
+      char msg[]{"Execution error with system status code: -1, errno: "};
+#ifdef _WIN32
+      if (strerror_s(err_buffer, sizeof(err_buffer), errno) != 0)
+#else
+      if (strerror_r(errno, err_buffer, sizeof(err_buffer)) != 0)
+#endif
+        terminator.Crash("errno to char msg failed.");
+      char *newMsg{static_cast<char *>(AllocateMemoryOrCrash(
+          terminator, std::strlen(msg) + std::strlen(err_buffer) + 1))};
+      std::strcat(newMsg, err_buffer);
+
+      if (!cmdstat) {
+        terminator.Crash(newMsg);
+      } else {
+        StoreIntToDescriptor(cmdstat, EXECL_ERR, terminator);
+        CheckAndCopyCharsToDescriptor(cmdmsg, newMsg);
+      }
+      FreeMemory(newMsg);
     }
   }
+
 #ifdef _WIN32
   // On WIN32 API std::system returns exit status directly
-  int exitStatusVal{status};
-  if (exitStatusVal == 1) {
+  std::int64_t exitStatusVal{status};
+  if (exitStatusVal != 0) {
+    if (!cmdstat) {
+      terminator.Crash(
+          "Invalid command quit with exit status code: %d", exitStatusVal);
+    } else {
+      StoreIntToDescriptor(cmdstat, INVALID_CL_ERR, terminator);
+      CheckAndCopyCharsToDescriptor(cmdmsg, "Invalid command line");
+    }
+  }
 #else
-  int exitStatusVal{WEXITSTATUS(status)};
-  if (exitStatusVal == 127 || exitStatusVal == 126) {
-#endif
+  std::int64_t exitStatusVal{WEXITSTATUS(status)};
+  if (exitStatusVal == 1) {
+    if (!cmdstat) {
+      terminator.Crash("Command line execution failed with exit code: 1.");
+    } else {
+      StoreIntToDescriptor(cmdstat, COMMAND_EXECUTION_ERR, terminator);
+      CheckAndCopyCharsToDescriptor(
+          cmdmsg, "Command line execution failed with exit code: 1.");
+    }
+  } else if (exitStatusVal == 126) {
+    if (!cmdstat) {
+      terminator.Crash("Command cannot be executed with exit code: 126.");
+    } else {
+      StoreIntToDescriptor(cmdstat, COMMAND_CANNOT_EXECUTE_ERR, terminator);
+      CheckAndCopyCharsToDescriptor(
+          cmdmsg, "Command cannot be executed with exit code: 126.");
+    }
+  } else if (exitStatusVal == 127) {
+    if (!cmdstat) {
+      terminator.Crash("Command not found with exit code: 127.");
+    } else {
+      StoreIntToDescriptor(cmdstat, COMMAND_NOT_FOUND_ERR, terminator);
+      CheckAndCopyCharsToDescriptor(
+          cmdmsg, "Command not found with exit code: 127.");
+    }
+    // capture all other nonzero exit code
+  } else if (exitStatusVal != 0) {
     if (!cmdstat) {
       terminator.Crash(
           "Invalid command quit with exit status code: %d", exitStatusVal);
@@ -88,23 +155,26 @@ int TerminationCheck(int status, const Descriptor *cmdstat,
       CheckAndCopyCharsToDescriptor(cmdmsg, "Invalid command line");
     }
   }
+#endif
+
 #if defined(WIFSIGNALED) && defined(WTERMSIG)
   if (WIFSIGNALED(status)) {
     if (!cmdstat) {
-      terminator.Crash("killed by signal: %d", WTERMSIG(status));
+      terminator.Crash("Killed by signal: %d", WTERMSIG(status));
     } else {
       StoreIntToDescriptor(cmdstat, SIGNAL_ERR, terminator);
-      CheckAndCopyCharsToDescriptor(cmdmsg, "killed by signal");
+      CheckAndCopyCharsToDescriptor(cmdmsg, "Killed by signal");
     }
   }
 #endif
+
 #if defined(WIFSTOPPED) && defined(WSTOPSIG)
   if (WIFSTOPPED(status)) {
     if (!cmdstat) {
-      terminator.Crash("stopped by signal: %d", WSTOPSIG(status));
+      terminator.Crash("Stopped by signal: %d", WSTOPSIG(status));
     } else {
       StoreIntToDescriptor(cmdstat, SIGNAL_ERR, terminator);
-      CheckAndCopyCharsToDescriptor(cmdmsg, "stopped by signal");
+      CheckAndCopyCharsToDescriptor(cmdmsg, "Stopped by signal");
     }
   }
 #endif
@@ -134,8 +204,9 @@ void RTNAME(ExecuteCommandLine)(const Descriptor &command, bool wait,
 
   if (wait) {
     // either wait is not specified or wait is true: synchronous mode
-    int status{std::system(newCmd)};
-    int exitStatusVal{TerminationCheck(status, cmdstat, cmdmsg, terminator)};
+    std::int64_t status{std::system(newCmd)};
+    std::int64_t exitStatusVal{
+        TerminationCheck(status, cmdstat, cmdmsg, terminator)};
     // If sync, assigned processor-dependent exit status. Otherwise unchanged
     CheckAndStoreIntToDescriptor(exitstat, exitStatusVal, terminator);
   } else {
@@ -173,7 +244,7 @@ void RTNAME(ExecuteCommandLine)(const Descriptor &command, bool wait,
         terminator.Crash(
             "CreateProcess failed with error code: %lu.", GetLastError());
       } else {
-        StoreIntToDescriptor(cmdstat, (uint32_t)GetLastError(), terminator);
+        StoreIntToDescriptor(cmdstat, ASYNC_NO_SUPPORT_ERR, terminator);
         CheckAndCopyCharsToDescriptor(cmdmsg, "CreateProcess failed.");
       }
     }
@@ -201,7 +272,7 @@ void RTNAME(ExecuteCommandLine)(const Descriptor &command, bool wait,
         }
         exit(EXIT_FAILURE);
       }
-      int status{std::system(newCmd)};
+      std::int64_t status{std::system(newCmd)};
       TerminationCheck(status, cmdstat, cmdmsg, terminator);
       exit(status);
     }
diff --git a/flang/runtime/external-unit.cpp b/flang/runtime/external-unit.cpp
index 328c994a180b19..8009151a8a370c 100644
--- a/flang/runtime/external-unit.cpp
+++ b/flang/runtime/external-unit.cpp
@@ -65,10 +65,17 @@ ExternalFileUnit *ExternalFileUnit::LookUpOrCreateAnonymous(int unit,
   bool exists{false};
   ExternalFileUnit *result{GetUnitMap().LookUpOrCreate(unit, handler, exists)};
   if (result && !exists) {
-    result->OpenAnonymousUnit(
-        dir == Direction::Input ? OpenStatus::Unknown : OpenStatus::Replace,
-        Action::ReadWrite, Position::Rewind, Convert::Unknown, handler);
-    result->isUnformatted = isUnformatted;
+    if (!result->OpenAnonymousUnit(
+            dir == Direction::Input ? OpenStatus::Unknown : OpenStatus::Replace,
+            Action::ReadWrite, Position::Rewind, Convert::Unknown, handler)) {
+      // fort.N isn't a writable file
+      if (ExternalFileUnit * closed{LookUpForClose(result->unitNumber())}) {
+        closed->DestroyClosed();
+      }
+      result = nullptr;
+    } else {
+      result->isUnformatted = isUnformatted;
+    }
   }
   return result;
 }
@@ -183,7 +190,7 @@ bool ExternalFileUnit::OpenUnit(Fortran::common::optional<OpenStatus> status,
   return impliedClose;
 }
 
-void ExternalFileUnit::OpenAnonymousUnit(
+bool ExternalFileUnit::OpenAnonymousUnit(
     Fortran::common::optional<OpenStatus> status,
     Fortran::common::optional<Action> action, Position position,
     Convert convert, IoErrorHandler &handler) {
@@ -193,6 +200,7 @@ void ExternalFileUnit::OpenAnonymousUnit(
   std::snprintf(path.get(), pathMaxLen, "fort.%d", unitNumber_);
   OpenUnit(status, action, position, std::move(path), std::strlen(path.get()),
       convert, handler);
+  return IsConnected();
 }
 
 void ExternalFileUnit::CloseUnit(CloseStatus status, IoErrorHandler &handler) {
diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp
index 543284cb5c3633..8f9b50a549e1fd 100644
--- a/flang/runtime/matmul.cpp
+++ b/flang/runtime/matmul.cpp
@@ -288,11 +288,25 @@ static inline RT_API_ATTRS void DoMatmul(
   }
   SubscriptValue n{x.GetDimension(xRank - 1).Extent()};
   if (n != y.GetDimension(0).Extent()) {
-    terminator.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
-        static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
-        static_cast<std::intmax_t>(n),
-        static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
-        static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
+    // At this point, we know that there's a shape error.  There are three
+    // possibilities, x is rank 1, y is rank 1, or both are rank 2.
+    if (xRank == 1) {
+      terminator.Crash("MATMUL: unacceptable operand shapes (%jd, %jdx%jd)",
+          static_cast<std::intmax_t>(n),
+          static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
+          static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
+    } else if (yRank == 1) {
+      terminator.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jd)",
+          static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
+          static_cast<std::intmax_t>(n),
+          static_cast<std::intmax_t>(y.GetDimension(0).Extent()));
+    } else {
+      terminator.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
+          static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
+          static_cast<std::intmax_t>(n),
+          static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
+          static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
+    }
   }
   using WriteResult =
       CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
diff --git a/flang/runtime/pointer.cpp b/flang/runtime/pointer.cpp
index 08a1223764f393..2979181ddd61bd 100644
--- a/flang/runtime/pointer.cpp
+++ b/flang/runtime/pointer.cpp
@@ -124,6 +124,23 @@ void RTDEF(PointerAssociateRemapping)(Descriptor &pointer,
   }
 }
 
+RT_API_ATTRS void *AllocateValidatedPointerPayload(std::size_t byteSize) {
+  // Add space for a footer to validate during deallocation.
+  constexpr std::size_t align{sizeof(std::uintptr_t)};
+  byteSize = ((byteSize + align - 1) / align) * align;
+  std::size_t total{byteSize + sizeof(std::uintptr_t)};
+  void *p{std::malloc(total)};
+  if (p) {
+    // Fill the footer word with the XOR of the ones' complement of
+    // the base address, which is a value that would be highly unlikely
+    // to appear accidentally at the right spot.
+    std::uintptr_t *footer{
+        reinterpret_cast<std::uintptr_t *>(static_cast<char *>(p) + byteSize)};
+    *footer = ~reinterpret_cast<std::uintptr_t>(p);
+  }
+  return p;
+}
+
 int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -137,22 +154,12 @@ int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
     elementBytes = pointer.raw().elem_len = 0;
   }
   std::size_t byteSize{pointer.Elements() * elementBytes};
-  // Add space for a footer to validate during DEALLOCATE.
-  constexpr std::size_t align{sizeof(std::uintptr_t)};
-  byteSize = ((byteSize + align - 1) / align) * align;
-  std::size_t total{byteSize + sizeof(std::uintptr_t)};
-  void *p{std::malloc(total)};
+  void *p{AllocateValidatedPointerPayload(byteSize)};
   if (!p) {
     return ReturnError(terminator, CFI_ERROR_MEM_ALLOCATION, errMsg, hasStat);
   }
   pointer.set_base_addr(p);
   pointer.SetByteStrides();
-  // Fill the footer word with the XOR of the ones' complement of
-  // the base address, which is a value that would be highly unlikely
-  // to appear accidentally at the right spot.
-  std::uintptr_t *footer{
-      reinterpret_cast<std::uintptr_t *>(static_cast<char *>(p) + byteSize)};
-  *footer = ~reinterpret_cast<std::uintptr_t>(p);
   int stat{StatOk};
   if (const DescriptorAddendum * addendum{pointer.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
@@ -176,6 +183,27 @@ int RTDEF(PointerAllocateSource)(Descriptor &pointer, const Descriptor &source,
   return stat;
 }
 
+static RT_API_ATTRS std::size_t GetByteSize(
+    const ISO::CFI_cdesc_t &descriptor) {
+  std::size_t rank{descriptor.rank};
+  const ISO::CFI_dim_t *dim{descriptor.dim};
+  std::size_t byteSize{descriptor.elem_len};
+  for (std::size_t j{0}; j < rank; ++j) {
+    byteSize *= dim[j].extent;
+  }
+  return byteSize;
+}
+
+bool RT_API_ATTRS ValidatePointerPayload(const ISO::CFI_cdesc_t &desc) {
+  std::size_t byteSize{GetByteSize(desc)};
+  constexpr std::size_t align{sizeof(std::uintptr_t)};
+  byteSize = ((byteSize + align - 1) / align) * align;
+  const void *p{desc.base_addr};
+  const std::uintptr_t *footer{reinterpret_cast<const std::uintptr_t *>(
+      static_cast<const char *>(p) + byteSize)};
+  return *footer == ~reinterpret_cast<std::uintptr_t>(p);
+}
+
 int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -185,20 +213,9 @@ int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
   if (!pointer.IsAllocated()) {
     return ReturnError(terminator, StatBaseNull, errMsg, hasStat);
   }
-  if (executionEnvironment.checkPointerDeallocation) {
-    // Validate the footer.  This should fail if the pointer doesn't
-    // span the entire object, or the object was not allocated as a
-    // pointer.
-    std::size_t byteSize{pointer.Elements() * pointer.ElementBytes()};
-    constexpr std::size_t align{sizeof(std::uintptr_t)};
-    byteSize = ((byteSize + align - 1) / align) * align;
-    void *p{pointer.raw().base_addr};
-    std::uintptr_t *footer{
-        reinterpret_cast<std::uintptr_t *>(static_cast<char *>(p) + byteSize)};
-    if (*footer != ~reinterpret_cast<std::uintptr_t>(p)) {
-      return ReturnError(
-          terminator, StatBadPointerDeallocation, errMsg, hasStat);
-    }
+  if (executionEnvironment.checkPointerDeallocation &&
+      !ValidatePointerPayload(pointer.raw())) {
+    return ReturnError(terminator, StatBadPointerDeallocation, errMsg, hasStat);
   }
   return ReturnError(terminator,
       pointer.Destroy(/*finalize=*/true, /*destroyPointers=*/true, &terminator),
diff --git a/flang/runtime/pseudo-unit.cpp b/flang/runtime/pseudo-unit.cpp
index 70e70eec6a77ea..526afd11d916e9 100644
--- a/flang/runtime/pseudo-unit.cpp
+++ b/flang/runtime/pseudo-unit.cpp
@@ -65,7 +65,7 @@ bool ExternalFileUnit::OpenUnit(Fortran::common::optional<OpenStatus> status,
   handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
 
-void ExternalFileUnit::OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
+bool ExternalFileUnit::OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
     Fortran::common::optional<Action>, Position, Convert convert,
     IoErrorHandler &handler) {
   handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index abd535fd0381de..83f839e205a483 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -134,7 +134,7 @@ class ExternalFileUnit : public ConnectionState,
   RT_API_ATTRS bool OpenUnit(Fortran::common::optional<OpenStatus>,
       Fortran::common::optional<Action>, Position, OwningPtr<char> &&path,
       std::size_t pathLength, Convert, IoErrorHandler &);
-  RT_API_ATTRS void OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
+  RT_API_ATTRS bool OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
       Fortran::common::optional<Action>, Position, Convert, IoErrorHandler &);
   RT_API_ATTRS void CloseUnit(CloseStatus, IoErrorHandler &);
   RT_API_ATTRS void DestroyClosed();
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index eea7a0f665b34e..e78c3516db45a0 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -31,11 +31,11 @@
 
 ! CHECK-A57: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! CHECK-A57-SAME: "-target-cpu" "cortex-a57"
-! CHECK-A57-SAME: "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+perfmon" "-target-feature" "+sha2" "-target-feature" "+neon"
+! CHECK-A57-SAME: "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2
 
 ! CHECK-A76: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! CHECK-A76-SAME: "-target-cpu" "cortex-a76"
-! CHECK-A76-SAME: "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" "-target-feature" "+ssbs"
+! CHECK-A76-SAME: "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+ssbs"
 
 ! CHECK-ARMV9: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! CHECK-ARMV9-SAME: "-target-cpu" "generic"
diff --git a/flang/test/Evaluate/fold-nearest.f90 b/flang/test/Evaluate/fold-nearest.f90
index bd8b020c392ac3..a7366e6d75407e 100644
--- a/flang/test/Evaluate/fold-nearest.f90
+++ b/flang/test/Evaluate/fold-nearest.f90
@@ -28,6 +28,12 @@ module m1
   logical, parameter :: test_15 = nearest(negZero, 0.) == minSubnormal
   logical, parameter :: test_16 = nearest(tiny(1.),-1.) == 1.1754942E-38
   logical, parameter :: test_17 = nearest(tiny(1.),1.) == 1.1754945E-38
+ contains
+  subroutine subr(a)
+    real, intent(in) :: a
+    !WARN: warning: NEAREST: S argument is zero
+    print *, nearest(a, 0.)
+  end
 end module
 
 module m2
diff --git a/flang/test/Evaluate/folding04.f90 b/flang/test/Evaluate/folding04.f90
index 86ae8debd6ef12..c7815b03403608 100644
--- a/flang/test/Evaluate/folding04.f90
+++ b/flang/test/Evaluate/folding04.f90
@@ -32,11 +32,22 @@ module real_tests
   !WARN: warning: invalid argument on evaluation of intrinsic function or operation
   real(4), parameter :: nan_r4_acos5 = acos(r4_pinf)
   TEST_ISNAN(nan_r4_acos5)
-  !WARN: warning: second argument to MOD must not be zero
+  !WARN: warning: MOD: P argument is zero
   real(4), parameter :: nan_r4_mod = mod(3.5, 0.)
   TEST_ISNAN(nan_r4_mod)
   !WARN: warning: overflow on evaluation of intrinsic function or operation
   logical, parameter :: test_exp_overflow = exp(256._4).EQ.r4_pinf
+ contains
+  subroutine s1(a,j)
+    !WARN: warning: MOD: P argument is zero
+    print *, mod(a, 0.)
+    !WARN: warning: MODULO: P argument is zero
+    print *, modulo(a, 0.)
+    !WARN: warning: MOD: P argument is zero
+    print *, mod(j, 0.)
+    !WARN: warning: MODULO: P argument is zero
+    print *, modulo(j, 0.)
+  end
 end module
 
 module parentheses
diff --git a/flang/test/Evaluate/rewrite06.f90 b/flang/test/Evaluate/rewrite06.f90
index 03eb463fe9bd5d..f27e6256556a9a 100644
--- a/flang/test/Evaluate/rewrite06.f90
+++ b/flang/test/Evaluate/rewrite06.f90
@@ -31,3 +31,9 @@ subroutine test(k)
     print *, storage_size(return_pdt(k))
   end subroutine
 end module
+
+subroutine test_assumed_rank(x)
+  real :: x(..)
+  !CHECK: PRINT *, sizeof(x)
+  print *, sizeof(x)
+end subroutine
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 834017bff71aa3..9e4ea0bc210775 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -16,9 +16,7 @@
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_other(ptr
 // CHECK-SAME:              %[[VAL_0:.*]])
-// CHECK:         %[[VAL_1:.*]] = alloca i32, i64 1, align 4
-// CHECK:         store i32 4, ptr %[[VAL_1]], align 4
-// CHECK:         call void %[[VAL_0]](ptr %[[VAL_1]])
+// CHECK:         call void %[[VAL_0]](ptr %{{.*}})
 
 func.func @_QPtest_proc_dummy() {
   %c0_i32 = arith.constant 0 : i32
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 396fbaeacf39f1..8b62787bb30942 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -1006,3 +1006,77 @@ func.func @omp_map_info_nested_derived_type_explicit_member_conversion(%arg0 : !
 }
 
 // -----
+
+// CHECK-LABEL:  llvm.func @omp_map_common_block_using_common_block_symbol
+
+// CHECK: %[[ADDR_OF:.*]] = llvm.mlir.addressof @var_common_ : !llvm.ptr
+// CHECK: %[[CB_MAP:.*]] = omp.map.info var_ptr(%[[ADDR_OF]] : !llvm.ptr, !llvm.array<8 x i8>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var_common"}
+// CHECK:    omp.target map_entries(%[[CB_MAP]] -> %[[ARG0:.*]] : !llvm.ptr) {
+// CHECK:    ^bb0(%[[ARG0]]: !llvm.ptr):
+// CHECK:      %[[VAR_2_OFFSET:.*]] = llvm.mlir.constant(4 : index) : i64
+// CHECK:      %[[VAR_1_OFFSET:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK:      %{{.*}} = llvm.getelementptr %[[ARG0]][%[[VAR_1_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+// CHECK:      %{{.*}} = llvm.getelementptr %[[ARG0]][%[[VAR_2_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+
+func.func @omp_map_common_block_using_common_block_symbol() {
+  %0 = fir.address_of(@var_common_) : !fir.ref<!fir.array<8xi8>>
+  %1 = omp.map.info var_ptr(%0 : !fir.ref<!fir.array<8xi8>>, !fir.array<8xi8>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.array<8xi8>> {name = "var_common"}
+  omp.target map_entries(%1 -> %arg0 : !fir.ref<!fir.array<8xi8>>) {
+  ^bb0(%arg0: !fir.ref<!fir.array<8xi8>>):
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %c20_i32 = arith.constant 20 : i32
+    %2 = fir.convert %arg0 : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    %3 = fir.coordinate_of %2, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %4 = fir.convert %3 : (!fir.ref<i8>) -> !fir.ref<i32>
+    %5 = fir.convert %arg0 : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    %6 = fir.coordinate_of %5, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %7 = fir.convert %6 : (!fir.ref<i8>) -> !fir.ref<i32>
+    %8 = fir.load %4 : !fir.ref<i32>
+    %9 = arith.addi %8, %c20_i32 : i32
+    fir.store %9 to %7 : !fir.ref<i32>
+    omp.terminator
+  }
+  return
+}
+
+fir.global common @var_common_(dense<0> : vector<8xi8>) {alignment = 4 : i64} : !fir.array<8xi8>
+
+// -----
+
+// CHECK-LABEL:  llvm.func @omp_map_common_block_using_common_block_members
+
+// CHECK:    %[[VAR_2_OFFSET:.*]] = llvm.mlir.constant(4 : index) : i64
+// CHECK:    %[[VAR_1_OFFSET:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK:    %[[ADDR_OF:.*]] = llvm.mlir.addressof @var_common_ : !llvm.ptr
+// CHECK:    %[[VAR_1_CB_GEP:.*]] = llvm.getelementptr %[[ADDR_OF]][%[[VAR_1_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+// CHECK:    %[[VAR_2_CB_GEP:.*]] = llvm.getelementptr %[[ADDR_OF]][%[[VAR_2_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+// CHECK:    %[[MAP_CB_VAR_1:.*]] = omp.map.info var_ptr(%[[VAR_1_CB_GEP]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var1"}
+// CHECK:    %[[MAP_CB_VAR_2:.*]] = omp.map.info var_ptr(%[[VAR_2_CB_GEP]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var2"}
+// CHECK:    omp.target map_entries(%[[MAP_CB_VAR_1]] -> %[[ARG0:.*]], %[[MAP_CB_VAR_2]] -> %[[ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
+// CHECK:     ^bb0(%[[ARG0]]: !llvm.ptr, %[[ARG1]]: !llvm.ptr):
+
+func.func @omp_map_common_block_using_common_block_members() {
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.address_of(@var_common_) : !fir.ref<!fir.array<8xi8>>
+  %1 = fir.convert %0 : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  %2 = fir.coordinate_of %1, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  %3 = fir.convert %2 : (!fir.ref<i8>) -> !fir.ref<i32>
+  %4 = fir.convert %0 : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  %5 = fir.coordinate_of %4, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  %6 = fir.convert %5 : (!fir.ref<i8>) -> !fir.ref<i32>
+  %7 = omp.map.info var_ptr(%3 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var1"}
+  %8 = omp.map.info var_ptr(%6 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var2"}
+  omp.target map_entries(%7 -> %arg0, %8 -> %arg1 : !fir.ref<i32>, !fir.ref<i32>) {
+  ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+    %c10_i32 = arith.constant 10 : i32
+    %9 = fir.load %arg0 : !fir.ref<i32>
+    %10 = arith.muli %9, %c10_i32 : i32
+    fir.store %10 to %arg1 : !fir.ref<i32>
+    omp.terminator
+  }
+  return
+}
+
+fir.global common @var_common_(dense<0> : vector<8xi8>) {alignment = 4 : i64} : !fir.array<8xi8>
diff --git a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
index f3a20690f05a9b..591be0b680a512 100644
--- a/flang/test/Integration/OpenMP/map-types-and-sizes.f90
+++ b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
@@ -231,6 +231,31 @@ subroutine mapType_char
   !$omp end target
 end subroutine mapType_char
 
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 35]
+subroutine mapType_common_block
+  implicit none
+  common /var_common/ var1, var2
+  integer :: var1, var2
+!$omp target map(tofrom: /var_common/)
+  var1 = var1 + 20
+  var2 = var2 + 30
+!$omp end target
+end subroutine mapType_common_block
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+subroutine mapType_common_block_members
+  implicit none
+  common /var_common/ var1, var2
+  integer :: var1, var2
+
+!$omp target map(tofrom: var1, var2)
+  var2 = var1
+!$omp end target
+end subroutine mapType_common_block_members
+
+
 !CHECK-LABEL: define {{.*}} @{{.*}}maptype_ptr_explicit_{{.*}}
 !CHECK: %[[ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
 !CHECK: %[[ALLOCA_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[ALLOCA]], i32 1
@@ -346,3 +371,19 @@ end subroutine mapType_char
 !CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
 !CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
 !CHECK: store ptr %[[ARR_OFF]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_common_block_{{.*}}
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr @var_common_, ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr @var_common_, ptr %[[OFFLOAD_PTR_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_common_block_members_{{.*}}
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr @var_common_, ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr @var_common_, ptr %[[OFFLOAD_PTR_ARR]], align 8
+!CHECK: %[[BASE_PTR_ARR_1:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+!CHECK: store ptr getelementptr (i8, ptr @var_common_, i64 4), ptr %[[BASE_PTR_ARR_1]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR_1:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+!CHECK: store ptr getelementptr (i8, ptr @var_common_, i64 4), ptr %[[OFFLOAD_PTR_ARR_1]], align 8
diff --git a/flang/test/Integration/debug-char-type-1.f90 b/flang/test/Integration/debug-char-type-1.f90
index a0aebd3125c6e6..5068663aa9e28e 100644
--- a/flang/test/Integration/debug-char-type-1.f90
+++ b/flang/test/Integration/debug-char-type-1.f90
@@ -2,6 +2,7 @@
 
 module helper
   character(len=40) :: str
+  character(len=:), allocatable :: str2
 end module helper
 
 program test
@@ -11,11 +12,14 @@ program test
   first = '3.14 = π'
   second = 'Fortran'
   str = 'Hello World!'
+  str2 = 'A quick brown fox jumps over a lazy dog'
 end program test
 
 ! CHECK-DAG: !DIGlobalVariable(name: "str"{{.*}}type: ![[TY40:[0-9]+]]{{.*}})
 ! CHECK-DAG: ![[TY40]] = !DIStringType(size: 320, encoding: DW_ATE_ASCII)
+! CHECK-DAG: !DIGlobalVariable(name: "str2"{{.*}}type: ![[TY:[0-9]+]]{{.*}})
+! CHECK-DAG: ![[TY]] = !DIStringType(stringLengthExpression: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 8), stringLocationExpression: !DIExpression(DW_OP_push_object_address, DW_OP_deref), encoding: DW_ATE_ASCII)
 ! CHECK-DAG: !DILocalVariable(name: "first"{{.*}}type: ![[TY8:[0-9]+]])
 ! CHECK-DAG: ![[TY8]] = !DIStringType(size: 256, encoding: DW_ATE_UCS)
 ! CHECK-DAG: !DILocalVariable(name: "second"{{.*}}type: ![[TY10:[0-9]+]])
-! CHECK-DAG: ![[TY10]] = !DIStringType(size: 80, encoding: DW_ATE_ASCII)
\ No newline at end of file
+! CHECK-DAG: ![[TY10]] = !DIStringType(size: 80, encoding: DW_ATE_ASCII)
diff --git a/flang/test/Integration/debug-ptr-type.f90 b/flang/test/Integration/debug-ptr-type.f90
new file mode 100644
index 00000000000000..bff7bcb862b5c3
--- /dev/null
+++ b/flang/test/Integration/debug-ptr-type.f90
@@ -0,0 +1,48 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+subroutine ff(n, m)
+  implicit none
+  integer i, j, m, n
+  real(4), pointer :: par(:, :)
+  integer, pointer :: psc
+  integer, pointer :: par2(:)
+  character(len=16), pointer :: pstr
+  real(4), target :: ar(4, 5)
+  integer, target :: sc
+  integer, target, allocatable :: ar2(:)
+  character(len=:), target, allocatable :: str
+
+  str = 'Hello'
+  pstr => str
+  allocate(ar2(4))
+  par2 => ar2
+  do i=1,5
+    do j=1,4
+    ar(j,i) = 0.1
+    par2(j) = j
+    end do
+  end do
+  sc = 3
+  psc => sc
+  par => ar
+
+  print *, sc
+  print *, ar
+  print *, ar2
+  print *, str
+  print *, psc
+  print *, par
+  print *, par2
+  print *, pstr
+end subroutine ff
+
+
+! CHECK-DAG: ![[INT_TY:[0-9]+]] = !DIBasicType(name: "integer"{{.*}})
+! CHECK-DAG: ![[ELEMS1:[0-9]+]] = !{!{{[0-9]+}}}
+! CHECK-DAG: !DILocalVariable(name: "par"{{.*}}type: ![[ARR_TY1:[0-9]+]])
+! CHECK-DAG: ![[ARR_TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS2:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref), associated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne))
+! CHECK-DAG: ![[ELEMS2]] = !{!{{[0-9]+}}, !{{[0-9]+}}}
+! CHECK-DAG: !DILocalVariable(name: "par2"{{.*}}type: ![[ARR_TY2:[0-9]+]])
+! CHECK-DAG: ![[ARR_TY2]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}, elements: ![[ELEMS1]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref), associated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne))
+! CHECK-DAG: !DILocalVariable(name: "psc"{{.*}}type: ![[PTR_TY:[0-9]+]])
+! CHECK-DAG: ![[PTR_TY]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[INT_TY]]{{.*}})
diff --git a/flang/test/Lower/OpenMP/common-block-map.f90 b/flang/test/Lower/OpenMP/common-block-map.f90
new file mode 100644
index 00000000000000..5033129683a8eb
--- /dev/null
+++ b/flang/test/Lower/OpenMP/common-block-map.f90
@@ -0,0 +1,83 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK: fir.global common @var_common_(dense<0> : vector<8xi8>) {{.*}} : !fir.array<8xi8>
+!CHECK: fir.global common @var_common_link_(dense<0> : vector<8xi8>) {{{.*}} omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.array<8xi8>
+
+!CHECK-LABEL: func.func @_QPmap_full_block
+!CHECK: %[[CB_ADDR:.*]] = fir.address_of(@var_common_) : !fir.ref<!fir.array<8xi8>>
+!CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[CB_ADDR]] : !fir.ref<!fir.array<8xi8>>, !fir.array<8xi8>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.array<8xi8>> {name = "var_common"}
+!CHECK: omp.target map_entries(%[[MAP]] -> %[[MAP_ARG:.*]] : !fir.ref<!fir.array<8xi8>>) {
+!CHECK:  ^bb0(%[[MAP_ARG]]: !fir.ref<!fir.array<8xi8>>):
+!CHECK:    %[[CONV:.*]] = fir.convert %[[MAP_ARG]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[INDEX:.*]] = arith.constant 0 : index
+!CHECK:    %[[COORD:.*]] = fir.coordinate_of %[[CONV]], %[[INDEX]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[CONV2:.*]] = fir.convert %[[COORD]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:    %[[CB_MEMBER_1:.*]]:2 = hlfir.declare %[[CONV2]] {uniq_name = "_QFmap_full_blockEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[CONV3:.*]] = fir.convert %[[MAP_ARG]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[INDEX2:.*]] = arith.constant 4 : index
+!CHECK:    %[[COORD2:.*]] = fir.coordinate_of %[[CONV3]], %[[INDEX2]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[CONV4:.*]] = fir.convert %[[COORD2]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:    %[[CB_MEMBER_2:.*]]:2 = hlfir.declare %[[CONV4]] {uniq_name = "_QFmap_full_blockEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine map_full_block
+    implicit none
+    common /var_common/ var1, var2
+    integer :: var1, var2
+!$omp target map(tofrom: /var_common/)
+    var1 = var1 + 20
+    var2 = var2 + 30
+!$omp end target
+end
+
+!CHECK-LABEL: @_QPmap_mix_of_members
+!CHECK: %[[COMMON_BLOCK:.*]] = fir.address_of(@var_common_) : !fir.ref<!fir.array<8xi8>>
+!CHECK: %[[CB_CONV:.*]] = fir.convert %[[COMMON_BLOCK]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK: %[[INDEX:.*]] = arith.constant 0 : index
+!CHECK: %[[COORD:.*]] = fir.coordinate_of %[[CB_CONV]], %[[INDEX]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK: %[[CONV:.*]] = fir.convert %[[COORD]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK: %[[CB_MEMBER_1:.*]]:2 = hlfir.declare %[[CONV]] {uniq_name = "_QFmap_mix_of_membersEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CB_CONV:.*]] = fir.convert %[[COMMON_BLOCK]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK: %[[INDEX:.*]] = arith.constant 4 : index
+!CHECK: %[[COORD:.*]] = fir.coordinate_of %[[CB_CONV]], %[[INDEX]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK: %[[CONV:.*]] = fir.convert %[[COORD]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK: %[[CB_MEMBER_2:.*]]:2 = hlfir.declare %[[CONV]] {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[MAP_EXP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_2]]#0 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var2"}
+!CHECK: %[[MAP_IMP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "var1"}
+!CHECK: omp.target map_entries(%[[MAP_EXP]] -> %[[ARG_EXP:.*]], %[[MAP_IMP]] -> %[[ARG_IMP:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+!CHECK: ^bb0(%[[ARG_EXP]]: !fir.ref<i32>, %[[ARG_IMP]]: !fir.ref<i32>):
+!CHECK:  %[[EXP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_EXP]] {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[IMP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_IMP]] {uniq_name = "_QFmap_mix_of_membersEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine map_mix_of_members
+    implicit none
+    common /var_common/ var1, var2
+    integer :: var1, var2
+
+!$omp target map(tofrom: var2)
+  var2 = var1
+!$omp end target
+end
+
+!CHECK-LABEL: @_QQmain
+!CHECK: %[[DECL_TAR_CB:.*]] = fir.address_of(@var_common_link_) : !fir.ref<!fir.array<8xi8>>
+!CHECK: %[[MAP_DECL_TAR_CB:.*]] = omp.map.info var_ptr(%[[DECL_TAR_CB]] : !fir.ref<!fir.array<8xi8>>, !fir.array<8xi8>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.array<8xi8>> {name = "var_common_link"}
+!CHECK: omp.target map_entries(%[[MAP_DECL_TAR_CB]] -> %[[MAP_DECL_TAR_ARG:.*]] : !fir.ref<!fir.array<8xi8>>) {
+!CHECK: ^bb0(%[[MAP_DECL_TAR_ARG]]: !fir.ref<!fir.array<8xi8>>):
+!CHECK:  %[[CONV:.*]] = fir.convert %[[MAP_DECL_TAR_ARG]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:  %[[INDEX:.*]] = arith.constant 0 : index
+!CHECK:  %[[COORD:.*]] = fir.coordinate_of %[[CONV]], %[[INDEX]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:  %[[CONV:.*]] = fir.convert %[[COORD]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:  %[[MEMBER_ONE:.*]]:2 = hlfir.declare %[[CONV]] {uniq_name = "_QFElink1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[CONV:.*]] = fir.convert %[[MAP_DECL_TAR_ARG]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:  %[[INDEX:.*]] = arith.constant 4 : index
+!CHECK:  %[[COORD:.*]] = fir.coordinate_of %[[CONV]], %[[INDEX]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:  %[[CONV:.*]] = fir.convert %[[COORD]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:  %[[MEMBER_TWO:.*]]:2 = hlfir.declare %[[CONV]] {uniq_name = "_QFElink2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+program main
+    implicit none
+    common /var_common_link/ link1, link2
+    integer :: link1, link2
+    !$omp declare target link(/var_common_link/)
+
+!$omp target map(tofrom: /var_common_link/)
+    link1 = link2 + 20
+!$omp end target
+end program
diff --git a/flang/test/Lower/OpenMP/copyprivate2.f90 b/flang/test/Lower/OpenMP/copyprivate2.f90
new file mode 100644
index 00000000000000..f10d509d805e29
--- /dev/null
+++ b/flang/test/Lower/OpenMP/copyprivate2.f90
@@ -0,0 +1,56 @@
+! Test lowering of COPYPRIVATE with allocatable/pointer variables.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: func private @_copy_box_ptr_i32(
+!CHECK-SAME:                  %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>,
+!CHECK-SAME:                  %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>) {
+!CHECK-NEXT:    %[[DST:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<pointer>,
+!CHECK-SAME:      uniq_name = "_copy_box_ptr_i32_dst"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) ->
+!CHECK-SAME:      (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK-NEXT:    %[[SRC:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<pointer>,
+!CHECK-SAME:      uniq_name = "_copy_box_ptr_i32_src"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) ->
+!CHECK-SAME:      (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK-NEXT:    %[[SRC_VAL:.*]] = fir.load %[[SRC]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-NEXT:    fir.store %[[SRC_VAL]] to %[[DST]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-NEXT:    return
+!CHECK-NEXT:  }
+
+!CHECK-LABEL: func private @_copy_box_heap_Uxi32(
+!CHECK-SAME:        %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>,
+!CHECK-SAME:        %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
+!CHECK-NEXT:    %[[DST:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<allocatable>,
+!CHECK-SAME:      uniq_name = "_copy_box_heap_Uxi32_dst"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ->
+!CHECK-SAME:      (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!CHECK-NEXT:    %[[SRC:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<allocatable>,
+!CHECK-SAME:      uniq_name = "_copy_box_heap_Uxi32_src"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ->
+!CHECK-SAME:      (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!CHECK-NEXT:    %[[DST_BOX:.*]] = fir.load %[[DST]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!CHECK:         fir.if %{{.*}} {
+!CHECK-NEXT:      %[[SRC_BOX:.*]] = fir.load %[[SRC]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!CHECK-NEXT:      hlfir.assign %[[SRC_BOX]] to %[[DST_BOX]] temporary_lhs : !fir.box<!fir.heap<!fir.array<?xi32>>>,
+!CHECK-SAME:        !fir.box<!fir.heap<!fir.array<?xi32>>>
+!CHECK-NEXT:    }
+!CHECK-NEXT:    return
+!CHECK-NEXT:  }
+
+!CHECK-LABEL: func @_QPtest_alloc_ptr
+!CHECK:         omp.parallel {
+!CHECK:           %[[A:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>,
+!CHECK-SAME:        uniq_name = "_QFtest_alloc_ptrEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ->
+!CHECK-SAME:        (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!CHECK:           %[[P:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<pointer>,
+!CHECK-SAME:        uniq_name = "_QFtest_alloc_ptrEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) ->
+!CHECK-SAME:        (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK:           omp.single copyprivate(
+!CHECK-SAME:        %[[A]]#0 -> @_copy_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>,
+!CHECK-SAME:        %[[P]]#0 -> @_copy_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHEK:          }
+subroutine test_alloc_ptr()
+  integer, allocatable :: a(:)
+  integer, pointer :: p
+
+  !$omp parallel private(a, p)
+  !$omp single
+  !$omp end single copyprivate(a, p)
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
new file mode 100644
index 00000000000000..717d9740c56f80
--- /dev/null
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -0,0 +1,76 @@
+! This test checks lowering of OpenMP order clause.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK-LABEL:   func.func @_QPsimd_order() {
+subroutine simd_order
+   !CHECK: omp.simd order(reproducible:concurrent) {
+   !$omp simd order(concurrent)
+   do i = 1, 10
+   end do
+   !CHECK: omp.simd order(reproducible:concurrent) {
+   !$omp simd order(reproducible:concurrent)
+   do i = 1, 10
+   end do
+   !CHECK: omp.simd order(unconstrained:concurrent) {
+   !$omp simd order(unconstrained:concurrent)
+   do i = 1, 10
+   end do
+end subroutine simd_order
+
+!CHECK-LABEL:   func.func @_QPdo_order() {
+subroutine do_order
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !$omp do order(concurrent)
+   do i = 1, 10
+   end do
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !$omp do order(reproducible:concurrent)
+   do i = 1, 10
+   end do
+   !CHECK: omp.wsloop order(unconstrained:concurrent) {
+   !$omp do order(unconstrained:concurrent)
+   do i = 1, 10
+   end do
+end subroutine do_order
+
+!CHECK-LABEL:   func.func @_QPdo_simd_order() {
+subroutine do_simd_order
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !$omp do simd order(concurrent)
+   do i = 1, 10
+   end do
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !$omp do simd order(reproducible:concurrent)
+   do i = 1, 10
+   end do
+   !CHECK: omp.wsloop order(unconstrained:concurrent) {
+   !$omp do simd order(unconstrained:concurrent)
+   do i = 1, 10
+   end do
+end subroutine do_simd_order
+
+!CHECK-LABEL:   func.func @_QPdo_simd_order_parallel() {
+subroutine do_simd_order_parallel
+   !CHECK: omp.parallel {
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !$omp parallel do simd order(reproducible:concurrent)
+   do i = 1, 10
+   end do
+end subroutine do_simd_order_parallel
+
+
+subroutine distribute_order
+   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !$omp teams distribute order(concurrent)
+   do i=1,10
+   end do
+   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !$omp teams distribute order(reproducible:concurrent)
+   do i=1,10
+   end do
+   !CHECK: omp.distribute order(unconstrained:concurrent) {
+   !$omp teams distribute order(unconstrained:concurrent)
+   do i = 1, 10
+   end do
+end subroutine
diff --git a/flang/test/Lower/character-local-variables.f90 b/flang/test/Lower/character-local-variables.f90
index 0cf61a2623c4e7..d5b959eca1ff63 100644
--- a/flang/test/Lower/character-local-variables.f90
+++ b/flang/test/Lower/character-local-variables.f90
@@ -1,4 +1,6 @@
 ! RUN: bbc -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -hlfir=false --enable-constant-argument-globalisation %s -o - \
+! RUN:    | FileCheck %s --check-prefix=CHECK-CONST
 
 ! Test lowering of local character variables
 
@@ -118,7 +120,8 @@ subroutine assumed_length_param(n)
   integer :: n
   ! CHECK: %[[c4:.*]] = arith.constant 4 : i64
   ! CHECK: fir.store %[[c4]] to %[[tmp:.*]] : !fir.ref<i64>
-  ! CHECK: fir.call @_QPtake_int(%[[tmp]]) {{.*}}: (!fir.ref<i64>) -> ()
+  ! CHECK-CONST: %[[tmp:.*]] = fir.address_of(@_global_const_.{{.*}}) : !fir.ref<i64>
+  ! CHECK-CONST: fir.call @_QPtake_int(%[[tmp]]) {{.*}}: (!fir.ref<i64>) -> ()
   call take_int(len(c(n), kind=8))
 end
 
diff --git a/flang/test/Preprocessing/cond-contin.F90 b/flang/test/Preprocessing/cond-contin.F90
new file mode 100644
index 00000000000000..9221e34e013f40
--- /dev/null
+++ b/flang/test/Preprocessing/cond-contin.F90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s
+! CHECK: subroutine test(ARG1, FA, FB, ARG2)
+! CHECK: end
+
+subroutine test( &
+ARG1, &
+! test
+#ifndef SWAP
+#define ARG1 FA
+#define ARG2 FB
+#else
+#define ARG1 FB
+#define ARG2 FA
+#endif
+ARG1, ARG2, &
+! test
+#undef ARG1
+#undef ARG2
+&ARG2)
+! comment
+end
diff --git a/flang/test/Preprocessing/inc-contin-1.F b/flang/test/Preprocessing/inc-contin-1.F
new file mode 100644
index 00000000000000..7a4e3a0cb0b59b
--- /dev/null
+++ b/flang/test/Preprocessing/inc-contin-1.F
@@ -0,0 +1,6 @@
+! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s
+! CHECK: call t(1  ,.false.)
+      program main
+#include "inc-contin-1.h"
+     $,.false.)
+      end
diff --git a/flang/test/Preprocessing/inc-contin-1.h b/flang/test/Preprocessing/inc-contin-1.h
new file mode 100644
index 00000000000000..d4b6461e75274f
--- /dev/null
+++ b/flang/test/Preprocessing/inc-contin-1.h
@@ -0,0 +1 @@
+      call t(1
diff --git a/flang/test/Preprocessing/inc-contin-2.F90 b/flang/test/Preprocessing/inc-contin-2.F90
new file mode 100644
index 00000000000000..3386ee0dfcddc4
--- /dev/null
+++ b/flang/test/Preprocessing/inc-contin-2.F90
@@ -0,0 +1,9 @@
+! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s
+! CHECK: print *, 3.14159
+! CHECK: print *, 3. 14159
+      program main
+#include "inc-contin-2a.h"
+     &14159
+#include "inc-contin-2b.h"
+     &14159
+      end program main
diff --git a/flang/test/Preprocessing/inc-contin-2a.h b/flang/test/Preprocessing/inc-contin-2a.h
new file mode 100644
index 00000000000000..24a4fa4830fa3b
--- /dev/null
+++ b/flang/test/Preprocessing/inc-contin-2a.h
@@ -0,0 +1 @@
+print *, 3.&
diff --git a/flang/test/Preprocessing/inc-contin-2b.h b/flang/test/Preprocessing/inc-contin-2b.h
new file mode 100644
index 00000000000000..b84a464af86e33
--- /dev/null
+++ b/flang/test/Preprocessing/inc-contin-2b.h
@@ -0,0 +1 @@
+print *, 3. &
diff --git a/flang/test/Preprocessing/include-args.F90 b/flang/test/Preprocessing/include-args.F90
index 011e4dba13e730..7c521db666ff9a 100644
--- a/flang/test/Preprocessing/include-args.F90
+++ b/flang/test/Preprocessing/include-args.F90
@@ -1,5 +1,5 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: call foo(3.14159)
+! CHECK: call foo(3.14159 )
 call foo (&
 #include "args.h"
 )
diff --git a/flang/test/Preprocessing/kw-in-char.F90 b/flang/test/Preprocessing/kw-in-char.F90
new file mode 100644
index 00000000000000..e8f3a33b8a7498
--- /dev/null
+++ b/flang/test/Preprocessing/kw-in-char.F90
@@ -0,0 +1,14 @@
+! RUN: %flang -E %s 2>&1 | FileCheck %s
+! CHECK: subroutine test_b_wrapper_c() bind(C, name="test_b_c_f")
+#define TEMP_LETTER b
+#define VAL c
+subroutine test_&
+TEMP_LETTER&
+_wrapper_&
+VAL&
+ () bind(C, name="test_&
+    &TEMP_LETTER&
+    &_&
+    &VAL&
+    &_f")
+end
diff --git a/flang/test/Preprocessing/sentinel-after-semi.F90 b/flang/test/Preprocessing/sentinel-after-semi.F90
new file mode 100644
index 00000000000000..75060ac1db0328
--- /dev/null
+++ b/flang/test/Preprocessing/sentinel-after-semi.F90
@@ -0,0 +1,7 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenacc %s 2>&1 | FileCheck %s
+! CHECK: !$ACC DECLARE COPYIN(var)
+#define ACCDECLARE(var) integer :: var; \
+  !$acc declare copyin(var)
+program main
+  ACCDECLARE(var)
+end program
diff --git a/flang/test/Semantics/boz-literal-constants.f90 b/flang/test/Semantics/boz-literal-constants.f90
index e6392f6f030e52..ee6919cb9ecd5c 100644
--- a/flang/test/Semantics/boz-literal-constants.f90
+++ b/flang/test/Semantics/boz-literal-constants.f90
@@ -7,7 +7,12 @@ subroutine bozchecks
   integer :: f, realpart = B"0101", img = B"1111", resint
   logical :: resbit
   complex :: rescmplx
+  character :: reschar
   real :: dbl, e
+  type :: dt
+    integer :: n
+  end type
+  type(dt) :: resdt
   interface
     subroutine explicit(n, x, c)
       integer :: n
@@ -98,6 +103,17 @@ subroutine explicit(n, x, c)
 
   res = REAL(B"1101")
 
+  resint = z'ff' ! ok
+  res = z'3f800000' ! ok
+  !ERROR: Right-hand side of this assignment may not be BOZ
+  rescmplx = z'123'
+  !ERROR: Right-hand side of this assignment may not be BOZ
+  resbit = z'123'
+  !ERROR: Right-hand side of this assignment may not be BOZ
+  reschar = z'123'
+  !ERROR: Right-hand side of this assignment may not be BOZ
+  resdt = z'123'
+
   !Ok
   call explicit(z'deadbeef', o'666', 'a')
 
diff --git a/flang/test/Semantics/c_loc01.f90 b/flang/test/Semantics/c_loc01.f90
index 7c9e2941729938..83b88d2ebd4b0f 100644
--- a/flang/test/Semantics/c_loc01.f90
+++ b/flang/test/Semantics/c_loc01.f90
@@ -4,7 +4,10 @@ module m
   type haslen(L)
     integer, len :: L
   end type
+  integer, target :: targ
  contains
+  subroutine subr
+  end
   subroutine test(assumedType, poly, nclen)
     type(*), target :: assumedType
     class(*), target ::  poly
@@ -17,6 +20,8 @@ subroutine test(assumedType, poly, nclen)
     type(hasLen(1)), target :: clen
     type(hasLen(*)), target :: nclen
     character(2), target :: ch
+    real :: arr1(purefun1(c_loc(targ))) ! ok
+    real :: arr2(purefun2(c_funloc(subr))) ! ok
     !ERROR: C_LOC() argument must be a data pointer or target
     cp = c_loc(notATarget)
     !ERROR: C_LOC() argument must be a data pointer or target
@@ -44,4 +49,12 @@ subroutine test(assumedType, poly, nclen)
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches operand types TYPE(c_funptr) and TYPE(c_ptr)
     cfp = cp
   end
+  pure integer function purefun1(p)
+    type(c_ptr), intent(in) :: p
+    purefun1 = 1
+  end
+  pure integer function purefun2(p)
+    type(c_funptr), intent(in) :: p
+    purefun2 = 1
+  end
 end module
diff --git a/flang/test/Semantics/call05.f90 b/flang/test/Semantics/call05.f90
index 71f2197067f76c..8a4386e28e2bcc 100644
--- a/flang/test/Semantics/call05.f90
+++ b/flang/test/Semantics/call05.f90
@@ -123,9 +123,7 @@ subroutine test
 
 module m2
 
-  !ERROR: Procedure 't3' may not be ALLOCATABLE without an explicit interface
   character(len=10), allocatable :: t1, t2, t3, t4
-  !ERROR: Procedure 't6' may not be ALLOCATABLE without an explicit interface
   character(len=:), allocatable :: t5, t6, t7, t8(:)
 
   character(len=10), pointer :: p1
@@ -189,7 +187,7 @@ subroutine test()
     !ERROR: ALLOCATABLE dummy argument 'a=' must be associated with an ALLOCATABLE actual argument
     call sma(t2(:))
 
-    !ERROR: ALLOCATABLE dummy argument 'a=' must be associated with an ALLOCATABLE actual argument
+    !ERROR: 't3' is not a callable procedure
     call sma(t3(1))
 
     !ERROR: ALLOCATABLE dummy argument 'a=' must be associated with an ALLOCATABLE actual argument
@@ -208,7 +206,7 @@ subroutine test()
     !ERROR: ALLOCATABLE dummy argument 'a=' must be associated with an ALLOCATABLE actual argument
     call sma(t5(:))
 
-    !ERROR: ALLOCATABLE dummy argument 'a=' must be associated with an ALLOCATABLE actual argument
+    !ERROR: 't6' is not a callable procedure
     call sma(t6(1))
 
     !ERROR: ALLOCATABLE dummy argument 'a=' must be associated with an ALLOCATABLE actual argument
diff --git a/flang/test/Semantics/select-rank03.f90 b/flang/test/Semantics/select-rank03.f90
index 8a965e950d3851..77b76f5f584ca2 100644
--- a/flang/test/Semantics/select-rank03.f90
+++ b/flang/test/Semantics/select-rank03.f90
@@ -52,10 +52,12 @@ subroutine allocatables(a)
       !ERROR: Whole assumed-size array 'a' may not appear here without subscripts
       a = 1.
     rank default
-      !ERROR: An assumed-rank object may not appear in an ALLOCATE statement
+      !ERROR: An assumed-rank dummy argument may not appear in an ALLOCATE statement
       allocate(a)
       deallocate(a)
-      a = 1.
+      !ERROR: An assumed-rank dummy argument is not allowed in an assignment statement
+      !ERROR: An assumed-rank dummy argument is not allowed as an operand here
+      a = a + 1.
     end select
     ! Test nested associations
     select rank(a)
@@ -121,11 +123,13 @@ subroutine pointers(p)
       !ERROR: Whole assumed-size array 'p' may not appear here without subscripts
       deallocate(p)
     rank default
-      !ERROR: An assumed-rank object may not appear in an ALLOCATE statement
+      !ERROR: An assumed-rank dummy argument may not appear in an ALLOCATE statement
       allocate(p)
       deallocate(p)
+      !ERROR: The left-hand side of a pointer assignment must not be an assumed-rank dummy argument
       !ERROR: pointer 'p' associated with object 't0' with incompatible type or shape
       p => t0
+      !ERROR: The left-hand side of a pointer assignment must not be an assumed-rank dummy argument
       !ERROR: pointer 'p' associated with object 't1' with incompatible type or shape
       p => t1
     end select
diff --git a/flang/test/Transforms/constant-argument-globalisation-2.fir b/flang/test/Transforms/constant-argument-globalisation-2.fir
new file mode 100644
index 00000000000000..03e08a0dcb914b
--- /dev/null
+++ b/flang/test/Transforms/constant-argument-globalisation-2.fir
@@ -0,0 +1,98 @@
+// RUN: fir-opt --split-input-file --constant-argument-globalisation-opt < %s | FileCheck %s
+
+module {
+// Test for "two conditional writes to the same alloca doesn't get replaced."
+  func.func @func(%arg0: i32, %arg1: i1) {
+    %c2_i32 = arith.constant 2 : i32
+    %addr = fir.alloca i32 {adapt.valuebyref}
+    fir.if %arg1 {
+      fir.store %c2_i32 to %addr : !fir.ref<i32>
+    } else {
+      fir.store %arg0 to %addr : !fir.ref<i32>
+    }
+    fir.call @sub2(%addr) : (!fir.ref<i32>) -> ()
+    return
+  }
+  func.func private @sub2(!fir.ref<i32>)
+
+// CHECK-LABEL: func.func @func
+// CHECK-SAME: [[ARG0:%.*]]: i32
+// CHECK-SAME: [[ARG1:%.*]]: i1)
+// CHECK: [[CONST:%.*]] = arith.constant
+// CHECK: [[ADDR:%.*]] = fir.alloca i32
+// CHECK: fir.if [[ARG1]]
+// CHECK: fir.store [[CONST]] to [[ADDR]]
+// CHECK:  } else {
+// CHECK: fir.store [[ARG0]] to [[ADDR]]
+// CHECK: fir.call @sub2([[ADDR]])
+// CHECK: return
+
+}
+
+// -----
+
+module {
+// Test for "two writes to the same alloca doesn't get replaced."
+  func.func @func() {
+    %c1_i32 = arith.constant 1 : i32
+    %c2_i32 = arith.constant 2 : i32
+    %addr = fir.alloca i32 {adapt.valuebyref}
+    fir.store %c1_i32 to %addr : !fir.ref<i32>
+    fir.store %c2_i32 to %addr : !fir.ref<i32>
+    fir.call @sub2(%addr) : (!fir.ref<i32>) -> ()
+    return
+  }
+  func.func private @sub2(!fir.ref<i32>)
+
+// CHECK-LABEL: func.func @func
+// CHECK: [[CONST1:%.*]] = arith.constant
+// CHECK: [[CONST2:%.*]] = arith.constant
+// CHECK: [[ADDR:%.*]] = fir.alloca i32
+// CHECK: fir.store [[CONST1]] to [[ADDR]]
+// CHECK: fir.store [[CONST2]] to [[ADDR]]
+// CHECK: fir.call @sub2([[ADDR]])
+// CHECK: return
+
+}
+
+// -----
+
+module {
+// Test for "one write to the the alloca gets replaced."
+  func.func @func() {
+    %c1_i32 = arith.constant 1 : i32
+    %addr = fir.alloca i32 {adapt.valuebyref}
+    fir.store %c1_i32 to %addr : !fir.ref<i32>
+    fir.call @sub2(%addr) : (!fir.ref<i32>) -> ()
+    return
+  }
+  func.func private @sub2(!fir.ref<i32>)
+
+// CHECK-LABEL: func.func @func
+// CHECK: [[ADDR:%.*]] = fir.address_of([[EXTR:@.*]]) : !fir.ref<i32>
+// CHECK: fir.call @sub2([[ADDR]])
+// CHECK: return
+// CHECK: fir.global internal [[EXTR]] constant : i32 {
+// CHECK: %{{.*}} = arith.constant 1 : i32
+// CHECK: fir.has_value %{{.*}} : i32
+// CHECK: }
+
+}
+
+// -----
+// Check that same argument used twice is converted.
+module {
+  func.func @func(%arg0: !fir.ref<i32>, %arg1: i1) {
+    %c2_i32 = arith.constant 2 : i32
+    %addr1 = fir.alloca i32 {adapt.valuebyref}
+    fir.store %c2_i32 to %addr1 : !fir.ref<i32>
+    fir.call @sub1(%addr1, %addr1) : (!fir.ref<i32>, !fir.ref<i32>) -> ()
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @func
+// CHECK-NEXT: %[[ARG1:.*]] = fir.address_of([[CONST1:@.*]]) : !fir.ref<i32>
+// CHECK-NEXT: %[[ARG2:.*]] = fir.address_of([[CONST2:@.*]]) : !fir.ref<i32>
+// CHECK-NEXT: fir.call @sub1(%[[ARG1]], %[[ARG2]])
+// CHECK-NEXT: return
diff --git a/flang/test/Transforms/constant-argument-globalisation.fir b/flang/test/Transforms/constant-argument-globalisation.fir
new file mode 100644
index 00000000000000..553c3c6c248458
--- /dev/null
+++ b/flang/test/Transforms/constant-argument-globalisation.fir
@@ -0,0 +1,67 @@
+// RUN: fir-opt --constant-argument-globalisation-opt  < %s | FileCheck %s
+// RUN: %flang_fc1 -emit-llvm -flang-deprecated-no-hlfir -O2 -o - %s | FileCheck --check-prefix=DISABLE %s
+module {
+  func.func @sub1(%arg0: !fir.ref<i32> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "y"}) {
+    %0 = fir.alloca i32 {adapt.valuebyref}
+    %1 = fir.alloca f64 {adapt.valuebyref}
+    %2 = fir.alloca f64 {adapt.valuebyref}
+    %c1_i32 = arith.constant 1 : i32
+    %cst = arith.constant 1.000000e+00 : f64
+    %cst_0 = arith.constant 0.000000e+00 : f64
+    %3 = fir.declare %arg0 {uniq_name = "_QFsub1Ex"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %4 = fir.declare %arg1 {uniq_name = "_QFsub1Ey"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    fir.store %cst_0 to %2 : !fir.ref<f64>
+    %false = arith.constant false
+    fir.store %cst to %1 : !fir.ref<f64>
+    %false_1 = arith.constant false
+    fir.store %c1_i32 to %0 : !fir.ref<i32>
+    %false_2 = arith.constant false
+    fir.call @sub2(%2, %1, %3, %4, %0) fastmath<contract> : (!fir.ref<f64>, !fir.ref<f64>, !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) -> ()
+    return
+  }
+  func.func private @sub2(!fir.ref<f64>, !fir.ref<f64>, !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
+
+// CHECK-LABEL: func.func @sub1(
+// CHECK-SAME: [[ARG0:%.*]]: !fir.ref<i32> {{{.*}}},
+// CHECK-SAME: [[ARG1:%.*]]: !fir.ref<i32> {{{.*}}}) {
+// CHECK: [[X:%.*]] = fir.declare [[ARG0]] {{.*}}
+// CHECK: [[Y:%.*]] = fir.declare [[ARG1]] {{.*}}
+// CHECK: [[CONST_R0:%.*]] = fir.address_of([[EXTR_0:@.*]]) : !fir.ref<f64>
+// CHECK: [[CONST_R1:%.*]] = fir.address_of([[EXTR_1:@.*]]) : !fir.ref<f64>
+// CHECK: [[CONST_I:%.*]] = fir.address_of([[EXTR_2:@.*]]) : !fir.ref<i32>
+// CHECK: fir.call @sub2([[CONST_R0]], [[CONST_R1]], [[X]], [[Y]], [[CONST_I]])
+// CHECK-SAME: fastmath<contract>
+// CHECK: return
+
+// CHECK: fir.global internal [[EXTR_0]] constant : f64 {
+// CHECK: %{{.*}} = arith.constant 0.000000e+00 : f64
+// CHECK: fir.has_value %{{.*}} : f64
+// CHECK: }
+// CHECK: fir.global internal [[EXTR_1]] constant : f64 {
+// CHECK: %{{.*}} = arith.constant 1.000000e+00 : f64
+// CHECK: fir.has_value %{{.*}} : f64
+// CHECK: }
+// CHECK: fir.global internal [[EXTR_2]] constant : i32 {
+// CHECK: %{{.*}} = arith.constant 1 : i32
+// CHECK: fir.has_value %{{.*}} : i32
+// CHECK: }
+
+// DISABLE-LABEL: ; ModuleID =
+// DISABLE-NOT: @_extruded
+// DISABLE:  define void @sub1(
+// DISABLE-SAME: ptr [[ARG0:%.*]],
+// DISABLE-SAME: ptr [[ARG1:%.*]])
+// DISABLE-SAME: {
+// DISABLE: [[CONST_R0:%.*]] = alloca double
+// DISABLE: [[CONST_R1:%.*]] = alloca double
+// DISABLE: [[CONST_I:%.*]] = alloca i32
+// DISABLE: store double 0.0{{.*}}+00, ptr [[CONST_R0]]
+// DISABLE: store double 1.0{{.*}}+00, ptr [[CONST_R1]]
+// DISABLE: store i32 1, ptr [[CONST_I]]
+// DISABLE: call void @sub2(ptr nonnull [[CONST_R0]],
+// DISABLE-SAME: ptr nonnull [[CONST_R1]], 
+// DISABLE-SAME: ptr [[ARG0]], ptr [[ARG1]],
+// DISABLE-SAME: ptr nonnull [[CONST_I]])
+// DISABLE: ret void
+// DISABLE: }
+}
diff --git a/flang/test/Transforms/debug-char-type-1.fir b/flang/test/Transforms/debug-char-type-1.fir
index cdce3b7b8b3340..630b52d96cb858 100644
--- a/flang/test/Transforms/debug-char-type-1.fir
+++ b/flang/test/Transforms/debug-char-type-1.fir
@@ -9,6 +9,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
     %0 = fir.zero_bits !fir.char<4,20>
     fir.has_value %0 : !fir.char<4,20>
   } loc(#loc1)
+  fir.global @_QMhelperEstr3 : !fir.box<!fir.heap<!fir.char<1,?>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.char<1,?>>
+    %1 = fir.embox %0 typeparams %c0 : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+    fir.has_value %1 : !fir.box<!fir.heap<!fir.char<1,?>>>
+  } loc(#loc1)
 }
 #loc1 = loc("string.f90":1:1)
 
@@ -16,4 +22,5 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 // CHECK-DAG: #llvm.di_global_variable<{{.*}}name = "str1"{{.*}}type = #[[TY1]]{{.*}}>
 // CHECK-DAG: #[[TY2:.*]] = #llvm.di_string_type<tag = DW_TAG_string_type, name = "", sizeInBits = 640, encoding = DW_ATE_UCS>
 // CHECK-DAG: #llvm.di_global_variable<{{.*}}name = "str2"{{.*}}type = #[[TY2]]{{.*}}>
-
+// CHECK-DAG: #[[TY3:.*]] = #llvm.di_string_type<tag = DW_TAG_string_type{{.*}}stringLengthExp = <[DW_OP_push_object_address, DW_OP_plus_uconst(8)]>, stringLocationExp = <[DW_OP_push_object_address, DW_OP_deref]>, encoding = DW_ATE_ASCII>
+// CHECK-DAG: #llvm.di_global_variable<{{.*}}name = "str3"{{.*}}type = #[[TY3]]{{.*}}>
diff --git a/flang/test/Transforms/debug-ptr-type.fir b/flang/test/Transforms/debug-ptr-type.fir
new file mode 100644
index 00000000000000..3f6c895ddaf8eb
--- /dev/null
+++ b/flang/test/Transforms/debug-ptr-type.fir
@@ -0,0 +1,40 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
+  fir.global @_QMhelperEpar : !fir.box<!fir.ptr<!fir.array<?x?xf32>>> {
+    %0 = fir.zero_bits !fir.ptr<!fir.array<?x?xf32>>
+    %c0 = arith.constant 0 : index
+    %1 = fir.shape %c0, %c0 : (index, index) -> !fir.shape<2>
+    %2 = fir.embox %0(%1) : (!fir.ptr<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
+  } loc(#loc1)
+  fir.global @_QMhelperEpar2 : !fir.box<!fir.ptr<!fir.array<?xi32>>> {
+    %0 = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+    %c0 = arith.constant 0 : index
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+    fir.has_value %2 : !fir.box<!fir.ptr<!fir.array<?xi32>>>
+  } loc(#loc2)
+  fir.global @_QMhelperEpsc : !fir.box<!fir.ptr<i32>> {
+    %0 = fir.zero_bits !fir.ptr<i32>
+    %1 = fir.embox %0 : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
+    fir.has_value %1 : !fir.box<!fir.ptr<i32>>
+  } loc(#loc3)
+  fir.global @_QMmEpstr : !fir.box<!fir.ptr<!fir.char<1,16>>> {
+    %0 = fir.zero_bits !fir.ptr<!fir.char<1,16>>
+    %1 = fir.embox %0 : (!fir.ptr<!fir.char<1,16>>) -> !fir.box<!fir.ptr<!fir.char<1,16>>>
+    fir.has_value %1 : !fir.box<!fir.ptr<!fir.char<1,16>>>
+  } loc(#loc4)
+}
+#loc1 = loc("test.f90":5:1)
+#loc2 = loc("test.f90":6:1)
+#loc3 = loc("test.f90":7:1)
+#loc4 = loc("test.f90":8:1)
+
+// CHECK-DAG: #[[INT_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer"{{.*}}>
+// CHECK-DAG: #[[ARR1_TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}elements = #llvm.di_subrange<lowerBound = #llvm.di_expression{{.*}}, upperBound = #llvm.di_expression{{.*}}>, #llvm.di_subrange<lowerBound = #llvm.di_expression{{.*}}, upperBound = #llvm.di_expression{{.*}}>, dataLocation = {{.*}}, associated = <[DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne]>>
+// CHECK-DAG: #[[ARR2_TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}elements = #llvm.di_subrange<lowerBound = #llvm.di_expression{{.*}}, upperBound = #llvm.di_expression{{.*}}>, dataLocation = {{.*}}, associated = <[DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne]>>
+// CHECK-DAG: #[[PTR_TY:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[INT_TY]]{{.*}}>
+// CHECK-DAG: #llvm.di_global_variable<{{.*}}name = "par"{{.*}}type = #[[ARR1_TY]]{{.*}}>
+// CHECK-DAG: #llvm.di_global_variable<{{.*}}name = "par2"{{.*}}type = #[[ARR2_TY]]{{.*}}>
+// CHECK-DAG: #llvm.di_global_variable<{{.*}}name = "psc"{{.*}}type = #[[PTR_TY]]{{.*}}>
diff --git a/flang/unittests/Evaluate/intrinsics.cpp b/flang/unittests/Evaluate/intrinsics.cpp
index 0bbc7fede699aa..86c471d2c62950 100644
--- a/flang/unittests/Evaluate/intrinsics.cpp
+++ b/flang/unittests/Evaluate/intrinsics.cpp
@@ -344,6 +344,7 @@ void TestIntrinsics() {
   TEST(table.GetGenericIntrinsicName("dcos") == "cos");
   TEST(table.GetGenericIntrinsicName("dcosh") == "cosh");
   TEST(table.GetGenericIntrinsicName("ddim") == "dim");
+  TEST(table.GetGenericIntrinsicName("derf") == "erf");
   TEST(table.GetGenericIntrinsicName("dexp") == "exp");
   TEST(table.GetGenericIntrinsicName("dint") == "aint");
   TEST(table.GetGenericIntrinsicName("dlog") == "log");
diff --git a/flang/unittests/Frontend/FrontendActionTest.cpp b/flang/unittests/Frontend/FrontendActionTest.cpp
index 123f428cc8b40b..bdf5a23fdbf6a7 100644
--- a/flang/unittests/Frontend/FrontendActionTest.cpp
+++ b/flang/unittests/Frontend/FrontendActionTest.cpp
@@ -143,7 +143,7 @@ TEST_F(FrontendActionTest, PrintPreprocessedInput) {
   EXPECT_TRUE(success);
   EXPECT_TRUE(!outputFileBuffer.empty());
   EXPECT_TRUE(
-      llvm::StringRef(outputFileBuffer.data()).starts_with("program b\n"));
+      llvm::StringRef(outputFileBuffer.data()).starts_with(" program b\n"));
 }
 
 TEST_F(FrontendActionTest, ParseSyntaxOnly) {
diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp
index 08daa4ba37f26b..20bd7a5b5ff35a 100644
--- a/flang/unittests/Runtime/CommandTest.cpp
+++ b/flang/unittests/Runtime/CommandTest.cpp
@@ -311,8 +311,8 @@ TEST_F(ZeroArguments, GetCommand) { CheckCommandValue(commandOnlyArgv, 1); }
 TEST_F(ZeroArguments, ECLValidCommandAndPadSync) {
   OwningPtr<Descriptor> command{CharDescriptor("echo hi")};
   bool wait{true};
-  OwningPtr<Descriptor> exitStat{EmptyIntDescriptor()};
-  OwningPtr<Descriptor> cmdStat{EmptyIntDescriptor()};
+  OwningPtr<Descriptor> exitStat{IntDescriptor(404)};
+  OwningPtr<Descriptor> cmdStat{IntDescriptor(202)};
   OwningPtr<Descriptor> cmdMsg{CharDescriptor("No change")};
 
   RTNAME(ExecuteCommandLine)
@@ -339,40 +339,91 @@ TEST_F(ZeroArguments, ECLValidCommandStatusSetSync) {
   CheckDescriptorEqStr(cmdMsg.get(), "No change");
 }
 
-TEST_F(ZeroArguments, ECLInvalidCommandErrorSync) {
-  OwningPtr<Descriptor> command{CharDescriptor("InvalidCommand")};
+TEST_F(ZeroArguments, ECLGeneralErrorCommandErrorSync) {
+  OwningPtr<Descriptor> command{CharDescriptor("cat GeneralErrorCommand")};
+  bool wait{true};
+  OwningPtr<Descriptor> exitStat{IntDescriptor(404)};
+  OwningPtr<Descriptor> cmdStat{IntDescriptor(202)};
+  OwningPtr<Descriptor> cmdMsg{CharDescriptor("cmd msg buffer XXXXXXXXXXXXXX")};
+
+  RTNAME(ExecuteCommandLine)
+  (*command.get(), wait, exitStat.get(), cmdStat.get(), cmdMsg.get());
+#ifdef _WIN32
+  CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 1);
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 6);
+  CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXXXX");
+#else
+  CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 1);
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 3);
+  CheckDescriptorEqStr(cmdMsg.get(), "Command line execution failed");
+#endif
+}
+
+TEST_F(ZeroArguments, ECLNotExecutedCommandErrorSync) {
+  OwningPtr<Descriptor> command{CharDescriptor(
+      "touch NotExecutedCommandFile && chmod -x NotExecutedCommandFile && "
+      "./NotExecutedCommandFile")};
+  bool wait{true};
+  OwningPtr<Descriptor> exitStat{IntDescriptor(404)};
+  OwningPtr<Descriptor> cmdStat{IntDescriptor(202)};
+  OwningPtr<Descriptor> cmdMsg{CharDescriptor("cmd msg buffer XXXXXXXX")};
+
+  RTNAME(ExecuteCommandLine)
+  (*command.get(), wait, exitStat.get(), cmdStat.get(), cmdMsg.get());
+#ifdef _WIN32
+  CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 1);
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 6);
+  CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXX");
+#else
+  CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 126);
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 4);
+  CheckDescriptorEqStr(cmdMsg.get(), "Command cannot be execu");
+  // removing the file only on Linux (file is not created on Win)
+  OwningPtr<Descriptor> commandClean{
+      CharDescriptor("rm -f NotExecutedCommandFile")};
+  OwningPtr<Descriptor> cmdMsgNoErr{CharDescriptor("No Error")};
+  RTNAME(ExecuteCommandLine)
+  (*commandClean.get(), wait, exitStat.get(), cmdStat.get(), cmdMsgNoErr.get());
+  CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 0);
+  CheckDescriptorEqStr(cmdMsgNoErr.get(), "No Error");
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 0);
+#endif
+}
+
+TEST_F(ZeroArguments, ECLNotFoundCommandErrorSync) {
+  OwningPtr<Descriptor> command{CharDescriptor("NotFoundCommand")};
   bool wait{true};
   OwningPtr<Descriptor> exitStat{IntDescriptor(404)};
   OwningPtr<Descriptor> cmdStat{IntDescriptor(202)};
-  OwningPtr<Descriptor> cmdMsg{CharDescriptor("Message ChangedXXXXXXXXX")};
+  OwningPtr<Descriptor> cmdMsg{CharDescriptor("unmodified buffer XXXXXXXXX")};
 
   RTNAME(ExecuteCommandLine)
   (*command.get(), wait, exitStat.get(), cmdStat.get(), cmdMsg.get());
 #ifdef _WIN32
-  CheckDescriptorEqInt(exitStat.get(), 1);
+  CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 1);
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 6);
+  CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXX");
 #else
   CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 127);
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 5);
+  CheckDescriptorEqStr(cmdMsg.get(), "Command not found with exit");
 #endif
-  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 3);
-  CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXX");
 }
 
 TEST_F(ZeroArguments, ECLInvalidCommandTerminatedSync) {
   OwningPtr<Descriptor> command{CharDescriptor("InvalidCommand")};
   bool wait{true};
-  OwningPtr<Descriptor> exitStat{IntDescriptor(404)};
   OwningPtr<Descriptor> cmdMsg{CharDescriptor("No Change")};
 
 #ifdef _WIN32
   EXPECT_DEATH(RTNAME(ExecuteCommandLine)(
-                   *command.get(), wait, exitStat.get(), nullptr, cmdMsg.get()),
+                   *command.get(), wait, nullptr, nullptr, cmdMsg.get()),
       "Invalid command quit with exit status code: 1");
 #else
   EXPECT_DEATH(RTNAME(ExecuteCommandLine)(
-                   *command.get(), wait, exitStat.get(), nullptr, cmdMsg.get()),
-      "Invalid command quit with exit status code: 127");
+                   *command.get(), wait, nullptr, nullptr, cmdMsg.get()),
+      "Command not found with exit code: 127.");
 #endif
-  CheckDescriptorEqInt(exitStat.get(), 404);
   CheckDescriptorEqStr(cmdMsg.get(), "No Change");
 }
 
@@ -394,13 +445,10 @@ TEST_F(ZeroArguments, ECLValidCommandAndExitStatNoChangeAndCMDStatusSetAsync) {
 TEST_F(ZeroArguments, ECLInvalidCommandParentNotTerminatedAsync) {
   OwningPtr<Descriptor> command{CharDescriptor("InvalidCommand")};
   bool wait{false};
-  OwningPtr<Descriptor> exitStat{IntDescriptor(404)};
   OwningPtr<Descriptor> cmdMsg{CharDescriptor("No change")};
 
   EXPECT_NO_FATAL_FAILURE(RTNAME(ExecuteCommandLine)(
-      *command.get(), wait, exitStat.get(), nullptr, cmdMsg.get()));
-
-  CheckDescriptorEqInt(exitStat.get(), 404);
+      *command.get(), wait, nullptr, nullptr, cmdMsg.get()));
   CheckDescriptorEqStr(cmdMsg.get(), "No change");
 }
 
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index f35471a06a53e5..4ffcd55ba9500f 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -401,9 +401,7 @@ if(LLVM_INCLUDE_TESTS)
   add_subdirectory(fuzzing)
 endif()
 
-if(LIBC_INCLUDE_BENCHMARKS)
-  add_subdirectory(benchmarks)
-endif()
+add_subdirectory(benchmarks)
 
 if (LIBC_INCLUDE_DOCS)
   add_subdirectory(docs)
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 4978da65850ccb..0cff6eb12c2475 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,3 +1,13 @@
+if(LIBC_TARGET_OS_IS_GPU)
+  add_subdirectory(gpu)
+  return()
+endif()
+
+# The CPU build depends on Google benchmark.
+if(NOT LIBC_INCLUDE_BENCHMARKS)
+  return()
+endif()
+
 find_package(Threads)
 
 set(LLVM_LINK_COMPONENTS
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
new file mode 100644
index 00000000000000..2e7e8e7600fdb7
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -0,0 +1,97 @@
+#include "benchmarks/gpu/BenchmarkLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"               // write_to_stderr
+#include "src/__support/big_int.h"                 // is_big_int
+#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+#include "src/__support/uint128.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace benchmarks {
+
+// cpp::string_view specialization
+template <>
+BenchmarkLogger &
+    BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+  LIBC_NAMESPACE::write_to_stderr(str);
+  return *this;
+}
+
+// cpp::string specialization
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
+  return *this << static_cast<cpp::string_view>(str);
+}
+
+// const char* specialization
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
+  return *this << cpp::string_view(&ch, 1);
+}
+
+// bool specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
+  return *this << (cond ? "true" : "false");
+}
+
+// void * specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
+  return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
+}
+
+template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
+  if constexpr (is_big_int_v<T> ||
+                (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
+                 (sizeof(T) > sizeof(uint64_t)))) {
+    static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
+    const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
+    return *this << buffer.view();
+  } else {
+    return *this << cpp::to_string(t);
+  }
+}
+
+// is_integral specializations
+// char is already specialized to handle character
+template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned char>(unsigned char);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned short>(unsigned short);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned int>(unsigned int);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned long>(unsigned long);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
+
+#ifdef LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
+#endif // LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
+
+// TODO: Add floating point formatting once it's supported by StringStream.
+
+BenchmarkLogger log;
+
+} // namespace benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
new file mode 100644
index 00000000000000..332ff1439e6f57
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -0,0 +1,27 @@
+//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+
+namespace LIBC_NAMESPACE {
+namespace benchmarks {
+
+// A class to log to standard output in the context of hermetic tests.
+struct BenchmarkLogger {
+  constexpr BenchmarkLogger() = default;
+  template <typename T> BenchmarkLogger &operator<<(T);
+};
+
+// A global TestLogger instance to be used in tests.
+extern BenchmarkLogger log;
+
+} // namespace benchmarks
+} // namespace LIBC_NAMESPACE
+
+#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
new file mode 100644
index 00000000000000..d167abcaf2db1c
--- /dev/null
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -0,0 +1,56 @@
+add_subdirectory(timing)
+
+add_custom_target(gpu-benchmark)
+
+function(add_benchmark benchmark_name)
+  cmake_parse_arguments(
+    "BENCHMARK"
+    "" # Optional arguments
+    "" # Single value arguments
+    "LINK_LIBRARIES" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
+    message(FATAL_ERROR "target does not support clock")
+  endif()
+  add_libc_hermetic(
+    ${benchmark_name}
+    IS_BENCHMARK
+    LINK_LIBRARIES
+      LibcGpuBenchmark.hermetic
+      ${BENCHMARK_LINK_LIBRARIES}
+    ${BENCHMARK_UNPARSED_ARGUMENTS}
+  )
+  get_fq_target_name(${benchmark_name} fq_target_name)
+  add_dependencies(gpu-benchmark ${fq_target_name})
+endfunction(add_benchmark)
+
+add_unittest_framework_library(
+  LibcGpuBenchmark
+  SRCS
+    LibcGpuBenchmark.cpp
+    LibcGpuBenchmarkMain.cpp
+    BenchmarkLogger.cpp
+  HDRS
+    LibcGpuBenchmark.h
+    BenchmarkLogger.h
+  DEPENDS
+    libc.src.__support.big_int
+    libc.src.__support.c_string
+    libc.src.__support.CPP.string
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.functional
+    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
+    libc.src.__support.fixed_point.fx_rep
+    libc.src.__support.macros.properties.types
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.uint128
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.fixedvector
+    libc.src.time.clock
+    libc.benchmarks.gpu.timing.timing
+)
+
+add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
new file mode 100644
index 00000000000000..69adb0c95ba768
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -0,0 +1,140 @@
+#include "LibcGpuBenchmark.h"
+#include "src/__support/CPP/algorithm.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/fixedvector.h"
+#include "src/time/gpu/time_utils.h"
+
+namespace LIBC_NAMESPACE {
+namespace benchmarks {
+
+FixedVector<Benchmark *, 64> benchmarks;
+cpp::array<BenchmarkResult, 1024> results;
+
+void Benchmark::add_benchmark(Benchmark *benchmark) {
+  benchmarks.push_back(benchmark);
+}
+
+BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) {
+  BenchmarkResult result;
+  uint64_t cycles_sum = 0;
+  double standard_deviation_sum = 0;
+  uint64_t min = UINT64_MAX;
+  uint64_t max = 0;
+  uint32_t samples_sum = 0;
+  uint32_t iterations_sum = 0;
+  clock_t time_sum = 0;
+  uint64_t num_threads = gpu::get_num_threads();
+  for (uint64_t i = 0; i < num_threads; i++) {
+    BenchmarkResult current_result = results[i];
+    cycles_sum += current_result.cycles;
+    standard_deviation_sum += current_result.standard_deviation;
+    min = cpp::min(min, current_result.min);
+    max = cpp::max(max, current_result.max);
+    samples_sum += current_result.samples;
+    iterations_sum += current_result.total_iterations;
+    time_sum += current_result.total_time;
+  }
+  result.cycles = cycles_sum / num_threads;
+  result.standard_deviation = standard_deviation_sum / num_threads;
+  result.min = min;
+  result.max = max;
+  result.samples = samples_sum / num_threads;
+  result.total_iterations = iterations_sum / num_threads;
+  result.total_time = time_sum / num_threads;
+  return result;
+}
+
+void Benchmark::run_benchmarks() {
+  uint64_t id = gpu::get_thread_id();
+  gpu::sync_threads();
+
+  for (Benchmark *benchmark : benchmarks)
+    results[id] = benchmark->run();
+  gpu::sync_threads();
+  if (id == 0) {
+    for (Benchmark *benchmark : benchmarks) {
+      BenchmarkResult all_results = reduce_results(results);
+      constexpr auto GREEN = "\033[32m";
+      constexpr auto RESET = "\033[0m";
+      log << GREEN << "[ RUN      ] " << RESET << benchmark->get_name() << '\n';
+      log << GREEN << "[       OK ] " << RESET << benchmark->get_name() << ": "
+          << all_results.cycles << " cycles, " << all_results.min << " min, "
+          << all_results.max << " max, " << all_results.total_iterations
+          << " iterations, " << all_results.total_time << " ns, "
+          << static_cast<long>(all_results.standard_deviation) << " stddev\n";
+    }
+  }
+  gpu::sync_threads();
+}
+
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+                          cpp::function<uint64_t(void)> wrapper_func) {
+  BenchmarkResult result;
+  RuntimeEstimationProgression rep;
+  uint32_t total_iterations = 0;
+  uint32_t iterations = options.initial_iterations;
+  if (iterations < 1u)
+    iterations = 1;
+
+  uint32_t samples = 0;
+  uint64_t total_time = 0;
+  uint64_t best_guess = 0;
+  uint64_t total_cycles = 0;
+  uint64_t cycles_squared = 0;
+  uint64_t min = UINT64_MAX;
+  uint64_t max = 0;
+
+  uint64_t overhead = UINT64_MAX;
+  int overhead_iterations = 10;
+  for (int i = 0; i < overhead_iterations; i++)
+    overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
+
+  for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
+    uint64_t sample_cycles = 0;
+    const clock_t start = static_cast<double>(clock());
+    for (uint32_t i = 0; i < iterations; i++) {
+      auto wrapper_intermediate = wrapper_func();
+      uint64_t result = wrapper_intermediate - overhead;
+      max = cpp::max(max, result);
+      min = cpp::min(min, result);
+      sample_cycles += result;
+    }
+    const clock_t end = clock();
+    const clock_t duration_ns =
+        ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
+    total_time += duration_ns;
+    time_budget -= duration_ns;
+    samples++;
+    total_cycles += sample_cycles;
+    cycles_squared += sample_cycles * sample_cycles;
+
+    total_iterations += iterations;
+    const double change_ratio =
+        rep.compute_improvement({iterations, sample_cycles});
+    best_guess = rep.current_estimation;
+
+    if (samples >= options.max_samples || iterations >= options.max_iterations)
+      break;
+    if (total_time >= options.min_duration && samples >= options.min_samples &&
+        change_ratio < options.epsilon)
+      break;
+
+    iterations *= options.scaling_factor;
+  }
+  result.cycles = best_guess;
+  result.standard_deviation = fputil::sqrt<double>(
+      static_cast<double>(cycles_squared) / total_iterations -
+      static_cast<double>(best_guess * best_guess));
+  result.min = min;
+  result.max = max;
+  result.samples = samples;
+  result.total_iterations = total_iterations;
+  result.total_time = total_time;
+  return result;
+};
+
+} // namespace benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
new file mode 100644
index 00000000000000..59dd5894620808
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -0,0 +1,108 @@
+#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+
+#include "benchmarks/gpu/BenchmarkLogger.h"
+#include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/CPP/limits.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/time/clock.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+namespace benchmarks {
+
+struct BenchmarkOptions {
+  uint32_t initial_iterations = 1;
+  uint32_t max_iterations = 10000000;
+  uint32_t min_samples = 4;
+  uint32_t max_samples = 1000;
+  uint64_t min_duration = 0;                  // in nanoseconds (ns)
+  uint64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
+  double epsilon = 0.01;
+  double scaling_factor = 1.4;
+};
+
+struct Measurement {
+  uint32_t iterations = 0;
+  uint64_t elapsed_cycles = 0;
+};
+
+class RefinableRuntimeEstimation {
+  uint64_t total_cycles = 0;
+  uint32_t total_iterations = 0;
+
+public:
+  uint64_t update(const Measurement &M) {
+    total_cycles += M.elapsed_cycles;
+    total_iterations += M.iterations;
+    return total_cycles / total_iterations;
+  }
+};
+
+// Tracks the progression of the runtime estimation
+class RuntimeEstimationProgression {
+  RefinableRuntimeEstimation rre;
+
+public:
+  uint64_t current_estimation = 0;
+
+  double compute_improvement(const Measurement &M) {
+    const uint64_t new_estimation = rre.update(M);
+    double ratio =
+        (static_cast<double>(current_estimation) / new_estimation) - 1.0;
+
+    // Get absolute value
+    if (ratio < 0)
+      ratio *= -1;
+
+    current_estimation = new_estimation;
+    return ratio;
+  }
+};
+
+struct BenchmarkResult {
+  uint64_t cycles = 0;
+  double standard_deviation = 0;
+  uint64_t min = UINT64_MAX;
+  uint64_t max = 0;
+  uint32_t samples = 0;
+  uint32_t total_iterations = 0;
+  clock_t total_time = 0;
+};
+
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+                          cpp::function<uint64_t(void)> wrapper_func);
+
+class Benchmark {
+  const cpp::function<uint64_t(void)> func;
+  const cpp::string_view name;
+
+public:
+  Benchmark(cpp::function<uint64_t(void)> func, char const *name)
+      : func(func), name(name) {
+    add_benchmark(this);
+  }
+
+  static void run_benchmarks();
+
+protected:
+  static void add_benchmark(Benchmark *benchmark);
+
+private:
+  BenchmarkResult run() {
+    BenchmarkOptions options;
+    return benchmark(options, func);
+  }
+  const cpp::string_view get_name() const { return name; }
+};
+} // namespace benchmarks
+} // namespace LIBC_NAMESPACE
+
+#define BENCHMARK(SuiteName, TestName, Func)                                   \
+  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
+      Func, #SuiteName "." #TestName);
+
+#endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
new file mode 100644
index 00000000000000..97366e55194a90
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -0,0 +1,6 @@
+#include "LibcGpuBenchmark.h"
+
+extern "C" int main(int argc, char **argv, char **envp) {
+  LIBC_NAMESPACE::benchmarks::Benchmark::run_benchmarks();
+  return 0;
+}
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
new file mode 100644
index 00000000000000..42eb4f7b5909a8
--- /dev/null
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(ctype)
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
new file mode 100644
index 00000000000000..79f01425770da9
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_custom_target(libc-gpu-ctype-benchmarks)
+
+add_benchmark(
+  isalnum_benchmark
+  SUITE
+    libc-gpu-ctype-benchmarks
+  SRCS
+    isalnum_benchmark.cpp
+  DEPENDS
+    libc.src.ctype.isalnum
+)
+
+add_benchmark(
+  isalpha_benchmark
+  SUITE
+    libc-gpu-ctype-benchmarks
+  SRCS
+    isalpha_benchmark.cpp
+  DEPENDS
+    libc.src.ctype.isalpha
+)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
new file mode 100644
index 00000000000000..4050bc0ec77b9f
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -0,0 +1,9 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalnum.h"
+
+uint64_t BM_IsAlnum() {
+  char x = 'c';
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
new file mode 100644
index 00000000000000..2038eb89bc77ba
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -0,0 +1,9 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalpha.h"
+
+uint64_t BM_IsAlpha() {
+  char x = 'c';
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
+}
+BENCHMARK(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
new file mode 100644
index 00000000000000..8bbc7e33f122a4
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -0,0 +1,12 @@
+foreach(target nvptx)
+  add_subdirectory(${target})
+  list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
+endforeach()
+
+add_header_library(
+  timing
+  HDRS
+    timing.h
+  DEPENDS
+    ${target_gpu_timing}
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
new file mode 100644
index 00000000000000..9958e16206a410
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_header_library(
+  nvptx_timing
+  HDRS
+    timing.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
new file mode 100644
index 00000000000000..d3851a764c43db
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -0,0 +1,99 @@
+//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static uint64_t overhead() {
+  volatile uint32_t x = 1;
+  uint32_t y = x;
+  uint64_t start = gpu::processor_clock();
+  asm("" ::"r"(y), "llr"(start));
+  uint32_t result = y;
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  uint64_t stop = gpu::processor_clock();
+  volatile auto storage = result;
+  return stop - start;
+}
+
+// Stimulate a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// delicate balance of hard-coded dependencies.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+  // We need to store the input somewhere to guarantee that the compiler will
+  // not constant propagate it and remove the profiling region.
+  volatile T storage = t;
+  T arg = storage;
+  asm("" ::"r"(arg));
+
+  // Get the current timestamp from the clock.
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  // This forces the compiler to load the input argument and run the clock cycle
+  // counter before the profiling region.
+  asm("" ::"r"(arg), "llr"(start));
+
+  // Run the function under test and return its value.
+  auto result = f(arg);
+
+  // This inline assembly performs a no-op which forces the result to both be
+  // used and prevents us from exiting this region before it's complete.
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+  // Obtain the current timestamp after running the calculation and force
+  // ordering.
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile T output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
+template <typename F, typename T1, typename T2>
+static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
+  volatile T1 storage = t1;
+  volatile T2 storage2 = t2;
+  T1 arg = storage;
+  T2 arg2 = storage2;
+  asm("" ::"r"(arg), "r"(arg2));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"r"(arg), "r"(arg2), "llr"(start));
+
+  auto result = f(arg, arg2);
+
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  return stop - start;
+}
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
new file mode 100644
index 00000000000000..180ea77954ae59
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -0,0 +1,22 @@
+//===------------- Implementation of GPU timing utils -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H
+#define LLVM_LIBC_UTILS_GPU_TIMING_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "amdgpu not yet supported"
+#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
+#include "nvptx/timing.h"
+#else
+#error "unsupported platform"
+#endif
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index c8d7c8a2b1c7ca..fbeec32883b635 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -526,12 +526,15 @@ function(add_integration_test test_name)
   add_dependencies(${INTEGRATION_TEST_SUITE} ${fq_target_name})
 endfunction(add_integration_test)
 
-# Rule to add a hermetic test. A hermetic test is one whose executable is fully
+# Rule to add a hermetic program. A hermetic program is one whose executable is fully
 # statically linked and consists of pieces drawn only from LLVM's libc. Nothing,
 # including the startup objects, come from the system libc.
 #
+# For the GPU, these can be either tests or benchmarks, depending on the value
+# of the LINK_LIBRARIES arg.
+#
 # Usage:
-#   add_libc_hermetic_test(
+#   add_libc_hermetic(
 #     <target name>
 #     SUITE <the suite to which the test should belong>
 #     SRCS <src1.cpp> [src2.cpp ...]
@@ -543,14 +546,14 @@ endfunction(add_integration_test)
 #     LINK_LIBRARIES <list of linking libraries for this target>
 #     LOADER_ARGS <list of special args to loaders (like the GPU loader)>
 #   )
-function(add_libc_hermetic_test test_name)
+function(add_libc_hermetic test_name)
   if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
     message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
     return()
   endif()
   cmake_parse_arguments(
     "HERMETIC_TEST"
-    "" # No optional arguments
+    "IS_BENCHMARK" # Optional arguments
     "SUITE" # Single value arguments
     "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
     ${ARGN}
@@ -678,7 +681,6 @@ function(add_libc_hermetic_test test_name)
     PRIVATE
       libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
       ${link_libraries}
-      LibcTest.hermetic
       LibcHermeticTestSupport.hermetic
       # The NVIDIA 'nvlink' linker does not currently support static libraries.
       $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
@@ -714,8 +716,12 @@ function(add_libc_hermetic_test test_name)
   )
 
   add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
-  add_dependencies(libc-hermetic-tests ${fq_target_name})
-endfunction(add_libc_hermetic_test)
+  if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
+    # If it is a benchmark, it will already have been added to the
+    # gpu-benchmark target
+    add_dependencies(libc-hermetic-tests ${fq_target_name})
+  endif()
+endfunction(add_libc_hermetic)
 
 # A convenience function to add both a unit test as well as a hermetic test.
 function(add_libc_test test_name)
@@ -730,7 +736,12 @@ function(add_libc_test test_name)
     add_libc_unittest(${test_name}.__unit__ ${LIBC_TEST_UNPARSED_ARGUMENTS})
   endif()
   if(LIBC_ENABLE_HERMETIC_TESTS AND NOT LIBC_TEST_UNIT_TEST_ONLY)
-    add_libc_hermetic_test(${test_name}.__hermetic__ ${LIBC_TEST_UNPARSED_ARGUMENTS})
+    add_libc_hermetic(
+      ${test_name}.__hermetic__
+      LINK_LIBRARIES
+        LibcTest.hermetic
+      ${LIBC_TEST_UNPARSED_ARGUMENTS}
+    )
     get_fq_target_name(${test_name} fq_test_name)
     if(TARGET ${fq_test_name}.__hermetic__ AND TARGET ${fq_test_name}.__unit__)
       # Tests like the file tests perform file operations on disk file. If we
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index dc8beb14fd7f44..e2a0908023a028 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -16,6 +16,17 @@ if(NOT LLVM_LIBC_FULL_BUILD)
                       "GPU.")
 endif()
 
+# Set the required flags globally so standard CMake utilities can compile.
+if(LIBC_TARGET_TRIPLE)
+  set(CMAKE_REQUIRED_FLAGS "--target=${LIBC_TARGET_TRIPLE}")
+endif()
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib")
+elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  set(CMAKE_REQUIRED_FLAGS
+      "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument")
+endif()
+
 # Identify the program used to package multiple images into a single binary.
 get_filename_component(compiler_path ${CMAKE_CXX_COMPILER} DIRECTORY)
 if(TARGET clang-offload-packager)
@@ -56,39 +67,9 @@ endif()
 
 set(LIBC_GPU_TEST_ARCHITECTURE "" CACHE STRING "Architecture for the GPU tests")
 if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  # Identify any locally installed AMD GPUs on the system using 'amdgpu-arch'.
-  if(TARGET amdgpu-arch)
-    get_target_property(LIBC_AMDGPU_ARCH amdgpu-arch LOCATION)
-  else()
-    find_program(LIBC_AMDGPU_ARCH
-                 NAMES amdgpu-arch NO_DEFAULT_PATH
-                 PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-  endif()
-  if(LIBC_AMDGPU_ARCH)
-    execute_process(COMMAND ${LIBC_AMDGPU_ARCH}
-                    OUTPUT_VARIABLE arch_tool_output
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(arch_tool_output MATCHES "^gfx[0-9]+")
-      set(PLATFORM_HAS_GPU TRUE)
-    endif()
-  endif()
+  check_cxx_compiler_flag(-mcpu=native PLATFORM_HAS_GPU)
 elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  # Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'.
-  if(TARGET nvptx-arch)
-    get_target_property(LIBC_NVPTX_ARCH nvptx-arch LOCATION)
-  else()
-    find_program(LIBC_NVPTX_ARCH
-                 NAMES nvptx-arch NO_DEFAULT_PATH
-                 PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-  endif()
-  if(LIBC_NVPTX_ARCH)
-    execute_process(COMMAND ${LIBC_NVPTX_ARCH}
-                    OUTPUT_VARIABLE arch_tool_output
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(arch_tool_output MATCHES "^sm_[0-9]+")
-      set(PLATFORM_HAS_GPU TRUE)
-    endif()
-  endif()
+  check_cxx_compiler_flag(-march=native PLATFORM_HAS_GPU)
 endif()
 
 set(gpu_test_architecture "")
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index b8e97783c77237..9651939454ccc5 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -83,93 +83,93 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdio.h entrypoints
-    libc.src.stdio.remove
     libc.src.stdio.printf
     libc.src.stdio.putchar
-    libc.src.stdio.sprintf
+    libc.src.stdio.remove
     libc.src.stdio.snprintf
+    libc.src.stdio.sprintf
     libc.src.stdio.vprintf
-    libc.src.stdio.vsprintf
     libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
 
     # stdbit.h entrypoints
-    libc.src.stdbit.stdc_leading_zeros_uc
-    libc.src.stdbit.stdc_leading_zeros_us
-    libc.src.stdbit.stdc_leading_zeros_ui
-    libc.src.stdbit.stdc_leading_zeros_ul
-    libc.src.stdbit.stdc_leading_zeros_ull
-    libc.src.stdbit.stdc_leading_ones_uc
-    libc.src.stdbit.stdc_leading_ones_us
-    libc.src.stdbit.stdc_leading_ones_ui
-    libc.src.stdbit.stdc_leading_ones_ul
-    libc.src.stdbit.stdc_leading_ones_ull
-    libc.src.stdbit.stdc_trailing_zeros_uc
-    libc.src.stdbit.stdc_trailing_zeros_us
-    libc.src.stdbit.stdc_trailing_zeros_ui
-    libc.src.stdbit.stdc_trailing_zeros_ul
-    libc.src.stdbit.stdc_trailing_zeros_ull
-    libc.src.stdbit.stdc_trailing_ones_uc
-    libc.src.stdbit.stdc_trailing_ones_us
-    libc.src.stdbit.stdc_trailing_ones_ui
-    libc.src.stdbit.stdc_trailing_ones_ul
-    libc.src.stdbit.stdc_trailing_ones_ull
-    libc.src.stdbit.stdc_first_leading_zero_uc
-    libc.src.stdbit.stdc_first_leading_zero_us
-    libc.src.stdbit.stdc_first_leading_zero_ui
-    libc.src.stdbit.stdc_first_leading_zero_ul
-    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
     libc.src.stdbit.stdc_first_leading_one_uc
-    libc.src.stdbit.stdc_first_leading_one_us
     libc.src.stdbit.stdc_first_leading_one_ui
     libc.src.stdbit.stdc_first_leading_one_ul
     libc.src.stdbit.stdc_first_leading_one_ull
-    libc.src.stdbit.stdc_first_trailing_zero_uc
-    libc.src.stdbit.stdc_first_trailing_zero_us
-    libc.src.stdbit.stdc_first_trailing_zero_ui
-    libc.src.stdbit.stdc_first_trailing_zero_ul
-    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
     libc.src.stdbit.stdc_first_trailing_one_uc
-    libc.src.stdbit.stdc_first_trailing_one_us
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
-    libc.src.stdbit.stdc_count_zeros_uc
-    libc.src.stdbit.stdc_count_zeros_us
-    libc.src.stdbit.stdc_count_zeros_ui
-    libc.src.stdbit.stdc_count_zeros_ul
-    libc.src.stdbit.stdc_count_zeros_ull
-    libc.src.stdbit.stdc_count_ones_uc
-    libc.src.stdbit.stdc_count_ones_us
-    libc.src.stdbit.stdc_count_ones_ui
-    libc.src.stdbit.stdc_count_ones_ul
-    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
     libc.src.stdbit.stdc_has_single_bit_uc
-    libc.src.stdbit.stdc_has_single_bit_us
     libc.src.stdbit.stdc_has_single_bit_ui
     libc.src.stdbit.stdc_has_single_bit_ul
     libc.src.stdbit.stdc_has_single_bit_ull
-    libc.src.stdbit.stdc_bit_width_uc
-    libc.src.stdbit.stdc_bit_width_us
-    libc.src.stdbit.stdc_bit_width_ui
-    libc.src.stdbit.stdc_bit_width_ul
-    libc.src.stdbit.stdc_bit_width_ull
-    libc.src.stdbit.stdc_bit_floor_uc
-    libc.src.stdbit.stdc_bit_floor_us
-    libc.src.stdbit.stdc_bit_floor_ui
-    libc.src.stdbit.stdc_bit_floor_ul
-    libc.src.stdbit.stdc_bit_floor_ull
-    libc.src.stdbit.stdc_bit_ceil_uc
-    libc.src.stdbit.stdc_bit_ceil_us
-    libc.src.stdbit.stdc_bit_ceil_ui
-    libc.src.stdbit.stdc_bit_ceil_ul
-    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
     libc.src.stdlib.abort
     libc.src.stdlib.abs
     libc.src.stdlib.aligned_alloc
-    libc.src.stdlib.atoi
     libc.src.stdlib.atof
+    libc.src.stdlib.atoi
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
     libc.src.stdlib.bsearch
@@ -211,11 +211,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fegetexceptflag
     libc.src.fenv.fegetround
     libc.src.fenv.feholdexcept
+    libc.src.fenv.feraiseexcept
     libc.src.fenv.fesetenv
     libc.src.fenv.fesetexcept
     libc.src.fenv.fesetexceptflag
     libc.src.fenv.fesetround
-    libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
     libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
@@ -228,12 +228,16 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.canonicalize
+    libc.src.math.canonicalizef
+    libc.src.math.canonicalizel
     libc.src.math.ceil
     libc.src.math.ceilf
     libc.src.math.ceill
     libc.src.math.copysign
     libc.src.math.copysignf
     libc.src.math.copysignl
+    libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.erff
@@ -242,6 +246,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.exp2m1f
     libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
@@ -258,16 +263,47 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaf
     libc.src.math.fmax
     libc.src.math.fmaxf
+    libc.src.math.fmaximum
+    libc.src.math.fmaximum_mag
+    libc.src.math.fmaximum_mag_num
+    libc.src.math.fmaximum_mag_numf
+    libc.src.math.fmaximum_mag_numl
+    libc.src.math.fmaximum_magf
+    libc.src.math.fmaximum_magl
+    libc.src.math.fmaximum_num
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numl
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximuml
     libc.src.math.fmaxl
     libc.src.math.fmin
     libc.src.math.fminf
+    libc.src.math.fminimum
+    libc.src.math.fminimum_mag
+    libc.src.math.fminimum_mag_num
+    libc.src.math.fminimum_mag_numf
+    libc.src.math.fminimum_mag_numl
+    libc.src.math.fminimum_magf
+    libc.src.math.fminimum_magl
+    libc.src.math.fminimum_num
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numl
+    libc.src.math.fminimumf
+    libc.src.math.fminimuml
     libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
+    libc.src.math.fmul
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
+    libc.src.math.fromfp
+    libc.src.math.fromfpf
+    libc.src.math.fromfpl
+    libc.src.math.fromfpx
+    libc.src.math.fromfpxf
+    libc.src.math.fromfpxl
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -314,9 +350,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainder
     libc.src.math.remainderf
@@ -328,11 +370,16 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintf
     libc.src.math.rintl
     libc.src.math.round
+    libc.src.math.roundeven
+    libc.src.math.roundevenf
+    libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.sin
+    libc.src.math.sincos
     libc.src.math.sincosf
     libc.src.math.sinf
     libc.src.math.sinhf
@@ -344,6 +391,12 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
+    libc.src.math.ufromfp
+    libc.src.math.ufromfpf
+    libc.src.math.ufromfpl
+    libc.src.math.ufromfpx
+    libc.src.math.ufromfpxf
+    libc.src.math.ufromfpxl
 )
 
 if(LIBC_COMPILER_HAS_FIXED_POINT)
@@ -352,23 +405,23 @@ if(LIBC_COMPILER_HAS_FIXED_POINT)
     libc.src.stdfix.abshk
     libc.src.stdfix.abshr
     libc.src.stdfix.absk
-    libc.src.stdfix.absr
     libc.src.stdfix.abslk
     libc.src.stdfix.abslr
+    libc.src.stdfix.absr
     libc.src.stdfix.exphk
     libc.src.stdfix.expk
     libc.src.stdfix.roundhk
     libc.src.stdfix.roundhr
     libc.src.stdfix.roundk
-    libc.src.stdfix.roundr
     libc.src.stdfix.roundlk
     libc.src.stdfix.roundlr
+    libc.src.stdfix.roundr
     libc.src.stdfix.rounduhk
     libc.src.stdfix.rounduhr
     libc.src.stdfix.rounduk
-    libc.src.stdfix.roundur
     libc.src.stdfix.roundulk
     libc.src.stdfix.roundulr
+    libc.src.stdfix.roundur
     libc.src.stdfix.sqrtuhk
     libc.src.stdfix.sqrtuhr
     libc.src.stdfix.sqrtuk
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index c9070c2691b192..8b37acef259c4f 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -79,93 +79,93 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdio.h entrypoints
-    libc.src.stdio.remove
     libc.src.stdio.printf
     libc.src.stdio.putchar
-    libc.src.stdio.sprintf
+    libc.src.stdio.remove
     libc.src.stdio.snprintf
+    libc.src.stdio.sprintf
     libc.src.stdio.vprintf
-    libc.src.stdio.vsprintf
     libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
 
     # stdbit.h entrypoints
-    libc.src.stdbit.stdc_leading_zeros_uc
-    libc.src.stdbit.stdc_leading_zeros_us
-    libc.src.stdbit.stdc_leading_zeros_ui
-    libc.src.stdbit.stdc_leading_zeros_ul
-    libc.src.stdbit.stdc_leading_zeros_ull
-    libc.src.stdbit.stdc_leading_ones_uc
-    libc.src.stdbit.stdc_leading_ones_us
-    libc.src.stdbit.stdc_leading_ones_ui
-    libc.src.stdbit.stdc_leading_ones_ul
-    libc.src.stdbit.stdc_leading_ones_ull
-    libc.src.stdbit.stdc_trailing_zeros_uc
-    libc.src.stdbit.stdc_trailing_zeros_us
-    libc.src.stdbit.stdc_trailing_zeros_ui
-    libc.src.stdbit.stdc_trailing_zeros_ul
-    libc.src.stdbit.stdc_trailing_zeros_ull
-    libc.src.stdbit.stdc_trailing_ones_uc
-    libc.src.stdbit.stdc_trailing_ones_us
-    libc.src.stdbit.stdc_trailing_ones_ui
-    libc.src.stdbit.stdc_trailing_ones_ul
-    libc.src.stdbit.stdc_trailing_ones_ull
-    libc.src.stdbit.stdc_first_leading_zero_uc
-    libc.src.stdbit.stdc_first_leading_zero_us
-    libc.src.stdbit.stdc_first_leading_zero_ui
-    libc.src.stdbit.stdc_first_leading_zero_ul
-    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
     libc.src.stdbit.stdc_first_leading_one_uc
-    libc.src.stdbit.stdc_first_leading_one_us
     libc.src.stdbit.stdc_first_leading_one_ui
     libc.src.stdbit.stdc_first_leading_one_ul
     libc.src.stdbit.stdc_first_leading_one_ull
-    libc.src.stdbit.stdc_first_trailing_zero_uc
-    libc.src.stdbit.stdc_first_trailing_zero_us
-    libc.src.stdbit.stdc_first_trailing_zero_ui
-    libc.src.stdbit.stdc_first_trailing_zero_ul
-    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
     libc.src.stdbit.stdc_first_trailing_one_uc
-    libc.src.stdbit.stdc_first_trailing_one_us
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
-    libc.src.stdbit.stdc_count_zeros_uc
-    libc.src.stdbit.stdc_count_zeros_us
-    libc.src.stdbit.stdc_count_zeros_ui
-    libc.src.stdbit.stdc_count_zeros_ul
-    libc.src.stdbit.stdc_count_zeros_ull
-    libc.src.stdbit.stdc_count_ones_uc
-    libc.src.stdbit.stdc_count_ones_us
-    libc.src.stdbit.stdc_count_ones_ui
-    libc.src.stdbit.stdc_count_ones_ul
-    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
     libc.src.stdbit.stdc_has_single_bit_uc
-    libc.src.stdbit.stdc_has_single_bit_us
     libc.src.stdbit.stdc_has_single_bit_ui
     libc.src.stdbit.stdc_has_single_bit_ul
     libc.src.stdbit.stdc_has_single_bit_ull
-    libc.src.stdbit.stdc_bit_width_uc
-    libc.src.stdbit.stdc_bit_width_us
-    libc.src.stdbit.stdc_bit_width_ui
-    libc.src.stdbit.stdc_bit_width_ul
-    libc.src.stdbit.stdc_bit_width_ull
-    libc.src.stdbit.stdc_bit_floor_uc
-    libc.src.stdbit.stdc_bit_floor_us
-    libc.src.stdbit.stdc_bit_floor_ui
-    libc.src.stdbit.stdc_bit_floor_ul
-    libc.src.stdbit.stdc_bit_floor_ull
-    libc.src.stdbit.stdc_bit_ceil_uc
-    libc.src.stdbit.stdc_bit_ceil_us
-    libc.src.stdbit.stdc_bit_ceil_ui
-    libc.src.stdbit.stdc_bit_ceil_ul
-    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
     libc.src.stdlib.abort
     libc.src.stdlib.abs
     libc.src.stdlib.aligned_alloc
-    libc.src.stdlib.atoi
     libc.src.stdlib.atof
+    libc.src.stdlib.atoi
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
     libc.src.stdlib.bsearch
@@ -207,11 +207,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fegetexceptflag
     libc.src.fenv.fegetround
     libc.src.fenv.feholdexcept
+    libc.src.fenv.feraiseexcept
     libc.src.fenv.fesetenv
     libc.src.fenv.fesetexcept
     libc.src.fenv.fesetexceptflag
     libc.src.fenv.fesetround
-    libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
     libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
@@ -224,12 +224,16 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.canonicalize
+    libc.src.math.canonicalizef
+    libc.src.math.canonicalizel
     libc.src.math.ceil
     libc.src.math.ceilf
     libc.src.math.ceill
     libc.src.math.copysign
     libc.src.math.copysignf
     libc.src.math.copysignl
+    libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.erff
@@ -238,6 +242,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.exp2m1f
     libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
@@ -254,15 +259,46 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaf
     libc.src.math.fmax
     libc.src.math.fmaxf
+    libc.src.math.fmaximum
+    libc.src.math.fmaximum_mag
+    libc.src.math.fmaximum_mag_num
+    libc.src.math.fmaximum_mag_numf
+    libc.src.math.fmaximum_mag_numl
+    libc.src.math.fmaximum_magf
+    libc.src.math.fmaximum_magl
+    libc.src.math.fmaximum_num
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numl
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximuml
     libc.src.math.fmaxl
     libc.src.math.fmin
     libc.src.math.fminf
+    libc.src.math.fminimum
+    libc.src.math.fminimum_mag
+    libc.src.math.fminimum_mag_num
+    libc.src.math.fminimum_mag_numf
+    libc.src.math.fminimum_mag_numl
+    libc.src.math.fminimum_magf
+    libc.src.math.fminimum_magl
+    libc.src.math.fminimum_num
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numl
+    libc.src.math.fminimumf
+    libc.src.math.fminimuml
     libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmul
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
+    libc.src.math.fromfp
+    libc.src.math.fromfpf
+    libc.src.math.fromfpl
+    libc.src.math.fromfpx
+    libc.src.math.fromfpxf
+    libc.src.math.fromfpxl
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -309,9 +345,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainder
     libc.src.math.remainderf
@@ -323,11 +365,16 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintf
     libc.src.math.rintl
     libc.src.math.round
+    libc.src.math.roundeven
+    libc.src.math.roundevenf
+    libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.sin
+    libc.src.math.sincos
     libc.src.math.sincosf
     libc.src.math.sinf
     libc.src.math.sinhf
@@ -339,6 +386,12 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
+    libc.src.math.ufromfp
+    libc.src.math.ufromfpf
+    libc.src.math.ufromfpl
+    libc.src.math.ufromfpx
+    libc.src.math.ufromfpxf
+    libc.src.math.ufromfpxl
 )
 
 if(LIBC_COMPILER_HAS_FIXED_POINT)
@@ -347,23 +400,23 @@ if(LIBC_COMPILER_HAS_FIXED_POINT)
     libc.src.stdfix.abshk
     libc.src.stdfix.abshr
     libc.src.stdfix.absk
-    libc.src.stdfix.absr
     libc.src.stdfix.abslk
     libc.src.stdfix.abslr
+    libc.src.stdfix.absr
     libc.src.stdfix.exphk
     libc.src.stdfix.expk
     libc.src.stdfix.roundhk
     libc.src.stdfix.roundhr
     libc.src.stdfix.roundk
-    libc.src.stdfix.roundr
     libc.src.stdfix.roundlk
     libc.src.stdfix.roundlr
+    libc.src.stdfix.roundr
     libc.src.stdfix.rounduhk
     libc.src.stdfix.rounduhr
     libc.src.stdfix.rounduk
-    libc.src.stdfix.roundur
     libc.src.stdfix.roundulk
     libc.src.stdfix.roundulr
+    libc.src.stdfix.roundur
     libc.src.stdfix.sqrtuhk
     libc.src.stdfix.sqrtuhr
     libc.src.stdfix.sqrtuk
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index e1303265b9ac41..4d5536318724d0 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -130,6 +130,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ceilf
     libc.src.math.ceill
     libc.src.math.coshf
+    libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.erff
     libc.src.math.exp
@@ -224,8 +225,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.sincos
     libc.src.math.sincosf
     libc.src.math.sinhf
+    libc.src.math.sin
     libc.src.math.sinf
     libc.src.math.sqrt
     libc.src.math.sqrtf
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 2217a696fc5d18..69f1bdb381e161 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,10 +1,10 @@
 if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   set(extra_entrypoints
       # stdio.h entrypoints
-      libc.src.stdio.sprintf
       libc.src.stdio.snprintf
-      libc.src.stdio.vsprintf
+      libc.src.stdio.sprintf
       libc.src.stdio.vsnprintf
+      libc.src.stdio.vsprintf
   )
 endif()
 
@@ -76,86 +76,90 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.string.strxfrm
 
     # stdbit.h entrypoints
-    libc.src.stdbit.stdc_leading_zeros_uc
-    libc.src.stdbit.stdc_leading_zeros_us
-    libc.src.stdbit.stdc_leading_zeros_ui
-    libc.src.stdbit.stdc_leading_zeros_ul
-    libc.src.stdbit.stdc_leading_zeros_ull
-    libc.src.stdbit.stdc_leading_ones_uc
-    libc.src.stdbit.stdc_leading_ones_us
-    libc.src.stdbit.stdc_leading_ones_ui
-    libc.src.stdbit.stdc_leading_ones_ul
-    libc.src.stdbit.stdc_leading_ones_ull
-    libc.src.stdbit.stdc_trailing_zeros_uc
-    libc.src.stdbit.stdc_trailing_zeros_us
-    libc.src.stdbit.stdc_trailing_zeros_ui
-    libc.src.stdbit.stdc_trailing_zeros_ul
-    libc.src.stdbit.stdc_trailing_zeros_ull
-    libc.src.stdbit.stdc_trailing_ones_uc
-    libc.src.stdbit.stdc_trailing_ones_us
-    libc.src.stdbit.stdc_trailing_ones_ui
-    libc.src.stdbit.stdc_trailing_ones_ul
-    libc.src.stdbit.stdc_trailing_ones_ull
-    libc.src.stdbit.stdc_first_leading_zero_uc
-    libc.src.stdbit.stdc_first_leading_zero_us
-    libc.src.stdbit.stdc_first_leading_zero_ui
-    libc.src.stdbit.stdc_first_leading_zero_ul
-    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
     libc.src.stdbit.stdc_first_leading_one_uc
-    libc.src.stdbit.stdc_first_leading_one_us
     libc.src.stdbit.stdc_first_leading_one_ui
     libc.src.stdbit.stdc_first_leading_one_ul
     libc.src.stdbit.stdc_first_leading_one_ull
-    libc.src.stdbit.stdc_first_trailing_zero_uc
-    libc.src.stdbit.stdc_first_trailing_zero_us
-    libc.src.stdbit.stdc_first_trailing_zero_ui
-    libc.src.stdbit.stdc_first_trailing_zero_ul
-    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
     libc.src.stdbit.stdc_first_trailing_one_uc
-    libc.src.stdbit.stdc_first_trailing_one_us
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
-    libc.src.stdbit.stdc_count_zeros_uc
-    libc.src.stdbit.stdc_count_zeros_us
-    libc.src.stdbit.stdc_count_zeros_ui
-    libc.src.stdbit.stdc_count_zeros_ul
-    libc.src.stdbit.stdc_count_zeros_ull
-    libc.src.stdbit.stdc_count_ones_uc
-    libc.src.stdbit.stdc_count_ones_us
-    libc.src.stdbit.stdc_count_ones_ui
-    libc.src.stdbit.stdc_count_ones_ul
-    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
     libc.src.stdbit.stdc_has_single_bit_uc
-    libc.src.stdbit.stdc_has_single_bit_us
     libc.src.stdbit.stdc_has_single_bit_ui
     libc.src.stdbit.stdc_has_single_bit_ul
     libc.src.stdbit.stdc_has_single_bit_ull
-    libc.src.stdbit.stdc_bit_width_uc
-    libc.src.stdbit.stdc_bit_width_us
-    libc.src.stdbit.stdc_bit_width_ui
-    libc.src.stdbit.stdc_bit_width_ul
-    libc.src.stdbit.stdc_bit_width_ull
-    libc.src.stdbit.stdc_bit_floor_uc
-    libc.src.stdbit.stdc_bit_floor_us
-    libc.src.stdbit.stdc_bit_floor_ui
-    libc.src.stdbit.stdc_bit_floor_ul
-    libc.src.stdbit.stdc_bit_floor_ull
-    libc.src.stdbit.stdc_bit_ceil_uc
-    libc.src.stdbit.stdc_bit_ceil_us
-    libc.src.stdbit.stdc_bit_ceil_ui
-    libc.src.stdbit.stdc_bit_ceil_ul
-    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
+    libc.src.stdlib._Exit
+    libc.src.stdlib.abort
     libc.src.stdlib.abs
-    libc.src.stdlib.atoi
+    libc.src.stdlib.atexit
     libc.src.stdlib.atof
+    libc.src.stdlib.atoi
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
+    libc.src.stdlib.bsearch
     libc.src.stdlib.div
+    libc.src.stdlib.exit
     libc.src.stdlib.labs
-    libc.src.stdlib.bsearch
     libc.src.stdlib.ldiv
     libc.src.stdlib.llabs
     libc.src.stdlib.lldiv
@@ -170,45 +174,41 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoll
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
-    libc.src.stdlib._Exit
-    libc.src.stdlib.atexit
-    libc.src.stdlib.exit
-    libc.src.stdlib.abort
 
     # Only implemented in the test suite
-    libc.src.stdlib.malloc
     libc.src.stdlib.aligned_alloc
-    libc.src.stdlib.realloc
     libc.src.stdlib.free
+    libc.src.stdlib.malloc
+    libc.src.stdlib.realloc
 
     # errno.h entrypoints
     libc.src.errno.errno
 
     # stdio.h entrypoints
     ${extra_entrypoints}
+    libc.src.stdio.clearerr
+    libc.src.stdio.fclose
     libc.src.stdio.feof
     libc.src.stdio.ferror
-    libc.src.stdio.fseek
     libc.src.stdio.fflush
-    libc.src.stdio.ftell
-    libc.src.stdio.clearerr
-    libc.src.stdio.puts
+    libc.src.stdio.fgetc
+    libc.src.stdio.fgets
     libc.src.stdio.fopen
-    libc.src.stdio.fclose
-    libc.src.stdio.fread
+    libc.src.stdio.fputc
     libc.src.stdio.fputs
+    libc.src.stdio.fread
+    libc.src.stdio.fseek
+    libc.src.stdio.ftell
     libc.src.stdio.fwrite
-    libc.src.stdio.fputc
-    libc.src.stdio.putc
-    libc.src.stdio.putchar
-    libc.src.stdio.fgets
-    libc.src.stdio.fgetc
     libc.src.stdio.getc
     libc.src.stdio.getchar
-    libc.src.stdio.ungetc
+    libc.src.stdio.putc
+    libc.src.stdio.putchar
+    libc.src.stdio.puts
+    libc.src.stdio.stderr
     libc.src.stdio.stdin
     libc.src.stdio.stdout
-    libc.src.stdio.stderr
+    libc.src.stdio.ungetc
 
     # inttypes.h entrypoints
     libc.src.inttypes.imaxabs
@@ -236,9 +236,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.asinh
     libc.src.math.asinhf
     libc.src.math.atan
-    libc.src.math.atanf
     libc.src.math.atan2
     libc.src.math.atan2f
+    libc.src.math.atanf
     libc.src.math.atanh
     libc.src.math.atanhf
     libc.src.math.ceil
@@ -251,11 +251,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.erf
     libc.src.math.erff
+    libc.src.math.exp
     libc.src.math.exp10
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
-    libc.src.math.exp
     libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
@@ -287,13 +287,13 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.llrintf
     libc.src.math.llround
     libc.src.math.llroundf
+    libc.src.math.log
     libc.src.math.log10
     libc.src.math.log10f
     libc.src.math.log1p
     libc.src.math.log1pf
     libc.src.math.log2
     libc.src.math.log2f
-    libc.src.math.log
     libc.src.math.logf
     libc.src.math.lrint
     libc.src.math.lrintf
@@ -322,9 +322,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.sin
-    libc.src.math.sinf
     libc.src.math.sincos
     libc.src.math.sincosf
+    libc.src.math.sinf
     libc.src.math.sinh
     libc.src.math.sinhf
     libc.src.math.sqrt
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 63238ec0094fbf..ea89f8bd138d6a 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -92,81 +92,81 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdbit.h entrypoints
-    libc.src.stdbit.stdc_leading_zeros_uc
-    libc.src.stdbit.stdc_leading_zeros_us
-    libc.src.stdbit.stdc_leading_zeros_ui
-    libc.src.stdbit.stdc_leading_zeros_ul
-    libc.src.stdbit.stdc_leading_zeros_ull
-    libc.src.stdbit.stdc_leading_ones_uc
-    libc.src.stdbit.stdc_leading_ones_us
-    libc.src.stdbit.stdc_leading_ones_ui
-    libc.src.stdbit.stdc_leading_ones_ul
-    libc.src.stdbit.stdc_leading_ones_ull
-    libc.src.stdbit.stdc_trailing_zeros_uc
-    libc.src.stdbit.stdc_trailing_zeros_us
-    libc.src.stdbit.stdc_trailing_zeros_ui
-    libc.src.stdbit.stdc_trailing_zeros_ul
-    libc.src.stdbit.stdc_trailing_zeros_ull
-    libc.src.stdbit.stdc_trailing_ones_uc
-    libc.src.stdbit.stdc_trailing_ones_us
-    libc.src.stdbit.stdc_trailing_ones_ui
-    libc.src.stdbit.stdc_trailing_ones_ul
-    libc.src.stdbit.stdc_trailing_ones_ull
-    libc.src.stdbit.stdc_first_leading_zero_uc
-    libc.src.stdbit.stdc_first_leading_zero_us
-    libc.src.stdbit.stdc_first_leading_zero_ui
-    libc.src.stdbit.stdc_first_leading_zero_ul
-    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
     libc.src.stdbit.stdc_first_leading_one_uc
-    libc.src.stdbit.stdc_first_leading_one_us
     libc.src.stdbit.stdc_first_leading_one_ui
     libc.src.stdbit.stdc_first_leading_one_ul
     libc.src.stdbit.stdc_first_leading_one_ull
-    libc.src.stdbit.stdc_first_trailing_zero_uc
-    libc.src.stdbit.stdc_first_trailing_zero_us
-    libc.src.stdbit.stdc_first_trailing_zero_ui
-    libc.src.stdbit.stdc_first_trailing_zero_ul
-    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
     libc.src.stdbit.stdc_first_trailing_one_uc
-    libc.src.stdbit.stdc_first_trailing_one_us
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
-    libc.src.stdbit.stdc_count_zeros_uc
-    libc.src.stdbit.stdc_count_zeros_us
-    libc.src.stdbit.stdc_count_zeros_ui
-    libc.src.stdbit.stdc_count_zeros_ul
-    libc.src.stdbit.stdc_count_zeros_ull
-    libc.src.stdbit.stdc_count_ones_uc
-    libc.src.stdbit.stdc_count_ones_us
-    libc.src.stdbit.stdc_count_ones_ui
-    libc.src.stdbit.stdc_count_ones_ul
-    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
     libc.src.stdbit.stdc_has_single_bit_uc
-    libc.src.stdbit.stdc_has_single_bit_us
     libc.src.stdbit.stdc_has_single_bit_ui
     libc.src.stdbit.stdc_has_single_bit_ul
     libc.src.stdbit.stdc_has_single_bit_ull
-    libc.src.stdbit.stdc_bit_width_uc
-    libc.src.stdbit.stdc_bit_width_us
-    libc.src.stdbit.stdc_bit_width_ui
-    libc.src.stdbit.stdc_bit_width_ul
-    libc.src.stdbit.stdc_bit_width_ull
-    libc.src.stdbit.stdc_bit_floor_uc
-    libc.src.stdbit.stdc_bit_floor_us
-    libc.src.stdbit.stdc_bit_floor_ui
-    libc.src.stdbit.stdc_bit_floor_ul
-    libc.src.stdbit.stdc_bit_floor_ull
-    libc.src.stdbit.stdc_bit_ceil_uc
-    libc.src.stdbit.stdc_bit_ceil_us
-    libc.src.stdbit.stdc_bit_ceil_ui
-    libc.src.stdbit.stdc_bit_ceil_ul
-    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
-    libc.src.stdlib.atoi
     libc.src.stdlib.atof
+    libc.src.stdlib.atoi
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
     libc.src.stdlib.bsearch
@@ -188,37 +188,37 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoull
 
     # stdlib.h external entrypoints
-    libc.src.stdlib.malloc
-    libc.src.stdlib.calloc
-    libc.src.stdlib.realloc
     libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.calloc
     libc.src.stdlib.free
+    libc.src.stdlib.malloc
+    libc.src.stdlib.realloc
 
     # stdio.h entrypoints
+    libc.src.stdio.fdopen
+    #libc.src.stdio.fscanf
     libc.src.stdio.remove
     libc.src.stdio.rename
-    libc.src.stdio.sprintf
     libc.src.stdio.snprintf
-    libc.src.stdio.vsprintf
-    libc.src.stdio.vsnprintf
-    libc.src.stdio.fdopen
-    #libc.src.stdio.sscanf
+    libc.src.stdio.sprintf
     #libc.src.stdio.scanf
-    #libc.src.stdio.fscanf
+    #libc.src.stdio.sscanf
+    libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
-    libc.src.sys.mman.mmap
-    libc.src.sys.mman.mprotect
-    libc.src.sys.mman.munmap
-    libc.src.sys.mman.posix_madvise
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.mlock2
-    libc.src.sys.mman.munlock
     libc.src.sys.mman.mlockall
-    libc.src.sys.mman.munlockall
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.mprotect
     libc.src.sys.mman.msync
+    libc.src.sys.mman.munlock
+    libc.src.sys.mman.munlockall
+    libc.src.sys.mman.munmap
+    libc.src.sys.mman.posix_madvise
     libc.src.sys.mman.shm_open
     libc.src.sys.mman.shm_unlink
 
@@ -267,11 +267,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.termios.cfgetospeed
     libc.src.termios.cfsetispeed
     libc.src.termios.cfsetospeed
-    libc.src.termios.tcgetattr
-    libc.src.termios.tcgetsid
     libc.src.termios.tcdrain
     libc.src.termios.tcflow
     libc.src.termios.tcflush
+    libc.src.termios.tcgetattr
+    libc.src.termios.tcgetsid
     libc.src.termios.tcsendbreak
     libc.src.termios.tcsetattr
 
@@ -320,11 +320,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fegetexceptflag
     libc.src.fenv.fegetround
     libc.src.fenv.feholdexcept
+    libc.src.fenv.feraiseexcept
     libc.src.fenv.fesetenv
     libc.src.fenv.fesetexcept
     libc.src.fenv.fesetexceptflag
     libc.src.fenv.fesetround
-    libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
     libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
@@ -337,21 +337,22 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
-    libc.src.math.copysign
-    libc.src.math.copysignf
-    libc.src.math.copysignl
     libc.src.math.ceil
     libc.src.math.ceilf
     libc.src.math.ceill
-    libc.src.math.coshf
+    libc.src.math.copysign
+    libc.src.math.copysignf
+    libc.src.math.copysignl
+    libc.src.math.cos
     libc.src.math.cosf
+    libc.src.math.coshf
     libc.src.math.erff
     libc.src.math.exp
-    libc.src.math.expf
     libc.src.math.exp10
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
     libc.src.math.fabs
@@ -367,38 +368,38 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaf
     libc.src.math.fmax
     libc.src.math.fmaxf
-    libc.src.math.fmaxl
-    libc.src.math.fmin
-    libc.src.math.fminf
-    libc.src.math.fminl
     libc.src.math.fmaximum
-    libc.src.math.fmaximumf
-    libc.src.math.fmaximuml
-    libc.src.math.fmaximum_num
-    libc.src.math.fmaximum_numf
-    libc.src.math.fmaximum_numl
     libc.src.math.fmaximum_mag
-    libc.src.math.fmaximum_magf
-    libc.src.math.fmaximum_magl
     libc.src.math.fmaximum_mag_num
     libc.src.math.fmaximum_mag_numf
     libc.src.math.fmaximum_mag_numl
+    libc.src.math.fmaximum_magf
+    libc.src.math.fmaximum_magl
+    libc.src.math.fmaximum_num
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numl
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximuml
+    libc.src.math.fmaxl
+    libc.src.math.fmin
+    libc.src.math.fminf
     libc.src.math.fminimum
-    libc.src.math.fminimumf
-    libc.src.math.fminimuml
-    libc.src.math.fminimum_num
-    libc.src.math.fminimum_numf
-    libc.src.math.fminimum_numl
     libc.src.math.fminimum_mag
-    libc.src.math.fminimum_magf
-    libc.src.math.fminimum_magl
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
-    libc.src.math.fmul
+    libc.src.math.fminimum_magf
+    libc.src.math.fminimum_magl
+    libc.src.math.fminimum_num
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numl
+    libc.src.math.fminimumf
+    libc.src.math.fminimuml
+    libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
+    libc.src.math.fmul
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -413,6 +414,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ilogb
     libc.src.math.ilogbf
     libc.src.math.ilogbl
+    libc.src.math.ldexp
+    libc.src.math.ldexpf
+    libc.src.math.ldexpl
     libc.src.math.llogb
     libc.src.math.llogbf
     libc.src.math.llogbl
@@ -422,26 +426,23 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.llround
     libc.src.math.llroundf
     libc.src.math.llroundl
-    libc.src.math.lrint
-    libc.src.math.lrintf
-    libc.src.math.lrintl
-    libc.src.math.lround
-    libc.src.math.lroundf
-    libc.src.math.lroundl
-    libc.src.math.ldexp
-    libc.src.math.ldexpf
-    libc.src.math.ldexpl
+    libc.src.math.log
     libc.src.math.log10
     libc.src.math.log10f
     libc.src.math.log1p
     libc.src.math.log1pf
     libc.src.math.log2
     libc.src.math.log2f
-    libc.src.math.log
-    libc.src.math.logf
     libc.src.math.logb
     libc.src.math.logbf
     libc.src.math.logbl
+    libc.src.math.logf
+    libc.src.math.lrint
+    libc.src.math.lrintf
+    libc.src.math.lrintl
+    libc.src.math.lround
+    libc.src.math.lroundf
+    libc.src.math.lroundl
     libc.src.math.modf
     libc.src.math.modff
     libc.src.math.modfl
@@ -464,11 +465,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextupf
     libc.src.math.nextupl
     libc.src.math.powf
-    libc.src.math.remainderf
     libc.src.math.remainder
+    libc.src.math.remainderf
     libc.src.math.remainderl
-    libc.src.math.remquof
     libc.src.math.remquo
+    libc.src.math.remquof
     libc.src.math.remquol
     libc.src.math.rint
     libc.src.math.rintf
@@ -479,9 +480,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.sin
+    libc.src.math.sincos
     libc.src.math.sincosf
-    libc.src.math.sinhf
     libc.src.math.sinf
+    libc.src.math.sinhf
     libc.src.math.sqrt
     libc.src.math.sqrtf
     libc.src.math.sqrtl
@@ -504,21 +507,24 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.f16div
+    libc.src.math.f16divf
     libc.src.math.f16fmaf
+    libc.src.math.f16sqrt
     libc.src.math.f16sqrtf
     libc.src.math.fabsf16
     libc.src.math.fdimf16
     libc.src.math.floorf16
     libc.src.math.fmaxf16
-    libc.src.math.fmaximumf16
-    libc.src.math.fmaximum_magf16
     libc.src.math.fmaximum_mag_numf16
+    libc.src.math.fmaximum_magf16
     libc.src.math.fmaximum_numf16
+    libc.src.math.fmaximumf16
     libc.src.math.fminf16
-    libc.src.math.fminimumf16
-    libc.src.math.fminimum_magf16
     libc.src.math.fminimum_mag_numf16
+    libc.src.math.fminimum_magf16
     libc.src.math.fminimum_numf16
+    libc.src.math.fminimumf16
     libc.src.math.frexpf16
     libc.src.math.fromfpf16
     libc.src.math.fromfpxf16
@@ -544,8 +550,8 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.remainderf16
     libc.src.math.remquof16
     libc.src.math.rintf16
-    libc.src.math.roundf16
     libc.src.math.roundevenf16
+    libc.src.math.roundf16
     libc.src.math.scalblnf16
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
@@ -567,15 +573,15 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
-    libc.src.math.fminf128
-    libc.src.math.fmaximumf128
-    libc.src.math.fmaximum_numf128
-    libc.src.math.fmaximum_magf128
     libc.src.math.fmaximum_mag_numf128
-    libc.src.math.fminimumf128
-    libc.src.math.fminimum_numf128
-    libc.src.math.fminimum_magf128
+    libc.src.math.fmaximum_magf128
+    libc.src.math.fmaximum_numf128
+    libc.src.math.fmaximumf128
+    libc.src.math.fminf128
     libc.src.math.fminimum_mag_numf128
+    libc.src.math.fminimum_magf128
+    libc.src.math.fminimum_numf128
+    libc.src.math.fminimumf128
     libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
@@ -583,9 +589,9 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
-    libc.src.math.logbf128
     libc.src.math.llrintf128
     libc.src.math.llroundf128
+    libc.src.math.logbf128
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
@@ -619,11 +625,11 @@ if(LLVM_LIBC_FULL_BUILD)
     # pthread.h entrypoints
     libc.src.pthread.pthread_atfork
     libc.src.pthread.pthread_attr_destroy
-    libc.src.pthread.pthread_attr_init
     libc.src.pthread.pthread_attr_getdetachstate
     libc.src.pthread.pthread_attr_getguardsize
     libc.src.pthread.pthread_attr_getstack
     libc.src.pthread.pthread_attr_getstacksize
+    libc.src.pthread.pthread_attr_init
     libc.src.pthread.pthread_attr_setdetachstate
     libc.src.pthread.pthread_attr_setguardsize
     libc.src.pthread.pthread_attr_setstack
@@ -637,36 +643,36 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_join
     libc.src.pthread.pthread_key_create
     libc.src.pthread.pthread_key_delete
-    libc.src.pthread.pthread_self
-    libc.src.pthread.pthread_setname_np
     libc.src.pthread.pthread_mutex_destroy
     libc.src.pthread.pthread_mutex_init
     libc.src.pthread.pthread_mutex_lock
     libc.src.pthread.pthread_mutex_unlock
     libc.src.pthread.pthread_mutexattr_destroy
-    libc.src.pthread.pthread_mutexattr_init
     libc.src.pthread.pthread_mutexattr_getpshared
     libc.src.pthread.pthread_mutexattr_getrobust
     libc.src.pthread.pthread_mutexattr_gettype
+    libc.src.pthread.pthread_mutexattr_init
     libc.src.pthread.pthread_mutexattr_setpshared
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
-    libc.src.pthread.pthread_rwlock_tryrdlock
     libc.src.pthread.pthread_rwlock_rdlock
     libc.src.pthread.pthread_rwlock_timedrdlock
-    libc.src.pthread.pthread_rwlock_trywrlock
-    libc.src.pthread.pthread_rwlock_wrlock
     libc.src.pthread.pthread_rwlock_timedwrlock
+    libc.src.pthread.pthread_rwlock_tryrdlock
+    libc.src.pthread.pthread_rwlock_trywrlock
     libc.src.pthread.pthread_rwlock_unlock
-    libc.src.pthread.pthread_rwlock_destroy
+    libc.src.pthread.pthread_rwlock_wrlock
     libc.src.pthread.pthread_rwlockattr_destroy
     libc.src.pthread.pthread_rwlockattr_getkind_np
     libc.src.pthread.pthread_rwlockattr_getpshared
     libc.src.pthread.pthread_rwlockattr_init
     libc.src.pthread.pthread_rwlockattr_setkind_np
     libc.src.pthread.pthread_rwlockattr_setpshared
+    libc.src.pthread.pthread_self
+    libc.src.pthread.pthread_setname_np
     libc.src.pthread.pthread_setspecific
 
     # sched.h entrypoints
@@ -676,18 +682,18 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
     libc.src.stdio.fclose
-    libc.src.stdio.flockfile
     libc.src.stdio.feof
     libc.src.stdio.feof_unlocked
     libc.src.stdio.ferror
     libc.src.stdio.ferror_unlocked
-    libc.src.stdio.fgetc
     libc.src.stdio.fflush
+    libc.src.stdio.fgetc
     libc.src.stdio.fileno
+    libc.src.stdio.flockfile
     libc.src.stdio.fopen
+    libc.src.stdio.fopencookie
     libc.src.stdio.fputc
     libc.src.stdio.fputs
-    libc.src.stdio.fopencookie
     libc.src.stdio.fread
     libc.src.stdio.fread_unlocked
     libc.src.stdio.fseek
@@ -699,14 +705,14 @@ if(LLVM_LIBC_FULL_BUILD)
     #TODO: Look into if fprintf can be enabled for overlay on aarch64
     libc.src.stdio.fprintf
     libc.src.stdio.printf
-    libc.src.stdio.vfprintf
-    libc.src.stdio.vprintf
     libc.src.stdio.putc
     libc.src.stdio.putchar
     libc.src.stdio.puts
     libc.src.stdio.stderr
     libc.src.stdio.stdin
     libc.src.stdio.stdout
+    libc.src.stdio.vfprintf
+    libc.src.stdio.vprintf
 
     # stdlib.h entrypoints
     libc.src.stdlib._Exit
@@ -716,24 +722,24 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdlib.getenv
 
     # signal.h entrypoints
-    libc.src.signal.raise
     libc.src.signal.kill
+    libc.src.signal.raise
     libc.src.signal.sigaction
+    libc.src.signal.sigaddset
     libc.src.signal.sigaltstack
     libc.src.signal.sigdelset
-    libc.src.signal.sigaddset
     libc.src.signal.sigemptyset
-    libc.src.signal.sigprocmask
     libc.src.signal.sigfillset
     libc.src.signal.signal
+    libc.src.signal.sigprocmask
 
     # search.h entrypoints
     libc.src.search.hcreate
     libc.src.search.hcreate_r
-    libc.src.search.hsearch
-    libc.src.search.hsearch_r
     libc.src.search.hdestroy
     libc.src.search.hdestroy_r
+    libc.src.search.hsearch
+    libc.src.search.hsearch_r
     libc.src.search.insque
     libc.src.search.remque
 
@@ -762,8 +768,8 @@ if(LLVM_LIBC_FULL_BUILD)
     # time.h entrypoints
     libc.src.time.asctime
     libc.src.time.asctime_r
-    libc.src.time.clock_gettime
     libc.src.time.clock
+    libc.src.time.clock_gettime
     libc.src.time.difftime
     libc.src.time.gettimeofday
     libc.src.time.gmtime
@@ -780,9 +786,9 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.unistd.fork
     libc.src.unistd.getopt
     libc.src.unistd.optarg
+    libc.src.unistd.opterr
     libc.src.unistd.optind
     libc.src.unistd.optopt
-    libc.src.unistd.opterr
     libc.src.unistd.swab
 
     # sys/select.h entrypoints
diff --git a/libc/config/linux/app.h b/libc/config/linux/app.h
index 766cd49e88f6f7..2a3b1560817b8b 100644
--- a/libc/config/linux/app.h
+++ b/libc/config/linux/app.h
@@ -35,24 +35,6 @@ struct TLSImage {
   uintptr_t align;
 };
 
-#if defined(LIBC_TARGET_ARCH_IS_X86_64) ||                                     \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64) ||                                    \
-    defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
-// At the language level, argc is an int. But we use uint64_t as the x86_64
-// ABI specifies it as an 8 byte value. Likewise, in the ARM64 ABI, arguments
-// are usually passed in registers.  x0 is a doubleword register, so this is
-// 64 bit for aarch64 as well.
-typedef uintptr_t ArgcType;
-
-// At the language level, argv is a char** value. However, we use uint64_t as
-// ABIs specify the argv vector be an |argc| long array of 8-byte values.
-typedef uintptr_t ArgVEntryType;
-
-typedef uintptr_t EnvironType;
-#else
-#error "argc and argv types are not defined for the target platform."
-#endif
-
 // Linux manpage on `proc(5)` says that the aux vector is an array of
 // unsigned long pairs.
 // (see: https://man7.org/linux/man-pages/man5/proc.5.html)
@@ -65,7 +47,7 @@ struct AuxEntry {
 };
 
 struct Args {
-  ArgcType argc;
+  uintptr_t argc;
 
   // A flexible length array would be more suitable here, but C++ doesn't have
   // flexible arrays: P1039 proposes to fix this. So, for now we just fake it.
@@ -73,7 +55,7 @@ struct Args {
   // (ISO C 5.1.2.2.1) so one is fine. Also, length of 1 is not really wrong as
   // |argc| is guaranteed to be atleast 1, and there is an 8-byte null entry at
   // the end of the argv array.
-  ArgVEntryType argv[1];
+  uintptr_t argv[1];
 };
 
 // Data structure which captures properties of a linux application.
@@ -87,7 +69,7 @@ struct AppProperties {
   TLSImage tls;
 
   // Environment data.
-  EnvironType *env_ptr;
+  uintptr_t *env_ptr;
 
   // Auxiliary vector data.
   AuxEntry *auxv_ptr;
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 681d3d2583fd3d..a27a4941534808 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -20,10 +20,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     # errno.h entrypoints
     libc.src.errno.errno
 
-    # setjmp.h entrypoints
-    libc.src.setjmp.longjmp
-    libc.src.setjmp.setjmp
-
     # string.h entrypoints
     libc.src.string.bcmp
     libc.src.string.bcopy
@@ -72,81 +68,81 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdbit.h entrypoints
-    libc.src.stdbit.stdc_leading_zeros_uc
-    libc.src.stdbit.stdc_leading_zeros_us
-    libc.src.stdbit.stdc_leading_zeros_ui
-    libc.src.stdbit.stdc_leading_zeros_ul
-    libc.src.stdbit.stdc_leading_zeros_ull
-    libc.src.stdbit.stdc_leading_ones_uc
-    libc.src.stdbit.stdc_leading_ones_us
-    libc.src.stdbit.stdc_leading_ones_ui
-    libc.src.stdbit.stdc_leading_ones_ul
-    libc.src.stdbit.stdc_leading_ones_ull
-    libc.src.stdbit.stdc_trailing_zeros_uc
-    libc.src.stdbit.stdc_trailing_zeros_us
-    libc.src.stdbit.stdc_trailing_zeros_ui
-    libc.src.stdbit.stdc_trailing_zeros_ul
-    libc.src.stdbit.stdc_trailing_zeros_ull
-    libc.src.stdbit.stdc_trailing_ones_uc
-    libc.src.stdbit.stdc_trailing_ones_us
-    libc.src.stdbit.stdc_trailing_ones_ui
-    libc.src.stdbit.stdc_trailing_ones_ul
-    libc.src.stdbit.stdc_trailing_ones_ull
-    libc.src.stdbit.stdc_first_leading_zero_uc
-    libc.src.stdbit.stdc_first_leading_zero_us
-    libc.src.stdbit.stdc_first_leading_zero_ui
-    libc.src.stdbit.stdc_first_leading_zero_ul
-    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
     libc.src.stdbit.stdc_first_leading_one_uc
-    libc.src.stdbit.stdc_first_leading_one_us
     libc.src.stdbit.stdc_first_leading_one_ui
     libc.src.stdbit.stdc_first_leading_one_ul
     libc.src.stdbit.stdc_first_leading_one_ull
-    libc.src.stdbit.stdc_first_trailing_zero_uc
-    libc.src.stdbit.stdc_first_trailing_zero_us
-    libc.src.stdbit.stdc_first_trailing_zero_ui
-    libc.src.stdbit.stdc_first_trailing_zero_ul
-    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
     libc.src.stdbit.stdc_first_trailing_one_uc
-    libc.src.stdbit.stdc_first_trailing_one_us
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
-    libc.src.stdbit.stdc_count_zeros_uc
-    libc.src.stdbit.stdc_count_zeros_us
-    libc.src.stdbit.stdc_count_zeros_ui
-    libc.src.stdbit.stdc_count_zeros_ul
-    libc.src.stdbit.stdc_count_zeros_ull
-    libc.src.stdbit.stdc_count_ones_uc
-    libc.src.stdbit.stdc_count_ones_us
-    libc.src.stdbit.stdc_count_ones_ui
-    libc.src.stdbit.stdc_count_ones_ul
-    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
     libc.src.stdbit.stdc_has_single_bit_uc
-    libc.src.stdbit.stdc_has_single_bit_us
     libc.src.stdbit.stdc_has_single_bit_ui
     libc.src.stdbit.stdc_has_single_bit_ul
     libc.src.stdbit.stdc_has_single_bit_ull
-    libc.src.stdbit.stdc_bit_width_uc
-    libc.src.stdbit.stdc_bit_width_us
-    libc.src.stdbit.stdc_bit_width_ui
-    libc.src.stdbit.stdc_bit_width_ul
-    libc.src.stdbit.stdc_bit_width_ull
-    libc.src.stdbit.stdc_bit_floor_uc
-    libc.src.stdbit.stdc_bit_floor_us
-    libc.src.stdbit.stdc_bit_floor_ui
-    libc.src.stdbit.stdc_bit_floor_ul
-    libc.src.stdbit.stdc_bit_floor_ull
-    libc.src.stdbit.stdc_bit_ceil_uc
-    libc.src.stdbit.stdc_bit_ceil_us
-    libc.src.stdbit.stdc_bit_ceil_ui
-    libc.src.stdbit.stdc_bit_ceil_ul
-    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
-    libc.src.stdlib.atoi
     libc.src.stdlib.atof
+    libc.src.stdlib.atoi
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
     libc.src.stdlib.bsearch
@@ -165,6 +161,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
 
+    # stdlib.h external entrypoints
+    libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.free
+    libc.src.stdlib.malloc
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.mmap
     libc.src.sys.mman.munmap
@@ -180,6 +181,14 @@ set(TARGET_LIBC_ENTRYPOINTS
 
 )
 
+if(LLVM_LIBC_FULL_BUILD)
+  list(APPEND TARGET_LIBC_ENTRYPOINTS
+    # setjmp.h entrypoints
+    libc.src.setjmp.longjmp
+    libc.src.setjmp.setjmp
+  )
+endif()
+
 set(TARGET_LIBM_ENTRYPOINTS
     # fenv.h entrypoints
     libc.src.fenv.feclearexcept
@@ -190,11 +199,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fegetexceptflag
     libc.src.fenv.fegetround
     libc.src.fenv.feholdexcept
+    libc.src.fenv.feraiseexcept
     libc.src.fenv.fesetenv
     libc.src.fenv.fesetexcept
     libc.src.fenv.fesetexceptflag
     libc.src.fenv.fesetround
-    libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
     libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
@@ -213,6 +222,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.copysign
     libc.src.math.copysignf
     libc.src.math.copysignl
+    libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.erff
@@ -237,37 +247,37 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaf
     libc.src.math.fmax
     libc.src.math.fmaxf
-    libc.src.math.fmaxl
-    libc.src.math.fmin
-    libc.src.math.fminf
-    libc.src.math.fminl
     libc.src.math.fmaximum
-    libc.src.math.fmaximumf
-    libc.src.math.fmaximuml
-    libc.src.math.fmaximum_num
-    libc.src.math.fmaximum_numf
-    libc.src.math.fmaximum_numl
     libc.src.math.fmaximum_mag
-    libc.src.math.fmaximum_magf
-    libc.src.math.fmaximum_magl
     libc.src.math.fmaximum_mag_num
     libc.src.math.fmaximum_mag_numf
     libc.src.math.fmaximum_mag_numl
+    libc.src.math.fmaximum_magf
+    libc.src.math.fmaximum_magl
+    libc.src.math.fmaximum_num
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numl
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximuml
+    libc.src.math.fmaxl
+    libc.src.math.fmin
+    libc.src.math.fminf
     libc.src.math.fminimum
-    libc.src.math.fminimumf
-    libc.src.math.fminimuml
-    libc.src.math.fminimum_num
-    libc.src.math.fminimum_numf
-    libc.src.math.fminimum_numl
     libc.src.math.fminimum_mag
-    libc.src.math.fminimum_magf
-    libc.src.math.fminimum_magl
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
-    libc.src.math.fmul
+    libc.src.math.fminimum_magf
+    libc.src.math.fminimum_magl
+    libc.src.math.fminimum_num
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numl
+    libc.src.math.fminimumf
+    libc.src.math.fminimuml
+    libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmul
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -348,6 +358,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.sin
+    libc.src.math.sincos
     libc.src.math.sincosf
     libc.src.math.sinf
     libc.src.math.sinhf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index e12d6b3957e518..ee8b3d531637e6 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -93,81 +93,81 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdbit.h entrypoints
-    libc.src.stdbit.stdc_leading_zeros_uc
-    libc.src.stdbit.stdc_leading_zeros_us
-    libc.src.stdbit.stdc_leading_zeros_ui
-    libc.src.stdbit.stdc_leading_zeros_ul
-    libc.src.stdbit.stdc_leading_zeros_ull
-    libc.src.stdbit.stdc_leading_ones_uc
-    libc.src.stdbit.stdc_leading_ones_us
-    libc.src.stdbit.stdc_leading_ones_ui
-    libc.src.stdbit.stdc_leading_ones_ul
-    libc.src.stdbit.stdc_leading_ones_ull
-    libc.src.stdbit.stdc_trailing_zeros_uc
-    libc.src.stdbit.stdc_trailing_zeros_us
-    libc.src.stdbit.stdc_trailing_zeros_ui
-    libc.src.stdbit.stdc_trailing_zeros_ul
-    libc.src.stdbit.stdc_trailing_zeros_ull
-    libc.src.stdbit.stdc_trailing_ones_uc
-    libc.src.stdbit.stdc_trailing_ones_us
-    libc.src.stdbit.stdc_trailing_ones_ui
-    libc.src.stdbit.stdc_trailing_ones_ul
-    libc.src.stdbit.stdc_trailing_ones_ull
-    libc.src.stdbit.stdc_first_leading_zero_uc
-    libc.src.stdbit.stdc_first_leading_zero_us
-    libc.src.stdbit.stdc_first_leading_zero_ui
-    libc.src.stdbit.stdc_first_leading_zero_ul
-    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
     libc.src.stdbit.stdc_first_leading_one_uc
-    libc.src.stdbit.stdc_first_leading_one_us
     libc.src.stdbit.stdc_first_leading_one_ui
     libc.src.stdbit.stdc_first_leading_one_ul
     libc.src.stdbit.stdc_first_leading_one_ull
-    libc.src.stdbit.stdc_first_trailing_zero_uc
-    libc.src.stdbit.stdc_first_trailing_zero_us
-    libc.src.stdbit.stdc_first_trailing_zero_ui
-    libc.src.stdbit.stdc_first_trailing_zero_ul
-    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
     libc.src.stdbit.stdc_first_trailing_one_uc
-    libc.src.stdbit.stdc_first_trailing_one_us
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
-    libc.src.stdbit.stdc_count_zeros_uc
-    libc.src.stdbit.stdc_count_zeros_us
-    libc.src.stdbit.stdc_count_zeros_ui
-    libc.src.stdbit.stdc_count_zeros_ul
-    libc.src.stdbit.stdc_count_zeros_ull
-    libc.src.stdbit.stdc_count_ones_uc
-    libc.src.stdbit.stdc_count_ones_us
-    libc.src.stdbit.stdc_count_ones_ui
-    libc.src.stdbit.stdc_count_ones_ul
-    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
     libc.src.stdbit.stdc_has_single_bit_uc
-    libc.src.stdbit.stdc_has_single_bit_us
     libc.src.stdbit.stdc_has_single_bit_ui
     libc.src.stdbit.stdc_has_single_bit_ul
     libc.src.stdbit.stdc_has_single_bit_ull
-    libc.src.stdbit.stdc_bit_width_uc
-    libc.src.stdbit.stdc_bit_width_us
-    libc.src.stdbit.stdc_bit_width_ui
-    libc.src.stdbit.stdc_bit_width_ul
-    libc.src.stdbit.stdc_bit_width_ull
-    libc.src.stdbit.stdc_bit_floor_uc
-    libc.src.stdbit.stdc_bit_floor_us
-    libc.src.stdbit.stdc_bit_floor_ui
-    libc.src.stdbit.stdc_bit_floor_ul
-    libc.src.stdbit.stdc_bit_floor_ull
-    libc.src.stdbit.stdc_bit_ceil_uc
-    libc.src.stdbit.stdc_bit_ceil_us
-    libc.src.stdbit.stdc_bit_ceil_ui
-    libc.src.stdbit.stdc_bit_ceil_ul
-    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
-    libc.src.stdlib.atoi
     libc.src.stdlib.atof
+    libc.src.stdlib.atoi
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
     libc.src.stdlib.bsearch
@@ -189,41 +189,41 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoull
 
     # stdlib.h external entrypoints
-    libc.src.stdlib.malloc
-    libc.src.stdlib.calloc
-    libc.src.stdlib.realloc
     libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.calloc
     libc.src.stdlib.free
+    libc.src.stdlib.malloc
+    libc.src.stdlib.realloc
 
     # stdio.h entrypoints
+    libc.src.stdio.fdopen
+    libc.src.stdio.fprintf
+    libc.src.stdio.fscanf
+    libc.src.stdio.printf
     libc.src.stdio.remove
     libc.src.stdio.rename
-    libc.src.stdio.sprintf
+    libc.src.stdio.scanf
     libc.src.stdio.snprintf
-    libc.src.stdio.fprintf
-    libc.src.stdio.printf
-    libc.src.stdio.vsprintf
-    libc.src.stdio.vsnprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.sscanf
     libc.src.stdio.vfprintf
     libc.src.stdio.vprintf
-    libc.src.stdio.sscanf
-    libc.src.stdio.scanf
-    libc.src.stdio.fscanf
-    libc.src.stdio.fdopen
+    libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
-    libc.src.sys.mman.mmap
-    libc.src.sys.mman.mprotect
-    libc.src.sys.mman.munmap
-    libc.src.sys.mman.posix_madvise
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.mlock2
-    libc.src.sys.mman.munlock
     libc.src.sys.mman.mlockall
-    libc.src.sys.mman.munlockall
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.mprotect
     libc.src.sys.mman.msync
+    libc.src.sys.mman.munlock
+    libc.src.sys.mman.munlockall
+    libc.src.sys.mman.munmap
+    libc.src.sys.mman.posix_madvise
     libc.src.sys.mman.shm_open
     libc.src.sys.mman.shm_unlink
 
@@ -272,11 +272,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.termios.cfgetospeed
     libc.src.termios.cfsetispeed
     libc.src.termios.cfsetospeed
-    libc.src.termios.tcgetattr
-    libc.src.termios.tcgetsid
     libc.src.termios.tcdrain
     libc.src.termios.tcflow
     libc.src.termios.tcflush
+    libc.src.termios.tcgetattr
+    libc.src.termios.tcgetsid
     libc.src.termios.tcsendbreak
     libc.src.termios.tcsetattr
 
@@ -328,11 +328,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fegetexceptflag
     libc.src.fenv.fegetround
     libc.src.fenv.feholdexcept
+    libc.src.fenv.feraiseexcept
     libc.src.fenv.fesetenv
     libc.src.fenv.fesetexcept
     libc.src.fenv.fesetexceptflag
     libc.src.fenv.fesetround
-    libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
     libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
@@ -345,21 +345,22 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
-    libc.src.math.copysign
-    libc.src.math.copysignf
-    libc.src.math.copysignl
     libc.src.math.ceil
     libc.src.math.ceilf
     libc.src.math.ceill
-    libc.src.math.coshf
+    libc.src.math.copysign
+    libc.src.math.copysignf
+    libc.src.math.copysignl
+    libc.src.math.cos
     libc.src.math.cosf
+    libc.src.math.coshf
     libc.src.math.erff
     libc.src.math.exp
-    libc.src.math.expf
     libc.src.math.exp10
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
     libc.src.math.fabs
@@ -373,40 +374,40 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.floorl
     libc.src.math.fma
     libc.src.math.fmaf
-    libc.src.math.fmin
-    libc.src.math.fminf
-    libc.src.math.fminl
     libc.src.math.fmax
     libc.src.math.fmaxf
-    libc.src.math.fmaxl
     libc.src.math.fmaximum
-    libc.src.math.fmaximumf
-    libc.src.math.fmaximuml
-    libc.src.math.fmaximum_num
-    libc.src.math.fmaximum_numf
-    libc.src.math.fmaximum_numl
     libc.src.math.fmaximum_mag
-    libc.src.math.fmaximum_magf
-    libc.src.math.fmaximum_magl
     libc.src.math.fmaximum_mag_num
     libc.src.math.fmaximum_mag_numf
     libc.src.math.fmaximum_mag_numl
+    libc.src.math.fmaximum_magf
+    libc.src.math.fmaximum_magl
+    libc.src.math.fmaximum_num
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numl
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximuml
+    libc.src.math.fmaxl
+    libc.src.math.fmin
+    libc.src.math.fminf
     libc.src.math.fminimum
-    libc.src.math.fminimumf
-    libc.src.math.fminimuml
-    libc.src.math.fminimum_num
-    libc.src.math.fminimum_numf
-    libc.src.math.fminimum_numl
     libc.src.math.fminimum_mag
-    libc.src.math.fminimum_magf
-    libc.src.math.fminimum_magl
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
-    libc.src.math.fmul
+    libc.src.math.fminimum_magf
+    libc.src.math.fminimum_magl
+    libc.src.math.fminimum_num
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numl
+    libc.src.math.fminimumf
+    libc.src.math.fminimuml
+    libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
+    libc.src.math.fmul
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -433,17 +434,17 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.llround
     libc.src.math.llroundf
     libc.src.math.llroundl
+    libc.src.math.log
     libc.src.math.log10
     libc.src.math.log10f
     libc.src.math.log1p
     libc.src.math.log1pf
     libc.src.math.log2
     libc.src.math.log2f
-    libc.src.math.log
-    libc.src.math.logf
     libc.src.math.logb
     libc.src.math.logbf
     libc.src.math.logbl
+    libc.src.math.logf
     libc.src.math.lrint
     libc.src.math.lrintf
     libc.src.math.lrintl
@@ -472,11 +473,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextupf
     libc.src.math.nextupl
     libc.src.math.powf
-    libc.src.math.remainderf
     libc.src.math.remainder
+    libc.src.math.remainderf
     libc.src.math.remainderl
-    libc.src.math.remquof
     libc.src.math.remquo
+    libc.src.math.remquof
     libc.src.math.remquol
     libc.src.math.rint
     libc.src.math.rintf
@@ -487,9 +488,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.sin
+    libc.src.math.sincos
     libc.src.math.sincosf
-    libc.src.math.sinhf
     libc.src.math.sinf
+    libc.src.math.sinhf
     libc.src.math.sqrt
     libc.src.math.sqrtf
     libc.src.math.sqrtl
@@ -515,15 +518,15 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
-    libc.src.math.fminf128
-    libc.src.math.fmaximumf128
-    libc.src.math.fmaximum_numf128
-    libc.src.math.fmaximum_magf128
     libc.src.math.fmaximum_mag_numf128
-    libc.src.math.fminimumf128
-    libc.src.math.fminimum_numf128
-    libc.src.math.fminimum_magf128
+    libc.src.math.fmaximum_magf128
+    libc.src.math.fmaximum_numf128
+    libc.src.math.fmaximumf128
+    libc.src.math.fminf128
     libc.src.math.fminimum_mag_numf128
+    libc.src.math.fminimum_magf128
+    libc.src.math.fminimum_numf128
+    libc.src.math.fminimumf128
     libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
@@ -531,9 +534,9 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
-    libc.src.math.logbf128
     libc.src.math.llrintf128
     libc.src.math.llroundf128
+    libc.src.math.logbf128
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
@@ -575,11 +578,11 @@ if(LLVM_LIBC_FULL_BUILD)
     # pthread.h entrypoints
     libc.src.pthread.pthread_atfork
     libc.src.pthread.pthread_attr_destroy
-    libc.src.pthread.pthread_attr_init
     libc.src.pthread.pthread_attr_getdetachstate
     libc.src.pthread.pthread_attr_getguardsize
     libc.src.pthread.pthread_attr_getstack
     libc.src.pthread.pthread_attr_getstacksize
+    libc.src.pthread.pthread_attr_init
     libc.src.pthread.pthread_attr_setdetachstate
     libc.src.pthread.pthread_attr_setguardsize
     libc.src.pthread.pthread_attr_setstack
@@ -593,21 +596,21 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_join
     libc.src.pthread.pthread_key_create
     libc.src.pthread.pthread_key_delete
-    libc.src.pthread.pthread_self
-    libc.src.pthread.pthread_setname_np
     libc.src.pthread.pthread_mutex_destroy
     libc.src.pthread.pthread_mutex_init
     libc.src.pthread.pthread_mutex_lock
     libc.src.pthread.pthread_mutex_unlock
     libc.src.pthread.pthread_mutexattr_destroy
-    libc.src.pthread.pthread_mutexattr_init
     libc.src.pthread.pthread_mutexattr_getpshared
     libc.src.pthread.pthread_mutexattr_getrobust
     libc.src.pthread.pthread_mutexattr_gettype
+    libc.src.pthread.pthread_mutexattr_init
     libc.src.pthread.pthread_mutexattr_setpshared
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_self
+    libc.src.pthread.pthread_setname_np
     libc.src.pthread.pthread_setspecific
 
     # sched.h entrypoints
@@ -621,20 +624,20 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
     libc.src.stdio.fclose
-    libc.src.stdio.flockfile
     libc.src.stdio.feof
     libc.src.stdio.feof_unlocked
     libc.src.stdio.ferror
     libc.src.stdio.ferror_unlocked
+    libc.src.stdio.fflush
     libc.src.stdio.fgetc
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
-    libc.src.stdio.fflush
     libc.src.stdio.fileno
+    libc.src.stdio.flockfile
     libc.src.stdio.fopen
+    libc.src.stdio.fopencookie
     libc.src.stdio.fputc
     libc.src.stdio.fputs
-    libc.src.stdio.fopencookie
     libc.src.stdio.fread
     libc.src.stdio.fread_unlocked
     libc.src.stdio.fseek
@@ -664,16 +667,16 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdlib.getenv
 
     # signal.h entrypoints
-    libc.src.signal.raise
     libc.src.signal.kill
+    libc.src.signal.raise
     libc.src.signal.sigaction
+    libc.src.signal.sigaddset
     libc.src.signal.sigaltstack
     libc.src.signal.sigdelset
-    libc.src.signal.sigaddset
     libc.src.signal.sigemptyset
-    libc.src.signal.sigprocmask
     libc.src.signal.sigfillset
     libc.src.signal.signal
+    libc.src.signal.sigprocmask
 
     # spawn.h entrypoints
     libc.src.spawn.posix_spawn
@@ -686,10 +689,10 @@ if(LLVM_LIBC_FULL_BUILD)
     # search.h entrypoints
     libc.src.search.hcreate
     libc.src.search.hcreate_r
-    libc.src.search.hsearch
-    libc.src.search.hsearch_r
     libc.src.search.hdestroy
     libc.src.search.hdestroy_r
+    libc.src.search.hsearch
+    libc.src.search.hsearch_r
     libc.src.search.insque
     libc.src.search.remque
 
@@ -718,8 +721,8 @@ if(LLVM_LIBC_FULL_BUILD)
     # time.h entrypoints
     libc.src.time.asctime
     libc.src.time.asctime_r
-    libc.src.time.clock_gettime
     libc.src.time.clock
+    libc.src.time.clock_gettime
     libc.src.time.difftime
     libc.src.time.gettimeofday
     libc.src.time.gmtime
@@ -729,16 +732,16 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.time.time
 
     # unistd.h entrypoints
+    libc.src.unistd.__llvm_libc_syscall
     libc.src.unistd._exit
     libc.src.unistd.environ
     libc.src.unistd.execv
     libc.src.unistd.fork
-    libc.src.unistd.__llvm_libc_syscall
     libc.src.unistd.getopt
     libc.src.unistd.optarg
+    libc.src.unistd.opterr
     libc.src.unistd.optind
     libc.src.unistd.optopt
-    libc.src.unistd.opterr
     libc.src.unistd.swab
 
     # sys/select.h entrypoints
diff --git a/libc/config/linux/syscall_numbers.h.inc b/libc/config/linux/syscall_numbers.h.inc
deleted file mode 100644
index 4a19d9a08875e8..00000000000000
--- a/libc/config/linux/syscall_numbers.h.inc
+++ /dev/null
@@ -1,2347 +0,0 @@
-//===---------------------- Linux syscall number macros -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-%%begin()
-
-#include <asm/unistd.h>
-
-#ifdef __NR_FAST_atomic_update
-#define SYS_FAST_atomic_update __NR_FAST_atomic_update
-#endif
-
-#ifdef __NR_FAST_cmpxchg
-#define SYS_FAST_cmpxchg __NR_FAST_cmpxchg
-#endif
-
-#ifdef __NR_FAST_cmpxchg64
-#define SYS_FAST_cmpxchg64 __NR_FAST_cmpxchg64
-#endif
-
-#ifdef __NR__llseek
-#define SYS__llseek __NR__llseek
-#endif
-
-#ifdef __NR__newselect
-#define SYS__newselect __NR__newselect
-#endif
-
-#ifdef __NR__sysctl
-#define SYS__sysctl __NR__sysctl
-#endif
-
-#ifdef __NR_accept
-#define SYS_accept __NR_accept
-#endif
-
-#ifdef __NR_accept4
-#define SYS_accept4 __NR_accept4
-#endif
-
-#ifdef __NR_access
-#define SYS_access __NR_access
-#endif
-
-#ifdef __NR_acct
-#define SYS_acct __NR_acct
-#endif
-
-#ifdef __NR_acl_get
-#define SYS_acl_get __NR_acl_get
-#endif
-
-#ifdef __NR_acl_set
-#define SYS_acl_set __NR_acl_set
-#endif
-
-#ifdef __NR_add_key
-#define SYS_add_key __NR_add_key
-#endif
-
-#ifdef __NR_adjtimex
-#define SYS_adjtimex __NR_adjtimex
-#endif
-
-#ifdef __NR_afs_syscall
-#define SYS_afs_syscall __NR_afs_syscall
-#endif
-
-#ifdef __NR_alarm
-#define SYS_alarm __NR_alarm
-#endif
-
-#ifdef __NR_alloc_hugepages
-#define SYS_alloc_hugepages __NR_alloc_hugepages
-#endif
-
-#ifdef __NR_arch_prctl
-#define SYS_arch_prctl __NR_arch_prctl
-#endif
-
-#ifdef __NR_arm_fadvise64_64
-#define SYS_arm_fadvise64_64 __NR_arm_fadvise64_64
-#endif
-
-#ifdef __NR_arm_sync_file_range
-#define SYS_arm_sync_file_range __NR_arm_sync_file_range
-#endif
-
-#ifdef __NR_atomic_barrier
-#define SYS_atomic_barrier __NR_atomic_barrier
-#endif
-
-#ifdef __NR_atomic_cmpxchg_32
-#define SYS_atomic_cmpxchg_32 __NR_atomic_cmpxchg_32
-#endif
-
-#ifdef __NR_attrctl
-#define SYS_attrctl __NR_attrctl
-#endif
-
-#ifdef __NR_bdflush
-#define SYS_bdflush __NR_bdflush
-#endif
-
-#ifdef __NR_bind
-#define SYS_bind __NR_bind
-#endif
-
-#ifdef __NR_bpf
-#define SYS_bpf __NR_bpf
-#endif
-
-#ifdef __NR_break
-#define SYS_break __NR_break
-#endif
-
-#ifdef __NR_brk
-#define SYS_brk __NR_brk
-#endif
-
-#ifdef __NR_cachectl
-#define SYS_cachectl __NR_cachectl
-#endif
-
-#ifdef __NR_cacheflush
-#define SYS_cacheflush __NR_cacheflush
-#endif
-
-#ifdef __NR_capget
-#define SYS_capget __NR_capget
-#endif
-
-#ifdef __NR_capset
-#define SYS_capset __NR_capset
-#endif
-
-#ifdef __NR_chdir
-#define SYS_chdir __NR_chdir
-#endif
-
-#ifdef __NR_chmod
-#define SYS_chmod __NR_chmod
-#endif
-
-#ifdef __NR_chown
-#define SYS_chown __NR_chown
-#endif
-
-#ifdef __NR_chown32
-#define SYS_chown32 __NR_chown32
-#endif
-
-#ifdef __NR_chroot
-#define SYS_chroot __NR_chroot
-#endif
-
-#ifdef __NR_clock_adjtime
-#define SYS_clock_adjtime __NR_clock_adjtime
-#endif
-
-#ifdef __NR_clock_getres
-#define SYS_clock_getres __NR_clock_getres
-#endif
-
-#ifdef __NR_clock_gettime
-#define SYS_clock_gettime __NR_clock_gettime
-#endif
-
-#ifdef __NR_clock_gettime64
-#define SYS_clock_gettime64 __NR_clock_gettime64
-#endif
-
-#ifdef __NR_clock_nanosleep
-#define SYS_clock_nanosleep __NR_clock_nanosleep
-#endif
-
-#ifdef __NR_clock_nanosleep_time64
-#define SYS_clock_nanosleep_time64 __NR_clock_nanosleep_time64
-#endif
-
-#ifdef __NR_clock_settime
-#define SYS_clock_settime __NR_clock_settime
-#endif
-
-#ifdef __NR_clone
-#define SYS_clone __NR_clone
-#endif
-
-#ifdef __NR_clone2
-#define SYS_clone2 __NR_clone2
-#endif
-
-#ifdef __NR_close
-#define SYS_close __NR_close
-#endif
-
-#ifdef __NR_cmpxchg_badaddr
-#define SYS_cmpxchg_badaddr __NR_cmpxchg_badaddr
-#endif
-
-#ifdef __NR_connect
-#define SYS_connect __NR_connect
-#endif
-
-#ifdef __NR_copy_file_range
-#define SYS_copy_file_range __NR_copy_file_range
-#endif
-
-#ifdef __NR_creat
-#define SYS_creat __NR_creat
-#endif
-
-#ifdef __NR_create_module
-#define SYS_create_module __NR_create_module
-#endif
-
-#ifdef __NR_delete_module
-#define SYS_delete_module __NR_delete_module
-#endif
-
-#ifdef __NR_dipc
-#define SYS_dipc __NR_dipc
-#endif
-
-#ifdef __NR_dup
-#define SYS_dup __NR_dup
-#endif
-
-#ifdef __NR_dup2
-#define SYS_dup2 __NR_dup2
-#endif
-
-#ifdef __NR_dup3
-#define SYS_dup3 __NR_dup3
-#endif
-
-#ifdef __NR_epoll_create
-#define SYS_epoll_create __NR_epoll_create
-#endif
-
-#ifdef __NR_epoll_create1
-#define SYS_epoll_create1 __NR_epoll_create1
-#endif
-
-#ifdef __NR_epoll_ctl
-#define SYS_epoll_ctl __NR_epoll_ctl
-#endif
-
-#ifdef __NR_epoll_ctl_old
-#define SYS_epoll_ctl_old __NR_epoll_ctl_old
-#endif
-
-#ifdef __NR_epoll_pwait
-#define SYS_epoll_pwait __NR_epoll_pwait
-#endif
-
-#ifdef __NR_epoll_pwait2
-#define SYS_epoll_pwait2 __NR_epoll_pwait2
-#endif
-
-#ifdef __NR_epoll_wait
-#define SYS_epoll_wait __NR_epoll_wait
-#endif
-
-#ifdef __NR_epoll_wait_old
-#define SYS_epoll_wait_old __NR_epoll_wait_old
-#endif
-
-#ifdef __NR_eventfd
-#define SYS_eventfd __NR_eventfd
-#endif
-
-#ifdef __NR_eventfd2
-#define SYS_eventfd2 __NR_eventfd2
-#endif
-
-#ifdef __NR_exec_with_loader
-#define SYS_exec_with_loader __NR_exec_with_loader
-#endif
-
-#ifdef __NR_execv
-#define SYS_execv __NR_execv
-#endif
-
-#ifdef __NR_execve
-#define SYS_execve __NR_execve
-#endif
-
-#ifdef __NR_execveat
-#define SYS_execveat __NR_execveat
-#endif
-
-#ifdef __NR_exit
-#define SYS_exit __NR_exit
-#endif
-
-#ifdef __NR_exit_group
-#define SYS_exit_group __NR_exit_group
-#endif
-
-#ifdef __NR_faccessat
-#define SYS_faccessat __NR_faccessat
-#endif
-
-#ifdef __NR_fadvise64
-#define SYS_fadvise64 __NR_fadvise64
-#endif
-
-#ifdef __NR_fadvise64_64
-#define SYS_fadvise64_64 __NR_fadvise64_64
-#endif
-
-#ifdef __NR_fallocate
-#define SYS_fallocate __NR_fallocate
-#endif
-
-#ifdef __NR_fanotify_init
-#define SYS_fanotify_init __NR_fanotify_init
-#endif
-
-#ifdef __NR_fanotify_mark
-#define SYS_fanotify_mark __NR_fanotify_mark
-#endif
-
-#ifdef __NR_fchdir
-#define SYS_fchdir __NR_fchdir
-#endif
-
-#ifdef __NR_fchmod
-#define SYS_fchmod __NR_fchmod
-#endif
-
-#ifdef __NR_fchmodat
-#define SYS_fchmodat __NR_fchmodat
-#endif
-
-#ifdef __NR_fchmodat2
-#define SYS_fchmodat2 __NR_fchmodat2
-#endif
-
-#ifdef __NR_fchown
-#define SYS_fchown __NR_fchown
-#endif
-
-#ifdef __NR_fchown32
-#define SYS_fchown32 __NR_fchown32
-#endif
-
-#ifdef __NR_fchownat
-#define SYS_fchownat __NR_fchownat
-#endif
-
-#ifdef __NR_fcntl
-#define SYS_fcntl __NR_fcntl
-#endif
-
-#ifdef __NR_fcntl64
-#define SYS_fcntl64 __NR_fcntl64
-#endif
-
-#ifdef __NR_fdatasync
-#define SYS_fdatasync __NR_fdatasync
-#endif
-
-#ifdef __NR_fgetxattr
-#define SYS_fgetxattr __NR_fgetxattr
-#endif
-
-#ifdef __NR_finit_module
-#define SYS_finit_module __NR_finit_module
-#endif
-
-#ifdef __NR_flistxattr
-#define SYS_flistxattr __NR_flistxattr
-#endif
-
-#ifdef __NR_flock
-#define SYS_flock __NR_flock
-#endif
-
-#ifdef __NR_fork
-#define SYS_fork __NR_fork
-#endif
-
-#ifdef __NR_free_hugepages
-#define SYS_free_hugepages __NR_free_hugepages
-#endif
-
-#ifdef __NR_fremovexattr
-#define SYS_fremovexattr __NR_fremovexattr
-#endif
-
-#ifdef __NR_fsetxattr
-#define SYS_fsetxattr __NR_fsetxattr
-#endif
-
-#ifdef __NR_fstat
-#define SYS_fstat __NR_fstat
-#endif
-
-#ifdef __NR_fstat64
-#define SYS_fstat64 __NR_fstat64
-#endif
-
-#ifdef __NR_fstatat64
-#define SYS_fstatat64 __NR_fstatat64
-#endif
-
-#ifdef __NR_fstatfs
-#define SYS_fstatfs __NR_fstatfs
-#endif
-
-#ifdef __NR_fstatfs64
-#define SYS_fstatfs64 __NR_fstatfs64
-#endif
-
-#ifdef __NR_fsync
-#define SYS_fsync __NR_fsync
-#endif
-
-#ifdef __NR_ftime
-#define SYS_ftime __NR_ftime
-#endif
-
-#ifdef __NR_ftruncate
-#define SYS_ftruncate __NR_ftruncate
-#endif
-
-#ifdef __NR_ftruncate64
-#define SYS_ftruncate64 __NR_ftruncate64
-#endif
-
-#ifdef __NR_futex
-#define SYS_futex __NR_futex
-#endif
-
-#ifdef __NR_futex_time64
-#define SYS_futex_time64 __NR_futex_time64
-#endif
-
-#ifdef __NR_futimesat
-#define SYS_futimesat __NR_futimesat
-#endif
-
-#ifdef __NR_get_kernel_syms
-#define SYS_get_kernel_syms __NR_get_kernel_syms
-#endif
-
-#ifdef __NR_get_mempolicy
-#define SYS_get_mempolicy __NR_get_mempolicy
-#endif
-
-#ifdef __NR_get_robust_list
-#define SYS_get_robust_list __NR_get_robust_list
-#endif
-
-#ifdef __NR_get_thread_area
-#define SYS_get_thread_area __NR_get_thread_area
-#endif
-
-#ifdef __NR_getcpu
-#define SYS_getcpu __NR_getcpu
-#endif
-
-#ifdef __NR_getcwd
-#define SYS_getcwd __NR_getcwd
-#endif
-
-#ifdef __NR_getdents
-#define SYS_getdents __NR_getdents
-#endif
-
-#ifdef __NR_getdents64
-#define SYS_getdents64 __NR_getdents64
-#endif
-
-#ifdef __NR_getdomainname
-#define SYS_getdomainname __NR_getdomainname
-#endif
-
-#ifdef __NR_getdtablesize
-#define SYS_getdtablesize __NR_getdtablesize
-#endif
-
-#ifdef __NR_getegid
-#define SYS_getegid __NR_getegid
-#endif
-
-#ifdef __NR_getegid32
-#define SYS_getegid32 __NR_getegid32
-#endif
-
-#ifdef __NR_geteuid
-#define SYS_geteuid __NR_geteuid
-#endif
-
-#ifdef __NR_geteuid32
-#define SYS_geteuid32 __NR_geteuid32
-#endif
-
-#ifdef __NR_getgid
-#define SYS_getgid __NR_getgid
-#endif
-
-#ifdef __NR_getgid32
-#define SYS_getgid32 __NR_getgid32
-#endif
-
-#ifdef __NR_getgroups
-#define SYS_getgroups __NR_getgroups
-#endif
-
-#ifdef __NR_getgroups32
-#define SYS_getgroups32 __NR_getgroups32
-#endif
-
-#ifdef __NR_gethostname
-#define SYS_gethostname __NR_gethostname
-#endif
-
-#ifdef __NR_getitimer
-#define SYS_getitimer __NR_getitimer
-#endif
-
-#ifdef __NR_getpagesize
-#define SYS_getpagesize __NR_getpagesize
-#endif
-
-#ifdef __NR_getpeername
-#define SYS_getpeername __NR_getpeername
-#endif
-
-#ifdef __NR_getpgid
-#define SYS_getpgid __NR_getpgid
-#endif
-
-#ifdef __NR_getpgrp
-#define SYS_getpgrp __NR_getpgrp
-#endif
-
-#ifdef __NR_getpid
-#define SYS_getpid __NR_getpid
-#endif
-
-#ifdef __NR_getpmsg
-#define SYS_getpmsg __NR_getpmsg
-#endif
-
-#ifdef __NR_getppid
-#define SYS_getppid __NR_getppid
-#endif
-
-#ifdef __NR_getpriority
-#define SYS_getpriority __NR_getpriority
-#endif
-
-#ifdef __NR_getrandom
-#define SYS_getrandom __NR_getrandom
-#endif
-
-#ifdef __NR_getresgid
-#define SYS_getresgid __NR_getresgid
-#endif
-
-#ifdef __NR_getresgid32
-#define SYS_getresgid32 __NR_getresgid32
-#endif
-
-#ifdef __NR_getresuid
-#define SYS_getresuid __NR_getresuid
-#endif
-
-#ifdef __NR_getresuid32
-#define SYS_getresuid32 __NR_getresuid32
-#endif
-
-#ifdef __NR_getrlimit
-#define SYS_getrlimit __NR_getrlimit
-#endif
-
-#ifdef __NR_getrusage
-#define SYS_getrusage __NR_getrusage
-#endif
-
-#ifdef __NR_getsid
-#define SYS_getsid __NR_getsid
-#endif
-
-#ifdef __NR_getsockname
-#define SYS_getsockname __NR_getsockname
-#endif
-
-#ifdef __NR_getsockopt
-#define SYS_getsockopt __NR_getsockopt
-#endif
-
-#ifdef __NR_gettid
-#define SYS_gettid __NR_gettid
-#endif
-
-#ifdef __NR_gettimeofday
-#define SYS_gettimeofday __NR_gettimeofday
-#endif
-
-#ifdef __NR_getuid
-#define SYS_getuid __NR_getuid
-#endif
-
-#ifdef __NR_getuid32
-#define SYS_getuid32 __NR_getuid32
-#endif
-
-#ifdef __NR_getunwind
-#define SYS_getunwind __NR_getunwind
-#endif
-
-#ifdef __NR_getxattr
-#define SYS_getxattr __NR_getxattr
-#endif
-
-#ifdef __NR_getxgid
-#define SYS_getxgid __NR_getxgid
-#endif
-
-#ifdef __NR_getxpid
-#define SYS_getxpid __NR_getxpid
-#endif
-
-#ifdef __NR_getxuid
-#define SYS_getxuid __NR_getxuid
-#endif
-
-#ifdef __NR_gtty
-#define SYS_gtty __NR_gtty
-#endif
-
-#ifdef __NR_idle
-#define SYS_idle __NR_idle
-#endif
-
-#ifdef __NR_init_module
-#define SYS_init_module __NR_init_module
-#endif
-
-#ifdef __NR_inotify_add_watch
-#define SYS_inotify_add_watch __NR_inotify_add_watch
-#endif
-
-#ifdef __NR_inotify_init
-#define SYS_inotify_init __NR_inotify_init
-#endif
-
-#ifdef __NR_inotify_init1
-#define SYS_inotify_init1 __NR_inotify_init1
-#endif
-
-#ifdef __NR_inotify_rm_watch
-#define SYS_inotify_rm_watch __NR_inotify_rm_watch
-#endif
-
-#ifdef __NR_io_cancel
-#define SYS_io_cancel __NR_io_cancel
-#endif
-
-#ifdef __NR_io_destroy
-#define SYS_io_destroy __NR_io_destroy
-#endif
-
-#ifdef __NR_io_getevents
-#define SYS_io_getevents __NR_io_getevents
-#endif
-
-#ifdef __NR_io_pgetevents
-#define SYS_io_pgetevents __NR_io_pgetevents
-#endif
-
-#ifdef __NR_io_setup
-#define SYS_io_setup __NR_io_setup
-#endif
-
-#ifdef __NR_io_submit
-#define SYS_io_submit __NR_io_submit
-#endif
-
-#ifdef __NR_ioctl
-#define SYS_ioctl __NR_ioctl
-#endif
-
-#ifdef __NR_ioperm
-#define SYS_ioperm __NR_ioperm
-#endif
-
-#ifdef __NR_iopl
-#define SYS_iopl __NR_iopl
-#endif
-
-#ifdef __NR_ioprio_get
-#define SYS_ioprio_get __NR_ioprio_get
-#endif
-
-#ifdef __NR_ioprio_set
-#define SYS_ioprio_set __NR_ioprio_set
-#endif
-
-#ifdef __NR_ipc
-#define SYS_ipc __NR_ipc
-#endif
-
-#ifdef __NR_kcmp
-#define SYS_kcmp __NR_kcmp
-#endif
-
-#ifdef __NR_kern_features
-#define SYS_kern_features __NR_kern_features
-#endif
-
-#ifdef __NR_kexec_file_load
-#define SYS_kexec_file_load __NR_kexec_file_load
-#endif
-
-#ifdef __NR_kexec_load
-#define SYS_kexec_load __NR_kexec_load
-#endif
-
-#ifdef __NR_keyctl
-#define SYS_keyctl __NR_keyctl
-#endif
-
-#ifdef __NR_kill
-#define SYS_kill __NR_kill
-#endif
-
-#ifdef __NR_lchown
-#define SYS_lchown __NR_lchown
-#endif
-
-#ifdef __NR_lchown32
-#define SYS_lchown32 __NR_lchown32
-#endif
-
-#ifdef __NR_lgetxattr
-#define SYS_lgetxattr __NR_lgetxattr
-#endif
-
-#ifdef __NR_link
-#define SYS_link __NR_link
-#endif
-
-#ifdef __NR_linkat
-#define SYS_linkat __NR_linkat
-#endif
-
-#ifdef __NR_listen
-#define SYS_listen __NR_listen
-#endif
-
-#ifdef __NR_listxattr
-#define SYS_listxattr __NR_listxattr
-#endif
-
-#ifdef __NR_llistxattr
-#define SYS_llistxattr __NR_llistxattr
-#endif
-
-#ifdef __NR_llseek
-#define SYS_llseek __NR_llseek
-#endif
-
-#ifdef __NR_lock
-#define SYS_lock __NR_lock
-#endif
-
-#ifdef __NR_lookup_dcookie
-#define SYS_lookup_dcookie __NR_lookup_dcookie
-#endif
-
-#ifdef __NR_lremovexattr
-#define SYS_lremovexattr __NR_lremovexattr
-#endif
-
-#ifdef __NR_lseek
-#define SYS_lseek __NR_lseek
-#endif
-
-#ifdef __NR_lsetxattr
-#define SYS_lsetxattr __NR_lsetxattr
-#endif
-
-#ifdef __NR_lstat
-#define SYS_lstat __NR_lstat
-#endif
-
-#ifdef __NR_lstat64
-#define SYS_lstat64 __NR_lstat64
-#endif
-
-#ifdef __NR_madvise
-#define SYS_madvise __NR_madvise
-#endif
-
-#ifdef __NR_mbind
-#define SYS_mbind __NR_mbind
-#endif
-
-#ifdef __NR_membarrier
-#define SYS_membarrier __NR_membarrier
-#endif
-
-#ifdef __NR_memfd_create
-#define SYS_memfd_create __NR_memfd_create
-#endif
-
-#ifdef __NR_memory_ordering
-#define SYS_memory_ordering __NR_memory_ordering
-#endif
-
-#ifdef __NR_migrate_pages
-#define SYS_migrate_pages __NR_migrate_pages
-#endif
-
-#ifdef __NR_mincore
-#define SYS_mincore __NR_mincore
-#endif
-
-#ifdef __NR_mkdir
-#define SYS_mkdir __NR_mkdir
-#endif
-
-#ifdef __NR_mkdirat
-#define SYS_mkdirat __NR_mkdirat
-#endif
-
-#ifdef __NR_mknod
-#define SYS_mknod __NR_mknod
-#endif
-
-#ifdef __NR_mknodat
-#define SYS_mknodat __NR_mknodat
-#endif
-
-#ifdef __NR_mlock
-#define SYS_mlock __NR_mlock
-#endif
-
-#ifdef __NR_mlock2
-#define SYS_mlock2 __NR_mlock2
-#endif
-
-#ifdef __NR_mlockall
-#define SYS_mlockall __NR_mlockall
-#endif
-
-#ifdef __NR_mmap
-#define SYS_mmap __NR_mmap
-#endif
-
-#ifdef __NR_mmap2
-#define SYS_mmap2 __NR_mmap2
-#endif
-
-#ifdef __NR_modify_ldt
-#define SYS_modify_ldt __NR_modify_ldt
-#endif
-
-#ifdef __NR_mount
-#define SYS_mount __NR_mount
-#endif
-
-#ifdef __NR_move_pages
-#define SYS_move_pages __NR_move_pages
-#endif
-
-#ifdef __NR_mprotect
-#define SYS_mprotect __NR_mprotect
-#endif
-
-#ifdef __NR_mpx
-#define SYS_mpx __NR_mpx
-#endif
-
-#ifdef __NR_mq_getsetattr
-#define SYS_mq_getsetattr __NR_mq_getsetattr
-#endif
-
-#ifdef __NR_mq_notify
-#define SYS_mq_notify __NR_mq_notify
-#endif
-
-#ifdef __NR_mq_open
-#define SYS_mq_open __NR_mq_open
-#endif
-
-#ifdef __NR_mq_timedreceive
-#define SYS_mq_timedreceive __NR_mq_timedreceive
-#endif
-
-#ifdef __NR_mq_timedsend
-#define SYS_mq_timedsend __NR_mq_timedsend
-#endif
-
-#ifdef __NR_mq_unlink
-#define SYS_mq_unlink __NR_mq_unlink
-#endif
-
-#ifdef __NR_mremap
-#define SYS_mremap __NR_mremap
-#endif
-
-#ifdef __NR_msgctl
-#define SYS_msgctl __NR_msgctl
-#endif
-
-#ifdef __NR_msgget
-#define SYS_msgget __NR_msgget
-#endif
-
-#ifdef __NR_msgrcv
-#define SYS_msgrcv __NR_msgrcv
-#endif
-
-#ifdef __NR_msgsnd
-#define SYS_msgsnd __NR_msgsnd
-#endif
-
-#ifdef __NR_msync
-#define SYS_msync __NR_msync
-#endif
-
-#ifdef __NR_multiplexer
-#define SYS_multiplexer __NR_multiplexer
-#endif
-
-#ifdef __NR_munlock
-#define SYS_munlock __NR_munlock
-#endif
-
-#ifdef __NR_munlockall
-#define SYS_munlockall __NR_munlockall
-#endif
-
-#ifdef __NR_munmap
-#define SYS_munmap __NR_munmap
-#endif
-
-#ifdef __NR_name_to_handle_at
-#define SYS_name_to_handle_at __NR_name_to_handle_at
-#endif
-
-#ifdef __NR_nanosleep
-#define SYS_nanosleep __NR_nanosleep
-#endif
-
-#ifdef __NR_newfstatat
-#define SYS_newfstatat __NR_newfstatat
-#endif
-
-#ifdef __NR_nfsservctl
-#define SYS_nfsservctl __NR_nfsservctl
-#endif
-
-#ifdef __NR_ni_syscall
-#define SYS_ni_syscall __NR_ni_syscall
-#endif
-
-#ifdef __NR_nice
-#define SYS_nice __NR_nice
-#endif
-
-#ifdef __NR_old_adjtimex
-#define SYS_old_adjtimex __NR_old_adjtimex
-#endif
-
-#ifdef __NR_oldfstat
-#define SYS_oldfstat __NR_oldfstat
-#endif
-
-#ifdef __NR_oldlstat
-#define SYS_oldlstat __NR_oldlstat
-#endif
-
-#ifdef __NR_oldolduname
-#define SYS_oldolduname __NR_oldolduname
-#endif
-
-#ifdef __NR_oldstat
-#define SYS_oldstat __NR_oldstat
-#endif
-
-#ifdef __NR_oldumount
-#define SYS_oldumount __NR_oldumount
-#endif
-
-#ifdef __NR_olduname
-#define SYS_olduname __NR_olduname
-#endif
-
-#ifdef __NR_open
-#define SYS_open __NR_open
-#endif
-
-#ifdef __NR_open_by_handle_at
-#define SYS_open_by_handle_at __NR_open_by_handle_at
-#endif
-
-#ifdef __NR_openat
-#define SYS_openat __NR_openat
-#endif
-
-#ifdef __NR_osf_adjtime
-#define SYS_osf_adjtime __NR_osf_adjtime
-#endif
-
-#ifdef __NR_osf_afs_syscall
-#define SYS_osf_afs_syscall __NR_osf_afs_syscall
-#endif
-
-#ifdef __NR_osf_alt_plock
-#define SYS_osf_alt_plock __NR_osf_alt_plock
-#endif
-
-#ifdef __NR_osf_alt_setsid
-#define SYS_osf_alt_setsid __NR_osf_alt_setsid
-#endif
-
-#ifdef __NR_osf_alt_sigpending
-#define SYS_osf_alt_sigpending __NR_osf_alt_sigpending
-#endif
-
-#ifdef __NR_osf_asynch_daemon
-#define SYS_osf_asynch_daemon __NR_osf_asynch_daemon
-#endif
-
-#ifdef __NR_osf_audcntl
-#define SYS_osf_audcntl __NR_osf_audcntl
-#endif
-
-#ifdef __NR_osf_audgen
-#define SYS_osf_audgen __NR_osf_audgen
-#endif
-
-#ifdef __NR_osf_chflags
-#define SYS_osf_chflags __NR_osf_chflags
-#endif
-
-#ifdef __NR_osf_execve
-#define SYS_osf_execve __NR_osf_execve
-#endif
-
-#ifdef __NR_osf_exportfs
-#define SYS_osf_exportfs __NR_osf_exportfs
-#endif
-
-#ifdef __NR_osf_fchflags
-#define SYS_osf_fchflags __NR_osf_fchflags
-#endif
-
-#ifdef __NR_osf_fdatasync
-#define SYS_osf_fdatasync __NR_osf_fdatasync
-#endif
-
-#ifdef __NR_osf_fpathconf
-#define SYS_osf_fpathconf __NR_osf_fpathconf
-#endif
-
-#ifdef __NR_osf_fstat
-#define SYS_osf_fstat __NR_osf_fstat
-#endif
-
-#ifdef __NR_osf_fstatfs
-#define SYS_osf_fstatfs __NR_osf_fstatfs
-#endif
-
-#ifdef __NR_osf_fstatfs64
-#define SYS_osf_fstatfs64 __NR_osf_fstatfs64
-#endif
-
-#ifdef __NR_osf_fuser
-#define SYS_osf_fuser __NR_osf_fuser
-#endif
-
-#ifdef __NR_osf_getaddressconf
-#define SYS_osf_getaddressconf __NR_osf_getaddressconf
-#endif
-
-#ifdef __NR_osf_getdirentries
-#define SYS_osf_getdirentries __NR_osf_getdirentries
-#endif
-
-#ifdef __NR_osf_getdomainname
-#define SYS_osf_getdomainname __NR_osf_getdomainname
-#endif
-
-#ifdef __NR_osf_getfh
-#define SYS_osf_getfh __NR_osf_getfh
-#endif
-
-#ifdef __NR_osf_getfsstat
-#define SYS_osf_getfsstat __NR_osf_getfsstat
-#endif
-
-#ifdef __NR_osf_gethostid
-#define SYS_osf_gethostid __NR_osf_gethostid
-#endif
-
-#ifdef __NR_osf_getitimer
-#define SYS_osf_getitimer __NR_osf_getitimer
-#endif
-
-#ifdef __NR_osf_getlogin
-#define SYS_osf_getlogin __NR_osf_getlogin
-#endif
-
-#ifdef __NR_osf_getmnt
-#define SYS_osf_getmnt __NR_osf_getmnt
-#endif
-
-#ifdef __NR_osf_getrusage
-#define SYS_osf_getrusage __NR_osf_getrusage
-#endif
-
-#ifdef __NR_osf_getsysinfo
-#define SYS_osf_getsysinfo __NR_osf_getsysinfo
-#endif
-
-#ifdef __NR_osf_gettimeofday
-#define SYS_osf_gettimeofday __NR_osf_gettimeofday
-#endif
-
-#ifdef __NR_osf_kloadcall
-#define SYS_osf_kloadcall __NR_osf_kloadcall
-#endif
-
-#ifdef __NR_osf_kmodcall
-#define SYS_osf_kmodcall __NR_osf_kmodcall
-#endif
-
-#ifdef __NR_osf_lstat
-#define SYS_osf_lstat __NR_osf_lstat
-#endif
-
-#ifdef __NR_osf_memcntl
-#define SYS_osf_memcntl __NR_osf_memcntl
-#endif
-
-#ifdef __NR_osf_mincore
-#define SYS_osf_mincore __NR_osf_mincore
-#endif
-
-#ifdef __NR_osf_mount
-#define SYS_osf_mount __NR_osf_mount
-#endif
-
-#ifdef __NR_osf_mremap
-#define SYS_osf_mremap __NR_osf_mremap
-#endif
-
-#ifdef __NR_osf_msfs_syscall
-#define SYS_osf_msfs_syscall __NR_osf_msfs_syscall
-#endif
-
-#ifdef __NR_osf_msleep
-#define SYS_osf_msleep __NR_osf_msleep
-#endif
-
-#ifdef __NR_osf_mvalid
-#define SYS_osf_mvalid __NR_osf_mvalid
-#endif
-
-#ifdef __NR_osf_mwakeup
-#define SYS_osf_mwakeup __NR_osf_mwakeup
-#endif
-
-#ifdef __NR_osf_naccept
-#define SYS_osf_naccept __NR_osf_naccept
-#endif
-
-#ifdef __NR_osf_nfssvc
-#define SYS_osf_nfssvc __NR_osf_nfssvc
-#endif
-
-#ifdef __NR_osf_ngetpeername
-#define SYS_osf_ngetpeername __NR_osf_ngetpeername
-#endif
-
-#ifdef __NR_osf_ngetsockname
-#define SYS_osf_ngetsockname __NR_osf_ngetsockname
-#endif
-
-#ifdef __NR_osf_nrecvfrom
-#define SYS_osf_nrecvfrom __NR_osf_nrecvfrom
-#endif
-
-#ifdef __NR_osf_nrecvmsg
-#define SYS_osf_nrecvmsg __NR_osf_nrecvmsg
-#endif
-
-#ifdef __NR_osf_nsendmsg
-#define SYS_osf_nsendmsg __NR_osf_nsendmsg
-#endif
-
-#ifdef __NR_osf_ntp_adjtime
-#define SYS_osf_ntp_adjtime __NR_osf_ntp_adjtime
-#endif
-
-#ifdef __NR_osf_ntp_gettime
-#define SYS_osf_ntp_gettime __NR_osf_ntp_gettime
-#endif
-
-#ifdef __NR_osf_old_creat
-#define SYS_osf_old_creat __NR_osf_old_creat
-#endif
-
-#ifdef __NR_osf_old_fstat
-#define SYS_osf_old_fstat __NR_osf_old_fstat
-#endif
-
-#ifdef __NR_osf_old_getpgrp
-#define SYS_osf_old_getpgrp __NR_osf_old_getpgrp
-#endif
-
-#ifdef __NR_osf_old_killpg
-#define SYS_osf_old_killpg __NR_osf_old_killpg
-#endif
-
-#ifdef __NR_osf_old_lstat
-#define SYS_osf_old_lstat __NR_osf_old_lstat
-#endif
-
-#ifdef __NR_osf_old_open
-#define SYS_osf_old_open __NR_osf_old_open
-#endif
-
-#ifdef __NR_osf_old_sigaction
-#define SYS_osf_old_sigaction __NR_osf_old_sigaction
-#endif
-
-#ifdef __NR_osf_old_sigblock
-#define SYS_osf_old_sigblock __NR_osf_old_sigblock
-#endif
-
-#ifdef __NR_osf_old_sigreturn
-#define SYS_osf_old_sigreturn __NR_osf_old_sigreturn
-#endif
-
-#ifdef __NR_osf_old_sigsetmask
-#define SYS_osf_old_sigsetmask __NR_osf_old_sigsetmask
-#endif
-
-#ifdef __NR_osf_old_sigvec
-#define SYS_osf_old_sigvec __NR_osf_old_sigvec
-#endif
-
-#ifdef __NR_osf_old_stat
-#define SYS_osf_old_stat __NR_osf_old_stat
-#endif
-
-#ifdef __NR_osf_old_vadvise
-#define SYS_osf_old_vadvise __NR_osf_old_vadvise
-#endif
-
-#ifdef __NR_osf_old_vtrace
-#define SYS_osf_old_vtrace __NR_osf_old_vtrace
-#endif
-
-#ifdef __NR_osf_old_wait
-#define SYS_osf_old_wait __NR_osf_old_wait
-#endif
-
-#ifdef __NR_osf_oldquota
-#define SYS_osf_oldquota __NR_osf_oldquota
-#endif
-
-#ifdef __NR_osf_pathconf
-#define SYS_osf_pathconf __NR_osf_pathconf
-#endif
-
-#ifdef __NR_osf_pid_block
-#define SYS_osf_pid_block __NR_osf_pid_block
-#endif
-
-#ifdef __NR_osf_pid_unblock
-#define SYS_osf_pid_unblock __NR_osf_pid_unblock
-#endif
-
-#ifdef __NR_osf_plock
-#define SYS_osf_plock __NR_osf_plock
-#endif
-
-#ifdef __NR_osf_priocntlset
-#define SYS_osf_priocntlset __NR_osf_priocntlset
-#endif
-
-#ifdef __NR_osf_profil
-#define SYS_osf_profil __NR_osf_profil
-#endif
-
-#ifdef __NR_osf_proplist_syscall
-#define SYS_osf_proplist_syscall __NR_osf_proplist_syscall
-#endif
-
-#ifdef __NR_osf_reboot
-#define SYS_osf_reboot __NR_osf_reboot
-#endif
-
-#ifdef __NR_osf_revoke
-#define SYS_osf_revoke __NR_osf_revoke
-#endif
-
-#ifdef __NR_osf_sbrk
-#define SYS_osf_sbrk __NR_osf_sbrk
-#endif
-
-#ifdef __NR_osf_security
-#define SYS_osf_security __NR_osf_security
-#endif
-
-#ifdef __NR_osf_select
-#define SYS_osf_select __NR_osf_select
-#endif
-
-#ifdef __NR_osf_set_program_attributes
-#define SYS_osf_set_program_attributes __NR_osf_set_program_attributes
-#endif
-
-#ifdef __NR_osf_set_speculative
-#define SYS_osf_set_speculative __NR_osf_set_speculative
-#endif
-
-#ifdef __NR_osf_sethostid
-#define SYS_osf_sethostid __NR_osf_sethostid
-#endif
-
-#ifdef __NR_osf_setitimer
-#define SYS_osf_setitimer __NR_osf_setitimer
-#endif
-
-#ifdef __NR_osf_setlogin
-#define SYS_osf_setlogin __NR_osf_setlogin
-#endif
-
-#ifdef __NR_osf_setsysinfo
-#define SYS_osf_setsysinfo __NR_osf_setsysinfo
-#endif
-
-#ifdef __NR_osf_settimeofday
-#define SYS_osf_settimeofday __NR_osf_settimeofday
-#endif
-
-#ifdef __NR_osf_shmat
-#define SYS_osf_shmat __NR_osf_shmat
-#endif
-
-#ifdef __NR_osf_signal
-#define SYS_osf_signal __NR_osf_signal
-#endif
-
-#ifdef __NR_osf_sigprocmask
-#define SYS_osf_sigprocmask __NR_osf_sigprocmask
-#endif
-
-#ifdef __NR_osf_sigsendset
-#define SYS_osf_sigsendset __NR_osf_sigsendset
-#endif
-
-#ifdef __NR_osf_sigstack
-#define SYS_osf_sigstack __NR_osf_sigstack
-#endif
-
-#ifdef __NR_osf_sigwaitprim
-#define SYS_osf_sigwaitprim __NR_osf_sigwaitprim
-#endif
-
-#ifdef __NR_osf_sstk
-#define SYS_osf_sstk __NR_osf_sstk
-#endif
-
-#ifdef __NR_osf_stat
-#define SYS_osf_stat __NR_osf_stat
-#endif
-
-#ifdef __NR_osf_statfs
-#define SYS_osf_statfs __NR_osf_statfs
-#endif
-
-#ifdef __NR_osf_statfs64
-#define SYS_osf_statfs64 __NR_osf_statfs64
-#endif
-
-#ifdef __NR_osf_subsys_info
-#define SYS_osf_subsys_info __NR_osf_subsys_info
-#endif
-
-#ifdef __NR_osf_swapctl
-#define SYS_osf_swapctl __NR_osf_swapctl
-#endif
-
-#ifdef __NR_osf_swapon
-#define SYS_osf_swapon __NR_osf_swapon
-#endif
-
-#ifdef __NR_osf_syscall
-#define SYS_osf_syscall __NR_osf_syscall
-#endif
-
-#ifdef __NR_osf_sysinfo
-#define SYS_osf_sysinfo __NR_osf_sysinfo
-#endif
-
-#ifdef __NR_osf_table
-#define SYS_osf_table __NR_osf_table
-#endif
-
-#ifdef __NR_osf_uadmin
-#define SYS_osf_uadmin __NR_osf_uadmin
-#endif
-
-#ifdef __NR_osf_usleep_thread
-#define SYS_osf_usleep_thread __NR_osf_usleep_thread
-#endif
-
-#ifdef __NR_osf_uswitch
-#define SYS_osf_uswitch __NR_osf_uswitch
-#endif
-
-#ifdef __NR_osf_utc_adjtime
-#define SYS_osf_utc_adjtime __NR_osf_utc_adjtime
-#endif
-
-#ifdef __NR_osf_utc_gettime
-#define SYS_osf_utc_gettime __NR_osf_utc_gettime
-#endif
-
-#ifdef __NR_osf_utimes
-#define SYS_osf_utimes __NR_osf_utimes
-#endif
-
-#ifdef __NR_osf_utsname
-#define SYS_osf_utsname __NR_osf_utsname
-#endif
-
-#ifdef __NR_osf_wait4
-#define SYS_osf_wait4 __NR_osf_wait4
-#endif
-
-#ifdef __NR_osf_waitid
-#define SYS_osf_waitid __NR_osf_waitid
-#endif
-
-#ifdef __NR_pause
-#define SYS_pause __NR_pause
-#endif
-
-#ifdef __NR_pciconfig_iobase
-#define SYS_pciconfig_iobase __NR_pciconfig_iobase
-#endif
-
-#ifdef __NR_pciconfig_read
-#define SYS_pciconfig_read __NR_pciconfig_read
-#endif
-
-#ifdef __NR_pciconfig_write
-#define SYS_pciconfig_write __NR_pciconfig_write
-#endif
-
-#ifdef __NR_perf_event_open
-#define SYS_perf_event_open __NR_perf_event_open
-#endif
-
-#ifdef __NR_perfctr
-#define SYS_perfctr __NR_perfctr
-#endif
-
-#ifdef __NR_perfmonctl
-#define SYS_perfmonctl __NR_perfmonctl
-#endif
-
-#ifdef __NR_personality
-#define SYS_personality __NR_personality
-#endif
-
-#ifdef __NR_pipe
-#define SYS_pipe __NR_pipe
-#endif
-
-#ifdef __NR_pipe2
-#define SYS_pipe2 __NR_pipe2
-#endif
-
-#ifdef __NR_pivot_root
-#define SYS_pivot_root __NR_pivot_root
-#endif
-
-#ifdef __NR_pkey_alloc
-#define SYS_pkey_alloc __NR_pkey_alloc
-#endif
-
-#ifdef __NR_pkey_free
-#define SYS_pkey_free __NR_pkey_free
-#endif
-
-#ifdef __NR_pkey_mprotect
-#define SYS_pkey_mprotect __NR_pkey_mprotect
-#endif
-
-#ifdef __NR_poll
-#define SYS_poll __NR_poll
-#endif
-
-#ifdef __NR_ppoll
-#define SYS_ppoll __NR_ppoll
-#endif
-
-#ifdef __NR_prctl
-#define SYS_prctl __NR_prctl
-#endif
-
-#ifdef __NR_pread64
-#define SYS_pread64 __NR_pread64
-#endif
-
-#ifdef __NR_preadv
-#define SYS_preadv __NR_preadv
-#endif
-
-#ifdef __NR_preadv2
-#define SYS_preadv2 __NR_preadv2
-#endif
-
-#ifdef __NR_prlimit64
-#define SYS_prlimit64 __NR_prlimit64
-#endif
-
-#ifdef __NR_process_vm_readv
-#define SYS_process_vm_readv __NR_process_vm_readv
-#endif
-
-#ifdef __NR_process_vm_writev
-#define SYS_process_vm_writev __NR_process_vm_writev
-#endif
-
-#ifdef __NR_prof
-#define SYS_prof __NR_prof
-#endif
-
-#ifdef __NR_profil
-#define SYS_profil __NR_profil
-#endif
-
-#ifdef __NR_pselect6
-#define SYS_pselect6 __NR_pselect6
-#endif
-
-#ifdef __NR_pselect6_time64
-#define SYS_pselect6_time64 __NR_pselect6_time64
-#endif
-
-#ifdef __NR_ptrace
-#define SYS_ptrace __NR_ptrace
-#endif
-
-#ifdef __NR_putpmsg
-#define SYS_putpmsg __NR_putpmsg
-#endif
-
-#ifdef __NR_pwrite64
-#define SYS_pwrite64 __NR_pwrite64
-#endif
-
-#ifdef __NR_pwritev
-#define SYS_pwritev __NR_pwritev
-#endif
-
-#ifdef __NR_pwritev2
-#define SYS_pwritev2 __NR_pwritev2
-#endif
-
-#ifdef __NR_query_module
-#define SYS_query_module __NR_query_module
-#endif
-
-#ifdef __NR_quotactl
-#define SYS_quotactl __NR_quotactl
-#endif
-
-#ifdef __NR_read
-#define SYS_read __NR_read
-#endif
-
-#ifdef __NR_readahead
-#define SYS_readahead __NR_readahead
-#endif
-
-#ifdef __NR_readdir
-#define SYS_readdir __NR_readdir
-#endif
-
-#ifdef __NR_readlink
-#define SYS_readlink __NR_readlink
-#endif
-
-#ifdef __NR_readlinkat
-#define SYS_readlinkat __NR_readlinkat
-#endif
-
-#ifdef __NR_readv
-#define SYS_readv __NR_readv
-#endif
-
-#ifdef __NR_reboot
-#define SYS_reboot __NR_reboot
-#endif
-
-#ifdef __NR_recv
-#define SYS_recv __NR_recv
-#endif
-
-#ifdef __NR_recvfrom
-#define SYS_recvfrom __NR_recvfrom
-#endif
-
-#ifdef __NR_recvmmsg
-#define SYS_recvmmsg __NR_recvmmsg
-#endif
-
-#ifdef __NR_recvmsg
-#define SYS_recvmsg __NR_recvmsg
-#endif
-
-#ifdef __NR_remap_file_pages
-#define SYS_remap_file_pages __NR_remap_file_pages
-#endif
-
-#ifdef __NR_removexattr
-#define SYS_removexattr __NR_removexattr
-#endif
-
-#ifdef __NR_rename
-#define SYS_rename __NR_rename
-#endif
-
-#ifdef __NR_renameat
-#define SYS_renameat __NR_renameat
-#endif
-
-#ifdef __NR_renameat2
-#define SYS_renameat2 __NR_renameat2
-#endif
-
-#ifdef __NR_request_key
-#define SYS_request_key __NR_request_key
-#endif
-
-#ifdef __NR_restart_syscall
-#define SYS_restart_syscall __NR_restart_syscall
-#endif
-
-#ifdef __NR_rmdir
-#define SYS_rmdir __NR_rmdir
-#endif
-
-#ifdef __NR_rseq
-#define SYS_rseq __NR_rseq
-#endif
-
-#ifdef __NR_rt_sigaction
-#define SYS_rt_sigaction __NR_rt_sigaction
-#endif
-
-#ifdef __NR_rt_sigpending
-#define SYS_rt_sigpending __NR_rt_sigpending
-#endif
-
-#ifdef __NR_rt_sigprocmask
-#define SYS_rt_sigprocmask __NR_rt_sigprocmask
-#endif
-
-#ifdef __NR_rt_sigqueueinfo
-#define SYS_rt_sigqueueinfo __NR_rt_sigqueueinfo
-#endif
-
-#ifdef __NR_rt_sigreturn
-#define SYS_rt_sigreturn __NR_rt_sigreturn
-#endif
-
-#ifdef __NR_rt_sigsuspend
-#define SYS_rt_sigsuspend __NR_rt_sigsuspend
-#endif
-
-#ifdef __NR_rt_sigtimedwait
-#define SYS_rt_sigtimedwait __NR_rt_sigtimedwait
-#endif
-
-#ifdef __NR_rt_tgsigqueueinfo
-#define SYS_rt_tgsigqueueinfo __NR_rt_tgsigqueueinfo
-#endif
-
-#ifdef __NR_rtas
-#define SYS_rtas __NR_rtas
-#endif
-
-#ifdef __NR_s390_guarded_storage
-#define SYS_s390_guarded_storage __NR_s390_guarded_storage
-#endif
-
-#ifdef __NR_s390_pci_mmio_read
-#define SYS_s390_pci_mmio_read __NR_s390_pci_mmio_read
-#endif
-
-#ifdef __NR_s390_pci_mmio_write
-#define SYS_s390_pci_mmio_write __NR_s390_pci_mmio_write
-#endif
-
-#ifdef __NR_s390_runtime_instr
-#define SYS_s390_runtime_instr __NR_s390_runtime_instr
-#endif
-
-#ifdef __NR_s390_sthyi
-#define SYS_s390_sthyi __NR_s390_sthyi
-#endif
-
-#ifdef __NR_sched_get_affinity
-#define SYS_sched_get_affinity __NR_sched_get_affinity
-#endif
-
-#ifdef __NR_sched_get_priority_max
-#define SYS_sched_get_priority_max __NR_sched_get_priority_max
-#endif
-
-#ifdef __NR_sched_get_priority_min
-#define SYS_sched_get_priority_min __NR_sched_get_priority_min
-#endif
-
-#ifdef __NR_sched_getaffinity
-#define SYS_sched_getaffinity __NR_sched_getaffinity
-#endif
-
-#ifdef __NR_sched_getattr
-#define SYS_sched_getattr __NR_sched_getattr
-#endif
-
-#ifdef __NR_sched_getparam
-#define SYS_sched_getparam __NR_sched_getparam
-#endif
-
-#ifdef __NR_sched_getscheduler
-#define SYS_sched_getscheduler __NR_sched_getscheduler
-#endif
-
-#ifdef __NR_sched_rr_get_interval
-#define SYS_sched_rr_get_interval __NR_sched_rr_get_interval
-#endif
-
-#ifdef __NR_sched_rr_get_interval_time64
-#define SYS_sched_rr_get_interval_time64 __NR_sched_rr_get_interval_time64
-#endif
-
-#ifdef __NR_sched_set_affinity
-#define SYS_sched_set_affinity __NR_sched_set_affinity
-#endif
-
-#ifdef __NR_sched_setaffinity
-#define SYS_sched_setaffinity __NR_sched_setaffinity
-#endif
-
-#ifdef __NR_sched_setattr
-#define SYS_sched_setattr __NR_sched_setattr
-#endif
-
-#ifdef __NR_sched_setparam
-#define SYS_sched_setparam __NR_sched_setparam
-#endif
-
-#ifdef __NR_sched_setscheduler
-#define SYS_sched_setscheduler __NR_sched_setscheduler
-#endif
-
-#ifdef __NR_sched_yield
-#define SYS_sched_yield __NR_sched_yield
-#endif
-
-#ifdef __NR_seccomp
-#define SYS_seccomp __NR_seccomp
-#endif
-
-#ifdef __NR_security
-#define SYS_security __NR_security
-#endif
-
-#ifdef __NR_select
-#define SYS_select __NR_select
-#endif
-
-#ifdef __NR_semctl
-#define SYS_semctl __NR_semctl
-#endif
-
-#ifdef __NR_semget
-#define SYS_semget __NR_semget
-#endif
-
-#ifdef __NR_semop
-#define SYS_semop __NR_semop
-#endif
-
-#ifdef __NR_semtimedop
-#define SYS_semtimedop __NR_semtimedop
-#endif
-
-#ifdef __NR_send
-#define SYS_send __NR_send
-#endif
-
-#ifdef __NR_sendfile
-#define SYS_sendfile __NR_sendfile
-#endif
-
-#ifdef __NR_sendfile64
-#define SYS_sendfile64 __NR_sendfile64
-#endif
-
-#ifdef __NR_sendmmsg
-#define SYS_sendmmsg __NR_sendmmsg
-#endif
-
-#ifdef __NR_sendmsg
-#define SYS_sendmsg __NR_sendmsg
-#endif
-
-#ifdef __NR_sendto
-#define SYS_sendto __NR_sendto
-#endif
-
-#ifdef __NR_set_mempolicy
-#define SYS_set_mempolicy __NR_set_mempolicy
-#endif
-
-#ifdef __NR_set_robust_list
-#define SYS_set_robust_list __NR_set_robust_list
-#endif
-
-#ifdef __NR_set_thread_area
-#define SYS_set_thread_area __NR_set_thread_area
-#endif
-
-#ifdef __NR_set_tid_address
-#define SYS_set_tid_address __NR_set_tid_address
-#endif
-
-#ifdef __NR_setdomainname
-#define SYS_setdomainname __NR_setdomainname
-#endif
-
-#ifdef __NR_setfsgid
-#define SYS_setfsgid __NR_setfsgid
-#endif
-
-#ifdef __NR_setfsgid32
-#define SYS_setfsgid32 __NR_setfsgid32
-#endif
-
-#ifdef __NR_setfsuid
-#define SYS_setfsuid __NR_setfsuid
-#endif
-
-#ifdef __NR_setfsuid32
-#define SYS_setfsuid32 __NR_setfsuid32
-#endif
-
-#ifdef __NR_setgid
-#define SYS_setgid __NR_setgid
-#endif
-
-#ifdef __NR_setgid32
-#define SYS_setgid32 __NR_setgid32
-#endif
-
-#ifdef __NR_setgroups
-#define SYS_setgroups __NR_setgroups
-#endif
-
-#ifdef __NR_setgroups32
-#define SYS_setgroups32 __NR_setgroups32
-#endif
-
-#ifdef __NR_sethae
-#define SYS_sethae __NR_sethae
-#endif
-
-#ifdef __NR_sethostname
-#define SYS_sethostname __NR_sethostname
-#endif
-
-#ifdef __NR_setitimer
-#define SYS_setitimer __NR_setitimer
-#endif
-
-#ifdef __NR_setns
-#define SYS_setns __NR_setns
-#endif
-
-#ifdef __NR_setpgid
-#define SYS_setpgid __NR_setpgid
-#endif
-
-#ifdef __NR_setpgrp
-#define SYS_setpgrp __NR_setpgrp
-#endif
-
-#ifdef __NR_setpriority
-#define SYS_setpriority __NR_setpriority
-#endif
-
-#ifdef __NR_setregid
-#define SYS_setregid __NR_setregid
-#endif
-
-#ifdef __NR_setregid32
-#define SYS_setregid32 __NR_setregid32
-#endif
-
-#ifdef __NR_setresgid
-#define SYS_setresgid __NR_setresgid
-#endif
-
-#ifdef __NR_setresgid32
-#define SYS_setresgid32 __NR_setresgid32
-#endif
-
-#ifdef __NR_setresuid
-#define SYS_setresuid __NR_setresuid
-#endif
-
-#ifdef __NR_setresuid32
-#define SYS_setresuid32 __NR_setresuid32
-#endif
-
-#ifdef __NR_setreuid
-#define SYS_setreuid __NR_setreuid
-#endif
-
-#ifdef __NR_setreuid32
-#define SYS_setreuid32 __NR_setreuid32
-#endif
-
-#ifdef __NR_setrlimit
-#define SYS_setrlimit __NR_setrlimit
-#endif
-
-#ifdef __NR_setsid
-#define SYS_setsid __NR_setsid
-#endif
-
-#ifdef __NR_setsockopt
-#define SYS_setsockopt __NR_setsockopt
-#endif
-
-#ifdef __NR_settimeofday
-#define SYS_settimeofday __NR_settimeofday
-#endif
-
-#ifdef __NR_setuid
-#define SYS_setuid __NR_setuid
-#endif
-
-#ifdef __NR_setuid32
-#define SYS_setuid32 __NR_setuid32
-#endif
-
-#ifdef __NR_setxattr
-#define SYS_setxattr __NR_setxattr
-#endif
-
-#ifdef __NR_sgetmask
-#define SYS_sgetmask __NR_sgetmask
-#endif
-
-#ifdef __NR_shmat
-#define SYS_shmat __NR_shmat
-#endif
-
-#ifdef __NR_shmctl
-#define SYS_shmctl __NR_shmctl
-#endif
-
-#ifdef __NR_shmdt
-#define SYS_shmdt __NR_shmdt
-#endif
-
-#ifdef __NR_shmget
-#define SYS_shmget __NR_shmget
-#endif
-
-#ifdef __NR_shutdown
-#define SYS_shutdown __NR_shutdown
-#endif
-
-#ifdef __NR_sigaction
-#define SYS_sigaction __NR_sigaction
-#endif
-
-#ifdef __NR_sigaltstack
-#define SYS_sigaltstack __NR_sigaltstack
-#endif
-
-#ifdef __NR_signal
-#define SYS_signal __NR_signal
-#endif
-
-#ifdef __NR_signalfd
-#define SYS_signalfd __NR_signalfd
-#endif
-
-#ifdef __NR_signalfd4
-#define SYS_signalfd4 __NR_signalfd4
-#endif
-
-#ifdef __NR_sigpending
-#define SYS_sigpending __NR_sigpending
-#endif
-
-#ifdef __NR_sigprocmask
-#define SYS_sigprocmask __NR_sigprocmask
-#endif
-
-#ifdef __NR_sigreturn
-#define SYS_sigreturn __NR_sigreturn
-#endif
-
-#ifdef __NR_sigsuspend
-#define SYS_sigsuspend __NR_sigsuspend
-#endif
-
-#ifdef __NR_socket
-#define SYS_socket __NR_socket
-#endif
-
-#ifdef __NR_socketcall
-#define SYS_socketcall __NR_socketcall
-#endif
-
-#ifdef __NR_socketpair
-#define SYS_socketpair __NR_socketpair
-#endif
-
-#ifdef __NR_splice
-#define SYS_splice __NR_splice
-#endif
-
-#ifdef __NR_spu_create
-#define SYS_spu_create __NR_spu_create
-#endif
-
-#ifdef __NR_spu_run
-#define SYS_spu_run __NR_spu_run
-#endif
-
-#ifdef __NR_ssetmask
-#define SYS_ssetmask __NR_ssetmask
-#endif
-
-#ifdef __NR_stat
-#define SYS_stat __NR_stat
-#endif
-
-#ifdef __NR_stat64
-#define SYS_stat64 __NR_stat64
-#endif
-
-#ifdef __NR_statfs
-#define SYS_statfs __NR_statfs
-#endif
-
-#ifdef __NR_statfs64
-#define SYS_statfs64 __NR_statfs64
-#endif
-
-#ifdef __NR_statx
-#define SYS_statx __NR_statx
-#endif
-
-#ifdef __NR_stime
-#define SYS_stime __NR_stime
-#endif
-
-#ifdef __NR_stty
-#define SYS_stty __NR_stty
-#endif
-
-#ifdef __NR_subpage_prot
-#define SYS_subpage_prot __NR_subpage_prot
-#endif
-
-#ifdef __NR_swapcontext
-#define SYS_swapcontext __NR_swapcontext
-#endif
-
-#ifdef __NR_swapoff
-#define SYS_swapoff __NR_swapoff
-#endif
-
-#ifdef __NR_swapon
-#define SYS_swapon __NR_swapon
-#endif
-
-#ifdef __NR_switch_endian
-#define SYS_switch_endian __NR_switch_endian
-#endif
-
-#ifdef __NR_symlink
-#define SYS_symlink __NR_symlink
-#endif
-
-#ifdef __NR_symlinkat
-#define SYS_symlinkat __NR_symlinkat
-#endif
-
-#ifdef __NR_sync
-#define SYS_sync __NR_sync
-#endif
-
-#ifdef __NR_sync_file_range
-#define SYS_sync_file_range __NR_sync_file_range
-#endif
-
-#ifdef __NR_sync_file_range2
-#define SYS_sync_file_range2 __NR_sync_file_range2
-#endif
-
-#ifdef __NR_syncfs
-#define SYS_syncfs __NR_syncfs
-#endif
-
-#ifdef __NR_sys_debug_setcontext
-#define SYS_sys_debug_setcontext __NR_sys_debug_setcontext
-#endif
-
-#ifdef __NR_sys_epoll_create
-#define SYS_sys_epoll_create __NR_sys_epoll_create
-#endif
-
-#ifdef __NR_sys_epoll_ctl
-#define SYS_sys_epoll_ctl __NR_sys_epoll_ctl
-#endif
-
-#ifdef __NR_sys_epoll_wait
-#define SYS_sys_epoll_wait __NR_sys_epoll_wait
-#endif
-
-#ifdef __NR_syscall
-#define SYS_syscall __NR_syscall
-#endif
-
-#ifdef __NR_sysfs
-#define SYS_sysfs __NR_sysfs
-#endif
-
-#ifdef __NR_sysinfo
-#define SYS_sysinfo __NR_sysinfo
-#endif
-
-#ifdef __NR_syslog
-#define SYS_syslog __NR_syslog
-#endif
-
-#ifdef __NR_sysmips
-#define SYS_sysmips __NR_sysmips
-#endif
-
-#ifdef __NR_tee
-#define SYS_tee __NR_tee
-#endif
-
-#ifdef __NR_tgkill
-#define SYS_tgkill __NR_tgkill
-#endif
-
-#ifdef __NR_time
-#define SYS_time __NR_time
-#endif
-
-#ifdef __NR_timer_create
-#define SYS_timer_create __NR_timer_create
-#endif
-
-#ifdef __NR_timer_delete
-#define SYS_timer_delete __NR_timer_delete
-#endif
-
-#ifdef __NR_timer_getoverrun
-#define SYS_timer_getoverrun __NR_timer_getoverrun
-#endif
-
-#ifdef __NR_timer_gettime
-#define SYS_timer_gettime __NR_timer_gettime
-#endif
-
-#ifdef __NR_timer_settime
-#define SYS_timer_settime __NR_timer_settime
-#endif
-
-#ifdef __NR_timerfd
-#define SYS_timerfd __NR_timerfd
-#endif
-
-#ifdef __NR_timerfd_create
-#define SYS_timerfd_create __NR_timerfd_create
-#endif
-
-#ifdef __NR_timerfd_gettime
-#define SYS_timerfd_gettime __NR_timerfd_gettime
-#endif
-
-#ifdef __NR_timerfd_settime
-#define SYS_timerfd_settime __NR_timerfd_settime
-#endif
-
-#ifdef __NR_times
-#define SYS_times __NR_times
-#endif
-
-#ifdef __NR_tkill
-#define SYS_tkill __NR_tkill
-#endif
-
-#ifdef __NR_truncate
-#define SYS_truncate __NR_truncate
-#endif
-
-#ifdef __NR_truncate64
-#define SYS_truncate64 __NR_truncate64
-#endif
-
-#ifdef __NR_tuxcall
-#define SYS_tuxcall __NR_tuxcall
-#endif
-
-#ifdef __NR_ugetrlimit
-#define SYS_ugetrlimit __NR_ugetrlimit
-#endif
-
-#ifdef __NR_ulimit
-#define SYS_ulimit __NR_ulimit
-#endif
-
-#ifdef __NR_umask
-#define SYS_umask __NR_umask
-#endif
-
-#ifdef __NR_umount
-#define SYS_umount __NR_umount
-#endif
-
-#ifdef __NR_umount2
-#define SYS_umount2 __NR_umount2
-#endif
-
-#ifdef __NR_uname
-#define SYS_uname __NR_uname
-#endif
-
-#ifdef __NR_unlink
-#define SYS_unlink __NR_unlink
-#endif
-
-#ifdef __NR_unlinkat
-#define SYS_unlinkat __NR_unlinkat
-#endif
-
-#ifdef __NR_unshare
-#define SYS_unshare __NR_unshare
-#endif
-
-#ifdef __NR_uselib
-#define SYS_uselib __NR_uselib
-#endif
-
-#ifdef __NR_userfaultfd
-#define SYS_userfaultfd __NR_userfaultfd
-#endif
-
-#ifdef __NR_ustat
-#define SYS_ustat __NR_ustat
-#endif
-
-#ifdef __NR_utime
-#define SYS_utime __NR_utime
-#endif
-
-#ifdef __NR_utimensat
-#define SYS_utimensat __NR_utimensat
-#endif
-
-#ifdef __NR_utimes
-#define SYS_utimes __NR_utimes
-#endif
-
-#ifdef __NR_utrap_install
-#define SYS_utrap_install __NR_utrap_install
-#endif
-
-#ifdef __NR_vfork
-#define SYS_vfork __NR_vfork
-#endif
-
-#ifdef __NR_vhangup
-#define SYS_vhangup __NR_vhangup
-#endif
-
-#ifdef __NR_vm86
-#define SYS_vm86 __NR_vm86
-#endif
-
-#ifdef __NR_vm86old
-#define SYS_vm86old __NR_vm86old
-#endif
-
-#ifdef __NR_vmsplice
-#define SYS_vmsplice __NR_vmsplice
-#endif
-
-#ifdef __NR_vserver
-#define SYS_vserver __NR_vserver
-#endif
-
-#ifdef __NR_wait4
-#define SYS_wait4 __NR_wait4
-#endif
-
-#ifdef __NR_waitid
-#define SYS_waitid __NR_waitid
-#endif
-
-#ifdef __NR_waitpid
-#define SYS_waitpid __NR_waitpid
-#endif
-
-#ifdef __NR_write
-#define SYS_write __NR_write
-#endif
-
-#ifdef __NR_writev
-#define SYS_writev __NR_writev
-#endif
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index ac170bf7b3da33..9f67dea006699f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -94,81 +94,81 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdbit.h entrypoints
-    libc.src.stdbit.stdc_leading_zeros_uc
-    libc.src.stdbit.stdc_leading_zeros_us
-    libc.src.stdbit.stdc_leading_zeros_ui
-    libc.src.stdbit.stdc_leading_zeros_ul
-    libc.src.stdbit.stdc_leading_zeros_ull
-    libc.src.stdbit.stdc_leading_ones_uc
-    libc.src.stdbit.stdc_leading_ones_us
-    libc.src.stdbit.stdc_leading_ones_ui
-    libc.src.stdbit.stdc_leading_ones_ul
-    libc.src.stdbit.stdc_leading_ones_ull
-    libc.src.stdbit.stdc_trailing_zeros_uc
-    libc.src.stdbit.stdc_trailing_zeros_us
-    libc.src.stdbit.stdc_trailing_zeros_ui
-    libc.src.stdbit.stdc_trailing_zeros_ul
-    libc.src.stdbit.stdc_trailing_zeros_ull
-    libc.src.stdbit.stdc_trailing_ones_uc
-    libc.src.stdbit.stdc_trailing_ones_us
-    libc.src.stdbit.stdc_trailing_ones_ui
-    libc.src.stdbit.stdc_trailing_ones_ul
-    libc.src.stdbit.stdc_trailing_ones_ull
-    libc.src.stdbit.stdc_first_leading_zero_uc
-    libc.src.stdbit.stdc_first_leading_zero_us
-    libc.src.stdbit.stdc_first_leading_zero_ui
-    libc.src.stdbit.stdc_first_leading_zero_ul
-    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
     libc.src.stdbit.stdc_first_leading_one_uc
-    libc.src.stdbit.stdc_first_leading_one_us
     libc.src.stdbit.stdc_first_leading_one_ui
     libc.src.stdbit.stdc_first_leading_one_ul
     libc.src.stdbit.stdc_first_leading_one_ull
-    libc.src.stdbit.stdc_first_trailing_zero_uc
-    libc.src.stdbit.stdc_first_trailing_zero_us
-    libc.src.stdbit.stdc_first_trailing_zero_ui
-    libc.src.stdbit.stdc_first_trailing_zero_ul
-    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
     libc.src.stdbit.stdc_first_trailing_one_uc
-    libc.src.stdbit.stdc_first_trailing_one_us
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
-    libc.src.stdbit.stdc_count_zeros_uc
-    libc.src.stdbit.stdc_count_zeros_us
-    libc.src.stdbit.stdc_count_zeros_ui
-    libc.src.stdbit.stdc_count_zeros_ul
-    libc.src.stdbit.stdc_count_zeros_ull
-    libc.src.stdbit.stdc_count_ones_uc
-    libc.src.stdbit.stdc_count_ones_us
-    libc.src.stdbit.stdc_count_ones_ui
-    libc.src.stdbit.stdc_count_ones_ul
-    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
     libc.src.stdbit.stdc_has_single_bit_uc
-    libc.src.stdbit.stdc_has_single_bit_us
     libc.src.stdbit.stdc_has_single_bit_ui
     libc.src.stdbit.stdc_has_single_bit_ul
     libc.src.stdbit.stdc_has_single_bit_ull
-    libc.src.stdbit.stdc_bit_width_uc
-    libc.src.stdbit.stdc_bit_width_us
-    libc.src.stdbit.stdc_bit_width_ui
-    libc.src.stdbit.stdc_bit_width_ul
-    libc.src.stdbit.stdc_bit_width_ull
-    libc.src.stdbit.stdc_bit_floor_uc
-    libc.src.stdbit.stdc_bit_floor_us
-    libc.src.stdbit.stdc_bit_floor_ui
-    libc.src.stdbit.stdc_bit_floor_ul
-    libc.src.stdbit.stdc_bit_floor_ull
-    libc.src.stdbit.stdc_bit_ceil_uc
-    libc.src.stdbit.stdc_bit_ceil_us
-    libc.src.stdbit.stdc_bit_ceil_ui
-    libc.src.stdbit.stdc_bit_ceil_ul
-    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
-    libc.src.stdlib.atoi
     libc.src.stdlib.atof
+    libc.src.stdlib.atoi
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
     libc.src.stdlib.bsearch
@@ -193,52 +193,52 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoull
 
     # stdlib.h external entrypoints
-    libc.src.stdlib.malloc
-    libc.src.stdlib.calloc
-    libc.src.stdlib.realloc
     libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.calloc
     libc.src.stdlib.free
+    libc.src.stdlib.malloc
+    libc.src.stdlib.realloc
 
     # stdio.h entrypoints
+    libc.src.stdio.fdopen
+    libc.src.stdio.fileno
+    libc.src.stdio.fprintf
+    libc.src.stdio.fscanf
+    libc.src.stdio.printf
     libc.src.stdio.remove
     libc.src.stdio.rename
-    libc.src.stdio.sprintf
+    libc.src.stdio.scanf
     libc.src.stdio.snprintf
-    libc.src.stdio.fprintf
-    libc.src.stdio.printf
-    libc.src.stdio.vsprintf
-    libc.src.stdio.vsnprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.sscanf
     libc.src.stdio.vfprintf
     libc.src.stdio.vprintf
-    libc.src.stdio.sscanf
-    libc.src.stdio.scanf
-    libc.src.stdio.fscanf
-    libc.src.stdio.fileno
-    libc.src.stdio.fdopen
+    libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
 
     # sys/epoll.h entrypoints
     libc.src.sys.epoll.epoll_create
     libc.src.sys.epoll.epoll_create1
     libc.src.sys.epoll.epoll_ctl
-    libc.src.sys.epoll.epoll_wait
     libc.src.sys.epoll.epoll_pwait
+    libc.src.sys.epoll.epoll_wait
     # TODO: Need to check if pwait2 is available before providing.
     # https://github.com/llvm/llvm-project/issues/80060
     # libc.src.sys.epoll.epoll_pwait2
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
-    libc.src.sys.mman.mmap
-    libc.src.sys.mman.mprotect
-    libc.src.sys.mman.munmap
-    libc.src.sys.mman.posix_madvise
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.mlock2
-    libc.src.sys.mman.munlock
     libc.src.sys.mman.mlockall
-    libc.src.sys.mman.munlockall
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.mprotect
     libc.src.sys.mman.msync
+    libc.src.sys.mman.munlock
+    libc.src.sys.mman.munlockall
+    libc.src.sys.mman.munmap
+    libc.src.sys.mman.posix_madvise
     libc.src.sys.mman.shm_open
     libc.src.sys.mman.shm_unlink
 
@@ -263,8 +263,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.stat.stat
 
     # sys/statvfs.h
-    libc.src.sys.statvfs.statvfs
     libc.src.sys.statvfs.fstatvfs
+    libc.src.sys.statvfs.statvfs
 
     # sys/utsname.h entrypoints
     libc.src.sys.utsname.uname
@@ -285,11 +285,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.termios.cfgetospeed
     libc.src.termios.cfsetispeed
     libc.src.termios.cfsetospeed
-    libc.src.termios.tcgetattr
-    libc.src.termios.tcgetsid
     libc.src.termios.tcdrain
     libc.src.termios.tcflow
     libc.src.termios.tcflush
+    libc.src.termios.tcgetattr
+    libc.src.termios.tcgetsid
     libc.src.termios.tcsendbreak
     libc.src.termios.tcsetattr
 
@@ -342,11 +342,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fegetexceptflag
     libc.src.fenv.fegetround
     libc.src.fenv.feholdexcept
+    libc.src.fenv.feraiseexcept
     libc.src.fenv.fesetenv
     libc.src.fenv.fesetexcept
     libc.src.fenv.fesetexceptflag
     libc.src.fenv.fesetround
-    libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
     libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
@@ -362,23 +362,23 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.canonicalize
     libc.src.math.canonicalizef
     libc.src.math.canonicalizel
-    libc.src.math.copysign
-    libc.src.math.copysignf
-    libc.src.math.copysignl
     libc.src.math.ceil
     libc.src.math.ceilf
     libc.src.math.ceill
+    libc.src.math.copysign
+    libc.src.math.copysignf
+    libc.src.math.copysignl
     libc.src.math.cos
-    libc.src.math.coshf
     libc.src.math.cosf
+    libc.src.math.coshf
     libc.src.math.erff
     libc.src.math.exp
-    libc.src.math.expf
     libc.src.math.exp10
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
     libc.src.math.exp2m1f
+    libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
     libc.src.math.fabs
@@ -392,40 +392,40 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.floorl
     libc.src.math.fma
     libc.src.math.fmaf
-    libc.src.math.fmin
-    libc.src.math.fminf
-    libc.src.math.fminl
     libc.src.math.fmax
     libc.src.math.fmaxf
-    libc.src.math.fmaxl
     libc.src.math.fmaximum
-    libc.src.math.fmaximumf
-    libc.src.math.fmaximuml
-    libc.src.math.fmaximum_num
-    libc.src.math.fmaximum_numf
-    libc.src.math.fmaximum_numl
     libc.src.math.fmaximum_mag
-    libc.src.math.fmaximum_magf
-    libc.src.math.fmaximum_magl
     libc.src.math.fmaximum_mag_num
     libc.src.math.fmaximum_mag_numf
     libc.src.math.fmaximum_mag_numl
+    libc.src.math.fmaximum_magf
+    libc.src.math.fmaximum_magl
+    libc.src.math.fmaximum_num
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numl
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximuml
+    libc.src.math.fmaxl
+    libc.src.math.fmin
+    libc.src.math.fminf
     libc.src.math.fminimum
-    libc.src.math.fminimumf
-    libc.src.math.fminimuml
-    libc.src.math.fminimum_num
-    libc.src.math.fminimum_numf
-    libc.src.math.fminimum_numl
     libc.src.math.fminimum_mag
-    libc.src.math.fminimum_magf
-    libc.src.math.fminimum_magl
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
-    libc.src.math.fmul
+    libc.src.math.fminimum_magf
+    libc.src.math.fminimum_magl
+    libc.src.math.fminimum_num
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numl
+    libc.src.math.fminimumf
+    libc.src.math.fminimuml
+    libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
+    libc.src.math.fmul
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -452,17 +452,17 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.llround
     libc.src.math.llroundf
     libc.src.math.llroundl
+    libc.src.math.log
     libc.src.math.log10
     libc.src.math.log10f
     libc.src.math.log1p
     libc.src.math.log1pf
     libc.src.math.log2
     libc.src.math.log2f
-    libc.src.math.log
-    libc.src.math.logf
     libc.src.math.logb
     libc.src.math.logbf
     libc.src.math.logbl
+    libc.src.math.logf
     libc.src.math.lrint
     libc.src.math.lrintf
     libc.src.math.lrintl
@@ -491,28 +491,29 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextupf
     libc.src.math.nextupl
     libc.src.math.powf
-    libc.src.math.remainderf
     libc.src.math.remainder
+    libc.src.math.remainderf
     libc.src.math.remainderl
-    libc.src.math.remquof
     libc.src.math.remquo
+    libc.src.math.remquof
     libc.src.math.remquol
     libc.src.math.rint
     libc.src.math.rintf
     libc.src.math.rintl
     libc.src.math.round
-    libc.src.math.roundf
-    libc.src.math.roundl
     libc.src.math.roundeven
     libc.src.math.roundevenf
     libc.src.math.roundevenl
+    libc.src.math.roundf
+    libc.src.math.roundl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
     libc.src.math.sin
+    libc.src.math.sincos
     libc.src.math.sincosf
-    libc.src.math.sinhf
     libc.src.math.sinf
+    libc.src.math.sinhf
     libc.src.math.sqrt
     libc.src.math.sqrtf
     libc.src.math.sqrtl
@@ -536,21 +537,28 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.f16div
+    libc.src.math.f16divf
+    libc.src.math.f16divl
+    libc.src.math.f16fma
     libc.src.math.f16fmaf
+    libc.src.math.f16fmal
+    libc.src.math.f16sqrt
     libc.src.math.f16sqrtf
+    libc.src.math.f16sqrtl
     libc.src.math.fabsf16
     libc.src.math.fdimf16
     libc.src.math.floorf16
     libc.src.math.fmaxf16
-    libc.src.math.fmaximumf16
-    libc.src.math.fmaximum_magf16
     libc.src.math.fmaximum_mag_numf16
+    libc.src.math.fmaximum_magf16
     libc.src.math.fmaximum_numf16
+    libc.src.math.fmaximumf16
     libc.src.math.fminf16
-    libc.src.math.fminimumf16
-    libc.src.math.fminimum_magf16
     libc.src.math.fminimum_mag_numf16
+    libc.src.math.fminimum_magf16
     libc.src.math.fminimum_numf16
+    libc.src.math.fminimumf16
     libc.src.math.fmodf16
     libc.src.math.frexpf16
     libc.src.math.fromfpf16
@@ -574,8 +582,8 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.remainderf16
     libc.src.math.remquof16
     libc.src.math.rintf16
-    libc.src.math.roundf16
     libc.src.math.roundevenf16
+    libc.src.math.roundf16
     libc.src.math.scalblnf16
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
@@ -586,6 +594,15 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.ufromfpf16
     libc.src.math.ufromfpxf16
   )
+
+  if(LIBC_TYPES_HAS_FLOAT128)
+    list(APPEND TARGET_LIBM_ENTRYPOINTS
+      # math.h C23 mixed _Float16 and _Float128 entrypoints
+      libc.src.math.f16divf128
+      libc.src.math.f16fmaf128
+      libc.src.math.f16sqrtf128
+    )
+  endif()
 endif()
 
 if(LIBC_TYPES_HAS_FLOAT128)
@@ -598,15 +615,15 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
-    libc.src.math.fminf128
-    libc.src.math.fmaximumf128
-    libc.src.math.fmaximum_numf128
-    libc.src.math.fmaximum_magf128
     libc.src.math.fmaximum_mag_numf128
-    libc.src.math.fminimumf128
-    libc.src.math.fminimum_numf128
-    libc.src.math.fminimum_magf128
+    libc.src.math.fmaximum_magf128
+    libc.src.math.fmaximum_numf128
+    libc.src.math.fmaximumf128
+    libc.src.math.fminf128
     libc.src.math.fminimum_mag_numf128
+    libc.src.math.fminimum_magf128
+    libc.src.math.fminimum_numf128
+    libc.src.math.fminimumf128
     libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
@@ -614,9 +631,9 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
-    libc.src.math.logbf128
     libc.src.math.llrintf128
     libc.src.math.llroundf128
+    libc.src.math.logbf128
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
@@ -643,23 +660,23 @@ if(LIBC_COMPILER_HAS_FIXED_POINT)
     libc.src.stdfix.abshk
     libc.src.stdfix.abshr
     libc.src.stdfix.absk
-    libc.src.stdfix.absr
     libc.src.stdfix.abslk
     libc.src.stdfix.abslr
+    libc.src.stdfix.absr
     libc.src.stdfix.exphk
     libc.src.stdfix.expk
     libc.src.stdfix.roundhk
     libc.src.stdfix.roundhr
     libc.src.stdfix.roundk
-    libc.src.stdfix.roundr
     libc.src.stdfix.roundlk
     libc.src.stdfix.roundlr
+    libc.src.stdfix.roundr
     libc.src.stdfix.rounduhk
     libc.src.stdfix.rounduhr
     libc.src.stdfix.rounduk
-    libc.src.stdfix.roundur
     libc.src.stdfix.roundulk
     libc.src.stdfix.roundulr
+    libc.src.stdfix.roundur
     libc.src.stdfix.sqrtuhk
     libc.src.stdfix.sqrtuhr
     libc.src.stdfix.sqrtuk
@@ -694,11 +711,11 @@ if(LLVM_LIBC_FULL_BUILD)
     # pthread.h entrypoints
     libc.src.pthread.pthread_atfork
     libc.src.pthread.pthread_attr_destroy
-    libc.src.pthread.pthread_attr_init
     libc.src.pthread.pthread_attr_getdetachstate
     libc.src.pthread.pthread_attr_getguardsize
     libc.src.pthread.pthread_attr_getstack
     libc.src.pthread.pthread_attr_getstacksize
+    libc.src.pthread.pthread_attr_init
     libc.src.pthread.pthread_attr_setdetachstate
     libc.src.pthread.pthread_attr_setguardsize
     libc.src.pthread.pthread_attr_setstack
@@ -718,36 +735,36 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_join
     libc.src.pthread.pthread_key_create
     libc.src.pthread.pthread_key_delete
-    libc.src.pthread.pthread_self
-    libc.src.pthread.pthread_setname_np
     libc.src.pthread.pthread_mutex_destroy
     libc.src.pthread.pthread_mutex_init
     libc.src.pthread.pthread_mutex_lock
     libc.src.pthread.pthread_mutex_unlock
     libc.src.pthread.pthread_mutexattr_destroy
-    libc.src.pthread.pthread_mutexattr_init
     libc.src.pthread.pthread_mutexattr_getpshared
     libc.src.pthread.pthread_mutexattr_getrobust
     libc.src.pthread.pthread_mutexattr_gettype
+    libc.src.pthread.pthread_mutexattr_init
     libc.src.pthread.pthread_mutexattr_setpshared
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
-    libc.src.pthread.pthread_rwlock_tryrdlock
     libc.src.pthread.pthread_rwlock_rdlock
     libc.src.pthread.pthread_rwlock_timedrdlock
-    libc.src.pthread.pthread_rwlock_trywrlock
-    libc.src.pthread.pthread_rwlock_wrlock
     libc.src.pthread.pthread_rwlock_timedwrlock
+    libc.src.pthread.pthread_rwlock_tryrdlock
+    libc.src.pthread.pthread_rwlock_trywrlock
     libc.src.pthread.pthread_rwlock_unlock
-    libc.src.pthread.pthread_rwlock_destroy
+    libc.src.pthread.pthread_rwlock_wrlock
     libc.src.pthread.pthread_rwlockattr_destroy
     libc.src.pthread.pthread_rwlockattr_getkind_np
     libc.src.pthread.pthread_rwlockattr_getpshared
     libc.src.pthread.pthread_rwlockattr_init
     libc.src.pthread.pthread_rwlockattr_setkind_np
     libc.src.pthread.pthread_rwlockattr_setpshared
+    libc.src.pthread.pthread_self
+    libc.src.pthread.pthread_setname_np
     libc.src.pthread.pthread_setspecific
 
     # sched.h entrypoints
@@ -761,24 +778,24 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
     libc.src.stdio.fclose
-    libc.src.stdio.flockfile
     libc.src.stdio.feof
     libc.src.stdio.feof_unlocked
     libc.src.stdio.ferror
     libc.src.stdio.ferror_unlocked
+    libc.src.stdio.fflush
     libc.src.stdio.fgetc
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
-    libc.src.stdio.fflush
+    libc.src.stdio.flockfile
     libc.src.stdio.fopen
+    libc.src.stdio.fopencookie
     libc.src.stdio.fputc
     libc.src.stdio.fputs
-    libc.src.stdio.fopencookie
     libc.src.stdio.fread
     libc.src.stdio.fread_unlocked
     libc.src.stdio.fseek
-    libc.src.stdio.ftell
     libc.src.stdio.fseeko
+    libc.src.stdio.ftell
     libc.src.stdio.ftello
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
@@ -807,16 +824,16 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdlib.quick_exit
 
     # signal.h entrypoints
-    libc.src.signal.raise
     libc.src.signal.kill
+    libc.src.signal.raise
     libc.src.signal.sigaction
+    libc.src.signal.sigaddset
     libc.src.signal.sigaltstack
     libc.src.signal.sigdelset
-    libc.src.signal.sigaddset
     libc.src.signal.sigemptyset
-    libc.src.signal.sigprocmask
     libc.src.signal.sigfillset
     libc.src.signal.signal
+    libc.src.signal.sigprocmask
 
     # spawn.h entrypoints
     libc.src.spawn.posix_spawn
@@ -829,10 +846,10 @@ if(LLVM_LIBC_FULL_BUILD)
     # search.h entrypoints
     libc.src.search.hcreate
     libc.src.search.hcreate_r
-    libc.src.search.hsearch
-    libc.src.search.hsearch_r
     libc.src.search.hdestroy
     libc.src.search.hdestroy_r
+    libc.src.search.hsearch
+    libc.src.search.hsearch_r
     libc.src.search.insque
     libc.src.search.remque
 
@@ -861,8 +878,8 @@ if(LLVM_LIBC_FULL_BUILD)
     # time.h entrypoints
     libc.src.time.asctime
     libc.src.time.asctime_r
-    libc.src.time.clock_gettime
     libc.src.time.clock
+    libc.src.time.clock_gettime
     libc.src.time.difftime
     libc.src.time.gettimeofday
     libc.src.time.gmtime
@@ -872,24 +889,24 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.time.time
 
     # unistd.h entrypoints
+    libc.src.unistd.__llvm_libc_syscall
     libc.src.unistd._exit
     libc.src.unistd.environ
     libc.src.unistd.execv
     libc.src.unistd.fork
-    libc.src.unistd.__llvm_libc_syscall
     libc.src.unistd.getopt
     libc.src.unistd.optarg
+    libc.src.unistd.opterr
     libc.src.unistd.optind
     libc.src.unistd.optopt
-    libc.src.unistd.opterr
     libc.src.unistd.swab
 
     # sys/select.h entrypoints
     libc.src.sys.select.select
 
     # sys/socket.h entrypoints
-    libc.src.sys.socket.socket
     libc.src.sys.socket.bind
+    libc.src.sys.socket.socket
   )
 endif()
 
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index a4898724daf86d..b33d572cf999ac 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -251,6 +251,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbnl
     libc.src.math.sin
     libc.src.math.sincosf
+    libc.src.math.sincosf
     libc.src.math.sinf
     libc.src.math.sinhf
     libc.src.math.sqrt
diff --git a/libc/docs/gpu/support.rst b/libc/docs/gpu/support.rst
index 31bbdc41090be6..1044b005de0eef 100644
--- a/libc/docs/gpu/support.rst
+++ b/libc/docs/gpu/support.rst
@@ -190,6 +190,8 @@ strtold        |check|
 strtoll        |check|
 strtoul        |check|
 strtoull       |check|
+srand          |check|
+rand           |check|
 =============  =========  ============
 
 inttypes.h
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index d6e642c4907f04..3ca7bf9fdb2835 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -124,7 +124,9 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dsub             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| f16fma           | |check|          |                 |                        | N/A                  |                        | 7.12.14.5              | F.10.11                    |
+| f16div           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.4              | F.10.11                    |
++------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
+| f16fma           | |check|          | |check|         | |check|                | N/A                  | |check|                | 7.12.14.5              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fabs             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.7.3               | F.10.4.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -264,7 +266,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | compoundn |                  |                 |                        |                      |                        | 7.12.7.2               | F.10.4.2                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| cos       | |check|          | large           |                        |                      |                        | 7.12.4.5               | F.10.1.5                   |
+| cos       | |check|          | |check|         |                        |                      |                        | 7.12.4.5               | F.10.1.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | cosh      | |check|          |                 |                        |                      |                        | 7.12.5.4               | F.10.2.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -290,7 +292,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fma       | |check|          | |check|         |                        |                      |                        | 7.12.13.1              | F.10.10.1                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| f16sqrt   | |check|          |                 |                        | N/A                  |                        | 7.12.14.6              | F.10.11                    |
+| f16sqrt   | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.6              | F.10.11                    |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fsqrt     | N/A              |                 |                        | N/A                  |                        | 7.12.14.6              | F.10.11                    |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -322,9 +324,9 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | rsqrt     |                  |                 |                        |                      |                        | 7.12.7.9               | F.10.4.9                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| sin       | |check|          | large           |                        |                      |                        | 7.12.4.6               | F.10.1.6                   |
+| sin       | |check|          | |check|         |                        |                      |                        | 7.12.4.6               | F.10.1.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| sincos    | |check|          | large           |                        |                      |                        |                        |                            |
+| sincos    | |check|          | |check|         |                        |                      |                        |                        |                            |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sinh      | |check|          |                 |                        |                      |                        | 7.12.5.5               | F.10.2.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -348,6 +350,7 @@ Legends:
   tie-to-even).
 * x ULPs: largest errors recorded.
 * N/A: Not defined in the standard or will not be added.
+* \*: LLVM libc extension.
 
 ..
   TODO(lntue): Add a new page to discuss about the algorithms used in the
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 9b3373a0ca3905..1cab1d3b812cf0 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -126,3 +126,12 @@ add_proxy_header_library(
     libc.include.llvm-libc-types.atexithandler_t
     libc.include.stdlib
 )
+
+add_proxy_header_library(
+  struct_sigaction
+  HDRS
+    struct_sigaction.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_sigaction
+    libc.include.signal
+)
diff --git a/libc/hdr/types/struct_sigaction.h b/libc/hdr/types/struct_sigaction.h
new file mode 100644
index 00000000000000..60f6caeb4af106
--- /dev/null
+++ b/libc/hdr/types/struct_sigaction.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct sigaction  ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_SIGACTION_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_SIGACTION_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_sigaction.h"
+
+#else
+
+#include <signal.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_SIGACTION_H
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index bb10fd4c947034..3ab7817d8568b7 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -223,30 +223,23 @@ add_gen_header(
   GEN_HDR errno.h
   DEPENDS
     .llvm-libc-macros.generic_error_number_macros
+    .llvm-libc-macros.error_number_macros
 )
 
-if(EXISTS "${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/signal.h.in")
-  add_gen_header(
-    signal
-    DEF_FILE signal.h.def
-    PARAMS
-      platform_signal=../config/${LIBC_TARGET_OS}/signal.h.in
-    GEN_HDR signal.h
-    DATA_FILES
-      ../config/${LIBC_TARGET_OS}/signal.h.in
-    DEPENDS
-      .llvm-libc-macros.signal_macros
-      .llvm-libc-types.sig_atomic_t
-      .llvm-libc-types.sigset_t
-      .llvm-libc-types.struct_sigaction
-      .llvm-libc-types.union_sigval
-      .llvm-libc-types.siginfo_t
-      .llvm-libc-types.stack_t
-      .llvm-libc-types.pid_t
-  )
-else()
-  message(STATUS "Skipping header signal.h as the target config is missing")
-endif()
+add_gen_header(
+  signal
+  DEF_FILE signal.h.def
+  GEN_HDR signal.h
+  DEPENDS
+    .llvm-libc-macros.signal_macros
+    .llvm-libc-types.sig_atomic_t
+    .llvm-libc-types.sigset_t
+    .llvm-libc-types.struct_sigaction
+    .llvm-libc-types.union_sigval
+    .llvm-libc-types.siginfo_t
+    .llvm-libc-types.stack_t
+    .llvm-libc-types.pid_t
+)
 
 add_gen_header(
   stdbit
@@ -523,10 +516,6 @@ add_gen_header(
   sys_syscall
   DEF_FILE sys/syscall.h.def
   GEN_HDR sys/syscall.h
-  PARAMS
-    syscall_numbers=../config/${LIBC_TARGET_OS}/syscall_numbers.h.inc
-  DATA_FILES
-    ../config/${LIBC_TARGET_OS}/syscall_numbers.h.inc
 )
 
 add_gen_header(
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index a4c2ca0df74d35..f6af11abd4dd76 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -103,6 +103,12 @@ add_macro_header(
     limits-macros.h
 )
 
+add_macro_header(
+  link_macros
+  HDR
+    link-macros.h
+)
+
 add_macro_header(
   math_macros
   HDR
diff --git a/libc/include/llvm-libc-macros/float16-macros.h b/libc/include/llvm-libc-macros/float16-macros.h
index e7d8d93bca1b60..9a11ecc49307e2 100644
--- a/libc/include/llvm-libc-macros/float16-macros.h
+++ b/libc/include/llvm-libc-macros/float16-macros.h
@@ -9,10 +9,18 @@
 #ifndef LLVM_LIBC_MACROS_FLOAT16_MACROS_H
 #define LLVM_LIBC_MACROS_FLOAT16_MACROS_H
 
+#include "../llvm-libc-types/float128.h"
+
 #if defined(__FLT16_MANT_DIG__) &&                                             \
     (!defined(__GNUC__) || __GNUC__ >= 13 || defined(__clang__)) &&            \
     !defined(__arm__) && !defined(_M_ARM) && !defined(__riscv)
 #define LIBC_TYPES_HAS_FLOAT16
+
+// TODO: This would no longer be required if HdrGen let us guard function
+// declarations with multiple macros.
+#ifdef LIBC_TYPES_HAS_FLOAT128
+#define LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128
+#endif // LIBC_TYPES_HAS_FLOAT128
 #endif
 
 #endif // LLVM_LIBC_MACROS_FLOAT16_MACROS_H
diff --git a/libc/include/llvm-libc-macros/link-macros.h b/libc/include/llvm-libc-macros/link-macros.h
new file mode 100644
index 00000000000000..5c8cadab8e71cb
--- /dev/null
+++ b/libc/include/llvm-libc-macros/link-macros.h
@@ -0,0 +1,13 @@
+//===-- Definition of macros to for extra dynamic linker functionality ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __LP64__
+#define ElfW(type) Elf64_ ## type
+#else
+#define ElfW(type) Elf32_ ## type
+#endif
diff --git a/libc/include/sys/syscall.h.def b/libc/include/sys/syscall.h.def
index 5cb6a20da77bee..03c19eb0885ed6 100644
--- a/libc/include/sys/syscall.h.def
+++ b/libc/include/sys/syscall.h.def
@@ -9,9 +9,2345 @@
 #ifndef LLVM_LIBC_SYS_SYSCALL_H
 #define LLVM_LIBC_SYS_SYSCALL_H
 
-<!> If syscall.h were a linux only file, then we do not need this indirection.
-<!> However, to keep the option of a non-linux OS requiring a syscall.h file,
-<!> with its own special syscall numbers, we use this indirection.
-%%include_file(${syscall_numbers})
+//TODO: Handle non-linux syscalls
+
+#include <asm/unistd.h>
+
+#ifdef __NR_FAST_atomic_update
+#define SYS_FAST_atomic_update __NR_FAST_atomic_update
+#endif
+
+#ifdef __NR_FAST_cmpxchg
+#define SYS_FAST_cmpxchg __NR_FAST_cmpxchg
+#endif
+
+#ifdef __NR_FAST_cmpxchg64
+#define SYS_FAST_cmpxchg64 __NR_FAST_cmpxchg64
+#endif
+
+#ifdef __NR__llseek
+#define SYS__llseek __NR__llseek
+#endif
+
+#ifdef __NR__newselect
+#define SYS__newselect __NR__newselect
+#endif
+
+#ifdef __NR__sysctl
+#define SYS__sysctl __NR__sysctl
+#endif
+
+#ifdef __NR_accept
+#define SYS_accept __NR_accept
+#endif
+
+#ifdef __NR_accept4
+#define SYS_accept4 __NR_accept4
+#endif
+
+#ifdef __NR_access
+#define SYS_access __NR_access
+#endif
+
+#ifdef __NR_acct
+#define SYS_acct __NR_acct
+#endif
+
+#ifdef __NR_acl_get
+#define SYS_acl_get __NR_acl_get
+#endif
+
+#ifdef __NR_acl_set
+#define SYS_acl_set __NR_acl_set
+#endif
+
+#ifdef __NR_add_key
+#define SYS_add_key __NR_add_key
+#endif
+
+#ifdef __NR_adjtimex
+#define SYS_adjtimex __NR_adjtimex
+#endif
+
+#ifdef __NR_afs_syscall
+#define SYS_afs_syscall __NR_afs_syscall
+#endif
+
+#ifdef __NR_alarm
+#define SYS_alarm __NR_alarm
+#endif
+
+#ifdef __NR_alloc_hugepages
+#define SYS_alloc_hugepages __NR_alloc_hugepages
+#endif
+
+#ifdef __NR_arch_prctl
+#define SYS_arch_prctl __NR_arch_prctl
+#endif
+
+#ifdef __NR_arm_fadvise64_64
+#define SYS_arm_fadvise64_64 __NR_arm_fadvise64_64
+#endif
+
+#ifdef __NR_arm_sync_file_range
+#define SYS_arm_sync_file_range __NR_arm_sync_file_range
+#endif
+
+#ifdef __NR_atomic_barrier
+#define SYS_atomic_barrier __NR_atomic_barrier
+#endif
+
+#ifdef __NR_atomic_cmpxchg_32
+#define SYS_atomic_cmpxchg_32 __NR_atomic_cmpxchg_32
+#endif
+
+#ifdef __NR_attrctl
+#define SYS_attrctl __NR_attrctl
+#endif
+
+#ifdef __NR_bdflush
+#define SYS_bdflush __NR_bdflush
+#endif
+
+#ifdef __NR_bind
+#define SYS_bind __NR_bind
+#endif
+
+#ifdef __NR_bpf
+#define SYS_bpf __NR_bpf
+#endif
+
+#ifdef __NR_break
+#define SYS_break __NR_break
+#endif
+
+#ifdef __NR_brk
+#define SYS_brk __NR_brk
+#endif
+
+#ifdef __NR_cachectl
+#define SYS_cachectl __NR_cachectl
+#endif
+
+#ifdef __NR_cacheflush
+#define SYS_cacheflush __NR_cacheflush
+#endif
+
+#ifdef __NR_capget
+#define SYS_capget __NR_capget
+#endif
+
+#ifdef __NR_capset
+#define SYS_capset __NR_capset
+#endif
+
+#ifdef __NR_chdir
+#define SYS_chdir __NR_chdir
+#endif
+
+#ifdef __NR_chmod
+#define SYS_chmod __NR_chmod
+#endif
+
+#ifdef __NR_chown
+#define SYS_chown __NR_chown
+#endif
+
+#ifdef __NR_chown32
+#define SYS_chown32 __NR_chown32
+#endif
+
+#ifdef __NR_chroot
+#define SYS_chroot __NR_chroot
+#endif
+
+#ifdef __NR_clock_adjtime
+#define SYS_clock_adjtime __NR_clock_adjtime
+#endif
+
+#ifdef __NR_clock_getres
+#define SYS_clock_getres __NR_clock_getres
+#endif
+
+#ifdef __NR_clock_gettime
+#define SYS_clock_gettime __NR_clock_gettime
+#endif
+
+#ifdef __NR_clock_gettime64
+#define SYS_clock_gettime64 __NR_clock_gettime64
+#endif
+
+#ifdef __NR_clock_nanosleep
+#define SYS_clock_nanosleep __NR_clock_nanosleep
+#endif
+
+#ifdef __NR_clock_nanosleep_time64
+#define SYS_clock_nanosleep_time64 __NR_clock_nanosleep_time64
+#endif
+
+#ifdef __NR_clock_settime
+#define SYS_clock_settime __NR_clock_settime
+#endif
+
+#ifdef __NR_clone
+#define SYS_clone __NR_clone
+#endif
+
+#ifdef __NR_clone2
+#define SYS_clone2 __NR_clone2
+#endif
+
+#ifdef __NR_close
+#define SYS_close __NR_close
+#endif
+
+#ifdef __NR_cmpxchg_badaddr
+#define SYS_cmpxchg_badaddr __NR_cmpxchg_badaddr
+#endif
+
+#ifdef __NR_connect
+#define SYS_connect __NR_connect
+#endif
+
+#ifdef __NR_copy_file_range
+#define SYS_copy_file_range __NR_copy_file_range
+#endif
+
+#ifdef __NR_creat
+#define SYS_creat __NR_creat
+#endif
+
+#ifdef __NR_create_module
+#define SYS_create_module __NR_create_module
+#endif
+
+#ifdef __NR_delete_module
+#define SYS_delete_module __NR_delete_module
+#endif
+
+#ifdef __NR_dipc
+#define SYS_dipc __NR_dipc
+#endif
+
+#ifdef __NR_dup
+#define SYS_dup __NR_dup
+#endif
+
+#ifdef __NR_dup2
+#define SYS_dup2 __NR_dup2
+#endif
+
+#ifdef __NR_dup3
+#define SYS_dup3 __NR_dup3
+#endif
+
+#ifdef __NR_epoll_create
+#define SYS_epoll_create __NR_epoll_create
+#endif
+
+#ifdef __NR_epoll_create1
+#define SYS_epoll_create1 __NR_epoll_create1
+#endif
+
+#ifdef __NR_epoll_ctl
+#define SYS_epoll_ctl __NR_epoll_ctl
+#endif
+
+#ifdef __NR_epoll_ctl_old
+#define SYS_epoll_ctl_old __NR_epoll_ctl_old
+#endif
+
+#ifdef __NR_epoll_pwait
+#define SYS_epoll_pwait __NR_epoll_pwait
+#endif
+
+#ifdef __NR_epoll_pwait2
+#define SYS_epoll_pwait2 __NR_epoll_pwait2
+#endif
+
+#ifdef __NR_epoll_wait
+#define SYS_epoll_wait __NR_epoll_wait
+#endif
+
+#ifdef __NR_epoll_wait_old
+#define SYS_epoll_wait_old __NR_epoll_wait_old
+#endif
+
+#ifdef __NR_eventfd
+#define SYS_eventfd __NR_eventfd
+#endif
+
+#ifdef __NR_eventfd2
+#define SYS_eventfd2 __NR_eventfd2
+#endif
+
+#ifdef __NR_exec_with_loader
+#define SYS_exec_with_loader __NR_exec_with_loader
+#endif
+
+#ifdef __NR_execv
+#define SYS_execv __NR_execv
+#endif
+
+#ifdef __NR_execve
+#define SYS_execve __NR_execve
+#endif
+
+#ifdef __NR_execveat
+#define SYS_execveat __NR_execveat
+#endif
+
+#ifdef __NR_exit
+#define SYS_exit __NR_exit
+#endif
+
+#ifdef __NR_exit_group
+#define SYS_exit_group __NR_exit_group
+#endif
+
+#ifdef __NR_faccessat
+#define SYS_faccessat __NR_faccessat
+#endif
+
+#ifdef __NR_fadvise64
+#define SYS_fadvise64 __NR_fadvise64
+#endif
+
+#ifdef __NR_fadvise64_64
+#define SYS_fadvise64_64 __NR_fadvise64_64
+#endif
+
+#ifdef __NR_fallocate
+#define SYS_fallocate __NR_fallocate
+#endif
+
+#ifdef __NR_fanotify_init
+#define SYS_fanotify_init __NR_fanotify_init
+#endif
+
+#ifdef __NR_fanotify_mark
+#define SYS_fanotify_mark __NR_fanotify_mark
+#endif
+
+#ifdef __NR_fchdir
+#define SYS_fchdir __NR_fchdir
+#endif
+
+#ifdef __NR_fchmod
+#define SYS_fchmod __NR_fchmod
+#endif
+
+#ifdef __NR_fchmodat
+#define SYS_fchmodat __NR_fchmodat
+#endif
+
+#ifdef __NR_fchmodat2
+#define SYS_fchmodat2 __NR_fchmodat2
+#endif
+
+#ifdef __NR_fchown
+#define SYS_fchown __NR_fchown
+#endif
+
+#ifdef __NR_fchown32
+#define SYS_fchown32 __NR_fchown32
+#endif
+
+#ifdef __NR_fchownat
+#define SYS_fchownat __NR_fchownat
+#endif
+
+#ifdef __NR_fcntl
+#define SYS_fcntl __NR_fcntl
+#endif
+
+#ifdef __NR_fcntl64
+#define SYS_fcntl64 __NR_fcntl64
+#endif
+
+#ifdef __NR_fdatasync
+#define SYS_fdatasync __NR_fdatasync
+#endif
+
+#ifdef __NR_fgetxattr
+#define SYS_fgetxattr __NR_fgetxattr
+#endif
+
+#ifdef __NR_finit_module
+#define SYS_finit_module __NR_finit_module
+#endif
+
+#ifdef __NR_flistxattr
+#define SYS_flistxattr __NR_flistxattr
+#endif
+
+#ifdef __NR_flock
+#define SYS_flock __NR_flock
+#endif
+
+#ifdef __NR_fork
+#define SYS_fork __NR_fork
+#endif
+
+#ifdef __NR_free_hugepages
+#define SYS_free_hugepages __NR_free_hugepages
+#endif
+
+#ifdef __NR_fremovexattr
+#define SYS_fremovexattr __NR_fremovexattr
+#endif
+
+#ifdef __NR_fsetxattr
+#define SYS_fsetxattr __NR_fsetxattr
+#endif
+
+#ifdef __NR_fstat
+#define SYS_fstat __NR_fstat
+#endif
+
+#ifdef __NR_fstat64
+#define SYS_fstat64 __NR_fstat64
+#endif
+
+#ifdef __NR_fstatat64
+#define SYS_fstatat64 __NR_fstatat64
+#endif
+
+#ifdef __NR_fstatfs
+#define SYS_fstatfs __NR_fstatfs
+#endif
+
+#ifdef __NR_fstatfs64
+#define SYS_fstatfs64 __NR_fstatfs64
+#endif
+
+#ifdef __NR_fsync
+#define SYS_fsync __NR_fsync
+#endif
+
+#ifdef __NR_ftime
+#define SYS_ftime __NR_ftime
+#endif
+
+#ifdef __NR_ftruncate
+#define SYS_ftruncate __NR_ftruncate
+#endif
+
+#ifdef __NR_ftruncate64
+#define SYS_ftruncate64 __NR_ftruncate64
+#endif
+
+#ifdef __NR_futex
+#define SYS_futex __NR_futex
+#endif
+
+#ifdef __NR_futex_time64
+#define SYS_futex_time64 __NR_futex_time64
+#endif
+
+#ifdef __NR_futimesat
+#define SYS_futimesat __NR_futimesat
+#endif
+
+#ifdef __NR_get_kernel_syms
+#define SYS_get_kernel_syms __NR_get_kernel_syms
+#endif
+
+#ifdef __NR_get_mempolicy
+#define SYS_get_mempolicy __NR_get_mempolicy
+#endif
+
+#ifdef __NR_get_robust_list
+#define SYS_get_robust_list __NR_get_robust_list
+#endif
+
+#ifdef __NR_get_thread_area
+#define SYS_get_thread_area __NR_get_thread_area
+#endif
+
+#ifdef __NR_getcpu
+#define SYS_getcpu __NR_getcpu
+#endif
+
+#ifdef __NR_getcwd
+#define SYS_getcwd __NR_getcwd
+#endif
+
+#ifdef __NR_getdents
+#define SYS_getdents __NR_getdents
+#endif
+
+#ifdef __NR_getdents64
+#define SYS_getdents64 __NR_getdents64
+#endif
+
+#ifdef __NR_getdomainname
+#define SYS_getdomainname __NR_getdomainname
+#endif
+
+#ifdef __NR_getdtablesize
+#define SYS_getdtablesize __NR_getdtablesize
+#endif
+
+#ifdef __NR_getegid
+#define SYS_getegid __NR_getegid
+#endif
+
+#ifdef __NR_getegid32
+#define SYS_getegid32 __NR_getegid32
+#endif
+
+#ifdef __NR_geteuid
+#define SYS_geteuid __NR_geteuid
+#endif
+
+#ifdef __NR_geteuid32
+#define SYS_geteuid32 __NR_geteuid32
+#endif
+
+#ifdef __NR_getgid
+#define SYS_getgid __NR_getgid
+#endif
+
+#ifdef __NR_getgid32
+#define SYS_getgid32 __NR_getgid32
+#endif
+
+#ifdef __NR_getgroups
+#define SYS_getgroups __NR_getgroups
+#endif
+
+#ifdef __NR_getgroups32
+#define SYS_getgroups32 __NR_getgroups32
+#endif
+
+#ifdef __NR_gethostname
+#define SYS_gethostname __NR_gethostname
+#endif
+
+#ifdef __NR_getitimer
+#define SYS_getitimer __NR_getitimer
+#endif
+
+#ifdef __NR_getpagesize
+#define SYS_getpagesize __NR_getpagesize
+#endif
+
+#ifdef __NR_getpeername
+#define SYS_getpeername __NR_getpeername
+#endif
+
+#ifdef __NR_getpgid
+#define SYS_getpgid __NR_getpgid
+#endif
+
+#ifdef __NR_getpgrp
+#define SYS_getpgrp __NR_getpgrp
+#endif
+
+#ifdef __NR_getpid
+#define SYS_getpid __NR_getpid
+#endif
+
+#ifdef __NR_getpmsg
+#define SYS_getpmsg __NR_getpmsg
+#endif
+
+#ifdef __NR_getppid
+#define SYS_getppid __NR_getppid
+#endif
+
+#ifdef __NR_getpriority
+#define SYS_getpriority __NR_getpriority
+#endif
+
+#ifdef __NR_getrandom
+#define SYS_getrandom __NR_getrandom
+#endif
+
+#ifdef __NR_getresgid
+#define SYS_getresgid __NR_getresgid
+#endif
+
+#ifdef __NR_getresgid32
+#define SYS_getresgid32 __NR_getresgid32
+#endif
+
+#ifdef __NR_getresuid
+#define SYS_getresuid __NR_getresuid
+#endif
+
+#ifdef __NR_getresuid32
+#define SYS_getresuid32 __NR_getresuid32
+#endif
+
+#ifdef __NR_getrlimit
+#define SYS_getrlimit __NR_getrlimit
+#endif
+
+#ifdef __NR_getrusage
+#define SYS_getrusage __NR_getrusage
+#endif
+
+#ifdef __NR_getsid
+#define SYS_getsid __NR_getsid
+#endif
+
+#ifdef __NR_getsockname
+#define SYS_getsockname __NR_getsockname
+#endif
+
+#ifdef __NR_getsockopt
+#define SYS_getsockopt __NR_getsockopt
+#endif
+
+#ifdef __NR_gettid
+#define SYS_gettid __NR_gettid
+#endif
+
+#ifdef __NR_gettimeofday
+#define SYS_gettimeofday __NR_gettimeofday
+#endif
+
+#ifdef __NR_getuid
+#define SYS_getuid __NR_getuid
+#endif
+
+#ifdef __NR_getuid32
+#define SYS_getuid32 __NR_getuid32
+#endif
+
+#ifdef __NR_getunwind
+#define SYS_getunwind __NR_getunwind
+#endif
+
+#ifdef __NR_getxattr
+#define SYS_getxattr __NR_getxattr
+#endif
+
+#ifdef __NR_getxgid
+#define SYS_getxgid __NR_getxgid
+#endif
+
+#ifdef __NR_getxpid
+#define SYS_getxpid __NR_getxpid
+#endif
+
+#ifdef __NR_getxuid
+#define SYS_getxuid __NR_getxuid
+#endif
+
+#ifdef __NR_gtty
+#define SYS_gtty __NR_gtty
+#endif
+
+#ifdef __NR_idle
+#define SYS_idle __NR_idle
+#endif
+
+#ifdef __NR_init_module
+#define SYS_init_module __NR_init_module
+#endif
+
+#ifdef __NR_inotify_add_watch
+#define SYS_inotify_add_watch __NR_inotify_add_watch
+#endif
+
+#ifdef __NR_inotify_init
+#define SYS_inotify_init __NR_inotify_init
+#endif
+
+#ifdef __NR_inotify_init1
+#define SYS_inotify_init1 __NR_inotify_init1
+#endif
+
+#ifdef __NR_inotify_rm_watch
+#define SYS_inotify_rm_watch __NR_inotify_rm_watch
+#endif
+
+#ifdef __NR_io_cancel
+#define SYS_io_cancel __NR_io_cancel
+#endif
+
+#ifdef __NR_io_destroy
+#define SYS_io_destroy __NR_io_destroy
+#endif
+
+#ifdef __NR_io_getevents
+#define SYS_io_getevents __NR_io_getevents
+#endif
+
+#ifdef __NR_io_pgetevents
+#define SYS_io_pgetevents __NR_io_pgetevents
+#endif
+
+#ifdef __NR_io_setup
+#define SYS_io_setup __NR_io_setup
+#endif
+
+#ifdef __NR_io_submit
+#define SYS_io_submit __NR_io_submit
+#endif
+
+#ifdef __NR_ioctl
+#define SYS_ioctl __NR_ioctl
+#endif
+
+#ifdef __NR_ioperm
+#define SYS_ioperm __NR_ioperm
+#endif
+
+#ifdef __NR_iopl
+#define SYS_iopl __NR_iopl
+#endif
+
+#ifdef __NR_ioprio_get
+#define SYS_ioprio_get __NR_ioprio_get
+#endif
+
+#ifdef __NR_ioprio_set
+#define SYS_ioprio_set __NR_ioprio_set
+#endif
+
+#ifdef __NR_ipc
+#define SYS_ipc __NR_ipc
+#endif
+
+#ifdef __NR_kcmp
+#define SYS_kcmp __NR_kcmp
+#endif
+
+#ifdef __NR_kern_features
+#define SYS_kern_features __NR_kern_features
+#endif
+
+#ifdef __NR_kexec_file_load
+#define SYS_kexec_file_load __NR_kexec_file_load
+#endif
+
+#ifdef __NR_kexec_load
+#define SYS_kexec_load __NR_kexec_load
+#endif
+
+#ifdef __NR_keyctl
+#define SYS_keyctl __NR_keyctl
+#endif
+
+#ifdef __NR_kill
+#define SYS_kill __NR_kill
+#endif
+
+#ifdef __NR_lchown
+#define SYS_lchown __NR_lchown
+#endif
+
+#ifdef __NR_lchown32
+#define SYS_lchown32 __NR_lchown32
+#endif
+
+#ifdef __NR_lgetxattr
+#define SYS_lgetxattr __NR_lgetxattr
+#endif
+
+#ifdef __NR_link
+#define SYS_link __NR_link
+#endif
+
+#ifdef __NR_linkat
+#define SYS_linkat __NR_linkat
+#endif
+
+#ifdef __NR_listen
+#define SYS_listen __NR_listen
+#endif
+
+#ifdef __NR_listxattr
+#define SYS_listxattr __NR_listxattr
+#endif
+
+#ifdef __NR_llistxattr
+#define SYS_llistxattr __NR_llistxattr
+#endif
+
+#ifdef __NR_llseek
+#define SYS_llseek __NR_llseek
+#endif
+
+#ifdef __NR_lock
+#define SYS_lock __NR_lock
+#endif
+
+#ifdef __NR_lookup_dcookie
+#define SYS_lookup_dcookie __NR_lookup_dcookie
+#endif
+
+#ifdef __NR_lremovexattr
+#define SYS_lremovexattr __NR_lremovexattr
+#endif
+
+#ifdef __NR_lseek
+#define SYS_lseek __NR_lseek
+#endif
+
+#ifdef __NR_lsetxattr
+#define SYS_lsetxattr __NR_lsetxattr
+#endif
+
+#ifdef __NR_lstat
+#define SYS_lstat __NR_lstat
+#endif
+
+#ifdef __NR_lstat64
+#define SYS_lstat64 __NR_lstat64
+#endif
+
+#ifdef __NR_madvise
+#define SYS_madvise __NR_madvise
+#endif
+
+#ifdef __NR_mbind
+#define SYS_mbind __NR_mbind
+#endif
+
+#ifdef __NR_membarrier
+#define SYS_membarrier __NR_membarrier
+#endif
+
+#ifdef __NR_memfd_create
+#define SYS_memfd_create __NR_memfd_create
+#endif
+
+#ifdef __NR_memory_ordering
+#define SYS_memory_ordering __NR_memory_ordering
+#endif
+
+#ifdef __NR_migrate_pages
+#define SYS_migrate_pages __NR_migrate_pages
+#endif
+
+#ifdef __NR_mincore
+#define SYS_mincore __NR_mincore
+#endif
+
+#ifdef __NR_mkdir
+#define SYS_mkdir __NR_mkdir
+#endif
+
+#ifdef __NR_mkdirat
+#define SYS_mkdirat __NR_mkdirat
+#endif
+
+#ifdef __NR_mknod
+#define SYS_mknod __NR_mknod
+#endif
+
+#ifdef __NR_mknodat
+#define SYS_mknodat __NR_mknodat
+#endif
+
+#ifdef __NR_mlock
+#define SYS_mlock __NR_mlock
+#endif
+
+#ifdef __NR_mlock2
+#define SYS_mlock2 __NR_mlock2
+#endif
+
+#ifdef __NR_mlockall
+#define SYS_mlockall __NR_mlockall
+#endif
+
+#ifdef __NR_mmap
+#define SYS_mmap __NR_mmap
+#endif
+
+#ifdef __NR_mmap2
+#define SYS_mmap2 __NR_mmap2
+#endif
+
+#ifdef __NR_modify_ldt
+#define SYS_modify_ldt __NR_modify_ldt
+#endif
+
+#ifdef __NR_mount
+#define SYS_mount __NR_mount
+#endif
+
+#ifdef __NR_move_pages
+#define SYS_move_pages __NR_move_pages
+#endif
+
+#ifdef __NR_mprotect
+#define SYS_mprotect __NR_mprotect
+#endif
+
+#ifdef __NR_mpx
+#define SYS_mpx __NR_mpx
+#endif
+
+#ifdef __NR_mq_getsetattr
+#define SYS_mq_getsetattr __NR_mq_getsetattr
+#endif
+
+#ifdef __NR_mq_notify
+#define SYS_mq_notify __NR_mq_notify
+#endif
+
+#ifdef __NR_mq_open
+#define SYS_mq_open __NR_mq_open
+#endif
+
+#ifdef __NR_mq_timedreceive
+#define SYS_mq_timedreceive __NR_mq_timedreceive
+#endif
+
+#ifdef __NR_mq_timedsend
+#define SYS_mq_timedsend __NR_mq_timedsend
+#endif
+
+#ifdef __NR_mq_unlink
+#define SYS_mq_unlink __NR_mq_unlink
+#endif
+
+#ifdef __NR_mremap
+#define SYS_mremap __NR_mremap
+#endif
+
+#ifdef __NR_msgctl
+#define SYS_msgctl __NR_msgctl
+#endif
+
+#ifdef __NR_msgget
+#define SYS_msgget __NR_msgget
+#endif
+
+#ifdef __NR_msgrcv
+#define SYS_msgrcv __NR_msgrcv
+#endif
+
+#ifdef __NR_msgsnd
+#define SYS_msgsnd __NR_msgsnd
+#endif
+
+#ifdef __NR_msync
+#define SYS_msync __NR_msync
+#endif
+
+#ifdef __NR_multiplexer
+#define SYS_multiplexer __NR_multiplexer
+#endif
+
+#ifdef __NR_munlock
+#define SYS_munlock __NR_munlock
+#endif
+
+#ifdef __NR_munlockall
+#define SYS_munlockall __NR_munlockall
+#endif
+
+#ifdef __NR_munmap
+#define SYS_munmap __NR_munmap
+#endif
+
+#ifdef __NR_name_to_handle_at
+#define SYS_name_to_handle_at __NR_name_to_handle_at
+#endif
+
+#ifdef __NR_nanosleep
+#define SYS_nanosleep __NR_nanosleep
+#endif
+
+#ifdef __NR_newfstatat
+#define SYS_newfstatat __NR_newfstatat
+#endif
+
+#ifdef __NR_nfsservctl
+#define SYS_nfsservctl __NR_nfsservctl
+#endif
+
+#ifdef __NR_ni_syscall
+#define SYS_ni_syscall __NR_ni_syscall
+#endif
+
+#ifdef __NR_nice
+#define SYS_nice __NR_nice
+#endif
+
+#ifdef __NR_old_adjtimex
+#define SYS_old_adjtimex __NR_old_adjtimex
+#endif
+
+#ifdef __NR_oldfstat
+#define SYS_oldfstat __NR_oldfstat
+#endif
+
+#ifdef __NR_oldlstat
+#define SYS_oldlstat __NR_oldlstat
+#endif
+
+#ifdef __NR_oldolduname
+#define SYS_oldolduname __NR_oldolduname
+#endif
+
+#ifdef __NR_oldstat
+#define SYS_oldstat __NR_oldstat
+#endif
+
+#ifdef __NR_oldumount
+#define SYS_oldumount __NR_oldumount
+#endif
+
+#ifdef __NR_olduname
+#define SYS_olduname __NR_olduname
+#endif
+
+#ifdef __NR_open
+#define SYS_open __NR_open
+#endif
+
+#ifdef __NR_open_by_handle_at
+#define SYS_open_by_handle_at __NR_open_by_handle_at
+#endif
+
+#ifdef __NR_openat
+#define SYS_openat __NR_openat
+#endif
+
+#ifdef __NR_osf_adjtime
+#define SYS_osf_adjtime __NR_osf_adjtime
+#endif
+
+#ifdef __NR_osf_afs_syscall
+#define SYS_osf_afs_syscall __NR_osf_afs_syscall
+#endif
+
+#ifdef __NR_osf_alt_plock
+#define SYS_osf_alt_plock __NR_osf_alt_plock
+#endif
+
+#ifdef __NR_osf_alt_setsid
+#define SYS_osf_alt_setsid __NR_osf_alt_setsid
+#endif
+
+#ifdef __NR_osf_alt_sigpending
+#define SYS_osf_alt_sigpending __NR_osf_alt_sigpending
+#endif
+
+#ifdef __NR_osf_asynch_daemon
+#define SYS_osf_asynch_daemon __NR_osf_asynch_daemon
+#endif
+
+#ifdef __NR_osf_audcntl
+#define SYS_osf_audcntl __NR_osf_audcntl
+#endif
+
+#ifdef __NR_osf_audgen
+#define SYS_osf_audgen __NR_osf_audgen
+#endif
+
+#ifdef __NR_osf_chflags
+#define SYS_osf_chflags __NR_osf_chflags
+#endif
+
+#ifdef __NR_osf_execve
+#define SYS_osf_execve __NR_osf_execve
+#endif
+
+#ifdef __NR_osf_exportfs
+#define SYS_osf_exportfs __NR_osf_exportfs
+#endif
+
+#ifdef __NR_osf_fchflags
+#define SYS_osf_fchflags __NR_osf_fchflags
+#endif
+
+#ifdef __NR_osf_fdatasync
+#define SYS_osf_fdatasync __NR_osf_fdatasync
+#endif
+
+#ifdef __NR_osf_fpathconf
+#define SYS_osf_fpathconf __NR_osf_fpathconf
+#endif
+
+#ifdef __NR_osf_fstat
+#define SYS_osf_fstat __NR_osf_fstat
+#endif
+
+#ifdef __NR_osf_fstatfs
+#define SYS_osf_fstatfs __NR_osf_fstatfs
+#endif
+
+#ifdef __NR_osf_fstatfs64
+#define SYS_osf_fstatfs64 __NR_osf_fstatfs64
+#endif
+
+#ifdef __NR_osf_fuser
+#define SYS_osf_fuser __NR_osf_fuser
+#endif
+
+#ifdef __NR_osf_getaddressconf
+#define SYS_osf_getaddressconf __NR_osf_getaddressconf
+#endif
+
+#ifdef __NR_osf_getdirentries
+#define SYS_osf_getdirentries __NR_osf_getdirentries
+#endif
+
+#ifdef __NR_osf_getdomainname
+#define SYS_osf_getdomainname __NR_osf_getdomainname
+#endif
+
+#ifdef __NR_osf_getfh
+#define SYS_osf_getfh __NR_osf_getfh
+#endif
+
+#ifdef __NR_osf_getfsstat
+#define SYS_osf_getfsstat __NR_osf_getfsstat
+#endif
+
+#ifdef __NR_osf_gethostid
+#define SYS_osf_gethostid __NR_osf_gethostid
+#endif
+
+#ifdef __NR_osf_getitimer
+#define SYS_osf_getitimer __NR_osf_getitimer
+#endif
+
+#ifdef __NR_osf_getlogin
+#define SYS_osf_getlogin __NR_osf_getlogin
+#endif
+
+#ifdef __NR_osf_getmnt
+#define SYS_osf_getmnt __NR_osf_getmnt
+#endif
+
+#ifdef __NR_osf_getrusage
+#define SYS_osf_getrusage __NR_osf_getrusage
+#endif
+
+#ifdef __NR_osf_getsysinfo
+#define SYS_osf_getsysinfo __NR_osf_getsysinfo
+#endif
+
+#ifdef __NR_osf_gettimeofday
+#define SYS_osf_gettimeofday __NR_osf_gettimeofday
+#endif
+
+#ifdef __NR_osf_kloadcall
+#define SYS_osf_kloadcall __NR_osf_kloadcall
+#endif
+
+#ifdef __NR_osf_kmodcall
+#define SYS_osf_kmodcall __NR_osf_kmodcall
+#endif
+
+#ifdef __NR_osf_lstat
+#define SYS_osf_lstat __NR_osf_lstat
+#endif
+
+#ifdef __NR_osf_memcntl
+#define SYS_osf_memcntl __NR_osf_memcntl
+#endif
+
+#ifdef __NR_osf_mincore
+#define SYS_osf_mincore __NR_osf_mincore
+#endif
+
+#ifdef __NR_osf_mount
+#define SYS_osf_mount __NR_osf_mount
+#endif
+
+#ifdef __NR_osf_mremap
+#define SYS_osf_mremap __NR_osf_mremap
+#endif
+
+#ifdef __NR_osf_msfs_syscall
+#define SYS_osf_msfs_syscall __NR_osf_msfs_syscall
+#endif
+
+#ifdef __NR_osf_msleep
+#define SYS_osf_msleep __NR_osf_msleep
+#endif
+
+#ifdef __NR_osf_mvalid
+#define SYS_osf_mvalid __NR_osf_mvalid
+#endif
+
+#ifdef __NR_osf_mwakeup
+#define SYS_osf_mwakeup __NR_osf_mwakeup
+#endif
+
+#ifdef __NR_osf_naccept
+#define SYS_osf_naccept __NR_osf_naccept
+#endif
+
+#ifdef __NR_osf_nfssvc
+#define SYS_osf_nfssvc __NR_osf_nfssvc
+#endif
+
+#ifdef __NR_osf_ngetpeername
+#define SYS_osf_ngetpeername __NR_osf_ngetpeername
+#endif
+
+#ifdef __NR_osf_ngetsockname
+#define SYS_osf_ngetsockname __NR_osf_ngetsockname
+#endif
+
+#ifdef __NR_osf_nrecvfrom
+#define SYS_osf_nrecvfrom __NR_osf_nrecvfrom
+#endif
+
+#ifdef __NR_osf_nrecvmsg
+#define SYS_osf_nrecvmsg __NR_osf_nrecvmsg
+#endif
+
+#ifdef __NR_osf_nsendmsg
+#define SYS_osf_nsendmsg __NR_osf_nsendmsg
+#endif
+
+#ifdef __NR_osf_ntp_adjtime
+#define SYS_osf_ntp_adjtime __NR_osf_ntp_adjtime
+#endif
+
+#ifdef __NR_osf_ntp_gettime
+#define SYS_osf_ntp_gettime __NR_osf_ntp_gettime
+#endif
+
+#ifdef __NR_osf_old_creat
+#define SYS_osf_old_creat __NR_osf_old_creat
+#endif
+
+#ifdef __NR_osf_old_fstat
+#define SYS_osf_old_fstat __NR_osf_old_fstat
+#endif
+
+#ifdef __NR_osf_old_getpgrp
+#define SYS_osf_old_getpgrp __NR_osf_old_getpgrp
+#endif
+
+#ifdef __NR_osf_old_killpg
+#define SYS_osf_old_killpg __NR_osf_old_killpg
+#endif
+
+#ifdef __NR_osf_old_lstat
+#define SYS_osf_old_lstat __NR_osf_old_lstat
+#endif
+
+#ifdef __NR_osf_old_open
+#define SYS_osf_old_open __NR_osf_old_open
+#endif
+
+#ifdef __NR_osf_old_sigaction
+#define SYS_osf_old_sigaction __NR_osf_old_sigaction
+#endif
+
+#ifdef __NR_osf_old_sigblock
+#define SYS_osf_old_sigblock __NR_osf_old_sigblock
+#endif
+
+#ifdef __NR_osf_old_sigreturn
+#define SYS_osf_old_sigreturn __NR_osf_old_sigreturn
+#endif
+
+#ifdef __NR_osf_old_sigsetmask
+#define SYS_osf_old_sigsetmask __NR_osf_old_sigsetmask
+#endif
+
+#ifdef __NR_osf_old_sigvec
+#define SYS_osf_old_sigvec __NR_osf_old_sigvec
+#endif
+
+#ifdef __NR_osf_old_stat
+#define SYS_osf_old_stat __NR_osf_old_stat
+#endif
+
+#ifdef __NR_osf_old_vadvise
+#define SYS_osf_old_vadvise __NR_osf_old_vadvise
+#endif
+
+#ifdef __NR_osf_old_vtrace
+#define SYS_osf_old_vtrace __NR_osf_old_vtrace
+#endif
+
+#ifdef __NR_osf_old_wait
+#define SYS_osf_old_wait __NR_osf_old_wait
+#endif
+
+#ifdef __NR_osf_oldquota
+#define SYS_osf_oldquota __NR_osf_oldquota
+#endif
+
+#ifdef __NR_osf_pathconf
+#define SYS_osf_pathconf __NR_osf_pathconf
+#endif
+
+#ifdef __NR_osf_pid_block
+#define SYS_osf_pid_block __NR_osf_pid_block
+#endif
+
+#ifdef __NR_osf_pid_unblock
+#define SYS_osf_pid_unblock __NR_osf_pid_unblock
+#endif
+
+#ifdef __NR_osf_plock
+#define SYS_osf_plock __NR_osf_plock
+#endif
+
+#ifdef __NR_osf_priocntlset
+#define SYS_osf_priocntlset __NR_osf_priocntlset
+#endif
+
+#ifdef __NR_osf_profil
+#define SYS_osf_profil __NR_osf_profil
+#endif
+
+#ifdef __NR_osf_proplist_syscall
+#define SYS_osf_proplist_syscall __NR_osf_proplist_syscall
+#endif
+
+#ifdef __NR_osf_reboot
+#define SYS_osf_reboot __NR_osf_reboot
+#endif
+
+#ifdef __NR_osf_revoke
+#define SYS_osf_revoke __NR_osf_revoke
+#endif
+
+#ifdef __NR_osf_sbrk
+#define SYS_osf_sbrk __NR_osf_sbrk
+#endif
+
+#ifdef __NR_osf_security
+#define SYS_osf_security __NR_osf_security
+#endif
+
+#ifdef __NR_osf_select
+#define SYS_osf_select __NR_osf_select
+#endif
+
+#ifdef __NR_osf_set_program_attributes
+#define SYS_osf_set_program_attributes __NR_osf_set_program_attributes
+#endif
+
+#ifdef __NR_osf_set_speculative
+#define SYS_osf_set_speculative __NR_osf_set_speculative
+#endif
+
+#ifdef __NR_osf_sethostid
+#define SYS_osf_sethostid __NR_osf_sethostid
+#endif
+
+#ifdef __NR_osf_setitimer
+#define SYS_osf_setitimer __NR_osf_setitimer
+#endif
+
+#ifdef __NR_osf_setlogin
+#define SYS_osf_setlogin __NR_osf_setlogin
+#endif
+
+#ifdef __NR_osf_setsysinfo
+#define SYS_osf_setsysinfo __NR_osf_setsysinfo
+#endif
+
+#ifdef __NR_osf_settimeofday
+#define SYS_osf_settimeofday __NR_osf_settimeofday
+#endif
+
+#ifdef __NR_osf_shmat
+#define SYS_osf_shmat __NR_osf_shmat
+#endif
+
+#ifdef __NR_osf_signal
+#define SYS_osf_signal __NR_osf_signal
+#endif
+
+#ifdef __NR_osf_sigprocmask
+#define SYS_osf_sigprocmask __NR_osf_sigprocmask
+#endif
+
+#ifdef __NR_osf_sigsendset
+#define SYS_osf_sigsendset __NR_osf_sigsendset
+#endif
+
+#ifdef __NR_osf_sigstack
+#define SYS_osf_sigstack __NR_osf_sigstack
+#endif
+
+#ifdef __NR_osf_sigwaitprim
+#define SYS_osf_sigwaitprim __NR_osf_sigwaitprim
+#endif
+
+#ifdef __NR_osf_sstk
+#define SYS_osf_sstk __NR_osf_sstk
+#endif
+
+#ifdef __NR_osf_stat
+#define SYS_osf_stat __NR_osf_stat
+#endif
+
+#ifdef __NR_osf_statfs
+#define SYS_osf_statfs __NR_osf_statfs
+#endif
+
+#ifdef __NR_osf_statfs64
+#define SYS_osf_statfs64 __NR_osf_statfs64
+#endif
+
+#ifdef __NR_osf_subsys_info
+#define SYS_osf_subsys_info __NR_osf_subsys_info
+#endif
+
+#ifdef __NR_osf_swapctl
+#define SYS_osf_swapctl __NR_osf_swapctl
+#endif
+
+#ifdef __NR_osf_swapon
+#define SYS_osf_swapon __NR_osf_swapon
+#endif
+
+#ifdef __NR_osf_syscall
+#define SYS_osf_syscall __NR_osf_syscall
+#endif
+
+#ifdef __NR_osf_sysinfo
+#define SYS_osf_sysinfo __NR_osf_sysinfo
+#endif
+
+#ifdef __NR_osf_table
+#define SYS_osf_table __NR_osf_table
+#endif
+
+#ifdef __NR_osf_uadmin
+#define SYS_osf_uadmin __NR_osf_uadmin
+#endif
+
+#ifdef __NR_osf_usleep_thread
+#define SYS_osf_usleep_thread __NR_osf_usleep_thread
+#endif
+
+#ifdef __NR_osf_uswitch
+#define SYS_osf_uswitch __NR_osf_uswitch
+#endif
+
+#ifdef __NR_osf_utc_adjtime
+#define SYS_osf_utc_adjtime __NR_osf_utc_adjtime
+#endif
+
+#ifdef __NR_osf_utc_gettime
+#define SYS_osf_utc_gettime __NR_osf_utc_gettime
+#endif
+
+#ifdef __NR_osf_utimes
+#define SYS_osf_utimes __NR_osf_utimes
+#endif
+
+#ifdef __NR_osf_utsname
+#define SYS_osf_utsname __NR_osf_utsname
+#endif
+
+#ifdef __NR_osf_wait4
+#define SYS_osf_wait4 __NR_osf_wait4
+#endif
+
+#ifdef __NR_osf_waitid
+#define SYS_osf_waitid __NR_osf_waitid
+#endif
+
+#ifdef __NR_pause
+#define SYS_pause __NR_pause
+#endif
+
+#ifdef __NR_pciconfig_iobase
+#define SYS_pciconfig_iobase __NR_pciconfig_iobase
+#endif
+
+#ifdef __NR_pciconfig_read
+#define SYS_pciconfig_read __NR_pciconfig_read
+#endif
+
+#ifdef __NR_pciconfig_write
+#define SYS_pciconfig_write __NR_pciconfig_write
+#endif
+
+#ifdef __NR_perf_event_open
+#define SYS_perf_event_open __NR_perf_event_open
+#endif
+
+#ifdef __NR_perfctr
+#define SYS_perfctr __NR_perfctr
+#endif
+
+#ifdef __NR_perfmonctl
+#define SYS_perfmonctl __NR_perfmonctl
+#endif
+
+#ifdef __NR_personality
+#define SYS_personality __NR_personality
+#endif
+
+#ifdef __NR_pipe
+#define SYS_pipe __NR_pipe
+#endif
+
+#ifdef __NR_pipe2
+#define SYS_pipe2 __NR_pipe2
+#endif
+
+#ifdef __NR_pivot_root
+#define SYS_pivot_root __NR_pivot_root
+#endif
+
+#ifdef __NR_pkey_alloc
+#define SYS_pkey_alloc __NR_pkey_alloc
+#endif
+
+#ifdef __NR_pkey_free
+#define SYS_pkey_free __NR_pkey_free
+#endif
+
+#ifdef __NR_pkey_mprotect
+#define SYS_pkey_mprotect __NR_pkey_mprotect
+#endif
+
+#ifdef __NR_poll
+#define SYS_poll __NR_poll
+#endif
+
+#ifdef __NR_ppoll
+#define SYS_ppoll __NR_ppoll
+#endif
+
+#ifdef __NR_prctl
+#define SYS_prctl __NR_prctl
+#endif
+
+#ifdef __NR_pread64
+#define SYS_pread64 __NR_pread64
+#endif
+
+#ifdef __NR_preadv
+#define SYS_preadv __NR_preadv
+#endif
+
+#ifdef __NR_preadv2
+#define SYS_preadv2 __NR_preadv2
+#endif
+
+#ifdef __NR_prlimit64
+#define SYS_prlimit64 __NR_prlimit64
+#endif
+
+#ifdef __NR_process_vm_readv
+#define SYS_process_vm_readv __NR_process_vm_readv
+#endif
+
+#ifdef __NR_process_vm_writev
+#define SYS_process_vm_writev __NR_process_vm_writev
+#endif
+
+#ifdef __NR_prof
+#define SYS_prof __NR_prof
+#endif
+
+#ifdef __NR_profil
+#define SYS_profil __NR_profil
+#endif
+
+#ifdef __NR_pselect6
+#define SYS_pselect6 __NR_pselect6
+#endif
+
+#ifdef __NR_pselect6_time64
+#define SYS_pselect6_time64 __NR_pselect6_time64
+#endif
+
+#ifdef __NR_ptrace
+#define SYS_ptrace __NR_ptrace
+#endif
+
+#ifdef __NR_putpmsg
+#define SYS_putpmsg __NR_putpmsg
+#endif
+
+#ifdef __NR_pwrite64
+#define SYS_pwrite64 __NR_pwrite64
+#endif
+
+#ifdef __NR_pwritev
+#define SYS_pwritev __NR_pwritev
+#endif
+
+#ifdef __NR_pwritev2
+#define SYS_pwritev2 __NR_pwritev2
+#endif
+
+#ifdef __NR_query_module
+#define SYS_query_module __NR_query_module
+#endif
+
+#ifdef __NR_quotactl
+#define SYS_quotactl __NR_quotactl
+#endif
+
+#ifdef __NR_read
+#define SYS_read __NR_read
+#endif
+
+#ifdef __NR_readahead
+#define SYS_readahead __NR_readahead
+#endif
+
+#ifdef __NR_readdir
+#define SYS_readdir __NR_readdir
+#endif
+
+#ifdef __NR_readlink
+#define SYS_readlink __NR_readlink
+#endif
+
+#ifdef __NR_readlinkat
+#define SYS_readlinkat __NR_readlinkat
+#endif
+
+#ifdef __NR_readv
+#define SYS_readv __NR_readv
+#endif
+
+#ifdef __NR_reboot
+#define SYS_reboot __NR_reboot
+#endif
+
+#ifdef __NR_recv
+#define SYS_recv __NR_recv
+#endif
+
+#ifdef __NR_recvfrom
+#define SYS_recvfrom __NR_recvfrom
+#endif
+
+#ifdef __NR_recvmmsg
+#define SYS_recvmmsg __NR_recvmmsg
+#endif
+
+#ifdef __NR_recvmsg
+#define SYS_recvmsg __NR_recvmsg
+#endif
+
+#ifdef __NR_remap_file_pages
+#define SYS_remap_file_pages __NR_remap_file_pages
+#endif
+
+#ifdef __NR_removexattr
+#define SYS_removexattr __NR_removexattr
+#endif
+
+#ifdef __NR_rename
+#define SYS_rename __NR_rename
+#endif
+
+#ifdef __NR_renameat
+#define SYS_renameat __NR_renameat
+#endif
+
+#ifdef __NR_renameat2
+#define SYS_renameat2 __NR_renameat2
+#endif
+
+#ifdef __NR_request_key
+#define SYS_request_key __NR_request_key
+#endif
+
+#ifdef __NR_restart_syscall
+#define SYS_restart_syscall __NR_restart_syscall
+#endif
+
+#ifdef __NR_rmdir
+#define SYS_rmdir __NR_rmdir
+#endif
+
+#ifdef __NR_rseq
+#define SYS_rseq __NR_rseq
+#endif
+
+#ifdef __NR_rt_sigaction
+#define SYS_rt_sigaction __NR_rt_sigaction
+#endif
+
+#ifdef __NR_rt_sigpending
+#define SYS_rt_sigpending __NR_rt_sigpending
+#endif
+
+#ifdef __NR_rt_sigprocmask
+#define SYS_rt_sigprocmask __NR_rt_sigprocmask
+#endif
+
+#ifdef __NR_rt_sigqueueinfo
+#define SYS_rt_sigqueueinfo __NR_rt_sigqueueinfo
+#endif
+
+#ifdef __NR_rt_sigreturn
+#define SYS_rt_sigreturn __NR_rt_sigreturn
+#endif
+
+#ifdef __NR_rt_sigsuspend
+#define SYS_rt_sigsuspend __NR_rt_sigsuspend
+#endif
+
+#ifdef __NR_rt_sigtimedwait
+#define SYS_rt_sigtimedwait __NR_rt_sigtimedwait
+#endif
+
+#ifdef __NR_rt_tgsigqueueinfo
+#define SYS_rt_tgsigqueueinfo __NR_rt_tgsigqueueinfo
+#endif
+
+#ifdef __NR_rtas
+#define SYS_rtas __NR_rtas
+#endif
+
+#ifdef __NR_s390_guarded_storage
+#define SYS_s390_guarded_storage __NR_s390_guarded_storage
+#endif
+
+#ifdef __NR_s390_pci_mmio_read
+#define SYS_s390_pci_mmio_read __NR_s390_pci_mmio_read
+#endif
+
+#ifdef __NR_s390_pci_mmio_write
+#define SYS_s390_pci_mmio_write __NR_s390_pci_mmio_write
+#endif
+
+#ifdef __NR_s390_runtime_instr
+#define SYS_s390_runtime_instr __NR_s390_runtime_instr
+#endif
+
+#ifdef __NR_s390_sthyi
+#define SYS_s390_sthyi __NR_s390_sthyi
+#endif
+
+#ifdef __NR_sched_get_affinity
+#define SYS_sched_get_affinity __NR_sched_get_affinity
+#endif
+
+#ifdef __NR_sched_get_priority_max
+#define SYS_sched_get_priority_max __NR_sched_get_priority_max
+#endif
+
+#ifdef __NR_sched_get_priority_min
+#define SYS_sched_get_priority_min __NR_sched_get_priority_min
+#endif
+
+#ifdef __NR_sched_getaffinity
+#define SYS_sched_getaffinity __NR_sched_getaffinity
+#endif
+
+#ifdef __NR_sched_getattr
+#define SYS_sched_getattr __NR_sched_getattr
+#endif
+
+#ifdef __NR_sched_getparam
+#define SYS_sched_getparam __NR_sched_getparam
+#endif
+
+#ifdef __NR_sched_getscheduler
+#define SYS_sched_getscheduler __NR_sched_getscheduler
+#endif
+
+#ifdef __NR_sched_rr_get_interval
+#define SYS_sched_rr_get_interval __NR_sched_rr_get_interval
+#endif
+
+#ifdef __NR_sched_rr_get_interval_time64
+#define SYS_sched_rr_get_interval_time64 __NR_sched_rr_get_interval_time64
+#endif
+
+#ifdef __NR_sched_set_affinity
+#define SYS_sched_set_affinity __NR_sched_set_affinity
+#endif
+
+#ifdef __NR_sched_setaffinity
+#define SYS_sched_setaffinity __NR_sched_setaffinity
+#endif
+
+#ifdef __NR_sched_setattr
+#define SYS_sched_setattr __NR_sched_setattr
+#endif
+
+#ifdef __NR_sched_setparam
+#define SYS_sched_setparam __NR_sched_setparam
+#endif
+
+#ifdef __NR_sched_setscheduler
+#define SYS_sched_setscheduler __NR_sched_setscheduler
+#endif
+
+#ifdef __NR_sched_yield
+#define SYS_sched_yield __NR_sched_yield
+#endif
+
+#ifdef __NR_seccomp
+#define SYS_seccomp __NR_seccomp
+#endif
+
+#ifdef __NR_security
+#define SYS_security __NR_security
+#endif
+
+#ifdef __NR_select
+#define SYS_select __NR_select
+#endif
+
+#ifdef __NR_semctl
+#define SYS_semctl __NR_semctl
+#endif
+
+#ifdef __NR_semget
+#define SYS_semget __NR_semget
+#endif
+
+#ifdef __NR_semop
+#define SYS_semop __NR_semop
+#endif
+
+#ifdef __NR_semtimedop
+#define SYS_semtimedop __NR_semtimedop
+#endif
+
+#ifdef __NR_send
+#define SYS_send __NR_send
+#endif
+
+#ifdef __NR_sendfile
+#define SYS_sendfile __NR_sendfile
+#endif
+
+#ifdef __NR_sendfile64
+#define SYS_sendfile64 __NR_sendfile64
+#endif
+
+#ifdef __NR_sendmmsg
+#define SYS_sendmmsg __NR_sendmmsg
+#endif
+
+#ifdef __NR_sendmsg
+#define SYS_sendmsg __NR_sendmsg
+#endif
+
+#ifdef __NR_sendto
+#define SYS_sendto __NR_sendto
+#endif
+
+#ifdef __NR_set_mempolicy
+#define SYS_set_mempolicy __NR_set_mempolicy
+#endif
+
+#ifdef __NR_set_robust_list
+#define SYS_set_robust_list __NR_set_robust_list
+#endif
+
+#ifdef __NR_set_thread_area
+#define SYS_set_thread_area __NR_set_thread_area
+#endif
+
+#ifdef __NR_set_tid_address
+#define SYS_set_tid_address __NR_set_tid_address
+#endif
+
+#ifdef __NR_setdomainname
+#define SYS_setdomainname __NR_setdomainname
+#endif
+
+#ifdef __NR_setfsgid
+#define SYS_setfsgid __NR_setfsgid
+#endif
+
+#ifdef __NR_setfsgid32
+#define SYS_setfsgid32 __NR_setfsgid32
+#endif
+
+#ifdef __NR_setfsuid
+#define SYS_setfsuid __NR_setfsuid
+#endif
+
+#ifdef __NR_setfsuid32
+#define SYS_setfsuid32 __NR_setfsuid32
+#endif
+
+#ifdef __NR_setgid
+#define SYS_setgid __NR_setgid
+#endif
+
+#ifdef __NR_setgid32
+#define SYS_setgid32 __NR_setgid32
+#endif
+
+#ifdef __NR_setgroups
+#define SYS_setgroups __NR_setgroups
+#endif
+
+#ifdef __NR_setgroups32
+#define SYS_setgroups32 __NR_setgroups32
+#endif
+
+#ifdef __NR_sethae
+#define SYS_sethae __NR_sethae
+#endif
+
+#ifdef __NR_sethostname
+#define SYS_sethostname __NR_sethostname
+#endif
+
+#ifdef __NR_setitimer
+#define SYS_setitimer __NR_setitimer
+#endif
+
+#ifdef __NR_setns
+#define SYS_setns __NR_setns
+#endif
+
+#ifdef __NR_setpgid
+#define SYS_setpgid __NR_setpgid
+#endif
+
+#ifdef __NR_setpgrp
+#define SYS_setpgrp __NR_setpgrp
+#endif
+
+#ifdef __NR_setpriority
+#define SYS_setpriority __NR_setpriority
+#endif
+
+#ifdef __NR_setregid
+#define SYS_setregid __NR_setregid
+#endif
+
+#ifdef __NR_setregid32
+#define SYS_setregid32 __NR_setregid32
+#endif
+
+#ifdef __NR_setresgid
+#define SYS_setresgid __NR_setresgid
+#endif
+
+#ifdef __NR_setresgid32
+#define SYS_setresgid32 __NR_setresgid32
+#endif
+
+#ifdef __NR_setresuid
+#define SYS_setresuid __NR_setresuid
+#endif
+
+#ifdef __NR_setresuid32
+#define SYS_setresuid32 __NR_setresuid32
+#endif
+
+#ifdef __NR_setreuid
+#define SYS_setreuid __NR_setreuid
+#endif
+
+#ifdef __NR_setreuid32
+#define SYS_setreuid32 __NR_setreuid32
+#endif
+
+#ifdef __NR_setrlimit
+#define SYS_setrlimit __NR_setrlimit
+#endif
+
+#ifdef __NR_setsid
+#define SYS_setsid __NR_setsid
+#endif
+
+#ifdef __NR_setsockopt
+#define SYS_setsockopt __NR_setsockopt
+#endif
+
+#ifdef __NR_settimeofday
+#define SYS_settimeofday __NR_settimeofday
+#endif
+
+#ifdef __NR_setuid
+#define SYS_setuid __NR_setuid
+#endif
+
+#ifdef __NR_setuid32
+#define SYS_setuid32 __NR_setuid32
+#endif
+
+#ifdef __NR_setxattr
+#define SYS_setxattr __NR_setxattr
+#endif
+
+#ifdef __NR_sgetmask
+#define SYS_sgetmask __NR_sgetmask
+#endif
+
+#ifdef __NR_shmat
+#define SYS_shmat __NR_shmat
+#endif
+
+#ifdef __NR_shmctl
+#define SYS_shmctl __NR_shmctl
+#endif
+
+#ifdef __NR_shmdt
+#define SYS_shmdt __NR_shmdt
+#endif
+
+#ifdef __NR_shmget
+#define SYS_shmget __NR_shmget
+#endif
+
+#ifdef __NR_shutdown
+#define SYS_shutdown __NR_shutdown
+#endif
+
+#ifdef __NR_sigaction
+#define SYS_sigaction __NR_sigaction
+#endif
+
+#ifdef __NR_sigaltstack
+#define SYS_sigaltstack __NR_sigaltstack
+#endif
+
+#ifdef __NR_signal
+#define SYS_signal __NR_signal
+#endif
+
+#ifdef __NR_signalfd
+#define SYS_signalfd __NR_signalfd
+#endif
+
+#ifdef __NR_signalfd4
+#define SYS_signalfd4 __NR_signalfd4
+#endif
+
+#ifdef __NR_sigpending
+#define SYS_sigpending __NR_sigpending
+#endif
+
+#ifdef __NR_sigprocmask
+#define SYS_sigprocmask __NR_sigprocmask
+#endif
+
+#ifdef __NR_sigreturn
+#define SYS_sigreturn __NR_sigreturn
+#endif
+
+#ifdef __NR_sigsuspend
+#define SYS_sigsuspend __NR_sigsuspend
+#endif
+
+#ifdef __NR_socket
+#define SYS_socket __NR_socket
+#endif
+
+#ifdef __NR_socketcall
+#define SYS_socketcall __NR_socketcall
+#endif
+
+#ifdef __NR_socketpair
+#define SYS_socketpair __NR_socketpair
+#endif
+
+#ifdef __NR_splice
+#define SYS_splice __NR_splice
+#endif
+
+#ifdef __NR_spu_create
+#define SYS_spu_create __NR_spu_create
+#endif
+
+#ifdef __NR_spu_run
+#define SYS_spu_run __NR_spu_run
+#endif
+
+#ifdef __NR_ssetmask
+#define SYS_ssetmask __NR_ssetmask
+#endif
+
+#ifdef __NR_stat
+#define SYS_stat __NR_stat
+#endif
+
+#ifdef __NR_stat64
+#define SYS_stat64 __NR_stat64
+#endif
+
+#ifdef __NR_statfs
+#define SYS_statfs __NR_statfs
+#endif
+
+#ifdef __NR_statfs64
+#define SYS_statfs64 __NR_statfs64
+#endif
+
+#ifdef __NR_statx
+#define SYS_statx __NR_statx
+#endif
+
+#ifdef __NR_stime
+#define SYS_stime __NR_stime
+#endif
+
+#ifdef __NR_stty
+#define SYS_stty __NR_stty
+#endif
+
+#ifdef __NR_subpage_prot
+#define SYS_subpage_prot __NR_subpage_prot
+#endif
+
+#ifdef __NR_swapcontext
+#define SYS_swapcontext __NR_swapcontext
+#endif
+
+#ifdef __NR_swapoff
+#define SYS_swapoff __NR_swapoff
+#endif
+
+#ifdef __NR_swapon
+#define SYS_swapon __NR_swapon
+#endif
+
+#ifdef __NR_switch_endian
+#define SYS_switch_endian __NR_switch_endian
+#endif
+
+#ifdef __NR_symlink
+#define SYS_symlink __NR_symlink
+#endif
+
+#ifdef __NR_symlinkat
+#define SYS_symlinkat __NR_symlinkat
+#endif
+
+#ifdef __NR_sync
+#define SYS_sync __NR_sync
+#endif
+
+#ifdef __NR_sync_file_range
+#define SYS_sync_file_range __NR_sync_file_range
+#endif
+
+#ifdef __NR_sync_file_range2
+#define SYS_sync_file_range2 __NR_sync_file_range2
+#endif
+
+#ifdef __NR_syncfs
+#define SYS_syncfs __NR_syncfs
+#endif
+
+#ifdef __NR_sys_debug_setcontext
+#define SYS_sys_debug_setcontext __NR_sys_debug_setcontext
+#endif
+
+#ifdef __NR_sys_epoll_create
+#define SYS_sys_epoll_create __NR_sys_epoll_create
+#endif
+
+#ifdef __NR_sys_epoll_ctl
+#define SYS_sys_epoll_ctl __NR_sys_epoll_ctl
+#endif
+
+#ifdef __NR_sys_epoll_wait
+#define SYS_sys_epoll_wait __NR_sys_epoll_wait
+#endif
+
+#ifdef __NR_syscall
+#define SYS_syscall __NR_syscall
+#endif
+
+#ifdef __NR_sysfs
+#define SYS_sysfs __NR_sysfs
+#endif
+
+#ifdef __NR_sysinfo
+#define SYS_sysinfo __NR_sysinfo
+#endif
+
+#ifdef __NR_syslog
+#define SYS_syslog __NR_syslog
+#endif
+
+#ifdef __NR_sysmips
+#define SYS_sysmips __NR_sysmips
+#endif
+
+#ifdef __NR_tee
+#define SYS_tee __NR_tee
+#endif
+
+#ifdef __NR_tgkill
+#define SYS_tgkill __NR_tgkill
+#endif
+
+#ifdef __NR_time
+#define SYS_time __NR_time
+#endif
+
+#ifdef __NR_timer_create
+#define SYS_timer_create __NR_timer_create
+#endif
+
+#ifdef __NR_timer_delete
+#define SYS_timer_delete __NR_timer_delete
+#endif
+
+#ifdef __NR_timer_getoverrun
+#define SYS_timer_getoverrun __NR_timer_getoverrun
+#endif
+
+#ifdef __NR_timer_gettime
+#define SYS_timer_gettime __NR_timer_gettime
+#endif
+
+#ifdef __NR_timer_settime
+#define SYS_timer_settime __NR_timer_settime
+#endif
+
+#ifdef __NR_timerfd
+#define SYS_timerfd __NR_timerfd
+#endif
+
+#ifdef __NR_timerfd_create
+#define SYS_timerfd_create __NR_timerfd_create
+#endif
+
+#ifdef __NR_timerfd_gettime
+#define SYS_timerfd_gettime __NR_timerfd_gettime
+#endif
+
+#ifdef __NR_timerfd_settime
+#define SYS_timerfd_settime __NR_timerfd_settime
+#endif
+
+#ifdef __NR_times
+#define SYS_times __NR_times
+#endif
+
+#ifdef __NR_tkill
+#define SYS_tkill __NR_tkill
+#endif
+
+#ifdef __NR_truncate
+#define SYS_truncate __NR_truncate
+#endif
+
+#ifdef __NR_truncate64
+#define SYS_truncate64 __NR_truncate64
+#endif
+
+#ifdef __NR_tuxcall
+#define SYS_tuxcall __NR_tuxcall
+#endif
+
+#ifdef __NR_ugetrlimit
+#define SYS_ugetrlimit __NR_ugetrlimit
+#endif
+
+#ifdef __NR_ulimit
+#define SYS_ulimit __NR_ulimit
+#endif
+
+#ifdef __NR_umask
+#define SYS_umask __NR_umask
+#endif
+
+#ifdef __NR_umount
+#define SYS_umount __NR_umount
+#endif
+
+#ifdef __NR_umount2
+#define SYS_umount2 __NR_umount2
+#endif
+
+#ifdef __NR_uname
+#define SYS_uname __NR_uname
+#endif
+
+#ifdef __NR_unlink
+#define SYS_unlink __NR_unlink
+#endif
+
+#ifdef __NR_unlinkat
+#define SYS_unlinkat __NR_unlinkat
+#endif
+
+#ifdef __NR_unshare
+#define SYS_unshare __NR_unshare
+#endif
+
+#ifdef __NR_uselib
+#define SYS_uselib __NR_uselib
+#endif
+
+#ifdef __NR_userfaultfd
+#define SYS_userfaultfd __NR_userfaultfd
+#endif
+
+#ifdef __NR_ustat
+#define SYS_ustat __NR_ustat
+#endif
+
+#ifdef __NR_utime
+#define SYS_utime __NR_utime
+#endif
+
+#ifdef __NR_utimensat
+#define SYS_utimensat __NR_utimensat
+#endif
+
+#ifdef __NR_utimes
+#define SYS_utimes __NR_utimes
+#endif
+
+#ifdef __NR_utrap_install
+#define SYS_utrap_install __NR_utrap_install
+#endif
+
+#ifdef __NR_vfork
+#define SYS_vfork __NR_vfork
+#endif
+
+#ifdef __NR_vhangup
+#define SYS_vhangup __NR_vhangup
+#endif
+
+#ifdef __NR_vm86
+#define SYS_vm86 __NR_vm86
+#endif
+
+#ifdef __NR_vm86old
+#define SYS_vm86old __NR_vm86old
+#endif
+
+#ifdef __NR_vmsplice
+#define SYS_vmsplice __NR_vmsplice
+#endif
+
+#ifdef __NR_vserver
+#define SYS_vserver __NR_vserver
+#endif
+
+#ifdef __NR_wait4
+#define SYS_wait4 __NR_wait4
+#endif
+
+#ifdef __NR_waitid
+#define SYS_waitid __NR_waitid
+#endif
+
+#ifdef __NR_waitpid
+#define SYS_waitpid __NR_waitpid
+#endif
+
+#ifdef __NR_write
+#define SYS_write __NR_write
+#endif
+
+#ifdef __NR_writev
+#define SYS_writev __NR_writev
+#endif
+
 
 #endif // LLVM_LIBC_SYS_SYSCALL_H
diff --git a/libc/newhdrgen/class_implementation/classes/enumeration.py b/libc/newhdrgen/class_implementation/classes/enumeration.py
new file mode 100644
index 00000000000000..be03dbf603c2ba
--- /dev/null
+++ b/libc/newhdrgen/class_implementation/classes/enumeration.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+#
+# ====-- Enumeration class for libc function headers ----------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+class Enumeration:
+    def __init__(self, name, value=None):
+        self.name = name
+        self.value = value
+
+    def __str__(self):
+        if self.value != None:
+            return f"{self.name} = {self.value}"
+        else:
+            return f"{self.name}"
diff --git a/libc/newhdrgen/class_implementation/classes/function.py b/libc/newhdrgen/class_implementation/classes/function.py
new file mode 100644
index 00000000000000..3c464e48b6e3bb
--- /dev/null
+++ b/libc/newhdrgen/class_implementation/classes/function.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+#
+# ====-- Function class for libc function headers -------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+class Function:
+    def __init__(
+        self, standards, return_type, name, arguments, guard=None, attributes=[]
+    ):
+        self.standards = standards
+        self.return_type = return_type
+        self.name = name
+        self.arguments = [
+            arg if isinstance(arg, str) else arg["type"] for arg in arguments
+        ]
+        self.guard = guard
+        self.attributes = attributes or []
+
+    def __str__(self):
+        attributes_str = " ".join(self.attributes)
+        arguments_str = ", ".join(self.arguments)
+        result = f"{self.return_type} {self.name}({arguments_str}){attributes_str};"
+        if self.guard:
+            result = f"#ifdef {self.guard}\n{result}\n#endif // {self.guard}"
+        return result
diff --git a/libc/newhdrgen/class_implementation/classes/include.py b/libc/newhdrgen/class_implementation/classes/include.py
new file mode 100644
index 00000000000000..2657f2e7c931cb
--- /dev/null
+++ b/libc/newhdrgen/class_implementation/classes/include.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+#
+# ====-- Include class for libc function headers --------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+class Include:
+    def __init__(self, name):
+        self.name = name
+
+    def __str__(self):
+        return f'#include "{self.name}"'
diff --git a/libc/newhdrgen/class_implementation/classes/macro.py b/libc/newhdrgen/class_implementation/classes/macro.py
new file mode 100644
index 00000000000000..bf17ae6b6c5ab8
--- /dev/null
+++ b/libc/newhdrgen/class_implementation/classes/macro.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+#
+# ====-- Macro class for libc function headers ----------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+class Macro:
+    def __init__(self, name, value=None):
+        self.name = name
+        self.value = value
+
+    def __str__(self):
+        if self.value != None:
+            return f"#define {self.name} {self.value}"
+        else:
+            return f"#define {self.name}"
diff --git a/libc/newhdrgen/class_implementation/classes/object.py b/libc/newhdrgen/class_implementation/classes/object.py
new file mode 100644
index 00000000000000..c65a82e1a660d1
--- /dev/null
+++ b/libc/newhdrgen/class_implementation/classes/object.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+#
+# ====-- Object class for libc function headers ---------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+class Object:
+    def __init__(self, name, type):
+        self.name = name
+        self.type = type
+
+    def __str__(self):
+        return f"extern {self.type} {self.name}"
diff --git a/libc/newhdrgen/class_implementation/classes/type.py b/libc/newhdrgen/class_implementation/classes/type.py
new file mode 100644
index 00000000000000..8e4a8f3cc9c58f
--- /dev/null
+++ b/libc/newhdrgen/class_implementation/classes/type.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+#
+# ====-- Type class for libc function headers -----------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+class Type:
+    def __init__(self, type_name):
+        self.type_name = type_name
+
+    def __str__(self):
+        return f"#include <llvm-libc-types/{self.type_name}.h>"
diff --git a/libc/newhdrgen/header.py b/libc/newhdrgen/header.py
new file mode 100644
index 00000000000000..7ce356831677eb
--- /dev/null
+++ b/libc/newhdrgen/header.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+#
+# ====- HeaderFile Class for libc function headers  -----------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+class HeaderFile:
+    def __init__(self, name):
+        self.name = name
+        self.macros = []
+        self.types = []
+        self.enumerations = []
+        self.objects = []
+        self.functions = []
+        self.includes = []
+
+    def add_macro(self, macro):
+        self.macros.append(macro)
+
+    def add_type(self, type_):
+        self.types.append(type_)
+
+    def add_enumeration(self, enumeration):
+        self.enumerations.append(enumeration)
+
+    def add_object(self, object):
+        self.objects.append(object)
+
+    def add_function(self, function):
+        self.functions.append(function)
+
+    def add_include(self, include):
+        self.includes.append(include)
+
+    def __str__(self):
+        content = [""]
+
+        for include in self.includes:
+            content.append(str(include))
+
+        for macro in self.macros:
+            content.append(str(macro))
+
+        for object in self.objects:
+            content.append(str(object))
+
+        for type_ in self.types:
+            content.append(str(type_))
+
+        if self.enumerations:
+            content.append("enum {")
+            for enum in self.enumerations:
+                content.append(f"\t{str(enum)},")
+            content.append("};")
+
+        # TODO: replace line below with common.h functionality
+        content.append("__BEGIN_C_DECLS\n")
+        for function in self.functions:
+            content.append(str(function))
+            content.append("")
+        content.append("__END_C_DECLS\n")
+        return "\n".join(content)
diff --git a/libc/newhdrgen/yaml/arpa_inet.yaml b/libc/newhdrgen/yaml/arpa_inet.yaml
new file mode 100644
index 00000000000000..945a602705dba4
--- /dev/null
+++ b/libc/newhdrgen/yaml/arpa_inet.yaml
@@ -0,0 +1,33 @@
+header: arpa-inet.h
+macros: []
+types:
+  - type_name: uint32_t
+  - type_name: uint16_t
+  - type_name: inttypes.h
+enums: []
+objects: []
+functions:
+  - name: htonl
+    standards: 
+      - POSIX
+    return_type: uint32_t
+    arguments:
+      - type: uint32_t
+  - name: htons
+    standards: 
+      - POSIX
+    return_type: uint16_t
+    arguments:
+      - type: uint16_t
+  - name: ntohl
+    standards: 
+      - POSIX
+    return_type: uint32_t
+    arguments:
+      - type: uint32_t
+  - name: ntohs
+    standards: 
+      - POSIX
+    return_type: uint16_t
+    arguments:
+      - type: uint16_t
diff --git a/libc/newhdrgen/yaml/dirent.yaml b/libc/newhdrgen/yaml/dirent.yaml
new file mode 100644
index 00000000000000..84a539a966345c
--- /dev/null
+++ b/libc/newhdrgen/yaml/dirent.yaml
@@ -0,0 +1,46 @@
+header: dirent.h
+macros: []
+types:
+  - type_name: struct_dirent
+  - type_name: DIR
+  - type_name: ino_t
+enums: []
+objects: []
+functions:
+  - name: alphasort
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const struct dirent **
+      - type: const struct dirent **
+  - name: closedir
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: DIR *
+  - name: dirfd
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: DIR *
+  - name: fdopendir
+    standards: 
+      - POSIX
+    return_type: DIR *
+    arguments:
+      - type: int
+  - name: opendir
+    standards: 
+      - POSIX
+    return_type: DIR *
+    arguments:
+      - type: const char *
+  - name: readdir
+    standards: 
+      - POSIX
+    return_type: struct dirent *
+    arguments:
+      - type: DIR *
diff --git a/libc/newhdrgen/yaml/errno.yaml b/libc/newhdrgen/yaml/errno.yaml
new file mode 100644
index 00000000000000..5d8de90f035bd3
--- /dev/null
+++ b/libc/newhdrgen/yaml/errno.yaml
@@ -0,0 +1,10 @@
+header: errno.h
+standards: 
+  - stdc
+  - Linux
+  - POSIX
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/fcntl.yaml b/libc/newhdrgen/yaml/fcntl.yaml
new file mode 100644
index 00000000000000..3cb9741256a3d2
--- /dev/null
+++ b/libc/newhdrgen/yaml/fcntl.yaml
@@ -0,0 +1,40 @@
+header: fcntl.h
+macros: []
+types: 
+  - type_name: off_t
+  - type_name: mode_t
+enums: []
+objects: []
+functions:
+  - name: creat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: mode_t
+  - name: fcntl
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+      - type: ...
+  - name: open
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: int
+      - type: ...
+  - name: openat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const char *
+      - type: int
+      - type: ...
diff --git a/libc/newhdrgen/yaml/float.yaml b/libc/newhdrgen/yaml/float.yaml
new file mode 100644
index 00000000000000..fcbede018113cd
--- /dev/null
+++ b/libc/newhdrgen/yaml/float.yaml
@@ -0,0 +1,8 @@
+header: float.h
+standards: 
+  - stdc
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/inttypes.yaml b/libc/newhdrgen/yaml/inttypes.yaml
new file mode 100644
index 00000000000000..772a1a0e06b214
--- /dev/null
+++ b/libc/newhdrgen/yaml/inttypes.yaml
@@ -0,0 +1,36 @@
+header: inttypes.h
+macros: 
+types:
+  - type_name: imaxdiv_t
+enums: []
+objects: []
+functions:
+  - name: imaxabs
+    standards: 
+      - stdc
+    return_type: intmax_t
+    arguments:
+      - type: intmax_t
+  - name: imaxdiv
+    standards: 
+      - stdc
+    return_type: imaxdiv_t
+    arguments:
+      - type: intmax_t
+      - type: intmax_t
+  - name: strtoimax
+    standards: 
+      - stdc
+    return_type: intmax_t
+    arguments:
+      - type: const char *__restrict
+      - type: char * *__restrict
+      - type: int
+  - name: strtoumax
+    standards: 
+      - stdc
+    return_type: uintmax_t
+    arguments:
+      - type: const char *__restrict
+      - type: char * *__restrict
+      - type: int
diff --git a/libc/newhdrgen/yaml/limits.yaml b/libc/newhdrgen/yaml/limits.yaml
new file mode 100644
index 00000000000000..9d0f14fa3095d1
--- /dev/null
+++ b/libc/newhdrgen/yaml/limits.yaml
@@ -0,0 +1,8 @@
+header: limits.h
+standards: 
+  - stdc
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/rpc.yaml b/libc/newhdrgen/yaml/rpc.yaml
new file mode 100644
index 00000000000000..8a096db8784ec1
--- /dev/null
+++ b/libc/newhdrgen/yaml/rpc.yaml
@@ -0,0 +1,23 @@
+header: gpu-rpc.h
+macros: []
+types: []
+enums: []
+objects: []
+functions:
+  - name: rpc_host_call
+    standards: 
+      - GPUExtensions
+    return_type: void
+    arguments:
+      - type: void *
+      - type: void *
+      - type: size_t
+  - name: rpc_fprintf
+    standards: 
+      - GPUExtensions
+    return_type: int
+    arguments:
+      - type: ::FILE *__restrict
+      - type: const char *__restrict
+      - type: void *
+      - type: size_t
diff --git a/libc/newhdrgen/yaml/search.yaml b/libc/newhdrgen/yaml/search.yaml
new file mode 100644
index 00000000000000..a7983a70bda730
--- /dev/null
+++ b/libc/newhdrgen/yaml/search.yaml
@@ -0,0 +1,60 @@
+header: search.h
+macros: []
+types:
+  - type_name: size_t
+  - type_name: struct_hsearch_data
+  - type_name: ENTRY
+  - type_name: ACTION
+enums: []
+objects: []
+functions:
+  - name: hcreate
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: size_t
+  - name: hcreate_r
+    standards: GNUExtensions
+    return_type: int
+    arguments:
+      - type: size_t
+      - type: struct hsearch_data *
+  - name: hsearch
+    standards: 
+      - POSIX
+    return_type: ENTRY *
+    arguments:
+      - type: ENTRY
+      - type: ACTION
+  - name: hsearch_r
+    standards: GNUExtensions
+    return_type: int
+    arguments:
+      - type: ENTRY
+      - type: ACTION
+      - type: ENTRY * *
+      - type: struct hsearch_data *
+  - name: hdestroy
+    standards: GNUExtensions 
+    return_type: void
+    arguments: []
+  - name: hdestroy_r
+    standards: 
+      - POSIX
+    return_type: void
+    arguments:
+      - type: struct hsearch_data *
+  - name: insque
+    standards: 
+      - POSIX
+    return_type: void
+    arguments:
+      - type: void *
+      - type: void *
+  - name: remque
+    standards: 
+      - POSIX
+    return_type: void
+    arguments:
+      - type: void *
diff --git a/libc/newhdrgen/yaml/setjmp.yaml b/libc/newhdrgen/yaml/setjmp.yaml
new file mode 100644
index 00000000000000..c750b46fc20881
--- /dev/null
+++ b/libc/newhdrgen/yaml/setjmp.yaml
@@ -0,0 +1,20 @@
+header: setjmp.h
+macros: []
+types:
+  - type_name: jmp_buf
+enums: []
+objects: []
+functions:
+  - name: longjmp
+    standards: 
+      - stdc
+    return_type: _Noreturn void
+    arguments:
+      - type: jmp_buf 
+      - type: int
+  - name: setjmp
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: jmp_buf
diff --git a/libc/newhdrgen/yaml/spawn.yaml b/libc/newhdrgen/yaml/spawn.yaml
new file mode 100644
index 00000000000000..86a2e624d4ef48
--- /dev/null
+++ b/libc/newhdrgen/yaml/spawn.yaml
@@ -0,0 +1,58 @@
+header: spawn.h
+macros: []
+types:
+  - type_name: posix_spawn_file_actions_t
+  - type_name: posix_spawnattr_t
+  - type_name: pid_t
+  - type_name: mode_t
+enums: []
+objects: []
+functions:
+  - name: posix_spawn
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pid_t *__restrict
+      - type: const char *__restrict
+      - type: posix_spawn_file_actions_t *
+      - type: posix_spawnattr_t *__restrict
+      - type: const char *__restrict *
+      - type: const char *__restrict *
+  - name: posix_spawn_file_actions_addclose
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: posix_spawn_file_actions_t *
+      - type: int
+  - name: posix_spawn_file_actions_adddup2
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: posix_spawn_file_actions_t *
+      - type: int
+      - type: int
+  - name: posix_spawn_file_actions_addopen
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: posix_spawn_file_actions_t *__restrict
+      - type: int
+      - type: const char *__restrict
+      - type: int
+      - type: mode_t
+  - name: posix_spawn_file_actions_destroy
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: posix_spawn_file_actions_t *
+  - name: posix_spawn_file_actions_init
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: posix_spawn_file_actions_t *
diff --git a/libc/newhdrgen/yaml/stdbit.yaml b/libc/newhdrgen/yaml/stdbit.yaml
new file mode 100644
index 00000000000000..7e1368df9de511
--- /dev/null
+++ b/libc/newhdrgen/yaml/stdbit.yaml
@@ -0,0 +1,396 @@
+header: stdbit.h
+macros: []
+types: []
+enums: []
+objects: []
+functions:
+  - name: stdc_leading_zeros_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_leading_zeros_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_leading_zeros_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_leading_zeros_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_leading_zeros_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_leading_ones_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_leading_ones_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_leading_ones_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_leading_ones_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_leading_ones_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_trailing_zeros_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_trailing_zeros_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_trailing_zeros_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_trailing_zeros_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_trailing_zeros_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_trailing_ones_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_trailing_ones_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_trailing_ones_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_trailing_ones_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_trailing_ones_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_first_leading_zero_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_first_leading_zero_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_first_leading_zero_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_first_leading_zero_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_first_leading_zero_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_first_leading_one_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_first_leading_one_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_first_leading_one_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_first_leading_one_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_first_leading_one_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_first_trailing_one_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_first_trailing_one_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_first_trailing_one_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_first_trailing_one_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_first_trailing_one_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_count_zeros_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_count_zeros_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_count_zeros_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_count_zeros_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_count_zeros_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_count_ones_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_count_ones_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_count_ones_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_count_ones_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_count_ones_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_has_single_bit_uc
+    standards: 
+      - stdc
+    return_type: bool
+    arguments:
+      - type: unsigned char
+  - name: stdc_has_single_bit_us
+    standards: 
+      - stdc
+    return_type: bool
+    arguments:
+      - type: unsigned short
+  - name: stdc_has_single_bit_ui
+    standards: 
+      - stdc
+    return_type: bool
+    arguments:
+      - type: unsigned int
+  - name: stdc_has_single_bit_ul
+    standards: 
+      - stdc
+    return_type: bool
+    arguments:
+      - type: unsigned long
+  - name: stdc_has_single_bit_ull
+    standards: 
+      - stdc
+    return_type: bool
+    arguments:
+      - type: unsigned long long
+  - name: stdc_bit_width_uc
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned char
+  - name: stdc_bit_width_us
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned short
+  - name: stdc_bit_width_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_bit_width_ul
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long
+  - name: stdc_bit_width_ull
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned long long
+  - name: stdc_bit_floor_uc
+    standards: 
+      - stdc
+    return_type: unsigned char
+    arguments:
+      - type: unsigned char
+  - name: stdc_bit_floor_us
+    standards: 
+      - stdc
+    return_type: unsigned short
+    arguments:
+      - type: unsigned short
+  - name: stdc_bit_floor_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_bit_floor_ul
+    standards: 
+      - stdc
+    return_type: unsigned long
+    arguments:
+      - type: unsigned long
+  - name: stdc_bit_floor_ull
+    standards: 
+      - stdc
+    return_type: unsigned long long
+    arguments:
+      - type: unsigned long long
+  - name: stdc_bit_ceil_uc
+    standards: 
+      - stdc
+    return_type: unsigned char
+    arguments:
+      - type: unsigned char
+  - name: stdc_bit_ceil_us
+    standards: 
+      - stdc
+    return_type: unsigned short
+    arguments:
+      - type: unsigned short
+  - name: stdc_bit_ceil_ui
+    standards: 
+      - stdc
+    return_type: unsigned int
+    arguments:
+      - type: unsigned int
+  - name: stdc_bit_ceil_ul
+    standards: 
+      - stdc
+    return_type: unsigned long
+    arguments:
+      - type: unsigned long
+  - name: stdc_bit_ceil_ull
+    standards: 
+      - stdc
+    return_type: unsigned long long
+    arguments:
+      - type: unsigned long long
diff --git a/libc/newhdrgen/yaml/stdckdint.yaml b/libc/newhdrgen/yaml/stdckdint.yaml
new file mode 100644
index 00000000000000..5cc2e109e90e75
--- /dev/null
+++ b/libc/newhdrgen/yaml/stdckdint.yaml
@@ -0,0 +1,8 @@
+header: stdckdint.h
+standards: 
+  - stdc
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/stdint.yaml b/libc/newhdrgen/yaml/stdint.yaml
new file mode 100644
index 00000000000000..5f613e1daa67fd
--- /dev/null
+++ b/libc/newhdrgen/yaml/stdint.yaml
@@ -0,0 +1,8 @@
+header: stdint.h
+standards: 
+  - stdc
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/sys_auxv.yaml b/libc/newhdrgen/yaml/sys_auxv.yaml
new file mode 100644
index 00000000000000..beea1d8b5f09f2
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_auxv.yaml
@@ -0,0 +1,8 @@
+header: sys-auxv.h
+standards: 
+  - GNUExtensions
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/sys_epoll.yaml b/libc/newhdrgen/yaml/sys_epoll.yaml
new file mode 100644
index 00000000000000..18d8f59de15f11
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_epoll.yaml
@@ -0,0 +1,60 @@
+header: sys-epoll.h
+macros: []
+types:
+  - type_name: struct_epoll_event
+  - type_name: struct_epoll_data
+  - type_name: sigset_t
+  - type_name: struct_timespec
+enums: []
+objects: []
+functions:
+  - name: epoll_create
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: int
+  - name: epoll_create1
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: int
+  - name: epoll_ctl
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+      - type: int
+      - type: struct epoll_event *
+  - name: epoll_wait
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: int
+      - type: struct epoll_event *
+      - type: int
+      - type: int
+  - name: epoll_pwait
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: int
+      - type: struct epoll_event *
+      - type: int
+      - type: int
+      - type: const sigset_t *
+  - name: epoll_pwait2
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: int
+      - type: struct epoll_event *
+      - type: int
+      - type: const struct timespec *
+      - type: const sigset_t *
diff --git a/libc/newhdrgen/yaml/sys_ioctl.yaml b/libc/newhdrgen/yaml/sys_ioctl.yaml
new file mode 100644
index 00000000000000..ffe73a84d51b96
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_ioctl.yaml
@@ -0,0 +1,7 @@
+header: sys-ioctl.h
+standards: POSIX
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/sys_prctl.yaml b/libc/newhdrgen/yaml/sys_prctl.yaml
new file mode 100644
index 00000000000000..35289f6274e7a0
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_prctl.yaml
@@ -0,0 +1,7 @@
+header: sys-prctl.h
+standards: Linux
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/sys_random.yaml b/libc/newhdrgen/yaml/sys_random.yaml
new file mode 100644
index 00000000000000..233fb2c7988cb9
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_random.yaml
@@ -0,0 +1,16 @@
+header: sys-random.h
+macros: []
+types:
+  - type_name: ssize_t
+  - type_name: size_t
+enums: []
+objects:
+functions:
+  - name: getrandom
+    standards: 
+      - Linux
+    return_type: ssize_t
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: unsigned int
diff --git a/libc/newhdrgen/yaml/sys_resource.yaml b/libc/newhdrgen/yaml/sys_resource.yaml
new file mode 100644
index 00000000000000..2cc801c823ed1f
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_resource.yaml
@@ -0,0 +1,20 @@
+header: sys-resource.h
+macros: []
+types:
+  - type_name: struct_rlimit
+  - type_name: rlim_t
+enums: []
+objects: []
+functions:
+  - name: getrlimit
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: struct rlimit *
+  - name: setrlimit
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const struct rlimit
diff --git a/libc/newhdrgen/yaml/sys_select.yaml b/libc/newhdrgen/yaml/sys_select.yaml
new file mode 100644
index 00000000000000..eb621f7d14750b
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_select.yaml
@@ -0,0 +1,22 @@
+header: sys-select.h
+macros: []
+types:
+  - type_name: struct_timeval
+  - type_name: struct_timespec
+  - type_name: suseconds_t
+  - type_name: sigset_t
+  - type_name: time_t
+  - type_name: fd_set
+enums: []
+objects: []
+functions:
+  - name: select
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: fd_set *__restrict
+      - type: fd_set *__restrict
+      - type: fd_set *__restrict
+      - type: struct timeval *__restrict
diff --git a/libc/newhdrgen/yaml/sys_sendfile.yaml b/libc/newhdrgen/yaml/sys_sendfile.yaml
new file mode 100644
index 00000000000000..8743bf01b99ba4
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_sendfile.yaml
@@ -0,0 +1,18 @@
+header: sys-sendfile.h
+macros: []
+types:
+  - type_name: ssize_t
+  - type_name: size_t
+  - type_name: off_t
+enums: []
+objects: []
+functions: 
+  - name: sendfile
+    standards: 
+      - GNUExtensions
+    return_type: ssize_t
+    arguments: 
+      - type: int 
+      - type: int 
+      - type: off_t * 
+      - type: size_t 
diff --git a/libc/newhdrgen/yaml/sys_socket.yaml b/libc/newhdrgen/yaml/sys_socket.yaml
new file mode 100644
index 00000000000000..f9984c638ceae6
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_socket.yaml
@@ -0,0 +1,26 @@
+header: sys-socket.h
+macros: []
+types:
+  - type_name: struct_sockaddr_un
+  - type_name: struct_sockaddr
+  - type_name: socklen_t
+  - type_name: sa_family_t
+enums: []
+objects: []
+functions:
+  - name: socket
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+      - type: int
+  - name: bind
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const struct sockaddr *
+      - type: socklen_t
diff --git a/libc/newhdrgen/yaml/sys_stat.yaml b/libc/newhdrgen/yaml/sys_stat.yaml
new file mode 100644
index 00000000000000..6ae69b4f870a2e
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_stat.yaml
@@ -0,0 +1,77 @@
+header: sys-stat.h
+macros: []
+types:
+  - type_name: blkcnt_t
+  - type_name: blksize_t
+  - type_name: off_t
+  - type_name: struct_timeval
+  - type_name: gid_t
+  - type_name: struct_stat
+  - type_name: uid_t
+  - type_name: nlink_t
+  - type_name: dev_t
+  - type_name: struct_timespec
+  - type_name: ino_t
+  - type_name: mode_t
+enums: []
+objects: []
+functions:
+  - name: chmod
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: mode_t
+  - name: fchmod
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: mode_t
+  - name: fchmodat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const char *
+      - type: mode_t
+      - type: int
+  - name: fstat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: struct stat *
+  - name: lstat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *__restrict
+      - type: struct stat *__restrict
+  - name: mkdir
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: mode_t
+  - name: mkdirat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const char *
+      - type: mode_t
+  - name: stat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *__restrict
+      - type: struct stat *__restrict
diff --git a/libc/newhdrgen/yaml/sys_statvfs.yaml b/libc/newhdrgen/yaml/sys_statvfs.yaml
new file mode 100644
index 00000000000000..3651901e9f2c75
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_statvfs.yaml
@@ -0,0 +1,21 @@
+header: sys-statvfs.h
+macros: []
+types:
+  - type_name: struct_statvfs
+enums: []
+objects: []
+functions:
+  - name: statvfs
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *__restrict
+      - type: struct statvfs *__restrict
+  - name: fstatvfs
+    standards: 
+      - POSIX
+    return_type: int
+    arguments: 
+      - type: int
+      - type: struct statvfs *
diff --git a/libc/newhdrgen/yaml/sys_time.yaml b/libc/newhdrgen/yaml/sys_time.yaml
new file mode 100644
index 00000000000000..a901cdafd26a16
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_time.yaml
@@ -0,0 +1,8 @@
+header: sys-time.h
+standards: Linux
+macros: []
+types:
+  - type_name: struct_timeval
+enums: []
+functions: []
+objects: []
diff --git a/libc/newhdrgen/yaml/sys_types.yaml b/libc/newhdrgen/yaml/sys_types.yaml
new file mode 100644
index 00000000000000..6b08254a7fab13
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_types.yaml
@@ -0,0 +1,30 @@
+header: sys-types.h
+standards: POSIX
+macros: []
+types:
+  - type_name: uid_t
+  - type_name: time_t
+  - type_name: pthread_t
+  - type_name: pthread_rwlockattr_t
+  - type_name: pthread_mutex_t
+  - type_name: blkcnt_t
+  - type_name: blksize_t
+  - type_name: clockid_t
+  - type_name: ssize_t
+  - type_name: pthread_mutexattr_t
+  - type_name: ino_t
+  - type_name: pthread_once_t
+  - type_name: mode_t
+  - type_name: dev_t
+  - type_name: pthread_attr_t
+  - type_name: gid_t
+  - type_name: pid_t
+  - type_name: nlink_t
+  - type_name: suseconds_t
+  - type_name: off_t
+  - type_name: size_t
+  - type_name: pthread_key_t
+  - type_name: pthread_condattr_t
+enums: []
+functions: []
+objects: []
diff --git a/libc/newhdrgen/yaml/sys_utsname.yaml b/libc/newhdrgen/yaml/sys_utsname.yaml
new file mode 100644
index 00000000000000..c48f7e32707486
--- /dev/null
+++ b/libc/newhdrgen/yaml/sys_utsname.yaml
@@ -0,0 +1,13 @@
+header: sys-utsname.h
+macros: []
+types:
+  - type_name: struct_utsname
+enums: []
+objects: []
+functions:
+  - name: uname
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: struct utsname *
diff --git a/libc/newhdrgen/yaml/termios.yaml b/libc/newhdrgen/yaml/termios.yaml
new file mode 100644
index 00000000000000..17986f8c0fc63e
--- /dev/null
+++ b/libc/newhdrgen/yaml/termios.yaml
@@ -0,0 +1,85 @@
+header: termios.h
+macros: []
+types:
+  - type_name: tcflag_t
+  - type_name: struct_termios
+  - type_name: speed_t
+  - type_name: pid_t
+  - type_name: cc_t
+enums: []
+objects: []
+functions:
+  - name: cfgetispeed
+    standards: 
+      - POSIX
+    return_type: speed_t
+    arguments:
+      - type: const struct termios *
+  - name: cfgetospeed
+    standards: 
+      - POSIX
+    return_type: speed_t
+    arguments:
+      - type: const struct termios *
+  - name: cfsetispeed
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: struct termios *
+      - type: speed_t
+  - name: cfsetospeed
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: struct termios *
+      - type: speed_t
+  - name: tcgetattr
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: struct termios *
+  - name: tcgetsid
+    standards: 
+      - POSIX
+    return_type: pid_t
+    arguments:
+      - type: int
+  - name: tcdrain
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: tcflow
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+  - name: tcflush
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+  - name: tcsendbreak
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+  - name: tcsetattr
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+      - type: struct termios *
diff --git a/libc/newhdrgen/yaml/threads.yaml b/libc/newhdrgen/yaml/threads.yaml
new file mode 100644
index 00000000000000..ef472af8bfd328
--- /dev/null
+++ b/libc/newhdrgen/yaml/threads.yaml
@@ -0,0 +1,148 @@
+header: threads.h
+macros:
+  - macro_name: ONCE_FLAG_INIT
+    macro_value: "{0}"
+types:
+  - type_name: once_flag
+  - type_name: __call_once_func_t
+  - type_name: cnd_t
+  - type_name: mtx_t
+  - type_name: thrd_start_t
+  - type_name: thrd_t
+  - type_name: tss_t
+  - type_name: tss_dtor_t
+enums:
+  - name: mtx_plain
+    value: null
+  - name: mtx_recursive
+    value: null
+  - name: mtx_timed
+    value: null
+  - name: thrd_timedout
+    value: null
+  - name: thrd_success
+    value: null
+  - name: thrd_busy
+    value: null
+  - name: thrd_error
+    value: null
+  - name: thrd_nomem
+    value: null
+objects: []
+functions:
+  - name: call_once
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: once_flag *
+      - type: __call_once_func_t
+  - name: cnd_broadcast
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: cnd_t *
+  - name: cnd_destroy
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: cnd_t *
+  - name: cnd_init
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: cnd_t *
+  - name: cnd_signal
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: cnd_t *
+  - name: cnd_wait
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: cnd_t *
+      - type: mtx_t *
+  - name: mtx_init
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: mtx_t *
+      - type: int
+  - name: mtx_destroy
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: void
+  - name: mtx_lock
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: mtx_t *
+  - name: mtx_unlock
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: mtx_t *
+  - name: thrd_create
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: thrd_t *
+      - type: thrd_start_t
+      - type: void *
+  - name: thrd_join
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: thrd_t
+      - type: int *
+  - name: thrd_detach
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: thrd_t
+  - name: thrd_current
+    standards: 
+      - stdc
+    return_type: thrd_t
+    arguments:
+      - type: void
+  - name: thrd_equal
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: thrd_t
+      - type: thrd_t
+  - name: thrd_exit
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: int
+  - name: tss_create
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: tss_t *
+      - type: tss_dtor_t
+  - name: tss_delete
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: tss_t
diff --git a/libc/newhdrgen/yaml/uchar.yaml b/libc/newhdrgen/yaml/uchar.yaml
new file mode 100644
index 00000000000000..a73da17acdeac3
--- /dev/null
+++ b/libc/newhdrgen/yaml/uchar.yaml
@@ -0,0 +1,12 @@
+header: uchar.h
+standards: 
+  - stdc
+macros: []
+types:
+  - type_name: char32_t
+  - type_name: char16_t
+  - type_name: char8_t
+  - type_name: mbstate_t
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/unistd.yaml b/libc/newhdrgen/yaml/unistd.yaml
new file mode 100644
index 00000000000000..511746aa19e856
--- /dev/null
+++ b/libc/newhdrgen/yaml/unistd.yaml
@@ -0,0 +1,329 @@
+header: unistd.h
+macros: []
+types:
+  - type_name: uid_t
+  - type_name: ssize_t
+  - type_name: size_t
+  - type_name: pid_t
+  - type_name: off_t
+  - type_name: __getoptargv_t
+  - type_name: __exec_envp_t
+  - type_name: __exec_argv_t
+enums: []
+objects:
+  - object_name: environ
+    object_type: char **
+  - object_name: optarg
+    object_type: char *
+  - object_name: optind
+    object_type: int
+  - object_name: opterr
+    object_type: int
+  - object_name: optopt
+    object_type: int
+functions:
+  - name: access
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: int
+  - name: chdir
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+  - name: close
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: dup
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: dup2
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+  - name: dup3
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: int
+      - type: int
+  - name: execve
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: __exec_argv_t
+      - type: __exec_envp_t
+  - name: fchdir
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: fsync
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: ftruncate
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: off_t
+  - name: getcwd
+    standards: 
+      - POSIX
+    return_type: char *
+    arguments:
+      - type: char *
+      - type: size_t
+  - name: geteuid
+    standards: 
+      - POSIX
+    return_type: uid_t
+    arguments: []
+  - name: getpid
+    standards: 
+      - POSIX
+    return_type: int
+    arguments: []
+  - name: getppid
+    standards: 
+      - POSIX
+    return_type: int
+    arguments: []
+  - name: getuid
+    standards: 
+      - POSIX
+    return_type: uid_t
+    arguments: []
+  - name: isatty
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+    guard: null
+    attributes: []
+  - name: link
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: linkat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const char *
+      - type: int
+      - type: const char *
+      - type: int
+  - name: lseek 
+    standards: 
+      - POSIX
+    return_type: off_t
+    arguments:
+      - type: int
+      - type: off_t
+      - type: int
+  - name: pipe
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int *
+  - name: pread
+    standards: 
+      - POSIX
+    return_type: ssize_t
+    arguments:
+      - type: int
+      - type: void *
+      - type: size_t
+      - type: off_t
+  - name: pwrite
+    standards: 
+      - POSIX
+    return_type: ssize_t
+    arguments:
+      - type: int
+      - type: const void *
+      - type: size_t
+      - type: off_t
+  - name: read
+    standards: 
+      - POSIX
+    return_type: ssize_t
+    arguments:
+      - type: int
+      - type: void *
+      - type: size_t
+  - name: readlink
+    standards: 
+      - POSIX
+    return_type: ssize_t
+    arguments:
+      - type: const char * __restrict
+      - type: char * __restrict
+      - type: size_t
+  - name: readlinkat
+    standards: 
+      - POSIX
+    return_type: ssize_t
+    arguments:
+      - type: const char * __restrict
+      - type: char * __restrict
+      - type: size_t
+  - name: rmdir
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+  - name: symlink
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: symlinkat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const char *
+      - type: int
+      - type: const char *
+  - name: sysconf
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: truncate
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: off_t
+  - name: unlink
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+  - name: unlinkat
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const char *
+      - type: int
+  - name: write
+    standards: 
+      - POSIX
+    return_type: ssize_t
+    arguments:
+      - type: int
+      - type: const void *
+      - type: size_t
+    guard: null
+    attributes: []
+  - name: _exit
+    standards: 
+      - POSIX
+    return_type: _Noreturn void
+    arguments:
+      - type: int
+  - name: execv
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: __exec_argv_t
+  - name: fork
+    standards: 
+      - POSIX
+    return_type: pid_t
+    arguments: []
+  - name: __llvm_libc_syscall
+    standards: 
+      - POSIX
+    return_type: long
+    arguments:
+      - type: long
+      - type: long
+      - type: long
+      - type: long
+      - type: long
+      - type: long
+      - type: long
+  - name: getopt
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: __getoptargv_t
+      - type: const char *
+  - name: swab
+    standards: 
+      - POSIX
+    return_type: void
+    arguments:
+      - type: const void * __restrict
+      - type: void *
+      - type: ssize_t
+  - name: enivron
+    standards: 
+      - POSIX
+    return_type: extern char **
+    arguments: []
+  - name: optarg
+    standards: 
+      - POSIX
+    return_type: extern char *
+    arguments: []
+  - name: optind
+    standards: 
+      - POSIX
+    return_type: extern int 
+    arguments: []
+  - name: optopt
+    standards: 
+      - POSIX
+    return_type: extern int 
+    arguments: []
+  - name: opterr
+    standards: 
+      - POSIX
+    return_type: extern int 
+    arguments: []
diff --git a/libc/newhdrgen/yaml/wchar.yaml b/libc/newhdrgen/yaml/wchar.yaml
new file mode 100644
index 00000000000000..663267fb69d73e
--- /dev/null
+++ b/libc/newhdrgen/yaml/wchar.yaml
@@ -0,0 +1,15 @@
+header: wchar.h
+macros: []
+types:
+  - type_name: size_t
+  - type_name: wint_t
+  - type_name: wchar_t
+enums: []
+objects: []
+functions:
+  - name: wctob
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: wint_t
diff --git a/libc/newhdrgen/yaml_combined/ctype.yaml b/libc/newhdrgen/yaml_combined/ctype.yaml
new file mode 100644
index 00000000000000..a4a3048ede872c
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/ctype.yaml
@@ -0,0 +1,103 @@
+header: ctype.h
+macros: []
+types: []
+enums: []
+objects: []
+functions:
+  - name: isalnum
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isalpha
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isblank
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: iscntrl
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isdigit
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isgraph
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: islower
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isprint
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: ispunct
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isspace
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isupper
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: isxdigit
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: tolower
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: toupper
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+    functions:
+  - name: isascii
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: toascii
+    standards: 
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: int
diff --git a/libc/newhdrgen/yaml_combined/fenv.yaml b/libc/newhdrgen/yaml_combined/fenv.yaml
new file mode 100644
index 00000000000000..e7ff8cdf4e3b0b
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/fenv.yaml
@@ -0,0 +1,107 @@
+header: fenv.h
+macros: []
+types:
+  - type_name: fenv_t
+  - type_name: fexcept_t
+enums: []
+objects: []
+functions:
+  - name: feclearexcept
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: fetestexcept
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: fetestexceptflag
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const fexcept_t *
+      - type: int
+  - name: feraiseexcept
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: fesetround
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: fegetround
+    standards: 
+      - stdc
+    return_type: int
+    arguments: []
+  - name: fegetenv
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: fenv_t *
+  - name: fesetenv
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const fenv_t *
+  - name: fegetexceptflag
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: fexcept_t *
+      - type: int
+  - name: fesetexcept
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: fesetexceptflag
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const fexcept_t *
+      - type: int
+  - name: feholdexcept
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: fenv_t *
+  - name: feupdateenv
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const fenv_t *
+  - name: fedisableexcept
+    standards: 
+      - GNUExtensions
+    return_type: int 
+    arguments: 
+      - type: int 
+    guard: null
+  - name: feenableexcept
+    standards: 
+      - GNUExtensions
+    return_type: int 
+    arguments:
+      - type: int
+    attributes: []
+  - name: fegetexcept
+    standards: 
+      - GNUExtensions
+    return_type: int
+    arguments: []
diff --git a/libc/newhdrgen/yaml_combined/math.yaml b/libc/newhdrgen/yaml_combined/math.yaml
new file mode 100644
index 00000000000000..dbb1e6ec63030b
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/math.yaml
@@ -0,0 +1,1884 @@
+header: math.h
+macros: []
+types:
+  - type_name: float_t
+  - type_name: double_t
+  - type_name: float128
+enums: []
+objects: []
+functions:
+  - name: copysign
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: copysignf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: copysignl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: copysignf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: copysignf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: ceil
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: ceilf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: ceill
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: ceilf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: ceilf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fabs
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+    attributes:
+      - __LIBC_CONST_ATTR
+  - name: fabsf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: fabsl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: fabsf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fabsf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fdim
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fdimf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fdiml
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fdimf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fdimf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: floor
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: floorf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: floorl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: floorf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: floorf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fmin
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fminf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fminl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fminf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fminf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+  - name: fmax
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fmaxf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fmaxl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fmaxf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fmaxf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+  - name: fmaximum
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fmaximumf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fmaximuml
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fmaximumf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fmaximumf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fmaximum_num
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fmaximum_numf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fmaximum_numl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fmaximum_numf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fmaximum_numf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fmaximum_mag
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fmaximum_magf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fmaximum_magl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fmaximum_magf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fmaximum_magf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fmaximum_mag_num
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fmaximum_mag_numf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fmaximum_mag_numl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fmaximum_mag_numf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fmaximum_mag_numf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fminimum
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fminimumf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fminimuml
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fminimumf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fminimumf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fminimum_num
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fminimum_numf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fminimum_numl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fminimum_numf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fminimum_numf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fminimum_mag
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fminimum_magf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fminimum_magl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fminimum_magf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fminimum_magf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fminimum_mag_num
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fminimum_mag_numf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fminimum_mag_numl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fminimum_mag_numf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fminimum_mag_numf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fma
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+      - type: double
+  - name: fmaf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+      - type: float
+  - name: f16fmaf
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: float
+      - type: float
+      - type: float
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fmod
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: fmodf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: fmodl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: fmodf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fmodf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: frexp
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: int *
+  - name: frexpf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: int *
+  - name: frexpl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: int *
+  - name: frexpf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: int *
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: frexpf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: int *
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fromfp
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: int
+      - type: unsigned int
+  - name: fromfpf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: int
+      - type: unsigned int
+  - name: fromfpl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: int
+      - type: unsigned int
+  - name: fromfpf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fromfpf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: fromfpx
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: int
+      - type: unsigned int
+  - name: fromfpxf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: int
+      - type: unsigned int
+  - name: fromfpxl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: int
+      - type: unsigned int
+  - name: fromfpxf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: fromfpxf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: ufromfp
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: int
+      - type: unsigned int
+  - name: ufromfpf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: int
+      - type: unsigned int
+  - name: ufromfpl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: int
+      - type: unsigned int
+  - name: ufromfpf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: ufromfpf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: ufromfpx
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: int
+      - type: unsigned int
+  - name: ufromfpxf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: int
+      - type: unsigned int
+  - name: ufromfpxl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: int
+      - type: unsigned int
+  - name: ufromfpxf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: ufromfpxf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: int
+      - type: unsigned int
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: hypot
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: hypotf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: ilogb
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: double
+  - name: ilogbf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: float
+  - name: ilogbl
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: long double
+  - name: ilogbf16
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: ilogbf128
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: llogb
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: double
+  - name: llogbf
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: float
+  - name: llogbl
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: long double
+  - name: llogbf16
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: llogbf128
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: ldexp
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: int
+  - name: ldexpf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: int
+  - name: ldexpl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: int
+  - name: ldexpf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: int
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: log10
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: log10f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: log1p
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: log1pf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: log2
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: log2f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: log
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: logf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: log1p
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: log1pf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: log2
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: log2f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: logb
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: logbf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: logbl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: logbf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: logbf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: modf
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double *
+  - name: modff
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float *
+  - name: modfl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double *
+  - name: modff16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16 *
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: modff128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128 *
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: cos
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: cosf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: sin
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: sincosf
+    standards: 
+      - gnu
+    return_type: void
+    arguments:
+      - type: float
+      - type: float *
+      - type: float *
+  - name: sinf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: tan
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: tanf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: erff
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: exp
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: expf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: exp2
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: exp2f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: exp2m1f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: expm1
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: expm1f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: exp10
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: exp10f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: remainder
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: remainderf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: remainderl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: remainderf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: remquo
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+      - type: int *
+  - name: remquof
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+      - type: int *
+  - name: remquol
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+      - type: int *
+  - name: remquof16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+      - type: int *
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: remquof128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+      - type: int *
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: round
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: roundf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: roundl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: roundf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: roundf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: roundeven
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: roundevenf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: roundevenl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: roundevenf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: roundevenf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: lround
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: double
+  - name: lroundf
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: float
+  - name: lroundl
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: long double
+  - name: lroundf16
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: lroundf128
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: llround
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: double
+  - name: llroundf
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: float
+  - name: llroundl
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: long double
+  - name: llroundf16
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: llroundf128
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: rint
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: rintf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: rintl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: rintf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: rintf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: lrint
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: double
+  - name: lrintf
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: float
+  - name: lrintl
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: long double
+  - name: lrintf16
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: lrintf128
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: llrint
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: double
+  - name: llrintf
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: float
+  - name: llrintl
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: long double
+  - name: llrintf16
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: llrintf128
+    standards: 
+      - stdc
+    return_type: Longlong
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: sqrt
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: sqrtf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: sqrtl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: sqrtf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: trunc
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: truncf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: truncl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: truncf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: truncf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: nearbyint
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: nearbyintf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: nearbyintl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: nearbyintf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: nearbyintf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: nextafterf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: nextafter
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: nextafterl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: nextafterf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: nextafterf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: nexttowardf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: long double
+  - name: nexttoward
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: long double
+  - name: nexttowardl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
+  - name: nexttowardf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: nextdown
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: nextdownf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: nextdownl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: nextdownf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: nextdownf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: nextup
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: nextupf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: nextupl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+  - name: nextupf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: nextupf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: powf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: pow
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: double
+  - name: coshf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: sinhf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: tanhf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: acosf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: asinf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: asin
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+  - name: atanf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: atan2f
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: float
+  - name: acoshf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: asinhf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: atanhf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+  - name: scalbn
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: int
+  - name: scalbnf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: int
+  - name: scalbnl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: int
+  - name: scalbnf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: int
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: nanf
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: const char *
+  - name: nan
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: const char *
+  - name: nanl
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: const char *
+  - name: nanf16
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: const char *
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: nanf128
+    standards: 
+      - stdc
+    return_type: float128
+    arguments:
+      - type: const char *
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: canonicalize
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: double
+      - type: double
+  - name: canonicalizef
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: float
+      - type: float
+  - name: canonicalizel
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: long double
+      - type: long double
+  - name: canonicalizef16
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: _Float16
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: canonicalizef128
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: float128
+      - type: float128
+    guard: LIBC_TYPES_HAS_FLOAT128
+  - name: totalorderf16
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: _Float16 *
+      - type: _Float16 *
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: totalordermagf16
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: _Float16 *
+      - type: _Float16 *
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: f16sqrtf
+    standards: 
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: float
+    guard: LIBC_TYPES_HAS_FLOAT16
diff --git a/libc/newhdrgen/yaml_combined/pthread.yaml b/libc/newhdrgen/yaml_combined/pthread.yaml
new file mode 100644
index 00000000000000..5a1ede32e53e3e
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/pthread.yaml
@@ -0,0 +1,368 @@
+header: pthread.h
+macros: []
+types:
+  - type_name: pthread_t
+  - type_name: pthread_once_t
+  - type_name: pthread_mutex_t
+  - type_name: pthread_mutexattr_t
+  - type_name: pthread_key_t
+  - type_name: pthread_condattr_t
+  - type_name: __pthread_tss_dtor_t
+  - type_name: pthread_rwlockattr_t
+  - type_name: pthread_attr_t
+  - type_name: __pthread_start_t
+  - type_name: __pthread_once_func_t
+  - type_name: __atfork_callback_t
+enums:
+  - name: PTHREAD_CREATE_JOINABLE
+    value: 0x0
+  - name: PTHREAD_CREATE_DETACHED
+    value: 0x1
+  - name: PTHREAD_MUTEX_NORMAL
+    value: 0x0
+  - name: PTHREAD_MUTEX_ERRORCHECK
+    value: 0x1
+  - name: PTHREAD_MUTEX_RECURSIVE
+    value: 0x2
+  - name: PTHREAD_MUTEX_DEFAULT
+    value: 0x0
+  - name: PTHREAD_PROCESS_PRIVATE
+    value: 0x0
+  - name: PTHREAD_PROCESS_SHARED
+    value: 0x1
+  - name: PTHREAD_MUTEX_STALLED
+    value: 0x0
+  - name: PTHREAD_MUTEX_ROBUST
+    value: 0x1
+  - name: PTHREAD_RWLOCK_PREFER_READER_NP
+    value: 0
+  - name: PTHREAD_RWLOCK_PREFER_WRITER_NP
+    value: 1
+  - name: PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP
+    value: 2
+functions:
+  - name: pthread_atfork
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: __atfork_callback_t
+      - type: __atfork_callback_t
+      - type: __atfork_callback_t
+  - name: pthread_attr_destroy
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_attr_t *
+  - name: pthread_attr_init
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_attr_t *
+  - name: pthread_attr_getdetachstate
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_attr_t *
+      - type: int *
+  - name: pthread_attr_getguardsize
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_attr_t * __restrict
+      - type: size_t * __restrict
+  - name: pthread_attr_getstack
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_attr_t * __restrict
+      - type: void * * __restrict
+      - type: size_t * __restrict
+  - name: pthread_attr_getstacksize
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_attr_t * __restrict
+      - type: size_t * __restrict
+  - name: pthread_attr_setdetachstate
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_attr_t *
+      - type: int
+  - name: pthread_attr_setguardsize
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_attr_t *
+      - type: size_t
+  - name: pthread_attr_setstack
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_attr_t *
+      - type: void *
+      - type: size_t
+  - name: pthread_attr_setstacksize
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_attr_t *
+      - type: size_t
+  - name: pthread_condattr_destroy
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_condattr_t *
+  - name: pthread_condattr_getclock
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_condattr_t * __restrict
+      - type: clockid_t * __restrict
+  - name: pthread_condattr_getpshared
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_condattr_t * __restrict
+      - type: int * __restrict
+  - name: pthread_condattr_init
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_condattr_t *
+  - name: pthread_condattr_setclock
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_condattr_t *
+      - type: clockid_t
+  - name: pthread_condattr_setpshared
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_condattr_t *
+      - type: int
+  - name: pthread_create
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_t * __restrict
+      - type: const pthread_attr_t * __restrict
+      - type: __pthread_start_t
+      - type: void *
+  - name: pthread_detach
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_t
+  - name: pthread_equal
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_t
+      - type: pthread_t
+  - name: pthread_exit
+    standards: 
+      - POSIX
+    return_type: __Noreturn void
+    arguments:
+      - type: void *
+  - name: pthread_getname_np
+    standards:
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: pthread_t
+      - type: char *
+      - type: size_t
+  - name: pthread_getspecific
+    standards: 
+      - POSIX
+    return_type: void *
+    arguments:
+      - type: pthread_key_t
+  - name: pthread_join
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_t
+      - type: void * *
+  - name: pthread_key_create
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_key_t *
+      - type: __pthread_tss_dtor_t
+  - name: pthread_key_delete
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_key_t
+  - name: pthread_self
+    standards: 
+      - POSIX
+    return_type: pthread_t
+    arguments: []
+  - name: pthread_setname_np
+    standards:
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: pthread_t
+      - type: const char *
+  - name: pthread_mutex_destroy
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutex_t *
+  - name: pthread_mutex_init
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutex_t * __restrict
+      - type: const pthread_mutexattr_t * __restrict
+  - name: pthread_mutex_lock
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutex_t *
+  - name: pthread_mutex_unlock
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutex_t *
+  - name: pthread_mutexattr_destroy
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutexattr_t *
+  - name: pthread_mutexattr_init
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutexattr_t *
+  - name: pthread_mutexattr_getpshared
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_mutexattr_t * __restrict
+      - type: int * __restrict
+  - name: pthread_mutexattr_getrobust
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_mutexattr_t * __restrict
+      - type: int * __restrict
+  - name: pthread_mutexattr_gettype
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_mutexattr_t * __restrict
+      - type: int * __restrict
+  - name: pthread_mutexattr_setpshared
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutexattr_t * __restrict
+      - type: int
+  - name: pthread_mutexattr_setrobust
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutexattr_t * __restrict
+      - type: int
+  - name: pthread_mutexattr_settype
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_mutexattr_t * __restrict
+      - type: int
+  - name: pthread_once
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_once_t *
+      - type: __pthread_once_func_t
+  - name: pthread_rwlockattr_destroy
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments:
+      - type: pthread_rwlockattr_t *
+  - name: pthread_rwlockattr_getkind_np
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlockattr_t *
+      - type: int *
+  - name: pthread_rwlockattr_getpshared
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const pthread_rwlockattr_t *
+      - type: int *
+  - name: pthread_rwlockattr_init
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlockattr_t *
+  - name: pthread_rwlockattr_setkind_np
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlockattr_t *
+      - type: int
+  - name: pthread_rwlockattr_setpshared
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlockattr_t *
+      - type: int
+  - name: pthread_setspecific
+    standards: 
+      - POSIX
+    return_type: void *
+    arguments:
+      - type: pthread_key_t
+      - type: const void *
diff --git a/libc/newhdrgen/yaml_combined/sched.yaml b/libc/newhdrgen/yaml_combined/sched.yaml
new file mode 100644
index 00000000000000..11a55b7d993b90
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/sched.yaml
@@ -0,0 +1,87 @@
+header: sched.h
+macros: []
+types:
+  - type_name: struct_timespec
+  - type_name: time_t
+  - type_name: struct_sched_param
+  - type_name: size_t
+  - type_name: cpu_set_t
+  - type_name: pid_t
+enums: []
+objects: []
+functions:
+  - name: sched_get_priority_max
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: sched_get_priority_min
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: sched_getaffinity
+    standards: 
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: pid_t
+      - type: size_t
+      - type: cpu_set_t *
+  - name: sched_getparam
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pid_t
+      - type: struct sched_param *
+  - name: sched_getscheduler
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pid_t
+      - type: int
+      - type: const struct sched_param *
+  - name: sched_rr_get_interval
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pid_t
+      - type: struct timespec *
+  - name: sched_setaffinity
+    standards: 
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: pid_t
+      - type: size_t
+      - type: const cpu_set_t *
+  - name: sched_setparam
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pid_t
+      - type: const struct sched_param *
+  - name: sched_setscheduler
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: pid_t
+  - name: sched_yield
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments: []
+  - name: __sched_getcpucount
+    standards: 
+      - llvm_libc_ext
+    return_type: int
+    arguments:
+      - type: size_t
+      - type: const cpu_set_t *
diff --git a/libc/newhdrgen/yaml_combined/signal.yaml b/libc/newhdrgen/yaml_combined/signal.yaml
new file mode 100644
index 00000000000000..a95273419dbde4
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/signal.yaml
@@ -0,0 +1,84 @@
+header: signal.h
+macros:
+  - macro_name: __need_size_t
+    macro_value: null
+types:
+  - type_name: pid_t
+  - type_name: stack_t
+  - type_name: siginfo_t
+  - type_name: struct_sigaction
+  - type_name: sigset_t
+  - type_name: union_sigval
+  - type_name: sig_atomic_t
+enums: []
+objects: []
+functions:
+  - name: raise
+    standards: 
+      - stdc 
+    return_type: int
+    arguments:
+      - type: int
+  - name: kill
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments:
+      - type: pid_t
+      - type: int
+  - name: sigaction
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const struct sigaction * __restrict
+      - type: struct sigaction * __restrict
+  - name: sigaltstack
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments:
+      - type: const stack_t * __restrict
+      - type: stack_t * __restrict
+  - name: sigdelset
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments:
+      - type: sigset_t *
+      - type: int
+  - name: sigaddset
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: sigset_t *
+      - type: int
+  - name: sigemptyset
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: sigset_t *
+  - name: sigprocmask
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments:
+      - type: int
+      - type: const sigset_t * __restrict
+      - type: sigset_t * __restrict
+  - name: sigfillset
+    standards: 
+      - POSIX 
+    return_type: int
+    arguments:
+      - type: sigset_t *
+  - name: signal
+    standards: 
+      - stdc
+    return_type: __sighandler_t
+    arguments:
+      - type: int
+      - type: __sighandler_t
diff --git a/libc/newhdrgen/yaml_combined/stdfix.yaml b/libc/newhdrgen/yaml_combined/stdfix.yaml
new file mode 100644
index 00000000000000..564826a3335805
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/stdfix.yaml
@@ -0,0 +1,222 @@
+header: stdfix.h
+macros: []
+types: []
+enums: []
+objects: []
+functions:
+  - name: abshk
+    standards: 
+      - stdc_ext
+    return_type: short accum
+    arguments:
+      - type: short accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: abshr
+    standards: 
+      - stdc_ext
+    return_type: short fract
+    arguments:
+      - type: short fract
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: absk
+    standards: 
+      - stdc_ext
+    return_type: accum
+    arguments:
+      - type: accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: absr
+    standards: 
+      - stdc_ext
+    return_type: fract
+    arguments:
+      - type: fract
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: abslk
+    standards: 
+      - stdc_ext
+    return_type: long accum
+    arguments:
+      - type: long accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: abslr
+    standards: 
+      - stdc_ext
+    return_type: long fract
+    arguments:
+      - type: long fract
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: exphk
+    standards: 
+      - stdc_ext
+      - llvm_libc_stdfix_ext
+    return_type: short accum
+    arguments:
+      - type: short accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: expk
+    standards: 
+      - stdc_ext
+      - llvm_libc_stdfix_ext
+    return_type: accum
+    arguments:
+      - type: accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundhk
+    standards: 
+      - stdc_ext
+    return_type: short accum
+    arguments:
+      - type: short accum
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundhr
+    standards: 
+      - stdc_ext
+    return_type: short fract
+    arguments:
+      - type: short fract
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundk
+    standards: 
+      - stdc_ext
+    return_type: accum
+    arguments:
+      - type: accum
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundr
+    standards: 
+      - stdc_ext
+    return_type: fract
+    arguments:
+      - type: fract
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundlk
+    standards: 
+      - stdc_ext
+    return_type: long accum
+    arguments:
+      - type: long accum
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundlr
+    standards: 
+      - stdc_ext
+    return_type: long fract
+    arguments:
+      - type: long fract
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: rounduhk
+    standards: 
+      - stdc_ext
+    return_type: unsigned short accum
+    arguments:
+      - type: unsigned short accum
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: rounduhr
+    standards: 
+      - stdc_ext
+    return_type: unsigned short fract
+    arguments:
+      - type: unsigned short fract
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: rounduk
+    standards: 
+      - stdc_ext
+    return_type: unsigned accum
+    arguments:
+      - type: unsigned accum
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundur
+    standards: 
+      - stdc_ext
+    return_type: unsigned fract
+    arguments:
+      - type: unsigned fract
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundulk
+    standards: 
+      - stdc_ext
+    return_type: unsigned long accum
+    arguments:
+      - type: unsigned long accum
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: roundulr
+    standards: 
+      - stdc_ext
+    return_type: unsigned long fract
+    arguments:
+      - type: unsigned long fract
+      - type: int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: sqrtuhk
+    standards: 
+      - stdc_ext
+      - llvm_libc_stdfix_ext
+    return_type: unsigned short accum
+    arguments:
+      - type: unsigned short accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: sqrtuhr
+    standards: 
+      - stdc_ext
+      - llvm_libc_stdfix_ext
+    return_type: unsigned short fract
+    arguments:
+      - type: unsigned short fract
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: sqrtuk
+    standards: 
+      - stdc_ext 
+      - llvm_libc_stdfix_ext
+    return_type: unsigned accum
+    arguments:
+      - type: unsigned accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: sqrtur
+    standards: 
+      - stdc_ext
+      - llvm_libc_stdfix_ext
+    return_type: unsigned fract
+    arguments:
+      - type: unsigned fract
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: sqrtulr
+    standards: 
+      - stdc_ext  
+      - llvm_libc_stdfix_ext
+    return_type: unsigned long fract
+    arguments:
+      - type: unsigned long fract
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: sqrtulk
+    standards: llvm_libc_stdfix_ext
+    return_type: unsigned long accum
+    arguments:
+      - type: unsigned long accum
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: uhksqrtus 
+    standards: 
+      - stdc_ext
+      - llvm_libc_stdfix_ext
+    return_type: unsigned short accum
+    arguments:
+      - type: unsigned short
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
+  - name: uksqrtui
+    standards: 
+      - stdc_ext 
+      - llvm_libc_stdfix_ext
+    return_type: unsigned accum
+    arguments:
+      - type: unsigned int
+    guard: LIBC_COMPILER_HAS_FIXED_POINT
diff --git a/libc/newhdrgen/yaml_combined/stdio.yaml b/libc/newhdrgen/yaml_combined/stdio.yaml
new file mode 100644
index 00000000000000..928a8d5228c211
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/stdio.yaml
@@ -0,0 +1,370 @@
+header: stdio.h
+macros:
+  - macro_name: _IONBF
+    macro_value: 2
+  - macro_name: _IOLBF
+    macro_value: 1
+  - macro_name: _IOFBF
+    macro_value: 0
+  - macro_name: stdout
+    macro_value: stdout
+  - macro_name: stdin
+    macro_value: stdin
+types:
+  - type_name: size_t
+  - type_name: off_t
+  - type_name: cookie_io_functions_t
+  - type_name: FILE
+enums: []
+objects:
+  - object_name: stdin
+    object_type: FILE *
+  - object_name: stdout
+    object_type: FILE *
+  - object_name: stderr
+    object_type: FILE *
+functions:
+  - name: remove
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+  - name: rename
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: sprintf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: char * __restrict
+      - type: const char * __restrict
+      - type: ...
+  - name: snprintf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: char * __restrict
+      - type: size_t
+      - type: const char * __restrict
+      - type: ...
+  - name: fprintf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE * __restrict
+      - type: const char * __restrict
+      - type: ...
+  - name: printf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char * __restrict
+      - type: ...
+  - name: vsprintf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: char * __restrict
+      - type: const char * __restrict
+      - type: va_list
+  - name: vsnprintf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: char * __restrict
+      - type: size_t
+      - type: const char * __restrict
+      - type: va_list
+  - name: vfprintf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE * __restrict
+      - type: const char * __restrict
+      - type: va_list
+  - name: vprintf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char * __restrict
+      - type: va_list
+  - name: sscanf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char * __restrict
+      - type: const char * __restrict
+      - type: ...
+  - name: scanf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char * __restrict
+      - type: ...
+  - name: fscanf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE * __restrict
+      - type: const char * __restrict
+      - type: ... 
+  - name: fileno
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: clearerr
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: FILE *
+  - name: clearerr_unlocked
+    standards: 
+      - GNUExtensions
+    return_type: void
+    arguments:
+      - type: FILE *
+  - name: fclose
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: flockfile
+    standards: 
+      - POSIX 
+    return_type: void
+    arguments:
+      - type: FILE *
+  - name: feof
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: feof_unlocked
+    standards: 
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: ferror
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: ferror_unlocked
+    standards: 
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: fgetc
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: fgetc_unlocked
+    standards: 
+      - GNUExtensions
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: fgets
+    standards: 
+      - stdc 
+    return_type: char *
+    arguments:
+      - type: char * __restrict
+      - type: int
+      - type: FILE * __restrict
+  - name: fflush
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: fopen
+    standards: 
+      - stdc 
+    return_type: FILE *
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: fputc
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+      - type: FILE *
+  - name: fputs
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char * __restrict
+      - type: FILE * __restrict
+  - name: fopencookie
+    standards: 
+      - GNUExtensions
+    return_type: FILE *
+    arguments:
+      - type: void *
+      - type: const char *
+      - type: cookie_io_functions_t
+  - name: fread
+    standards: 
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: void * __restrict
+      - type: size_t
+      - type: size_t
+      - type: FILE * __restrict
+  - name: fread_unlocked
+    standards: 
+      - GNUExtensions
+    return_type: size_t
+    arguments:
+      - type: void * __restrict
+      - type: size_t
+      - type: size_t
+      - type: FILE * __restrict
+  - name: fseek
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE *
+      - type: long
+      - type: int
+  - name: ftell 
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: FILE *
+  - name: funlockfile
+    standards: 
+      - POSIX
+    return_type: void
+    arguments:
+      - type: FILE *
+  - name: fwrite
+    standards: 
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: const void * __restrict
+      - type: size_t
+      - type: size_t
+      - type: FILE * __restrict
+  - name: fwrite_unlocked
+    standards: 
+      - GNUExtensions
+    return_type: size_t
+    arguments:
+      - type: const void * __restrict
+      - type: size_t
+      - type: size_t
+      - type: FILE * __restrict
+  - name: getc
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: getc_unlocked
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: FILE *
+  - name: getchar
+    standards: 
+      - stdc
+    return_type: int
+    arguments: []
+  - name: getchar_unlocked
+    standards: 
+      - POSIX
+    return_type: int
+    arguments: []
+  - name: putc
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+      - type: FILE *
+  - name: putchar
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: puts
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+  - name: setbuf
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: FILE * __restrict
+      - type: char * __restrict
+  - name: setvbuf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: FILE * __restrict
+      - type: char * __restrict
+      - type: int
+      - type: size_t
+  - name: ungetc
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+      - type: FILE *
+  - name: stderr
+    standards: 
+      - stdc
+    return_type: extern FILE *
+    arguments: []
+  - name: stdin
+    standards: 
+      - stdc
+    return_type: extern FILE *
+    arguments: []
+  - name: stdout
+    standards: 
+      - stdc 
+    return_type: extern FILE *
+    arguments: []
diff --git a/libc/newhdrgen/yaml_combined/stdlib.yaml b/libc/newhdrgen/yaml_combined/stdlib.yaml
new file mode 100644
index 00000000000000..032b47f8291e61
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/stdlib.yaml
@@ -0,0 +1,273 @@
+header: stdlib.h
+standards: 
+      - stdc
+macros: []
+types:
+  - type_name: __atexithandler_t
+  - type_name: __qsortrcompare_t
+  - type_name: __qsortcompare_t
+  - type_name: __bsearchcompare_t
+  - type_name: size_t
+  - type_name: lldiv_t
+  - type_name: ldiv_t
+  - type_name: div_t
+enums: []
+objects: []
+functions:
+  - name: abs
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: int
+  - name: atoi
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+  - name: atof
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: const char * __restrict
+  - name: atol
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: const char *
+  - name: atoll
+    standards: 
+      - stdc
+    return_type: long long
+    arguments:
+      - type: const char *
+  - name: bsearch
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: const void *
+      - type: const void *
+      - type: size_t
+      - type: size_t
+      - type: __bsearchcompare_t
+  - name: div
+    standards: 
+      - stdc
+    return_type: div_t
+    arguments:
+      - type: int
+      - type: int
+  - name: labs
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: long
+  - name: ldiv
+    standards: 
+      - stdc
+    return_type: ldiv_t
+    arguments:
+      - type: long
+      - type: long
+  - name: llabs
+    standards: 
+      - stdc
+    return_type: long long
+    arguments:
+      - type: long long
+  - name: lldiv
+    standards: 
+      - stdc
+    return_type: lldiv_t
+    arguments:
+      - type: long long
+      - type: long long
+  - name: qsort
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: size_t
+      - type: __qsortcompare_t
+  - name: qsort_r
+    standards: 
+      - GNUExtensions 
+    return_type: void
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: size_t
+      - type: __qsortrcompare_t
+      - type: void *
+  - name: rand
+    standards: 
+      - stdc
+    return_type: int
+    arguments: []
+  - name: srand
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: unsigned int
+  - name: strfromd
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: char * __restrict
+      - type: size_t
+      - type: const char * __restrict
+      - type: double
+  - name: strfromf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: char * __restrict
+      - type: size_t
+      - type: const char * __restrict
+      - type: float
+  - name: strfroml
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: char * __restrict
+      - type: size_t
+      - type: const char * __restrict
+      - type: long double
+  - name: strtod
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: const char * __restrict
+      - type: char * * __restrict
+  - name: strtof
+    standards: 
+      - stdc
+    return_type: float
+    arguments:
+      - type: const char * __restrict
+      - type: char * * __restrict
+  - name: strtol
+    standards: 
+      - stdc
+    return_type: long
+    arguments:
+      - type: const char * __restrict
+      - type: char * * __restrict
+      - type: int
+  - name: strtold
+    standards: 
+      - stdc
+    return_type: long double
+    arguments:
+      - type: const char * __restrict
+      - type: char * * __restrict
+  - name: strtoll
+    standards: 
+      - stdc
+    return_type: long long
+    arguments:
+      - type: const char * __restrict
+      - type: char * * __restrict
+      - type: int
+  - name: strtoul
+    standards: 
+      - stdc
+    return_type: unsigned long
+    arguments:
+      - type: const char * __restrict
+      - type: char * * __restrict
+      - type: int
+  - name: strtoull
+    standards: 
+      - stdc
+    return_type: unsigned long long
+    arguments:
+      - type: const char * __restrict
+      - type: char * * __restrict
+      - type: int
+  - name: malloc
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: size_t
+  - name: calloc
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: size_t
+      - type: size_t
+  - name: realloc
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: void *
+      - type: size_t
+  - name: aligned_alloc
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: size_t
+      - type: size_t
+  - name: free
+    standards: 
+      - stdc
+    return_type: void
+    arguments:
+      - type: void *
+  - name: _Exit
+    standards: 
+      - stdc
+    return_type: _Noreturn void
+    arguments:
+      - type: int
+  - name: abort
+    standards: 
+      - stdc
+    return_type: _Noreturn void
+    arguments: []
+  - name: at_quick_exit
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: __atexithandler_t
+  - name: atexit
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: __atexithandler_t
+  - name: exit
+    standards: 
+      - stdc
+    return_type: _Noreturn void
+    arguments:
+      - type: int
+  - name: getenv
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: const char *
+  - name: quick_exit
+    standards: 
+      - stdc
+    return_type: _Noreturn void
+    arguments:
+      - type: int
diff --git a/libc/newhdrgen/yaml_combined/string.yaml b/libc/newhdrgen/yaml_combined/string.yaml
new file mode 100644
index 00000000000000..7080a9b7cb3a75
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/string.yaml
@@ -0,0 +1,302 @@
+header: string.h
+macros: []
+types:
+  - type_name: size_t
+enums: []
+objects: []
+functions:
+  - name: memcpy
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: void *__restrict
+      - type: const void *__restrict
+      - type: size_t
+  - name: memmove
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: void *
+      - type: const void *
+      - type: size_t
+  - name: memcmp
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const void *
+      - type: const void *
+      - type: size_t
+  - name: memchr
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: const void *
+      - type: int
+      - type: size_t
+  - name: memset
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: void *
+      - type: int
+      - type: size_t
+  - name: memset_explicit
+    standards: 
+      - stdc
+    return_type: void *
+    arguments:
+      - type: void *
+      - type: int
+      - type: size_t
+  - name: strcpy
+    standards: 
+      - stdc
+    return_type: char
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+  - name: strncpy
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+      - type: size_t
+  - name: strcat
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+  - name: strncat
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: char *
+      - type: const char *
+      - type: size_t
+  - name: strcmp
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strcoll
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strncmp
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+      - type: size_t
+  - name: strxfrm
+    standards: 
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+      - type: size_t
+  - name: strchr
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: int
+  - name: strcspn
+    standards: 
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strdup
+    return_type: char *
+    arguments:
+      - type: const char *
+  - name: strndup
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: size_t
+  - name: strpbrk
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strrchr
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: int
+  - name: strspn
+    standards: 
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strstr
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strtok
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+  - name: strerror
+    standards: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: int
+  - name: strlen
+    standards: 
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: const char *
+  - name: memccpy
+    standards: 
+      - POSIX
+    return_type: void *
+    arguments:
+      - type: void *__restrict
+      - type: const void *__restrict
+      - type: int
+      - type: size_t
+  - name: mempcpy
+    standards: 
+      - POSIX
+    return_type: void *
+    arguments:
+      - type: void *__restrict
+      - type: const void *__restrict
+      - type: size_t
+  - name: stpcpy
+    standards: 
+      - POSIX
+    return_type: char *
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+  - name: stpncpy
+    standards: 
+      - POSIX
+    return_type: char *
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+      - type: size_t
+  - name: strnlen
+    standards: 
+      - POSIX
+    return_type: size_t
+    arguments:
+      - type: const char *
+      - type: size_t
+  - name: strtok_r
+    standards: 
+      - POSIX
+    return_type: char *
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+      - type: char ** __restrict
+  - name: strsignal
+    standards: 
+      - POSIX
+    return_type: char *
+    arguments:
+      - type: int
+  - name: memmem
+    standards: 
+      - GNUExtensions
+    return_type: void *
+    arguments:
+      - type: const void *
+      - type: size_t
+      - type: const void *
+      - type: size_t
+  - name: memrchr
+    standards: 
+      - GNUExtensions
+    return_type: void *
+    arguments:
+      - type: const void *
+      - type: int
+      - type: size_t
+  - name: strerror_r
+    standards: 
+      - GNUExtensions
+    return_type: char *
+    arguments:
+      - type: int
+      - type: char *
+      - type: size_t
+  - name: strcasestr
+    standards: 
+      - GNUExtensions
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strchrnul
+    standards: 
+      - GNUExtensions
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: int
+  - name: strlcat
+    standards: 
+      - BSDExtensions
+    return_type: size_t
+    arguments:
+      - type: char *__restrict
+      - type: char *__restrict
+      - type: size_t
+  - name: strlcpy
+    standards: 
+      - BSDExtensions
+    return_type: size_t
+    arguments:
+      - type: char *__restrict
+      - type: char *__restrict
+      - type: size_t
+  - name: strsep
+    standards: 
+      - BSDExtensions
+    return_type: char *
+    arguments:
+      - type: char **__restrict
+      - type: char *__restrict
diff --git a/libc/newhdrgen/yaml_combined/strings.yaml b/libc/newhdrgen/yaml_combined/strings.yaml
new file mode 100644
index 00000000000000..3bc0c4a76dd1bf
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/strings.yaml
@@ -0,0 +1,58 @@
+header: strings.h
+macros: []
+types: []
+enums: []
+objects: []
+functions:
+  - name: bcopy
+    standards: 
+      - llvm_libc_ext
+    return_type: void
+    arguments:
+      - type: const void *
+      - type: void *
+      - type: size_t
+  - name: bzero
+    standards: 
+      - llvm_libc_ext
+    return_type: void
+    arguments:
+      - type: void *
+      - type: size_t
+  - name: bcmp
+    standards: 
+      - llvm_libc_ext
+    return_type: int
+    arguments:
+      - type: const void *
+      - type: const void *
+      - type: size_t
+  - name: strcasecmp
+    standards: 
+      - BSDExtensions
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+  - name: strncasecmp
+    standards: 
+      - BSDExtensions
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+      - type: size_t
+  - name: index
+    standards: 
+      - BSDExtensions
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: int
+  - name: rindex
+    standards: 
+      - BSDExtensions
+    return_type: char *
+    arguments:
+      - type: const char *
+      - type: int
diff --git a/libc/newhdrgen/yaml_combined/sys_mman.yaml b/libc/newhdrgen/yaml_combined/sys_mman.yaml
new file mode 100644
index 00000000000000..7b858db2de11a7
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/sys_mman.yaml
@@ -0,0 +1,118 @@
+#known as sys/mman in POSIX 
+header: sys-mman.h
+standards: POSIX
+macros: []
+macros: []
+types:
+  - type_name: mode_t
+  - type_name: size_t
+  - type_name: off_t
+enums: []
+objects: []
+functions:
+  - name: madvise
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: int
+  - name: mmap
+    standards: 
+      - POSIX
+    return_type: void *
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: int
+      - type: int
+      - type: int
+      - type: off_t
+  - name: mprotect
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: int
+  - name: munmap
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+  - name: POSIX_madvise
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: int
+  # this is a linux function
+  - name: mincore
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: unsigned char *
+  - name: mlock
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+  - name: mlock2
+    standards: 
+      - Linux
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: unsigned int
+  - name: munlock
+    standards: POSIX
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+  - name: mlockall
+    standards: 
+      -POSIX
+    return_type: int
+    arguments:
+      - type: int
+  - name: munlockall
+    standards: 
+      - POSIX
+    return_type: int
+    arguments: []
+  - name: msync
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: void *
+      - type: size_t
+      - type: int
+  - name: shm_open
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: int
+      - type: mode_t
+  - name: shm_unlink
+    standards: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      
\ No newline at end of file
diff --git a/libc/newhdrgen/yaml_combined/sys_wait.yaml b/libc/newhdrgen/yaml_combined/sys_wait.yaml
new file mode 100644
index 00000000000000..56a6066e925d68
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/sys_wait.yaml
@@ -0,0 +1,33 @@
+header: sys-wait.h #sys/wait.h
+macros: []
+types:
+  - type_name: siginfo_t
+  - type_name: struct_rusage
+  - type_name: pid_t
+enums: []
+objects: []
+functions:
+  - name: wait
+    standards: 
+      - POSIX
+    return_type: pid_t
+    arguments:
+      - type: int *
+  - name: wait4
+    standards: 
+      - BSDExtensions
+      - POSIX 
+    return_type: pid_t
+    arguments:
+      - type: pid_t
+      - type: int *
+      - type: int
+      - type: struct rusage *
+  - name: waitpid
+    standards: 
+      - POSIX
+    return_type: pid_t
+    arguments:
+      - type: pid_t
+      - type: int *
+      - type: int
diff --git a/libc/newhdrgen/yaml_combined/time.yaml b/libc/newhdrgen/yaml_combined/time.yaml
new file mode 100644
index 00000000000000..fbdf1877924846
--- /dev/null
+++ b/libc/newhdrgen/yaml_combined/time.yaml
@@ -0,0 +1,83 @@
+header: time.h
+macros: []
+types:
+  - type_name: struct_timeval
+  - type_name: clockid_t
+  - type_name: struct_timespec
+  - type_name: struct_tm
+  - type_name: time_t
+  - type_name: clock_t
+enums: []
+objects: []
+functions:
+  - name: asctime
+    standard:
+      - stdc
+    return_type: char *
+    arguments:
+      - type: struct tm *
+  - name: asctime_r
+    standard: 
+      - stdc
+    return_type: char *
+    arguments:
+      - type: struct tm *
+      - type: char *
+  - name: clock_gettime
+    standard: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: clockid_t
+      - type: struct timespec *
+  - name: clock
+    standard: 
+      - stdc
+    return_type: clock_t
+    arguments: []
+  - name: difftime
+    standard: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: time_t
+      - type: time_t
+  - name: gettimeofday
+    standard: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: struct timeval *
+      - type: void *
+  - name: gmtime
+    standard: 
+      - stdc
+    return_type: struct tm *
+    arguments:
+      - type: time_t *
+  - name: gmtime_r
+    standard: 
+      - stdc
+    return_type: struct tm *
+    arguments:
+      - type: time_t *
+      - type: struct tm *
+  - name: mktime
+    standard: 
+      - stdc
+    return_type: time_t
+    arguments:
+      - type: struct tm *
+  - name: nanosleep
+    standard: 
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const struct timespec *
+      - type: struct timespec *
+  - name: time
+    standard: 
+      - stdc
+    return_type: time_t
+    arguments:
+      - type: time_t *
diff --git a/libc/newhdrgen/yaml_to_classes.py b/libc/newhdrgen/yaml_to_classes.py
new file mode 100644
index 00000000000000..7159dd9cc88817
--- /dev/null
+++ b/libc/newhdrgen/yaml_to_classes.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+#
+# ===- Generate headers for libc functions  -------------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+
+import yaml
+import re
+import argparse
+
+from pathlib import Path
+from header import HeaderFile
+from class_implementation.classes.macro import Macro
+from class_implementation.classes.type import Type
+from class_implementation.classes.function import Function
+from class_implementation.classes.include import Include
+from class_implementation.classes.enumeration import Enumeration
+from class_implementation.classes.object import Object
+
+
+def yaml_to_classes(yaml_data):
+    """
+    Convert YAML data to header classes.
+
+    Args:
+        yaml_data: The YAML data containing header specifications.
+
+    Returns:
+        HeaderFile: An instance of HeaderFile populated with the data.
+    """
+    header_name = yaml_data.get("header")
+    header = HeaderFile(header_name)
+
+    for macro_data in yaml_data.get("macros", []):
+        header.add_macro(Macro(macro_data["macro_name"], macro_data["macro_value"]))
+
+    for type_data in yaml_data.get("types", []):
+        header.add_type(Type(type_data["type_name"]))
+
+    for enum_data in yaml_data.get("enums", []):
+        header.add_enumeration(
+            Enumeration(enum_data["name"], enum_data.get("value", None))
+        )
+
+    for function_data in yaml_data.get("functions", []):
+        arguments = [arg["type"] for arg in function_data["arguments"]]
+        guard = function_data.get("guard", None)
+        attributes = function_data.get("attributes", None)
+        standards = (function_data.get("standards", None),)
+        header.add_function(
+            Function(
+                standards,
+                function_data["return_type"],
+                function_data["name"],
+                arguments,
+                guard,
+                attributes,
+            )
+        )
+
+    for object_data in yaml_data.get("objects", []):
+        header.add_object(
+            Object(object_data["object_name"], object_data["object_type"])
+        )
+
+    for include_data in yaml_data.get("includes", []):
+        header.add_include(Include(include_data))
+
+    return header
+
+
+def load_yaml_file(yaml_file):
+    """
+    Load YAML file and convert it to header classes.
+
+    Args:
+        yaml_file: The path to the YAML file.
+
+    Returns:
+        HeaderFile: An instance of HeaderFile populated with the data from the YAML file.
+    """
+    with open(yaml_file, "r") as f:
+        yaml_data = yaml.safe_load(f)
+    return yaml_to_classes(yaml_data)
+
+
+def fill_public_api(header_str, h_def_content):
+    """
+    Replace the %%public_api() placeholder in the .h.def content with the generated header content.
+
+    Args:
+        header_str: The generated header string.
+        h_def_content: The content of the .h.def file.
+
+    Returns:
+        The final header content with the public API filled in.
+    """
+    return h_def_content.replace("%%public_api()", header_str, 1)
+
+
+def main(yaml_file, h_def_file, output_dir):
+    """
+    Main function to generate header files from YAML and .h.def templates.
+
+    Args:
+        yaml_file: Path to the YAML file containing header specification.
+        h_def_file: Path to the .h.def template file.
+        output_dir: Directory to output the generated header file.
+    """
+
+    header = load_yaml_file(yaml_file)
+
+    with open(h_def_file, "r") as f:
+        h_def_content = f.read()
+
+    header_str = str(header)
+    final_header_content = fill_public_api(header_str, h_def_content)
+
+    output_file_name = Path(h_def_file).stem
+    output_file_path = Path(output_dir) / output_file_name
+
+    with open(output_file_path, "w") as f:
+        f.write(final_header_content)
+
+    print(f"Generated header file: {output_file_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate header files from YAML and .h.def templates"
+    )
+    parser.add_argument(
+        "yaml_file", help="Path to the YAML file containing header specification"
+    )
+    parser.add_argument("h_def_file", help="Path to the .h.def template file")
+    parser.add_argument(
+        "--output_dir",
+        default=".",
+        help="Directory to output the generated header file",
+    )
+    args = parser.parse_args()
+
+    main(args.yaml_file, args.h_def_file, args.output_dir)
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
index ca61d4ef371a2e..74a90a4d041321 100644
--- a/libc/spec/llvm_libc_ext.td
+++ b/libc/spec/llvm_libc_ext.td
@@ -51,8 +51,25 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
       ]
   >;
 
+  HeaderSpec Math = HeaderSpec<
+      "math.h",
+      [], // Macros
+      [], // Types
+      [], // Enumerations
+      [
+          GuardedFunctionSpec<"f16div", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16divf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16divl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+
+          GuardedFunctionSpec<"f16sqrt", RetValSpec<Float16Type>, [ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16sqrtf", RetValSpec<Float16Type>, [ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16sqrtl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+      ]
+  >;
+
   let Headers = [
     Assert,
+    Math,
     Sched,
     Strings,
   ];
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 73d00c3e33aa9c..6e59062dde7273 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -477,7 +477,10 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
+          GuardedFunctionSpec<"f16fma", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16fmaf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16fmal", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16fmaf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
 
           FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
@@ -726,7 +729,9 @@ def StdC : StandardSpec<"stdc"> {
 
           GuardedFunctionSpec<"setpayloadsigf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
-          GuardedFunctionSpec<"f16sqrtf", RetValSpec<Float16Type>, [ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16divf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
+
+          GuardedFunctionSpec<"f16sqrtf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
       ]
   >;
 
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 4ede74206e9737..84c5f802710c4a 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -154,7 +154,6 @@ add_header_library(
   HDRS
     multiply_add.h
   DEPENDS
-    .fma
     libc.src.__support.common
 )
 
@@ -199,6 +198,7 @@ add_header_library(
   HDRS
     dyadic_float.h
   DEPENDS
+    .fenv_impl
     .fp_bits
     .multiply_add
     libc.src.__support.CPP.type_traits
diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h
index 741e24aa519a79..cff32938229d02 100644
--- a/libc/src/__support/FPUtil/NearestIntegerOperations.h
+++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h
@@ -199,7 +199,8 @@ round_using_specific_rounding_mode(T x, int rnd) {
     return x;
 
   StorageType trim_value =
-      bits.get_mantissa() & ((StorageType(1) << trim_size) - 1);
+      bits.get_mantissa() &
+      static_cast<StorageType>(((StorageType(1) << trim_size) - 1));
   StorageType half_value =
       static_cast<StorageType>((StorageType(1) << (trim_size - 1)));
   // If exponent is 0, trimSize will be equal to the mantissa width, and
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index b9490b52f6b41e..3d16a3cce3a99c 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -21,12 +21,22 @@ using DoubleDouble = LIBC_NAMESPACE::NumberPair<double>;
 // The output of Dekker's FastTwoSum algorithm is correct, i.e.:
 //   r.hi + r.lo = a + b exactly
 //   and |r.lo| < eps(r.lo)
-// if ssumption: |a| >= |b|, or a = 0.
+// Assumption: |a| >= |b|, or a = 0.
+template <bool FAST2SUM = true>
 LIBC_INLINE constexpr DoubleDouble exact_add(double a, double b) {
   DoubleDouble r{0.0, 0.0};
-  r.hi = a + b;
-  double t = r.hi - a;
-  r.lo = b - t;
+  if constexpr (FAST2SUM) {
+    r.hi = a + b;
+    double t = r.hi - a;
+    r.lo = b - t;
+  } else {
+    r.hi = a + b;
+    double t1 = r.hi - a;
+    double t2 = r.hi - t1;
+    double t3 = b - t1;
+    double t4 = a - t2;
+    r.lo = t3 + t4;
+  }
   return r;
 }
 
@@ -40,15 +50,20 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a,
 
 // Assumption: |a.hi| >= |b|
 LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
-  DoubleDouble r = exact_add(a.hi, b);
+  DoubleDouble r = exact_add<false>(a.hi, b);
   return exact_add(r.hi, r.lo + a.lo);
 }
 
-// Velkamp's Splitting for double precision.
-LIBC_INLINE constexpr DoubleDouble split(double a) {
+// Veltkamp's Splitting for double precision.
+// Note: This is proved to be correct for all rounding modes:
+//   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
+//   Roundings," https://inria.hal.science/hal-04480440.
+// Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
+template <size_t N = 27> LIBC_INLINE constexpr DoubleDouble split(double a) {
   DoubleDouble r{0.0, 0.0};
-  // Splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
-  constexpr double C = 0x1.0p27 + 1.0;
+  // CN = 2^N.
+  constexpr double CN = static_cast<double>(1 << N);
+  constexpr double C = CN + 1.0;
   double t1 = C * a;
   double t2 = a - t1;
   r.hi = t1 + t2;
@@ -56,6 +71,14 @@ LIBC_INLINE constexpr DoubleDouble split(double a) {
   return r;
 }
 
+// Note: When FMA instruction is not available, the `exact_mult` function is
+// only correct for round-to-nearest mode.  See:
+//   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
+//   Roundings," https://inria.hal.science/hal-04480440.
+// Using Theorem 1 in the paper above, without FMA instruction, if we restrict
+// the generated constants to precision <= 51, and splitting it by 2^28 + 1,
+// then a * b = r.hi + r.lo is exact for all rounding modes.
+template <bool NO_FMA_ALL_ROUNDINGS = false>
 LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
   DoubleDouble r{0.0, 0.0};
 
@@ -65,7 +88,13 @@ LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
 #else
   // Dekker's Product.
   DoubleDouble as = split(a);
-  DoubleDouble bs = split(b);
+  DoubleDouble bs;
+
+  if constexpr (NO_FMA_ALL_ROUNDINGS)
+    bs = split<28>(b);
+  else
+    bs = split(b);
+
   r.hi = a * b;
   double t1 = as.hi * bs.hi - r.hi;
   double t2 = as.hi * bs.lo + t1;
@@ -82,9 +111,10 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
   return r;
 }
 
+template <bool NO_FMA_ALL_ROUNDINGS = false>
 LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
                                     const DoubleDouble &b) {
-  DoubleDouble r = exact_mult(a.hi, b.hi);
+  DoubleDouble r = exact_mult<NO_FMA_ALL_ROUNDINGS>(a.hi, b.hi);
   double t1 = multiply_add(a.hi, b.lo, r.lo);
   double t2 = multiply_add(a.lo, b.hi, t1);
   r.lo = t2;
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 63cb98351201f6..8d44a98a693f87 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
 
+#include "FEnvImpl.h"
 #include "FPBits.h"
 #include "multiply_add.h"
 #include "src/__support/CPP/type_traits.h"
@@ -86,11 +87,11 @@ template <size_t Bits> struct DyadicFloat {
 
   // Assume that it is already normalized.
   // Output is rounded correctly with respect to the current rounding mode.
-  template <typename T,
+  template <typename T, bool ShouldSignalExceptions,
             typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
                                             (FPBits<T>::FRACTION_LEN < Bits),
                                         void>>
-  LIBC_INLINE explicit constexpr operator T() const {
+  LIBC_INLINE constexpr T as() const {
     if (LIBC_UNLIKELY(mantissa.is_zero()))
       return FPBits<T>::zero(sign).get_val();
 
@@ -107,7 +108,17 @@ template <size_t Bits> struct DyadicFloat {
       T d_hi =
           FPBits<T>::create_value(sign, 2 * FPBits<T>::EXP_BIAS, IMPLICIT_MASK)
               .get_val();
-      return T(2) * d_hi;
+      // volatile prevents constant propagation that would result in infinity
+      // always being returned no matter the current rounding mode.
+      volatile T two = static_cast<T>(2.0);
+      T r = two * d_hi;
+
+      // TODO: Whether rounding down the absolute value to max_normal should
+      // also raise FE_OVERFLOW and set ERANGE is debatable.
+      if (ShouldSignalExceptions && FPBits<T>(r).is_inf())
+        set_errno_if_required(ERANGE);
+
+      return r;
     }
 
     bool denorm = false;
@@ -145,13 +156,13 @@ template <size_t Bits> struct DyadicFloat {
       // d_lo is denormal, but the output is normal.
       int scale_up_exponent = 1 - exp_lo;
       T scale_up_factor =
-          FPBits<T>::create_value(sign,
+          FPBits<T>::create_value(Sign::POS,
                                   static_cast<output_bits_t>(
                                       FPBits<T>::EXP_BIAS + scale_up_exponent),
                                   IMPLICIT_MASK)
               .get_val();
       T scale_down_factor =
-          FPBits<T>::create_value(sign,
+          FPBits<T>::create_value(Sign::POS,
                                   static_cast<output_bits_t>(
                                       FPBits<T>::EXP_BIAS - scale_up_exponent),
                                   IMPLICIT_MASK)
@@ -179,10 +190,20 @@ template <size_t Bits> struct DyadicFloat {
       output_bits_t clear_exp = static_cast<output_bits_t>(
           output_bits_t(exp_hi) << FPBits<T>::SIG_LEN);
       output_bits_t r_bits = FPBits<T>(r).uintval() - clear_exp;
+
       if (!(r_bits & FPBits<T>::EXP_MASK)) {
         // Output is denormal after rounding, clear the implicit bit for 80-bit
         // long double.
         r_bits -= IMPLICIT_MASK;
+
+        // TODO: IEEE Std 754-2019 lets implementers choose whether to check for
+        // "tininess" before or after rounding for base-2 formats, as long as
+        // the same choice is made for all operations. Our choice to check after
+        // rounding might not be the same as the hardware's.
+        if (ShouldSignalExceptions && round_and_sticky) {
+          set_errno_if_required(ERANGE);
+          raise_except_if_required(FE_UNDERFLOW);
+        }
       }
 
       return FPBits<T>(r_bits).get_val();
@@ -191,6 +212,14 @@ template <size_t Bits> struct DyadicFloat {
     return r;
   }
 
+  template <typename T,
+            typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
+                                            (FPBits<T>::FRACTION_LEN < Bits),
+                                        void>>
+  LIBC_INLINE explicit constexpr operator T() const {
+    return as<T, /*ShouldSignalExceptions=*/false>();
+  }
+
   LIBC_INLINE explicit constexpr operator MantissaType() const {
     if (mantissa.is_zero())
       return 0;
@@ -278,11 +307,11 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
 // don't need to normalize the inputs again in this function.  If the inputs are
 // not normalized, the results might lose precision significantly.
 template <size_t Bits>
-LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(DyadicFloat<Bits> a,
-                                                  DyadicFloat<Bits> b) {
+LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a,
+                                                  const DyadicFloat<Bits> &b) {
   DyadicFloat<Bits> result;
   result.sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS;
-  result.exponent = a.exponent + b.exponent + int(Bits);
+  result.exponent = a.exponent + b.exponent + static_cast<int>(Bits);
 
   if (!(a.mantissa.is_zero() || b.mantissa.is_zero())) {
     result.mantissa = a.mantissa.quick_mul_hi(b.mantissa);
@@ -309,7 +338,7 @@ multiply_add(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b,
 // Simple exponentiation implementation for printf. Only handles positive
 // exponents, since division isn't implemented.
 template <size_t Bits>
-LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(DyadicFloat<Bits> a,
+LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(const DyadicFloat<Bits> &a,
                                               uint32_t power) {
   DyadicFloat<Bits> result = 1.0;
   DyadicFloat<Bits> cur_power = a;
@@ -325,7 +354,7 @@ LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(DyadicFloat<Bits> a,
 }
 
 template <size_t Bits>
-LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(DyadicFloat<Bits> a,
+LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(const DyadicFloat<Bits> &a,
                                                   int32_t pow_2) {
   DyadicFloat<Bits> result = a;
   result.exponent += pow_2;
diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
index a8a95ba3f15ffa..fb49fd00395371 100644
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -8,6 +8,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.type_traits
+    libc.src.__support.FPUtil.dyadic_float
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
@@ -24,6 +25,8 @@ add_header_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.type_traits
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.dyadic_float
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
@@ -45,3 +48,20 @@ add_header_library(
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
 )
+
+add_header_library(
+  div
+  HDRS
+    div.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.optimization
+)
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 71b150758d4191..72341a129dcb58 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -12,7 +12,9 @@
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/big_int.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
@@ -106,20 +108,52 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
                                  sizeof(OutType) <= sizeof(InType),
                              OutType>
 fma(InType x, InType y, InType z) {
-  using OutFPBits = fputil::FPBits<OutType>;
+  using OutFPBits = FPBits<OutType>;
   using OutStorageType = typename OutFPBits::StorageType;
-  using InFPBits = fputil::FPBits<InType>;
+  using InFPBits = FPBits<InType>;
   using InStorageType = typename InFPBits::StorageType;
 
   constexpr int IN_EXPLICIT_MANT_LEN = InFPBits::FRACTION_LEN + 1;
   constexpr size_t PROD_LEN = 2 * IN_EXPLICIT_MANT_LEN;
   constexpr size_t TMP_RESULT_LEN = cpp::bit_ceil(PROD_LEN + 1);
   using TmpResultType = UInt<TMP_RESULT_LEN>;
+  using DyadicFloat = DyadicFloat<TMP_RESULT_LEN>;
 
-  constexpr size_t EXTRA_FRACTION_LEN =
-      TMP_RESULT_LEN - 1 - OutFPBits::FRACTION_LEN;
-  constexpr TmpResultType EXTRA_FRACTION_STICKY_MASK =
-      (TmpResultType(1) << (EXTRA_FRACTION_LEN - 1)) - 1;
+  InFPBits x_bits(x), y_bits(y), z_bits(z);
+
+  if (LIBC_UNLIKELY(x_bits.is_nan() || y_bits.is_nan() || z_bits.is_nan())) {
+    if (x_bits.is_nan() || y_bits.is_nan()) {
+      if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan() ||
+          z_bits.is_signaling_nan())
+        raise_except_if_required(FE_INVALID);
+
+      if (x_bits.is_quiet_nan()) {
+        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
+        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(x_bits.sign(),
+                                      static_cast<OutStorageType>(x_payload))
+              .get_val();
+      }
+
+      if (y_bits.is_quiet_nan()) {
+        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
+        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(y_bits.sign(),
+                                      static_cast<OutStorageType>(y_payload))
+              .get_val();
+      }
+
+      if (z_bits.is_quiet_nan()) {
+        InStorageType z_payload = static_cast<InStorageType>(getpayload(z));
+        if ((z_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(z_bits.sign(),
+                                      static_cast<OutStorageType>(z_payload))
+              .get_val();
+      }
+
+      return OutFPBits::quiet_nan().get_val();
+    }
+  }
 
   if (LIBC_UNLIKELY(x == 0 || y == 0 || z == 0))
     return static_cast<OutType>(x * y + z);
@@ -142,7 +176,9 @@ fma(InType x, InType y, InType z) {
     z *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
   }
 
-  InFPBits x_bits(x), y_bits(y), z_bits(z);
+  x_bits = InFPBits(x);
+  y_bits = InFPBits(y);
+  z_bits = InFPBits(z);
   const Sign z_sign = z_bits.sign();
   Sign prod_sign = (x_bits.sign() == y_bits.sign()) ? Sign::POS : Sign::NEG;
   x_exp += x_bits.get_biased_exponent();
@@ -182,7 +218,6 @@ fma(InType x, InType y, InType z) {
   constexpr int RESULT_MIN_LEN = PROD_LEN - InFPBits::FRACTION_LEN;
   z_mant <<= RESULT_MIN_LEN;
   int z_lsb_exp = z_exp - (InFPBits::FRACTION_LEN + RESULT_MIN_LEN);
-  bool round_bit = false;
   bool sticky_bits = false;
   bool z_shifted = false;
 
@@ -221,85 +256,18 @@ fma(InType x, InType y, InType z) {
     }
   }
 
-  OutStorageType result = 0;
-  int r_exp = 0; // Unbiased exponent of the result
-
-  int round_mode = fputil::quick_get_round();
-
-  // Normalize the result.
-  if (prod_mant != 0) {
-    int lead_zeros = cpp::countl_zero(prod_mant);
-    // Move the leading 1 to the most significant bit.
-    prod_mant <<= lead_zeros;
-    prod_lsb_exp -= lead_zeros;
-    r_exp = prod_lsb_exp + (cpp::numeric_limits<TmpResultType>::digits - 1) -
-            InFPBits::EXP_BIAS + OutFPBits::EXP_BIAS;
-
-    if (r_exp > 0) {
-      // The result is normal.  We will shift the mantissa to the right by the
-      // amount of extra bits compared to the length of the explicit mantissa in
-      // the output type.  The rounding bit then becomes the highest bit that is
-      // shifted out, and the following lower bits are merged into sticky bits.
-      round_bit =
-          (prod_mant & (TmpResultType(1) << (EXTRA_FRACTION_LEN - 1))) != 0;
-      sticky_bits |= (prod_mant & EXTRA_FRACTION_STICKY_MASK) != 0;
-      result = static_cast<OutStorageType>(prod_mant >> EXTRA_FRACTION_LEN);
-    } else {
-      if (r_exp < -OutFPBits::FRACTION_LEN) {
-        // The result is smaller than 1/2 of the smallest denormal number.
-        sticky_bits = true; // since the result is non-zero.
-        result = 0;
-      } else {
-        // The result is denormal.
-        TmpResultType mask = TmpResultType(1) << (EXTRA_FRACTION_LEN - r_exp);
-        round_bit = (prod_mant & mask) != 0;
-        sticky_bits |= (prod_mant & (mask - 1)) != 0;
-        if (r_exp > -OutFPBits::FRACTION_LEN)
-          result = static_cast<OutStorageType>(
-              prod_mant >> (EXTRA_FRACTION_LEN + 1 - r_exp));
-        else
-          result = 0;
-      }
-
-      r_exp = 0;
-    }
-  } else {
+  if (prod_mant == 0) {
     // When there is exact cancellation, i.e., x*y == -z exactly, return -0.0 if
     // rounding downward and +0.0 for other rounding modes.
-    if (round_mode == FE_DOWNWARD)
+    if (quick_get_round() == FE_DOWNWARD)
       prod_sign = Sign::NEG;
     else
       prod_sign = Sign::POS;
   }
 
-  // Finalize the result.
-  if (LIBC_UNLIKELY(r_exp >= OutFPBits::MAX_BIASED_EXPONENT)) {
-    if ((round_mode == FE_TOWARDZERO) ||
-        (round_mode == FE_UPWARD && prod_sign.is_neg()) ||
-        (round_mode == FE_DOWNWARD && prod_sign.is_pos())) {
-      return OutFPBits::max_normal(prod_sign).get_val();
-    }
-    return OutFPBits::inf(prod_sign).get_val();
-  }
-
-  // Remove hidden bit and append the exponent field and sign bit.
-  result = static_cast<OutStorageType>(
-      (result & OutFPBits::FRACTION_MASK) |
-      (static_cast<OutStorageType>(r_exp) << OutFPBits::FRACTION_LEN));
-  if (prod_sign.is_neg())
-    result |= OutFPBits::SIGN_MASK;
-
-  // Rounding.
-  if (round_mode == FE_TONEAREST) {
-    if (round_bit && (sticky_bits || ((result & 1) != 0)))
-      ++result;
-  } else if ((round_mode == FE_UPWARD && prod_sign.is_pos()) ||
-             (round_mode == FE_DOWNWARD && prod_sign.is_neg())) {
-    if (round_bit || sticky_bits)
-      ++result;
-  }
-
-  return cpp::bit_cast<OutType>(result);
+  DyadicFloat result(prod_sign, prod_lsb_exp - InFPBits::EXP_BIAS, prod_mant);
+  result.mantissa |= static_cast<unsigned int>(sticky_bits);
+  return result.template as<OutType, /*ShouldSignalExceptions=*/true>();
 }
 
 } // namespace generic
diff --git a/libc/src/__support/FPUtil/generic/div.h b/libc/src/__support/FPUtil/generic/div.h
new file mode 100644
index 00000000000000..0d84aa8d8bccc1
--- /dev/null
+++ b/libc/src/__support/FPUtil/generic/div.h
@@ -0,0 +1,123 @@
+//===-- Division of IEEE 754 floating-point numbers -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_DIV_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_DIV_H
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE::fputil::generic {
+
+template <typename OutType, typename InType>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
+                                 cpp::is_floating_point_v<InType> &&
+                                 sizeof(OutType) <= sizeof(InType),
+                             OutType>
+div(InType x, InType y) {
+  using OutFPBits = FPBits<OutType>;
+  using OutStorageType = typename OutFPBits::StorageType;
+  using InFPBits = FPBits<InType>;
+  using InStorageType = typename InFPBits::StorageType;
+  using DyadicFloat =
+      DyadicFloat<cpp::bit_ceil(static_cast<size_t>(InFPBits::FRACTION_LEN))>;
+
+  InFPBits x_bits(x);
+  InFPBits y_bits(y);
+
+  Sign result_sign = x_bits.sign() == y_bits.sign() ? Sign::POS : Sign::NEG;
+
+  if (LIBC_UNLIKELY(x_bits.is_inf_or_nan() || y_bits.is_inf_or_nan() ||
+                    x_bits.is_zero() || y_bits.is_zero())) {
+    if (x_bits.is_nan() || y_bits.is_nan()) {
+      if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
+        raise_except_if_required(FE_INVALID);
+
+      if (x_bits.is_quiet_nan()) {
+        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
+        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(x_bits.sign(),
+                                      static_cast<OutStorageType>(x_payload))
+              .get_val();
+      }
+
+      if (y_bits.is_quiet_nan()) {
+        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
+        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(y_bits.sign(),
+                                      static_cast<OutStorageType>(y_payload))
+              .get_val();
+      }
+
+      return OutFPBits::quiet_nan().get_val();
+    }
+
+    if (x_bits.is_inf()) {
+      if (y_bits.is_inf()) {
+        set_errno_if_required(EDOM);
+        raise_except_if_required(FE_INVALID);
+        return OutFPBits::quiet_nan().get_val();
+      }
+
+      return OutFPBits::inf(result_sign).get_val();
+    }
+
+    if (y_bits.is_inf())
+      return OutFPBits::inf(result_sign).get_val();
+
+    if (y_bits.is_zero()) {
+      if (x_bits.is_zero()) {
+        raise_except_if_required(FE_INVALID);
+        return OutFPBits::quiet_nan().get_val();
+      }
+
+      raise_except_if_required(FE_DIVBYZERO);
+      return OutFPBits::inf(result_sign).get_val();
+    }
+
+    if (x_bits.is_zero())
+      return OutFPBits::zero(result_sign).get_val();
+  }
+
+  DyadicFloat xd(x);
+  DyadicFloat yd(y);
+
+  // Number of iterations = full output precision + 1 rounding bit + 1 potential
+  // leading 0.
+  constexpr int NUM_ITERS = OutFPBits::FRACTION_LEN + 3;
+  int result_exp = xd.exponent - yd.exponent - (NUM_ITERS - 1);
+
+  InStorageType q = 0;
+  InStorageType r = static_cast<InStorageType>(xd.mantissa >> 2);
+  InStorageType yd_mant_in = static_cast<InStorageType>(yd.mantissa >> 1);
+
+  for (int i = 0; i < NUM_ITERS; ++i) {
+    q <<= 1;
+    r <<= 1;
+    if (r >= yd_mant_in) {
+      q += 1;
+      r -= yd_mant_in;
+    }
+  }
+
+  DyadicFloat result(result_sign, result_exp, q);
+  result.mantissa |= static_cast<unsigned int>(r != 0);
+  return result.template as<OutType, /*ShouldSignalExceptions=*/true>();
+}
+
+} // namespace LIBC_NAMESPACE::fputil::generic
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_DIV_H
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index d6e894fdfe021b..1b713796f74991 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -14,7 +14,7 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/common.h"
 #include "src/__support/uint128.h"
 
@@ -78,16 +78,14 @@ sqrt(InType x) {
     return x86::sqrt(x);
   } else {
     // IEEE floating points formats.
-    using OutFPBits = typename fputil::FPBits<OutType>;
-    using OutStorageType = typename OutFPBits::StorageType;
-    using InFPBits = typename fputil::FPBits<InType>;
+    using OutFPBits = FPBits<OutType>;
+    using InFPBits = FPBits<InType>;
     using InStorageType = typename InFPBits::StorageType;
+    using DyadicFloat =
+        DyadicFloat<cpp::bit_ceil(static_cast<size_t>(InFPBits::STORAGE_LEN))>;
+
     constexpr InStorageType ONE = InStorageType(1) << InFPBits::FRACTION_LEN;
     constexpr auto FLT_NAN = OutFPBits::quiet_nan().get_val();
-    constexpr int EXTRA_FRACTION_LEN =
-        InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-    constexpr InStorageType EXTRA_FRACTION_MASK =
-        (InStorageType(1) << EXTRA_FRACTION_LEN) - 1;
 
     InFPBits bits(x);
 
@@ -135,6 +133,7 @@ sqrt(InType x) {
       InStorageType y = ONE;
       InStorageType r = x_mant - ONE;
 
+      // TODO: Reduce iteration count to OutFPBits::FRACTION_LEN + 2 or + 3.
       for (InStorageType current_bit = ONE >> 1; current_bit;
            current_bit >>= 1) {
         r <<= 1;
@@ -146,91 +145,19 @@ sqrt(InType x) {
       }
 
       // We compute one more iteration in order to round correctly.
-      bool lsb = (y & (InStorageType(1) << EXTRA_FRACTION_LEN)) !=
-                 0;    // Least significant bit
-      bool rb = false; // Round bit
       r <<= 2;
-      InStorageType tmp = (y << 2) + 1;
+      y <<= 2;
+      InStorageType tmp = y + 1;
       if (r >= tmp) {
         r -= tmp;
-        rb = true;
-      }
-
-      bool sticky = false;
-
-      if constexpr (EXTRA_FRACTION_LEN > 0) {
-        sticky = rb || (y & EXTRA_FRACTION_MASK) != 0;
-        rb = (y & (InStorageType(1) << (EXTRA_FRACTION_LEN - 1))) != 0;
-      }
-
-      // Remove hidden bit and append the exponent field.
-      x_exp = ((x_exp >> 1) + OutFPBits::EXP_BIAS);
-
-      OutStorageType y_out = static_cast<OutStorageType>(
-          ((y - ONE) >> EXTRA_FRACTION_LEN) |
-          (static_cast<OutStorageType>(x_exp) << OutFPBits::FRACTION_LEN));
-
-      if constexpr (EXTRA_FRACTION_LEN > 0) {
-        if (x_exp >= OutFPBits::MAX_BIASED_EXPONENT) {
-          switch (quick_get_round()) {
-          case FE_TONEAREST:
-          case FE_UPWARD:
-            return OutFPBits::inf().get_val();
-          default:
-            return OutFPBits::max_normal().get_val();
-          }
-        }
-
-        if (x_exp <
-            -OutFPBits::EXP_BIAS - OutFPBits::SIG_LEN + EXTRA_FRACTION_LEN) {
-          switch (quick_get_round()) {
-          case FE_UPWARD:
-            return OutFPBits::min_subnormal().get_val();
-          default:
-            return OutType(0.0);
-          }
-        }
-
-        if (x_exp <= 0) {
-          int underflow_extra_fraction_len = EXTRA_FRACTION_LEN - x_exp + 1;
-          InStorageType underflow_extra_fraction_mask =
-              (InStorageType(1) << underflow_extra_fraction_len) - 1;
-
-          rb = (y & (InStorageType(1) << (underflow_extra_fraction_len - 1))) !=
-               0;
-          OutStorageType subnormal_mant =
-              static_cast<OutStorageType>(y >> underflow_extra_fraction_len);
-          lsb = (subnormal_mant & 1) != 0;
-          sticky = sticky || (y & underflow_extra_fraction_mask) != 0;
-
-          switch (quick_get_round()) {
-          case FE_TONEAREST:
-            if (rb && (lsb || sticky))
-              ++subnormal_mant;
-            break;
-          case FE_UPWARD:
-            if (rb || sticky)
-              ++subnormal_mant;
-            break;
-          }
-
-          return cpp::bit_cast<OutType>(subnormal_mant);
-        }
-      }
-
-      switch (quick_get_round()) {
-      case FE_TONEAREST:
-        // Round to nearest, ties to even
-        if (rb && (lsb || (r != 0)))
-          ++y_out;
-        break;
-      case FE_UPWARD:
-        if (rb || (r != 0) || sticky)
-          ++y_out;
-        break;
+        // Rounding bit.
+        y |= 2;
       }
+      // Sticky bit.
+      y |= static_cast<unsigned int>(r != 0);
 
-      return cpp::bit_cast<OutType>(y_out);
+      DyadicFloat yd(Sign::POS, (x_exp >> 1) - 2 - InFPBits::FRACTION_LEN, y);
+      return yd.template as<OutType, /*ShouldSignalExceptions=*/true>();
     }
   }
 }
diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
index 622914e4265c91..1b9eee4ace62d0 100644
--- a/libc/src/__support/FPUtil/multiply_add.h
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -39,17 +39,18 @@ multiply_add(T x, T y, T z) {
 #if defined(LIBC_TARGET_CPU_HAS_FMA)
 
 // FMA instructions are available.
-#include "FMA.h"
+// We use builtins directly instead of including FMA.h to avoid a circular
+// dependency: multiply_add.h -> FMA.h -> generic/FMA.h -> dyadic_float.h.
 
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
 LIBC_INLINE float multiply_add(float x, float y, float z) {
-  return fma<float>(x, y, z);
+  return __builtin_fmaf(x, y, z);
 }
 
 LIBC_INLINE double multiply_add(double x, double y, double z) {
-  return fma<double>(x, y, z);
+  return __builtin_fma(x, y, z);
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/OSUtil/linux/arm/syscall.h b/libc/src/__support/OSUtil/linux/arm/syscall.h
index af100747832750..9674ee45a49e91 100644
--- a/libc/src/__support/OSUtil/linux/arm/syscall.h
+++ b/libc/src/__support/OSUtil/linux/arm/syscall.h
@@ -12,14 +12,29 @@
 #include "src/__support/common.h"
 
 #ifdef __thumb__
-#error "The arm syscall implementation does not yet support thumb flavor."
-#endif // __thumb__
+#define R7 long r7 = number
+#define SYSCALL_INSTR(input_constraint)                                        \
+  int temp;                                                                    \
+  LIBC_INLINE_ASM(R"(
+    mov %[temp], r7
+    mov r7, %2
+    svc #0
+    mov r7, %[temp]
+  )"                                                          \
+                  : "=r"(r0), [temp] "=&r"(temp)                               \
+                  : input_constraint                                           \
+                  : "memory", "cc")
+#else
+#define R7 register long r7 asm("r7") = number
+#define SYSCALL_INSTR(input_constraint)                                        \
+  LIBC_INLINE_ASM("svc 0" : "=r"(r0) : input_constraint : "memory", "cc")
+#endif
 
 #define REGISTER_DECL_0                                                        \
-  register long r7 __asm__("r7") = number;                                     \
+  R7;                                                                          \
   register long r0 __asm__("r0");
 #define REGISTER_DECL_1                                                        \
-  register long r7 __asm__("r7") = number;                                     \
+  R7;                                                                          \
   register long r0 __asm__("r0") = arg1;
 #define REGISTER_DECL_2                                                        \
   REGISTER_DECL_1                                                              \
@@ -45,9 +60,6 @@
 #define REGISTER_CONSTRAINT_5 REGISTER_CONSTRAINT_4, "r"(r4)
 #define REGISTER_CONSTRAINT_6 REGISTER_CONSTRAINT_5, "r"(r5)
 
-#define SYSCALL_INSTR(input_constraint)                                        \
-  LIBC_INLINE_ASM("svc 0" : "=r"(r0) : input_constraint : "memory", "cc")
-
 namespace LIBC_NAMESPACE {
 
 LIBC_INLINE long syscall_impl(long number) {
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index b087f898c395d6..083deb8c336813 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -67,15 +67,11 @@ int fcntl(int fd, int cmd, void *arg) {
   }
   case F_GETOWN: {
     struct f_owner_ex fex;
-    int retVal =
+    int ret =
         LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl, fd, F_GETOWN_EX, &fex);
-    if (retVal == -EINVAL)
-      return LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl, fd, cmd,
-                                               reinterpret_cast<void *>(arg));
-    if (static_cast<unsigned long>(retVal) <= -4096UL)
+    if (ret >= 0)
       return fex.type == F_OWNER_PGRP ? -fex.pid : fex.pid;
-
-    libc_errno = -retVal;
+    libc_errno = -ret;
     return -1;
   }
   // The general case
diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index 5ce9541d73f68e..59a3912ef0f098 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -387,7 +387,8 @@ struct BigInt {
   }
 
   // Initialize the first word to |v| and the rest to 0.
-  template <typename T, typename = cpp::enable_if_t<cpp::is_integral_v<T>>>
+  template <typename T, typename = cpp::enable_if_t<cpp::is_integral_v<T> &&
+                                                    !cpp::is_same_v<T, bool>>>
   LIBC_INLINE constexpr BigInt(T v) {
     constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
     const bool is_neg = Signed && (v < 0);
@@ -440,7 +441,7 @@ struct BigInt {
     constexpr size_t MAX_COUNT =
         T_SIZE > Bits ? WORD_COUNT : T_SIZE / WORD_SIZE;
     for (size_t i = 1; i < MAX_COUNT; ++i)
-      lo += static_cast<T>(val[i]) << (WORD_SIZE * i);
+      lo += static_cast<T>(static_cast<T>(val[i]) << (WORD_SIZE * i));
     if constexpr (Signed && (T_SIZE > Bits)) {
       // Extend sign for negative numbers.
       constexpr T MASK = (~T(0) << Bits);
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index 403b1620d20df2..5161c0d7a533cf 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -90,6 +90,11 @@ template <typename T, size_t CAPACITY> class FixedVector {
 
   LIBC_INLINE constexpr iterator begin() { return store.begin(); }
   LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; }
+
+  LIBC_INLINE constexpr const_iterator begin() const { return store.begin(); }
+  LIBC_INLINE constexpr const_iterator end() const {
+    return const_iterator{&store[item_count]};
+  }
 };
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/high_precision_decimal.h b/libc/src/__support/high_precision_decimal.h
index 2c5a349e4495eb..6a22554165014f 100644
--- a/libc/src/__support/high_precision_decimal.h
+++ b/libc/src/__support/high_precision_decimal.h
@@ -409,7 +409,8 @@ class HighPrecisionDecimal {
       result *= 10;
       ++cur_digit;
     }
-    return result + this->should_round_up(this->decimal_point, round);
+    return result + static_cast<unsigned int>(
+                        this->should_round_up(this->decimal_point, round));
   }
 
   // Extra functions for testing.
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index 59886ca44be12a..05a47791deed8a 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -33,4 +33,18 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
 #error "Unhandled compiler"
 #endif
 
+// Defining optimization options for math functions.
+// TODO: Exporting this to public generated headers?
+#define LIBC_MATH_SKIP_ACCURATE_PASS 0x01
+#define LIBC_MATH_SMALL_TABLES 0x02
+#define LIBC_MATH_NO_ERRNO 0x04
+#define LIBC_MATH_NO_EXCEPT 0x08
+#define LIBC_MATH_FAST                                                         \
+  (LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES |                     \
+   LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT)
+
+#ifndef LIBC_MATH
+#define LIBC_MATH 0
+#endif // LIBC_MATH
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
diff --git a/libc/src/__support/threads/callonce.h b/libc/src/__support/threads/callonce.h
index b3d6813f7dda9e..67f4c996571072 100644
--- a/libc/src/__support/threads/callonce.h
+++ b/libc/src/__support/threads/callonce.h
@@ -9,13 +9,32 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_CALLONCE_H
 #define LLVM_LIBC_SRC___SUPPORT_THREADS_CALLONCE_H
 
+#include "src/__support/macros/optimization.h" // LIBC_LIKELY
+
+// Plaform specific routines, provides:
+// - OnceFlag definition
+// - callonce_impl::callonce_fastpath for fast path check
+// - callonce_impl::callonce_slowpath for slow path execution
+#ifdef __linux__
+#include "src/__support/threads/linux/callonce.h"
+#else
+#error "callonce is not supported on this platform"
+#endif
+
 namespace LIBC_NAMESPACE {
 
-struct CallOnceFlag;
+// Common definitions
 using CallOnceCallback = void(void);
+namespace callonce_impl {
+int callonce_slowpath(CallOnceFlag *flag, CallOnceCallback *callback);
+} // namespace callonce_impl
 
-int callonce(CallOnceFlag *flag, CallOnceCallback *callback);
+LIBC_INLINE int callonce(CallOnceFlag *flag, CallOnceCallback *callback) {
+  if (LIBC_LIKELY(callonce_impl::callonce_fastpath(flag)))
+    return 0;
 
+  return callonce_impl::callonce_slowpath(flag, callback);
+}
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_THREADS_CALLONCE_H
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index 95e509b7a825dd..8b7971584e77ec 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -100,8 +100,10 @@ add_object_library(
     callonce.cpp
   HDRS
     ../callonce.h
+    callonce.h
   DEPENDS
     .futex_utils
+    libc.src.__support.macros.optimization
 )
 
 add_object_library(
diff --git a/libc/src/__support/threads/linux/callonce.cpp b/libc/src/__support/threads/linux/callonce.cpp
index b48a514a448753..24d376f447fba1 100644
--- a/libc/src/__support/threads/linux/callonce.cpp
+++ b/libc/src/__support/threads/linux/callonce.cpp
@@ -7,27 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/threads/callonce.h"
-#include "src/__support/macros/optimization.h"
+#include "src/__support/threads/linux/callonce.h"
 #include "src/__support/threads/linux/futex_utils.h"
 
 namespace LIBC_NAMESPACE {
-
-static constexpr FutexWordType NOT_CALLED = 0x0;
-static constexpr FutexWordType START = 0x11;
-static constexpr FutexWordType WAITING = 0x22;
-static constexpr FutexWordType FINISH = 0x33;
-
-int callonce(CallOnceFlag *flag, CallOnceCallback *func) {
+namespace callonce_impl {
+int callonce_slowpath(CallOnceFlag *flag, CallOnceCallback *func) {
   auto *futex_word = reinterpret_cast<Futex *>(flag);
 
   FutexWordType not_called = NOT_CALLED;
 
-  // Avoid cmpxchg operation if the function has already been called.
-  // The destination operand of cmpxchg may receive a write cycle without
-  // regard to the result of the comparison
-  if (LIBC_LIKELY(futex_word->load(cpp::MemoryOrder::RELAXED) == FINISH))
-    return 0;
-
   // The call_once call can return only after the called function |func|
   // returns. So, we use futexes to synchronize calls with the same flag value.
   if (futex_word->compare_exchange_strong(not_called, START)) {
@@ -46,5 +35,5 @@ int callonce(CallOnceFlag *flag, CallOnceCallback *func) {
 
   return 0;
 }
-
+} // namespace callonce_impl
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/threads/linux/callonce.h b/libc/src/__support/threads/linux/callonce.h
new file mode 100644
index 00000000000000..315cc6149e9ecf
--- /dev/null
+++ b/libc/src/__support/threads/linux/callonce.h
@@ -0,0 +1,31 @@
+//===-- Linux callonce fastpath -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_CALLONCE_H
+#define LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_CALLONCE_H
+
+#include "src/__support/threads/linux/futex_utils.h"
+
+namespace LIBC_NAMESPACE {
+using CallOnceFlag = Futex;
+
+namespace callonce_impl {
+static constexpr FutexWordType NOT_CALLED = 0x0;
+static constexpr FutexWordType START = 0x11;
+static constexpr FutexWordType WAITING = 0x22;
+static constexpr FutexWordType FINISH = 0x33;
+
+// Avoid cmpxchg operation if the function has already been called.
+// The destination operand of cmpxchg may receive a write cycle without
+// regard to the result of the comparison.
+LIBC_INLINE bool callonce_fastpath(CallOnceFlag *flag) {
+  return flag->load(cpp::MemoryOrder::RELAXED) == FINISH;
+}
+} // namespace callonce_impl
+
+} // namespace LIBC_NAMESPACE
+#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_CALLONCE_H
diff --git a/libc/src/__support/threads/thread.h b/libc/src/__support/threads/thread.h
index acfe33879f8783..f89c687eeaa19b 100644
--- a/libc/src/__support/threads/thread.h
+++ b/libc/src/__support/threads/thread.h
@@ -43,6 +43,9 @@ union ThreadReturnValue {
      defined(LIBC_TARGET_ARCH_IS_X86_64) ||                                    \
      defined(LIBC_TARGET_ARCH_IS_ANY_RISCV))
 constexpr unsigned int STACK_ALIGNMENT = 16;
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
+// See Section 6.2.1.2 Stack constraints at a public interface of AAPCS32.
+constexpr unsigned int STACK_ALIGNMENT = 8;
 #endif
 // TODO: Provide stack alignment requirements for other architectures.
 
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index a921939ea806c2..4777f669f0cb5b 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -99,9 +99,20 @@ add_math_entrypoint_object(exp10f)
 add_math_entrypoint_object(expm1)
 add_math_entrypoint_object(expm1f)
 
+add_math_entrypoint_object(f16div)
+add_math_entrypoint_object(f16divf)
+add_math_entrypoint_object(f16divl)
+add_math_entrypoint_object(f16divf128)
+
+add_math_entrypoint_object(f16fma)
 add_math_entrypoint_object(f16fmaf)
+add_math_entrypoint_object(f16fmal)
+add_math_entrypoint_object(f16fmaf128)
 
+add_math_entrypoint_object(f16sqrt)
 add_math_entrypoint_object(f16sqrtf)
+add_math_entrypoint_object(f16sqrtl)
+add_math_entrypoint_object(f16sqrtf128)
 
 add_math_entrypoint_object(fabs)
 add_math_entrypoint_object(fabsf)
diff --git a/libc/src/math/f16div.h b/libc/src/math/f16div.h
new file mode 100644
index 00000000000000..3807bc02276c9b
--- /dev/null
+++ b/libc/src/math/f16div.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16div ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16DIV_H
+#define LLVM_LIBC_SRC_MATH_F16DIV_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16div(double x, double y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16DIV_H
diff --git a/libc/src/math/f16divf.h b/libc/src/math/f16divf.h
new file mode 100644
index 00000000000000..a3359d9e479445
--- /dev/null
+++ b/libc/src/math/f16divf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16divf -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16DIVF_H
+#define LLVM_LIBC_SRC_MATH_F16DIVF_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16divf(float x, float y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16DIVF_H
diff --git a/libc/src/math/f16divf128.h b/libc/src/math/f16divf128.h
new file mode 100644
index 00000000000000..2f63535ca27ce2
--- /dev/null
+++ b/libc/src/math/f16divf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16divf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16DIVF128_H
+#define LLVM_LIBC_SRC_MATH_F16DIVF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16divf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16DIVF128_H
diff --git a/libc/src/math/f16divl.h b/libc/src/math/f16divl.h
new file mode 100644
index 00000000000000..ad9999135b5880
--- /dev/null
+++ b/libc/src/math/f16divl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16divl -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16DIVL_H
+#define LLVM_LIBC_SRC_MATH_F16DIVL_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16divl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16DIVL_H
diff --git a/libc/src/math/f16fma.h b/libc/src/math/f16fma.h
new file mode 100644
index 00000000000000..d9505f88f37af2
--- /dev/null
+++ b/libc/src/math/f16fma.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16fma ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16FMA_H
+#define LLVM_LIBC_SRC_MATH_F16FMA_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16fma(double x, double y, double z);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16FMA_H
diff --git a/libc/src/math/f16fmaf128.h b/libc/src/math/f16fmaf128.h
new file mode 100644
index 00000000000000..9203b4d30d2124
--- /dev/null
+++ b/libc/src/math/f16fmaf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16fmaf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16FMAF128_H
+#define LLVM_LIBC_SRC_MATH_F16FMAF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16fmaf128(float128 x, float128 y, float128 z);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16FMAF128_H
diff --git a/libc/src/math/f16fmal.h b/libc/src/math/f16fmal.h
new file mode 100644
index 00000000000000..6f5dd33aa18f63
--- /dev/null
+++ b/libc/src/math/f16fmal.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16fmal -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16FMAL_H
+#define LLVM_LIBC_SRC_MATH_F16FMAL_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16fmal(long double x, long double y, long double z);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16FMAL_H
diff --git a/libc/src/math/f16sqrt.h b/libc/src/math/f16sqrt.h
new file mode 100644
index 00000000000000..f1134ac5ee2b9c
--- /dev/null
+++ b/libc/src/math/f16sqrt.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16sqrt -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16SQRT_H
+#define LLVM_LIBC_SRC_MATH_F16SQRT_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16sqrt(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16SQRT_H
diff --git a/libc/src/math/f16sqrtf128.h b/libc/src/math/f16sqrtf128.h
new file mode 100644
index 00000000000000..61a6ce9ea5a5d7
--- /dev/null
+++ b/libc/src/math/f16sqrtf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16sqrtf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16SQRTF128_H
+#define LLVM_LIBC_SRC_MATH_F16SQRTF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16sqrtf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16SQRTF128_H
diff --git a/libc/src/math/f16sqrtl.h b/libc/src/math/f16sqrtl.h
new file mode 100644
index 00000000000000..fd3c55fc95f328
--- /dev/null
+++ b/libc/src/math/f16sqrtl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16sqrtl ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16SQRTL_H
+#define LLVM_LIBC_SRC_MATH_F16SQRTL_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16sqrtl(long double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16SQRTL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index a0114aafcad39e..7dd6c4812fa153 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -135,6 +135,23 @@ add_header_library(
     libc.src.__support.common
 )
 
+add_header_library(
+  range_reduction_double
+  HDRS
+    range_reduction_double_common.h
+    range_reduction_double_fma.h
+    range_reduction_double_nofma.h
+  DEPENDS
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.common
+    libc.src.__support.integer_literals
+)
+
 add_header_library(
   sincosf_utils
   HDRS
@@ -146,6 +163,39 @@ add_header_library(
     libc.src.__support.common
 )
 
+add_header_library(
+  sincos_eval
+  HDRS
+    sincos_eval.h
+  DEPENDS
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.integer_literals
+)
+
+add_entrypoint_object(
+  cos
+  SRCS
+    cos.cpp
+  HDRS
+    ../cos.h
+  DEPENDS
+    .range_reduction_double
+    .sincos_eval
+    libc.hdr.errno_macros
+    libc.src.errno.errno
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   cosf
   SRCS
@@ -167,6 +217,28 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  sin
+  SRCS
+    sin.cpp
+  HDRS
+    ../sin.h
+  DEPENDS
+    .range_reduction_double
+    .sincos_eval
+    libc.hdr.errno_macros
+    libc.src.errno.errno
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   sinf
   SRCS
@@ -189,6 +261,28 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  sincos
+  SRCS
+    sincos.cpp
+  HDRS
+    ../sincos.h
+  DEPENDS
+    .range_reduction_double
+    .sincos_eval
+    libc.hdr.errno_macros
+    libc.src.errno.errno
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   sincosf
   SRCS
@@ -3682,6 +3776,71 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  f16div
+  SRCS
+    f16div.cpp
+  HDRS
+    ../f16div.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.div
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16divf
+  SRCS
+    f16divf.cpp
+  HDRS
+    ../f16divf.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.div
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16divl
+  SRCS
+    f16divl.cpp
+  HDRS
+    ../f16divl.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.div
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16divf128
+  SRCS
+    f16divf128.cpp
+  HDRS
+    ../f16divf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.div
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16fma
+  SRCS
+    f16fma.cpp
+  HDRS
+    ../f16fma.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O0 -ggdb3
+)
+
 add_entrypoint_object(
   f16fmaf
   SRCS
@@ -3695,6 +3854,45 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  f16fmal
+  SRCS
+    f16fmal.cpp
+  HDRS
+    ../f16fmal.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16fmaf128
+  SRCS
+    f16fmaf128.cpp
+  HDRS
+    ../f16fmaf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16sqrt
+  SRCS
+    f16sqrt.cpp
+  HDRS
+    ../f16sqrt.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   f16sqrtf
   SRCS
@@ -3707,3 +3905,29 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
 )
+
+add_entrypoint_object(
+  f16sqrtl
+  SRCS
+    f16sqrtl.cpp
+  HDRS
+    ../f16sqrtl.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  f16sqrtf128
+  SRCS
+    f16sqrtf128.cpp
+  HDRS
+    ../f16sqrtf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.sqrt
+  COMPILE_OPTIONS
+    -O3
+)
diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp
new file mode 100644
index 00000000000000..a2cfe758fe4cc0
--- /dev/null
+++ b/libc/src/math/generic/cos.cpp
@@ -0,0 +1,197 @@
+//===-- Double-precision cos function -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cos.h"
+#include "hdr/errno_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/sincos_eval.h"
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+
+using LIBC_NAMESPACE::fma::FAST_PASS_EXPONENT;
+using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI;
+using LIBC_NAMESPACE::fma::range_reduction_small;
+using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128;
+
+LIBC_INLINE constexpr bool NO_FMA = false;
+#else
+#include "range_reduction_double_nofma.h"
+
+using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT;
+using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI;
+using LIBC_NAMESPACE::nofma::range_reduction_small;
+using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128;
+
+LIBC_INLINE constexpr bool NO_FMA = true;
+#endif // LIBC_TARGET_CPU_HAS_FMA
+
+// TODO: We might be able to improve the performance of large range reduction of
+// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
+// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
+// those lookup table.
+#include "range_reduction_double_common.h"
+
+#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
+#define LIBC_MATH_COS_SKIP_ACCURATE_PASS
+#endif
+
+namespace LIBC_NAMESPACE {
+
+using DoubleDouble = fputil::DoubleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+LLVM_LIBC_FUNCTION(double, cos, (double x)) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint16_t x_e = xbits.get_biased_exponent();
+
+  DoubleDouble y;
+  unsigned k;
+  generic::LargeRangeReduction<NO_FMA> range_reduction_large;
+
+  // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
+    // |x| < 2^-27
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
+      // Signed zeros.
+      if (LIBC_UNLIKELY(x == 0.0))
+        return 1.0;
+
+      // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
+      return fputil::round_result_slightly_down(1.0);
+    }
+
+    // // Small range reduction.
+    k = range_reduction_small(x, y);
+  } else {
+    // Inf or NaN
+    if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
+      // sin(+-Inf) = NaN
+      if (xbits.get_mantissa() == 0) {
+        fputil::set_errno_if_required(EDOM);
+        fputil::raise_except_if_required(FE_INVALID);
+      }
+      return x + FPBits::quiet_nan().get_val();
+    }
+
+    // Large range reduction.
+    k = range_reduction_large.compute_high_part(x);
+    y = range_reduction_large.fast();
+  }
+
+  DoubleDouble sin_y, cos_y;
+
+  generic::sincos_eval(y, sin_y, cos_y);
+
+  // Look up sin(k * pi/128) and cos(k * pi/128)
+  // Memory saving versions:
+
+  // Use 128-entry table instead:
+  // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
+  // uint64_t sin_s = static_cast<uint64_t>((k + 128) & 128) << (63 - 7);
+  // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
+  // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
+  // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
+  // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
+  // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
+  // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
+
+  // Use 64-entry table instead:
+  // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+  //   unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+  //   DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+  //   if (kk & 128) {
+  //     ans.hi = -ans.hi;
+  //     ans.lo = -ans.lo;
+  //   }
+  //   return ans;
+  // };
+  // DoubleDouble sin_k = get_idx_dd(k + 128);
+  // DoubleDouble cos_k = get_idx_dd(k + 64);
+
+  // Fast look up version, but needs 256-entry table.
+  // -sin(k * pi/128) = sin((k + 128) * pi/128)
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128) & 255];
+  DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+
+  // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
+  // So k is an integer and -pi / 256 <= y <= pi / 256.
+  // Then cos(x) = cos((k * pi/128 + y)
+  //             = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)
+  DoubleDouble cos_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, cos_k);
+  DoubleDouble msin_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, msin_k);
+
+  DoubleDouble rr = fputil::exact_add<false>(cos_k_cos_y.hi, msin_k_sin_y.hi);
+  rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo;
+
+#ifdef LIBC_MATH_COS_SKIP_ACCURATE_PASS
+  return rr.hi + rr.lo;
+#else
+
+  // Accurate test and pass for correctly rounded implementation.
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+  constexpr double ERR = 0x1.0p-70;
+#else
+  // TODO: Improve non-FMA fast pass accuracy.
+  constexpr double ERR = 0x1.0p-66;
+#endif // LIBC_TARGET_CPU_HAS_FMA
+
+  double rlp = rr.lo + ERR;
+  double rlm = rr.lo - ERR;
+
+  double r_upper = rr.hi + rlp; // (rr.lo + ERR);
+  double r_lower = rr.hi + rlm; // (rr.lo - ERR);
+
+  // Ziv's rounding test.
+  if (LIBC_LIKELY(r_upper == r_lower))
+    return r_upper;
+
+  Float128 u_f128, sin_u, cos_u;
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
+    u_f128 = generic::range_reduction_small_f128(x);
+  else
+    u_f128 = range_reduction_large.accurate();
+
+  generic::sincos_eval(u_f128, sin_u, cos_u);
+
+  auto get_sin_k = [](unsigned kk) -> Float128 {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+    if (kk & 128)
+      ans.sign = Sign::NEG;
+    return ans;
+  };
+
+  // -sin(k * pi/128) = sin((k + 128) * pi/128)
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  Float128 msin_k_f128 = get_sin_k(k + 128);
+  Float128 cos_k_f128 = get_sin_k(k + 64);
+
+  // cos(x) = cos((k * pi/128 + u)
+  //        = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128)
+  Float128 r = fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u),
+                                 fputil::quick_mul(msin_k_f128, sin_u));
+
+  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
+  // https://github.com/llvm/llvm-project/issues/96452.
+
+  return static_cast<double>(r);
+#endif // !LIBC_MATH_COS_SKIP_ACCURATE_PASS
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16div.cpp b/libc/src/math/generic/f16div.cpp
new file mode 100644
index 00000000000000..2ff0ff78655391
--- /dev/null
+++ b/libc/src/math/generic/f16div.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16div function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16div.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16div, (double x, double y)) {
+  return fputil::generic::div<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16divf.cpp b/libc/src/math/generic/f16divf.cpp
new file mode 100644
index 00000000000000..45874fbac2055a
--- /dev/null
+++ b/libc/src/math/generic/f16divf.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16divf function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16divf.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16divf, (float x, float y)) {
+  return fputil::generic::div<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16divf128.cpp b/libc/src/math/generic/f16divf128.cpp
new file mode 100644
index 00000000000000..1d37ad8aa2366c
--- /dev/null
+++ b/libc/src/math/generic/f16divf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16divf128 function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16divf128.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16divf128, (float128 x, float128 y)) {
+  return fputil::generic::div<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16divl.cpp b/libc/src/math/generic/f16divl.cpp
new file mode 100644
index 00000000000000..3fb9c7891f5d1b
--- /dev/null
+++ b/libc/src/math/generic/f16divl.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16divl function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16divl.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16divl, (long double x, long double y)) {
+  return fputil::generic::div<float16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16fma.cpp b/libc/src/math/generic/f16fma.cpp
new file mode 100644
index 00000000000000..10ee028c069301
--- /dev/null
+++ b/libc/src/math/generic/f16fma.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16fma function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16fma.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16fma, (double x, double y, double z)) {
+  return fputil::fma<float16>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16fmaf128.cpp b/libc/src/math/generic/f16fmaf128.cpp
new file mode 100644
index 00000000000000..5b2f801cf21fa5
--- /dev/null
+++ b/libc/src/math/generic/f16fmaf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16fmaf128 function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16fmaf128.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16fmaf128, (float128 x, float128 y, float128 z)) {
+  return fputil::fma<float16>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16fmal.cpp b/libc/src/math/generic/f16fmal.cpp
new file mode 100644
index 00000000000000..067483629a3362
--- /dev/null
+++ b/libc/src/math/generic/f16fmal.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of f16fmal function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16fmal.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16fmal,
+                   (long double x, long double y, long double z)) {
+  return fputil::fma<float16>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/x86_64/cos.cpp b/libc/src/math/generic/f16sqrt.cpp
similarity index 64%
rename from libc/src/math/x86_64/cos.cpp
rename to libc/src/math/generic/f16sqrt.cpp
index 2cb8db4b7a859d..9d5f081f633152 100644
--- a/libc/src/math/x86_64/cos.cpp
+++ b/libc/src/math/generic/f16sqrt.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the cos function for x86_64 ---------------------===//
+//===-- Implementation of f16sqrt function --------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/math/cos.h"
+#include "src/math/f16sqrt.h"
+#include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, cos, (double x)) {
-  __asm__ __volatile__("fcos" : "+t"(x));
-  return x;
+LLVM_LIBC_FUNCTION(float16, f16sqrt, (double x)) {
+  return fputil::sqrt<float16>(x);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/f16sqrtf128.cpp b/libc/src/math/generic/f16sqrtf128.cpp
new file mode 100644
index 00000000000000..11a1e8252788ee
--- /dev/null
+++ b/libc/src/math/generic/f16sqrtf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16sqrtf128 function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16sqrtf128.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16sqrtf128, (float128 x)) {
+  return fputil::sqrt<float16>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/x86_64/sin.cpp b/libc/src/math/generic/f16sqrtl.cpp
similarity index 64%
rename from libc/src/math/x86_64/sin.cpp
rename to libc/src/math/generic/f16sqrtl.cpp
index 2c7b8aa6e8c836..2aaac9a780f66c 100644
--- a/libc/src/math/x86_64/sin.cpp
+++ b/libc/src/math/generic/f16sqrtl.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the sin function for x86_64 ---------------------===//
+//===-- Implementation of f16sqrtl function -------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/math/sin.h"
+#include "src/math/f16sqrtl.h"
+#include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, sin, (double x)) {
-  __asm__ __volatile__("fsin" : "+t"(x));
-  return x;
+LLVM_LIBC_FUNCTION(float16, f16sqrtl, (long double x)) {
+  return fputil::sqrt<float16>(x);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h
new file mode 100644
index 00000000000000..150118fba0ba06
--- /dev/null
+++ b/libc/src/math/generic/range_reduction_double_common.h
@@ -0,0 +1,230 @@
+//===-- Range reduction for double precision sin/cos/tan -*- C++ --------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H
+#define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+
+namespace LIBC_NAMESPACE {
+
+namespace generic {
+
+using LIBC_NAMESPACE::fputil::DoubleDouble;
+using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
+
+LIBC_INLINE constexpr Float128 PI_OVER_128_F128 = {
+    Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128};
+
+// Note: The look-up tables ONE_TWENTY_EIGHT_OVER_PI is selected to be either
+// from fma:: or nofma:: namespace.
+
+// For large range |x| >= 2^32, we use the exponent of x to find 3 double-chunks
+// of 128/pi c_hi, c_mid, c_lo such that:
+//   1) ulp(round(x * c_hi, D, RN)) >= 256,
+//   2) If x * c_hi = ph_hi + ph_lo and x * c_mid = pm_hi + pm_lo, then
+//        min(ulp(ph_lo), ulp(pm_hi)) >= 2^-53.
+//   3) ulp(round(x * c_lo, D, RN)) <= 2^-7x.
+// This will allow us to do quick computations as:
+//   (x * 256/pi) ~ x * (c_hi + c_mid + c_lo)    (mod 256)
+//                ~ ph_lo + pm_hi + pm_lo + (x * c_lo)
+// Then,
+//   round(x * 128/pi) = round(ph_lo + pm_hi)    (mod 256)
+// And the high part of fractional part of (x * 128/pi) can simply be:
+//   {x * 128/pi}_hi = {ph_lo + pm_hi}.
+// To prevent overflow when x is very large, we simply scale up
+// (c_hi, c_mid, c_lo) by a fixed power of 2 (based on the index) and scale down
+// x by the same amount.
+
+template <bool NO_FMA> struct LargeRangeReduction {
+  // Calculate the high part of the range reduction exactly.
+  LIBC_INLINE unsigned compute_high_part(double x) {
+    using FPBits = typename fputil::FPBits<double>;
+    FPBits xbits(x);
+
+    // TODO: The extra exponent gap of 62 below can be reduced a bit for non-FMA
+    // with a more careful analysis, which in turn will reduce the error bound
+    // for non-FMA
+    int x_e_m62 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 62);
+    idx = static_cast<unsigned>((x_e_m62 >> 4) + 3);
+    // Scale x down by 2^(-(16 * (idx - 3))
+    xbits.set_biased_exponent((x_e_m62 & 15) + FPBits::EXP_BIAS + 62);
+    // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78
+    x_reduced = xbits.get_val();
+    // x * c_hi = ph.hi + ph.lo exactly.
+    DoubleDouble ph =
+        fputil::exact_mult<NO_FMA>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
+    // x * c_mid = pm.hi + pm.lo exactly.
+    DoubleDouble pm =
+        fputil::exact_mult<NO_FMA>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
+    // Extract integral parts and fractional parts of (ph.lo + pm.hi).
+    double kh = fputil::nearest_integer(ph.lo);
+    double ph_lo_frac = ph.lo - kh; // Exact
+    double km = fputil::nearest_integer(pm.hi + ph_lo_frac);
+    double pm_hi_frac = pm.hi - km; // Exact
+    // x * 128/pi mod 1 ~ y_hi + y_lo
+    y_hi = ph_lo_frac + pm_hi_frac; // Exact
+    pm_lo = pm.lo;
+    return static_cast<unsigned>(static_cast<int64_t>(kh) +
+                                 static_cast<int64_t>(km));
+  }
+
+  LIBC_INLINE DoubleDouble fast() const {
+    // y_lo = x * c_lo + pm.lo
+    double y_lo = fputil::multiply_add(x_reduced,
+                                       ONE_TWENTY_EIGHT_OVER_PI[idx][2], pm_lo);
+    DoubleDouble y = fputil::exact_add(y_hi, y_lo);
+
+    // Digits of pi/128, generated by Sollya with:
+    // > a = round(pi/128, D, RN);
+    // > b = round(pi/128 - a, D, RN);
+    constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60,
+                                             0x1.921fb54442d18p-6};
+
+    // Error bound: with {a} denote the fractional part of a, i.e.:
+    //   {a} = a - round(a)
+    // Then,
+    //   | {x * 128/pi} - (y_hi + y_lo) | <  2 * ulp(x_reduced *
+    //                                         * ONE_TWENTY_EIGHT_OVER_PI[idx][2])
+    // For FMA:
+    //   | {x * 128/pi} - (y_hi + y_lo) | <= 2 * 2^77 * 2^-103 * 2^-52
+    //                                    =  2^-77.
+    //   | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-77.
+    //                                      = 2^-82.
+    // For non-FMA:
+    //   | {x * 128/pi} - (y_hi + y_lo) | <= 2 * 2^77 * 2^-99 * 2^-52
+    //                                    =  2^-73.
+    //   | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-73.
+    //                                      = 2^-78.
+    return fputil::quick_mult<NO_FMA>(y, PI_OVER_128_DD);
+  }
+
+  LIBC_INLINE Float128 accurate() const {
+    // y_lo = x * c_lo + pm.lo
+    Float128 y_lo_0(x_reduced * ONE_TWENTY_EIGHT_OVER_PI[idx][3]);
+    Float128 y_lo_1 = fputil::quick_mul(
+        Float128(x_reduced), Float128(ONE_TWENTY_EIGHT_OVER_PI[idx][2]));
+    Float128 y_lo_2(pm_lo);
+    Float128 y_hi_f128(y_hi);
+
+    Float128 y = fputil::quick_add(
+        y_hi_f128,
+        fputil::quick_add(y_lo_2, fputil::quick_add(y_lo_1, y_lo_0)));
+
+    return fputil::quick_mul(y, PI_OVER_128_F128);
+  }
+
+private:
+  // Index of x in the look-up table ONE_TWENTY_EIGHT_OVER_PI.
+  unsigned idx;
+  // x scaled down by 2^(-16 *(idx - 3))).
+  double x_reduced;
+  // High part of (x * 128/pi) mod 1.
+  double y_hi;
+  // Low part of x * ONE_TWENTY_EIGHT_OVER_PI[idx][1].
+  double pm_lo;
+};
+
+LIBC_INLINE Float128 range_reduction_small_f128(double x) {
+  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI[3][0];
+  double kd = fputil::nearest_integer(prod_hi);
+
+  Float128 mk_f128(-kd);
+  Float128 x_f128(x);
+  Float128 p_hi =
+      fputil::quick_mul(x_f128, Float128(ONE_TWENTY_EIGHT_OVER_PI[3][0]));
+  Float128 p_mid =
+      fputil::quick_mul(x_f128, Float128(ONE_TWENTY_EIGHT_OVER_PI[3][1]));
+  Float128 p_lo =
+      fputil::quick_mul(x_f128, Float128(ONE_TWENTY_EIGHT_OVER_PI[3][2]));
+  Float128 s_hi = fputil::quick_add(p_hi, mk_f128);
+  Float128 s_lo = fputil::quick_add(p_mid, p_lo);
+  Float128 y = fputil::quick_add(s_hi, s_lo);
+
+  return fputil::quick_mul(y, PI_OVER_128_F128);
+}
+
+LIBC_INLINE constexpr Float128 SIN_K_PI_OVER_128_F128[65] = {
+    {Sign::POS, 0, 0},
+    {Sign::POS, -133, 0xc90a'afbd'1b33'efc9'c539'edcb'fda0'cf2c_u128},
+    {Sign::POS, -132, 0xc8fb'2f88'6ec0'9f37'6a17'954b'2b7c'5171_u128},
+    {Sign::POS, -131, 0x96a9'0496'70cf'ae65'f775'7409'4d3c'35c4_u128},
+    {Sign::POS, -131, 0xc8bd'35e1'4da1'5f0e'c739'6c89'4bbf'7389_u128},
+    {Sign::POS, -131, 0xfab2'72b5'4b98'71a2'7047'29ae'56d7'8a37_u128},
+    {Sign::POS, -130, 0x9640'8374'7309'd113'000a'89a1'1e07'c1fe_u128},
+    {Sign::POS, -130, 0xaf10'a224'59fe'32a6'3fee'f3bb'58b1'f10d_u128},
+    {Sign::POS, -130, 0xc7c5'c1e3'4d30'55b2'5cc8'c00e'4fcc'd850_u128},
+    {Sign::POS, -130, 0xe05c'1353'f27b'17e5'0ebc'61ad'e6ca'83cd_u128},
+    {Sign::POS, -130, 0xf8cf'cbd9'0af8'd57a'4221'dc4b'a772'598d_u128},
+    {Sign::POS, -129, 0x888e'9315'8fb3'bb04'9841'56f5'5334'4306_u128},
+    {Sign::POS, -129, 0x94a0'3176'acf8'2d45'ae4b'a773'da6b'f754_u128},
+    {Sign::POS, -129, 0xa09a'e4a0'bb30'0a19'2f89'5f44'a303'cc0b_u128},
+    {Sign::POS, -129, 0xac7c'd3ad'58fe'e7f0'811f'9539'84ef'f83e_u128},
+    {Sign::POS, -129, 0xb844'2987'd22c'f576'9cc3'ef36'746d'e3b8_u128},
+    {Sign::POS, -129, 0xc3ef'1535'754b'168d'3122'c2a5'9efd'dc37_u128},
+    {Sign::POS, -129, 0xcf7b'ca1d'476c'516d'a812'90bd'baad'62e4_u128},
+    {Sign::POS, -129, 0xdae8'804f'0ae6'015b'362c'b974'182e'3030_u128},
+    {Sign::POS, -129, 0xe633'74c9'8e22'f0b4'2872'ce1b'fc7a'd1cd_u128},
+    {Sign::POS, -129, 0xf15a'e9c0'37b1'd8f0'6c48'e9e3'420b'0f1e_u128},
+    {Sign::POS, -129, 0xfc5d'26df'c4d5'cfda'27c0'7c91'1290'b8d1_u128},
+    {Sign::POS, -128, 0x839c'3cc9'17ff'6cb4'bfd7'9717'f288'0abf_u128},
+    {Sign::POS, -128, 0x88f5'9aa0'da59'1421'b892'ca83'61d8'c84c_u128},
+    {Sign::POS, -128, 0x8e39'd9cd'7346'4364'bba4'cfec'bff5'4867_u128},
+    {Sign::POS, -128, 0x9368'2a66'e896'f544'b178'2191'1e71'c16e_u128},
+    {Sign::POS, -128, 0x987f'bfe7'0b81'a708'19ce'c845'ac87'a5c6_u128},
+    {Sign::POS, -128, 0x9d7f'd149'0285'c9e3'e25e'3954'9638'ae68_u128},
+    {Sign::POS, -128, 0xa267'9928'48ee'b0c0'3b51'67ee'359a'234e_u128},
+    {Sign::POS, -128, 0xa736'55df'1f2f'489e'149f'6e75'9934'68a3_u128},
+    {Sign::POS, -128, 0xabeb'49a4'6764'fd15'1bec'da80'89c1'a94c_u128},
+    {Sign::POS, -128, 0xb085'baa8'e966'f6da'e4ca'd00d'5c94'bcd2_u128},
+    {Sign::POS, -128, 0xb504'f333'f9de'6484'597d'89b3'754a'be9f_u128},
+    {Sign::POS, -128, 0xb968'41bf'7ffc'b21a'9de1'e3b2'2b8b'f4db_u128},
+    {Sign::POS, -128, 0xbdae'f913'557d'76f0'ac85'320f'528d'6d5d_u128},
+    {Sign::POS, -128, 0xc1d8'705f'fcbb'6e90'bdf0'715c'b8b2'0bd7_u128},
+    {Sign::POS, -128, 0xc5e4'0358'a8ba'05a7'43da'25d9'9267'326b_u128},
+    {Sign::POS, -128, 0xc9d1'124c'931f'da7a'8335'241b'e169'3225_u128},
+    {Sign::POS, -128, 0xcd9f'023f'9c3a'059e'23af'31db'7179'a4aa_u128},
+    {Sign::POS, -128, 0xd14d'3d02'313c'0eed'744f'ea20'e8ab'ef92_u128},
+    {Sign::POS, -128, 0xd4db'3148'750d'1819'f630'e8b6'dac8'3e69_u128},
+    {Sign::POS, -128, 0xd848'52c0'a80f'fcdb'24b9'fe00'6635'74a4_u128},
+    {Sign::POS, -128, 0xdb94'1a28'cb71'ec87'2c19'b632'53da'43fc_u128},
+    {Sign::POS, -128, 0xdebe'0563'7ca9'4cfb'4b19'aa71'fec3'ae6d_u128},
+    {Sign::POS, -128, 0xe1c5'978c'05ed'8691'f4e8'a837'2f8c'5810_u128},
+    {Sign::POS, -128, 0xe4aa'5909'a08f'a7b4'1227'85ae'67f5'515d_u128},
+    {Sign::POS, -128, 0xe76b'd7a1'e63b'9786'1251'2952'9d48'a92f_u128},
+    {Sign::POS, -128, 0xea09'a68a'6e49'cd62'15ad'45b4'a1b5'e823_u128},
+    {Sign::POS, -128, 0xec83'5e79'946a'3145'7e61'0231'ac1d'6181_u128},
+    {Sign::POS, -128, 0xeed8'9db6'6611'e307'86f8'c20f'b664'b01b_u128},
+    {Sign::POS, -128, 0xf109'0827'b437'25fd'6712'7db3'5b28'7316_u128},
+    {Sign::POS, -128, 0xf314'4762'4708'8f74'a548'6bdc'455d'56a2_u128},
+    {Sign::POS, -128, 0xf4fa'0ab6'316e'd2ec'163c'5c7f'03b7'18c5_u128},
+    {Sign::POS, -128, 0xf6ba'073b'424b'19e8'2c79'1f59'cc1f'fc23_u128},
+    {Sign::POS, -128, 0xf853'f7dc'9186'b952'c7ad'c6b4'9888'91bb_u128},
+    {Sign::POS, -128, 0xf9c7'9d63'272c'4628'4504'ae08'd19b'2980_u128},
+    {Sign::POS, -128, 0xfb14'be7f'bae5'8156'2172'a361'fd2a'722f_u128},
+    {Sign::POS, -128, 0xfc3b'27d3'8a5d'49ab'2567'78ff'cb5c'1769_u128},
+    {Sign::POS, -128, 0xfd3a'abf8'4528'b50b'eae6'bd95'1c1d'abbe_u128},
+    {Sign::POS, -128, 0xfe13'2387'0cfe'9a3d'90cd'1d95'9db6'74ef_u128},
+    {Sign::POS, -128, 0xfec4'6d1e'8929'2cf0'4139'0efd'c726'e9ef_u128},
+    {Sign::POS, -128, 0xff4e'6d68'0c41'd0a9'0f66'8633'f1ab'858a_u128},
+    {Sign::POS, -128, 0xffb1'0f1b'cb6b'ef1d'421e'8eda'af59'453e_u128},
+    {Sign::POS, -128, 0xffec'4304'2668'65d9'5657'5523'6696'1732_u128},
+    {Sign::POS, 0, 1},
+};
+
+} // namespace generic
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H
diff --git a/libc/src/math/generic/range_reduction_double_fma.h b/libc/src/math/generic/range_reduction_double_fma.h
new file mode 100644
index 00000000000000..c136de957d2f05
--- /dev/null
+++ b/libc/src/math/generic/range_reduction_double_fma.h
@@ -0,0 +1,495 @@
+//===-- Range reduction for double precision sin/cos/tan w/ FMA -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_FMA_H
+#define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_FMA_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+namespace fma {
+
+using LIBC_NAMESPACE::fputil::DoubleDouble;
+
+LIBC_INLINE constexpr int FAST_PASS_EXPONENT = 32;
+
+// Digits of 2^(16*i) / pi, generated by Sollya with:
+// For [2..62]:
+// >  for i from 3 to 63 do {
+//     pi_inv = 2^(16*(i - 3)) / pi;
+//     pn = nearestint(pi_inv);
+//     pi_frac = pi_inv - pn;
+//     a = round(pi_frac, D, RN);
+//     b = round(pi_frac - a, D, RN);
+//     c = round(pi_frac - a - b, D, RN);
+//     d = round(pi_frac - a - b - c, D, RN);
+//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
+//   };
+// For [0..1]:
+// The leading bit of 2^(16*(i - 3)) / pi is very small, so we add 0.25 so that
+// the conditions for the algorithms are still satisfied, and one of those
+// conditions guarantees that ulp(0.25 * x_reduced) >= 2, and will safely be
+// discarded.
+//  for i from 0 to 2 do {
+//     pi_frac = 0.25 + 2^(16*(i - 3)) / pi;
+//     a = round(pi_frac, D, RN);
+//     b = round(pi_frac - a, D, RN);
+//     c = round(pi_frac - a - b, D, RN);
+//     d = round(pi_frac - a - b - c, D, RN);
+//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
+//   };
+// For The fast pass using double-double, we only need 3 parts (a, b, c), but
+// for the accurate pass using Float128, instead of using another table of
+// Float128s, we simply add the fourth path (a, b, c, d), which simplify the
+// implementation a bit and saving some memory.
+LIBC_INLINE constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = {
+    {0x1.0000000000014p5, 0x1.7cc1b727220a9p-49, 0x1.3f84eafa3ea6ap-103,
+     -0x1.11f924eb53362p-157},
+    {0x1.0000000145f3p5, 0x1.b727220a94fe1p-49, 0x1.d5f47d4d37703p-104,
+     0x1.b6295993c439p-158},
+    {0x1.000145f306dcap5, -0x1.bbead603d8a83p-50, 0x1.f534ddc0db629p-106,
+     0x1.664f10e4107f9p-160},
+    {0x1.45f306dc9c883p5, -0x1.6b01ec5417056p-49, -0x1.6447e493ad4cep-103,
+     0x1.e21c820ff28b2p-157},
+    {-0x1.f246c6efab581p4, 0x1.3abe8fa9a6eep-53, 0x1.b6c52b3278872p-107,
+     0x1.07f9458eaf7afp-164},
+    {0x1.391054a7f09d6p4, -0x1.70565911f924fp-53, 0x1.2b3278872084p-107,
+     -0x1.ae9c5421443aap-162},
+    {0x1.529fc2757d1f5p2, 0x1.a6ee06db14acdp-53, -0x1.8778df7c035d4p-107,
+     0x1.d5ef5de2b0db9p-161},
+    {-0x1.ec54170565912p-1, 0x1.b6c52b3278872p-59, 0x1.07f9458eaf7afp-116,
+     -0x1.d4f246dc8e2dfp-173},
+    {-0x1.505c1596447e5p5, 0x1.b14acc9e21c82p-49, 0x1.fe5163abdebbcp-106,
+     0x1.586dc91b8e909p-160},
+    {-0x1.596447e493ad5p1, 0x1.93c439041fe51p-54, 0x1.8eaf7aef1586ep-108,
+     -0x1.b7238b7b645a4p-163},
+    {0x1.bb81b6c52b328p5, -0x1.de37df00d74e3p-49, 0x1.7bd778ac36e49p-103,
+     -0x1.1c5bdb22d1ffap-158},
+    {0x1.b6c52b3278872p5, 0x1.07f9458eaf7afp-52, -0x1.d4f246dc8e2dfp-109,
+     0x1.374b801924bbbp-164},
+    {0x1.2b3278872084p5, -0x1.ae9c5421443aap-50, 0x1.b7246e3a424ddp-106,
+     0x1.700324977504fp-161},
+    {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db9p-49, 0x1.1b8e909374b8p-104,
+     0x1.924bba8274648p-160},
+    {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dbp-104,
+     -0x1.115f62e6de302p-158},
+    {-0x1.ae9c5421443aap-2, 0x1.b7246e3a424ddp-58, 0x1.700324977504fp-113,
+     -0x1.cdbc603c429c7p-167},
+    {-0x1.38a84288753c9p5, -0x1.b7238b7b645a4p-51, 0x1.924bba8274648p-112,
+     0x1.cfe1deb1cb12ap-166},
+    {-0x1.0a21d4f246dc9p3, 0x1.d2126e9700325p-53, -0x1.a22bec5cdbc6p-107,
+     -0x1.e214e34ed658cp-162},
+    {-0x1.d4f246dc8e2dfp3, 0x1.374b801924bbbp-52, -0x1.f62e6de301e21p-106,
+     -0x1.38d3b5963045ep-160},
+    {-0x1.236e4716f6c8bp4, -0x1.1ff9b6d115f63p-50, 0x1.921cfe1deb1cbp-106,
+     0x1.29a73ee88235fp-162},
+    {0x1.b8e909374b802p4, -0x1.b6d115f62e6dep-50, -0x1.80f10a71a76b3p-105,
+     0x1.cfba208d7d4bbp-160},
+    {0x1.09374b801924cp4, -0x1.15f62e6de301ep-50, -0x1.0a71a76b2c609p-105,
+     0x1.1046bea5d7689p-159},
+    {-0x1.68ffcdb688afbp3, -0x1.736f180f10a72p-53, 0x1.62534e7dd1047p-107,
+     -0x1.0568a25dbd8b3p-161},
+    {0x1.924bba8274648p0, 0x1.cfe1deb1cb12ap-54, -0x1.63045df7282b4p-108,
+     -0x1.44bb7b16638fep-162},
+    {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106,
+     0x1.213a671c09ad1p-160},
+    {0x1.3a32439fc3bd6p1, 0x1.cb129a73ee882p-54, 0x1.afa975da24275p-109,
+     -0x1.8e3f652e8207p-164},
+    {-0x1.b78c0788538d4p4, 0x1.29a73ee88235fp-50, 0x1.4baed1213a672p-104,
+     -0x1.fb29741037d8dp-159},
+    {0x1.fc3bd63962535p5, -0x1.822efb9415a29p-51, 0x1.a24274ce38136p-105,
+     -0x1.741037d8cdc54p-159},
+    {-0x1.4e34ed658c117p2, -0x1.f7282b4512edfp-52, 0x1.d338e04d68bfp-107,
+     -0x1.bec66e29c67cbp-162},
+    {0x1.62534e7dd1047p5, -0x1.0568a25dbd8b3p-49, -0x1.c7eca5d040df6p-105,
+     -0x1.9b8a719f2b318p-160},
+    {-0x1.63045df7282b4p4, -0x1.44bb7b16638fep-50, 0x1.ad17df904e647p-104,
+     0x1.639835339f49dp-158},
+    {0x1.d1046bea5d769p5, -0x1.bd8b31c7eca5dp-49, -0x1.037d8cdc538dp-107,
+     0x1.a99cfa4e422fcp-161},
+    {0x1.afa975da24275p3, -0x1.8e3f652e8207p-52, 0x1.3991d63983534p-106,
+     -0x1.82d8dee81d108p-160},
+    {-0x1.a28976f62cc72p5, 0x1.35a2fbf209cc9p-53, -0x1.4e33e566305b2p-109,
+     0x1.08bf177bf2507p-163},
+    {-0x1.76f62cc71fb29p5, -0x1.d040df633714ep-49, -0x1.9f2b3182d8defp-104,
+     0x1.f8bbdf9283b2p-158},
+    {0x1.d338e04d68bfp5, -0x1.bec66e29c67cbp-50, 0x1.9cfa4e422fc5ep-105,
+     -0x1.036be27003b4p-161},
+    {0x1.c09ad17df904ep4, 0x1.91d639835339fp-50, 0x1.272117e2ef7e5p-104,
+     -0x1.7c4e007680022p-158},
+    {0x1.68befc827323bp5, -0x1.c67cacc60b638p-50, 0x1.17e2ef7e4a0ecp-104,
+     0x1.ff897ffde0598p-158},
+    {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ff1p-103,
+     0x1.7ffde05980fefp-158},
+    {-0x1.8cdc538cf9599p5, 0x1.f49c845f8bbep-50, -0x1.b5f13801da001p-104,
+     0x1.e05980fef2f12p-158},
+    {-0x1.4e33e566305b2p3, 0x1.08bf177bf2507p-51, 0x1.8ffc4bffef02dp-105,
+     -0x1.fc04343b9d298p-160},
+    {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f13802p-52, 0x1.2fffbc0b301fep-107,
+     -0x1.a1dce94beb25cp-163},
+    {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086877p-109,
+     -0x1.d297d64b824b2p-164},
+    {0x1.3908bf177bf25p5, 0x1.d8ffc4bffef03p-53, -0x1.9fc04343b9d29p-108,
+     -0x1.f592e092c9813p-162},
+    {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f593p-110,
+     0x1.f6d367ecf27cbp-166},
+    {-0x1.081b5f13801dap4, -0x1.0fd33f8086877p-61, -0x1.d297d64b824b2p-116,
+     -0x1.8130d834f648bp-170},
+    {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee7p-50, -0x1.297d64b824b26p-104,
+     -0x1.30d834f648b0cp-162},
+    {-0x1.c00ed00043f4dp5, 0x1.fde5e2316b415p-55, -0x1.2e092c98130d8p-110,
+     -0x1.a7b24585ce04dp-165},
+    {0x1.2fffbc0b301fep5, -0x1.a1dce94beb25cp-51, -0x1.25930261b069fp-107,
+     0x1.b74f463f669e6p-162},
+    {-0x1.0fd33f8086877p3, -0x1.d297d64b824b2p-52, -0x1.8130d834f648bp-106,
+     -0x1.738132c3402bap-163},
+    {-0x1.9fc04343b9d29p4, -0x1.f592e092c9813p-50, -0x1.b069ec9161738p-107,
+     -0x1.32c3402ba515bp-163},
+    {-0x1.0d0ee74a5f593p2, 0x1.f6d367ecf27cbp-54, 0x1.36e9e8c7ecd3dp-111,
+     -0x1.00ae9456c229cp-165},
+    {-0x1.dce94beb25c12p5, -0x1.64c0986c1a7b2p-49, -0x1.161738132c34p-103,
+     -0x1.5d28ad8453814p-158},
+    {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b6p-104,
+     -0x1.5b08a7028341dp-159},
+    {0x1.b47db4d9fb3cap4, -0x1.a7b24585ce04dp-53, 0x1.3cbfd45aea4f7p-107,
+     0x1.63f5f2f8bd9e8p-161},
+    {-0x1.25930261b069fp5, 0x1.b74f463f669e6p-50, -0x1.5d28ad8453814p-110,
+     -0x1.a0e84c2f8c608p-166},
+    {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bap-51, -0x1.456c229c0a0dp-105,
+     -0x1.d0985f18c10ebp-159},
+    {-0x1.b069ec9161738p5, -0x1.32c3402ba515bp-51, -0x1.14e050683a131p-108,
+     0x1.0739f78a5292fp-162},
+    {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41dp-104,
+     -0x1.821d6b5b4565p-160},
+    {-0x1.61738132c3403p5, 0x1.16ba93dd63f5fp-49, 0x1.7c5ecf41ce7dep-104,
+     0x1.4a525d4d7f6bfp-159},
+    {0x1.fb34f2ff516bbp3, -0x1.b08a7028341d1p-51, 0x1.9e839cfbc5295p-105,
+     -0x1.a2b2809409dc1p-159},
+    {0x1.3cbfd45aea4f7p5, 0x1.63f5f2f8bd9e8p-49, 0x1.ce7de294a4baap-104,
+     -0x1.404a04ee072a3p-158},
+    {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108,
+     -0x1.3b81ca8bdea7fp-164},
+    {-0x1.15b08a7028342p5, 0x1.7b3d0739f78a5p-50, 0x1.497535fdafd89p-105,
+     -0x1.ca8bdea7f33eep-164},
+};
+
+// Lookup table for sin(k * pi / 128) with k = 0, ..., 255.
+// Table is generated with Sollya as follow:
+// > display = hexadecimal;
+// > for k from 0 to 255 do {
+//     a = D(sin(k * pi/128)); };
+//     b = D(sin(k * pi/128) - a);
+//     print("{", b, ",", a, "},");
+//   };
+LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
+    {0, 0},
+    {-0x1.b1d63091a013p-64, 0x1.92155f7a3667ep-6},
+    {-0x1.912bd0d569a9p-61, 0x1.91f65f10dd814p-5},
+    {-0x1.9a088a8bf6b2cp-59, 0x1.2d52092ce19f6p-4},
+    {-0x1.e2718d26ed688p-60, 0x1.917a6bc29b42cp-4},
+    {0x1.a2704729ae56dp-59, 0x1.f564e56a9730ep-4},
+    {0x1.13000a89a11ep-58, 0x1.2c8106e8e613ap-3},
+    {0x1.531ff779ddac6p-57, 0x1.5e214448b3fc6p-3},
+    {-0x1.26d19b9ff8d82p-57, 0x1.8f8b83c69a60bp-3},
+    {-0x1.af1439e521935p-62, 0x1.c0b826a7e4f63p-3},
+    {-0x1.42deef11da2c4p-57, 0x1.f19f97b215f1bp-3},
+    {0x1.824c20ab7aa9ap-56, 0x1.111d262b1f677p-2},
+    {-0x1.5d28da2c4612dp-56, 0x1.294062ed59f06p-2},
+    {0x1.0c97c4afa2518p-56, 0x1.4135c94176601p-2},
+    {-0x1.efdc0d58cf62p-62, 0x1.58f9a75ab1fddp-2},
+    {-0x1.44b19e0864c5dp-56, 0x1.7088530fa459fp-2},
+    {-0x1.72cedd3d5a61p-57, 0x1.87de2a6aea963p-2},
+    {0x1.6da81290bdbabp-57, 0x1.9ef7943a8ed8ap-2},
+    {0x1.5b362cb974183p-57, 0x1.b5d1009e15ccp-2},
+    {0x1.6850e59c37f8fp-58, 0x1.cc66e9931c45ep-2},
+    {0x1.e0d891d3c6841p-58, 0x1.e2b5d3806f63bp-2},
+    {-0x1.2ec1fc1b776b8p-60, 0x1.f8ba4dbf89abap-2},
+    {-0x1.a5a014347406cp-55, 0x1.073879922ffeep-1},
+    {-0x1.ef23b69abe4f1p-55, 0x1.11eb3541b4b23p-1},
+    {0x1.b25dd267f66p-55, 0x1.1c73b39ae68c8p-1},
+    {-0x1.5da743ef3770cp-55, 0x1.26d054cdd12dfp-1},
+    {-0x1.efcc626f74a6fp-57, 0x1.30ff7fce17035p-1},
+    {0x1.e3e25e3954964p-56, 0x1.3affa292050b9p-1},
+    {0x1.8076a2cfdc6b3p-57, 0x1.44cf325091dd6p-1},
+    {0x1.3c293edceb327p-57, 0x1.4e6cabbe3e5e9p-1},
+    {-0x1.75720992bfbb2p-55, 0x1.57d69348cecap-1},
+    {-0x1.251b352ff2a37p-56, 0x1.610b7551d2cdfp-1},
+    {-0x1.bdd3413b26456p-55, 0x1.6a09e667f3bcdp-1},
+    {0x1.0d4ef0f1d915cp-55, 0x1.72d0837efff96p-1},
+    {-0x1.0f537acdf0ad7p-56, 0x1.7b5df226aafafp-1},
+    {-0x1.6f420f8ea3475p-56, 0x1.83b0e0bff976ep-1},
+    {-0x1.2c5e12ed1336dp-55, 0x1.8bc806b151741p-1},
+    {0x1.3d419a920df0bp-55, 0x1.93a22499263fbp-1},
+    {-0x1.30ee286712474p-55, 0x1.9b3e047f38741p-1},
+    {-0x1.128bb015df175p-56, 0x1.a29a7a0462782p-1},
+    {0x1.9f630e8b6dac8p-60, 0x1.a9b66290ea1a3p-1},
+    {-0x1.926da300ffccep-55, 0x1.b090a581502p-1},
+    {-0x1.bc69f324e6d61p-55, 0x1.b728345196e3ep-1},
+    {-0x1.825a732ac700ap-55, 0x1.bd7c0ac6f952ap-1},
+    {-0x1.6e0b1757c8d07p-56, 0x1.c38b2f180bdb1p-1},
+    {-0x1.2fb761e946603p-58, 0x1.c954b213411f5p-1},
+    {-0x1.e7b6bb5ab58aep-58, 0x1.ced7af43cc773p-1},
+    {-0x1.4ef5295d25af2p-55, 0x1.d4134d14dc93ap-1},
+    {0x1.457e610231ac2p-56, 0x1.d906bcf328d46p-1},
+    {0x1.83c37c6107db3p-55, 0x1.ddb13b6ccc23cp-1},
+    {-0x1.014c76c126527p-55, 0x1.e212104f686e5p-1},
+    {-0x1.16b56f2847754p-57, 0x1.e6288ec48e112p-1},
+    {0x1.760b1e2e3f81ep-55, 0x1.e9f4156c62ddap-1},
+    {0x1.e82c791f59cc2p-56, 0x1.ed740e7684963p-1},
+    {0x1.52c7adc6b4989p-56, 0x1.f0a7efb9230d7p-1},
+    {-0x1.d7bafb51f72e6p-56, 0x1.f38f3ac64e589p-1},
+    {0x1.562172a361fd3p-56, 0x1.f6297cff75cbp-1},
+    {0x1.ab256778ffcb6p-56, 0x1.f8764fa714ba9p-1},
+    {-0x1.7a0a8ca13571fp-55, 0x1.fa7557f08a517p-1},
+    {0x1.1ec8668ecaceep-55, 0x1.fc26470e19fd3p-1},
+    {-0x1.87df6378811c7p-55, 0x1.fd88da3d12526p-1},
+    {0x1.521ecd0c67e35p-57, 0x1.fe9cdad01883ap-1},
+    {-0x1.c57bc2e24aa15p-57, 0x1.ff621e3796d7ep-1},
+    {-0x1.1354d4556e4cbp-55, 0x1.ffd886084cd0dp-1},
+    {0, 1},
+    {-0x1.1354d4556e4cbp-55, 0x1.ffd886084cd0dp-1},
+    {-0x1.c57bc2e24aa15p-57, 0x1.ff621e3796d7ep-1},
+    {0x1.521ecd0c67e35p-57, 0x1.fe9cdad01883ap-1},
+    {-0x1.87df6378811c7p-55, 0x1.fd88da3d12526p-1},
+    {0x1.1ec8668ecaceep-55, 0x1.fc26470e19fd3p-1},
+    {-0x1.7a0a8ca13571fp-55, 0x1.fa7557f08a517p-1},
+    {0x1.ab256778ffcb6p-56, 0x1.f8764fa714ba9p-1},
+    {0x1.562172a361fd3p-56, 0x1.f6297cff75cbp-1},
+    {-0x1.d7bafb51f72e6p-56, 0x1.f38f3ac64e589p-1},
+    {0x1.52c7adc6b4989p-56, 0x1.f0a7efb9230d7p-1},
+    {0x1.e82c791f59cc2p-56, 0x1.ed740e7684963p-1},
+    {0x1.760b1e2e3f81ep-55, 0x1.e9f4156c62ddap-1},
+    {-0x1.16b56f2847754p-57, 0x1.e6288ec48e112p-1},
+    {-0x1.014c76c126527p-55, 0x1.e212104f686e5p-1},
+    {0x1.83c37c6107db3p-55, 0x1.ddb13b6ccc23cp-1},
+    {0x1.457e610231ac2p-56, 0x1.d906bcf328d46p-1},
+    {-0x1.4ef5295d25af2p-55, 0x1.d4134d14dc93ap-1},
+    {-0x1.e7b6bb5ab58aep-58, 0x1.ced7af43cc773p-1},
+    {-0x1.2fb761e946603p-58, 0x1.c954b213411f5p-1},
+    {-0x1.6e0b1757c8d07p-56, 0x1.c38b2f180bdb1p-1},
+    {-0x1.825a732ac700ap-55, 0x1.bd7c0ac6f952ap-1},
+    {-0x1.bc69f324e6d61p-55, 0x1.b728345196e3ep-1},
+    {-0x1.926da300ffccep-55, 0x1.b090a581502p-1},
+    {0x1.9f630e8b6dac8p-60, 0x1.a9b66290ea1a3p-1},
+    {-0x1.128bb015df175p-56, 0x1.a29a7a0462782p-1},
+    {-0x1.30ee286712474p-55, 0x1.9b3e047f38741p-1},
+    {0x1.3d419a920df0bp-55, 0x1.93a22499263fbp-1},
+    {-0x1.2c5e12ed1336dp-55, 0x1.8bc806b151741p-1},
+    {-0x1.6f420f8ea3475p-56, 0x1.83b0e0bff976ep-1},
+    {-0x1.0f537acdf0ad7p-56, 0x1.7b5df226aafafp-1},
+    {0x1.0d4ef0f1d915cp-55, 0x1.72d0837efff96p-1},
+    {-0x1.bdd3413b26456p-55, 0x1.6a09e667f3bcdp-1},
+    {-0x1.251b352ff2a37p-56, 0x1.610b7551d2cdfp-1},
+    {-0x1.75720992bfbb2p-55, 0x1.57d69348cecap-1},
+    {0x1.3c293edceb327p-57, 0x1.4e6cabbe3e5e9p-1},
+    {0x1.8076a2cfdc6b3p-57, 0x1.44cf325091dd6p-1},
+    {0x1.e3e25e3954964p-56, 0x1.3affa292050b9p-1},
+    {-0x1.efcc626f74a6fp-57, 0x1.30ff7fce17035p-1},
+    {-0x1.5da743ef3770cp-55, 0x1.26d054cdd12dfp-1},
+    {0x1.b25dd267f66p-55, 0x1.1c73b39ae68c8p-1},
+    {-0x1.ef23b69abe4f1p-55, 0x1.11eb3541b4b23p-1},
+    {-0x1.a5a014347406cp-55, 0x1.073879922ffeep-1},
+    {-0x1.2ec1fc1b776b8p-60, 0x1.f8ba4dbf89abap-2},
+    {0x1.e0d891d3c6841p-58, 0x1.e2b5d3806f63bp-2},
+    {0x1.6850e59c37f8fp-58, 0x1.cc66e9931c45ep-2},
+    {0x1.5b362cb974183p-57, 0x1.b5d1009e15ccp-2},
+    {0x1.6da81290bdbabp-57, 0x1.9ef7943a8ed8ap-2},
+    {-0x1.72cedd3d5a61p-57, 0x1.87de2a6aea963p-2},
+    {-0x1.44b19e0864c5dp-56, 0x1.7088530fa459fp-2},
+    {-0x1.efdc0d58cf62p-62, 0x1.58f9a75ab1fddp-2},
+    {0x1.0c97c4afa2518p-56, 0x1.4135c94176601p-2},
+    {-0x1.5d28da2c4612dp-56, 0x1.294062ed59f06p-2},
+    {0x1.824c20ab7aa9ap-56, 0x1.111d262b1f677p-2},
+    {-0x1.42deef11da2c4p-57, 0x1.f19f97b215f1bp-3},
+    {-0x1.af1439e521935p-62, 0x1.c0b826a7e4f63p-3},
+    {-0x1.26d19b9ff8d82p-57, 0x1.8f8b83c69a60bp-3},
+    {0x1.531ff779ddac6p-57, 0x1.5e214448b3fc6p-3},
+    {0x1.13000a89a11ep-58, 0x1.2c8106e8e613ap-3},
+    {0x1.a2704729ae56dp-59, 0x1.f564e56a9730ep-4},
+    {-0x1.e2718d26ed688p-60, 0x1.917a6bc29b42cp-4},
+    {-0x1.9a088a8bf6b2cp-59, 0x1.2d52092ce19f6p-4},
+    {-0x1.912bd0d569a9p-61, 0x1.91f65f10dd814p-5},
+    {-0x1.b1d63091a013p-64, 0x1.92155f7a3667ep-6},
+    {0, 0},
+    {0x1.b1d63091a013p-64, -0x1.92155f7a3667ep-6},
+    {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5},
+    {0x1.9a088a8bf6b2cp-59, -0x1.2d52092ce19f6p-4},
+    {0x1.e2718d26ed688p-60, -0x1.917a6bc29b42cp-4},
+    {-0x1.a2704729ae56dp-59, -0x1.f564e56a9730ep-4},
+    {-0x1.13000a89a11ep-58, -0x1.2c8106e8e613ap-3},
+    {-0x1.531ff779ddac6p-57, -0x1.5e214448b3fc6p-3},
+    {0x1.26d19b9ff8d82p-57, -0x1.8f8b83c69a60bp-3},
+    {0x1.af1439e521935p-62, -0x1.c0b826a7e4f63p-3},
+    {0x1.42deef11da2c4p-57, -0x1.f19f97b215f1bp-3},
+    {-0x1.824c20ab7aa9ap-56, -0x1.111d262b1f677p-2},
+    {0x1.5d28da2c4612dp-56, -0x1.294062ed59f06p-2},
+    {-0x1.0c97c4afa2518p-56, -0x1.4135c94176601p-2},
+    {0x1.efdc0d58cf62p-62, -0x1.58f9a75ab1fddp-2},
+    {0x1.44b19e0864c5dp-56, -0x1.7088530fa459fp-2},
+    {0x1.72cedd3d5a61p-57, -0x1.87de2a6aea963p-2},
+    {-0x1.6da81290bdbabp-57, -0x1.9ef7943a8ed8ap-2},
+    {-0x1.5b362cb974183p-57, -0x1.b5d1009e15ccp-2},
+    {-0x1.6850e59c37f8fp-58, -0x1.cc66e9931c45ep-2},
+    {-0x1.e0d891d3c6841p-58, -0x1.e2b5d3806f63bp-2},
+    {0x1.2ec1fc1b776b8p-60, -0x1.f8ba4dbf89abap-2},
+    {0x1.a5a014347406cp-55, -0x1.073879922ffeep-1},
+    {0x1.ef23b69abe4f1p-55, -0x1.11eb3541b4b23p-1},
+    {-0x1.b25dd267f66p-55, -0x1.1c73b39ae68c8p-1},
+    {0x1.5da743ef3770cp-55, -0x1.26d054cdd12dfp-1},
+    {0x1.efcc626f74a6fp-57, -0x1.30ff7fce17035p-1},
+    {-0x1.e3e25e3954964p-56, -0x1.3affa292050b9p-1},
+    {-0x1.8076a2cfdc6b3p-57, -0x1.44cf325091dd6p-1},
+    {-0x1.3c293edceb327p-57, -0x1.4e6cabbe3e5e9p-1},
+    {0x1.75720992bfbb2p-55, -0x1.57d69348cecap-1},
+    {0x1.251b352ff2a37p-56, -0x1.610b7551d2cdfp-1},
+    {0x1.bdd3413b26456p-55, -0x1.6a09e667f3bcdp-1},
+    {-0x1.0d4ef0f1d915cp-55, -0x1.72d0837efff96p-1},
+    {0x1.0f537acdf0ad7p-56, -0x1.7b5df226aafafp-1},
+    {0x1.6f420f8ea3475p-56, -0x1.83b0e0bff976ep-1},
+    {0x1.2c5e12ed1336dp-55, -0x1.8bc806b151741p-1},
+    {-0x1.3d419a920df0bp-55, -0x1.93a22499263fbp-1},
+    {0x1.30ee286712474p-55, -0x1.9b3e047f38741p-1},
+    {0x1.128bb015df175p-56, -0x1.a29a7a0462782p-1},
+    {-0x1.9f630e8b6dac8p-60, -0x1.a9b66290ea1a3p-1},
+    {0x1.926da300ffccep-55, -0x1.b090a581502p-1},
+    {0x1.bc69f324e6d61p-55, -0x1.b728345196e3ep-1},
+    {0x1.825a732ac700ap-55, -0x1.bd7c0ac6f952ap-1},
+    {0x1.6e0b1757c8d07p-56, -0x1.c38b2f180bdb1p-1},
+    {0x1.2fb761e946603p-58, -0x1.c954b213411f5p-1},
+    {0x1.e7b6bb5ab58aep-58, -0x1.ced7af43cc773p-1},
+    {0x1.4ef5295d25af2p-55, -0x1.d4134d14dc93ap-1},
+    {-0x1.457e610231ac2p-56, -0x1.d906bcf328d46p-1},
+    {-0x1.83c37c6107db3p-55, -0x1.ddb13b6ccc23cp-1},
+    {0x1.014c76c126527p-55, -0x1.e212104f686e5p-1},
+    {0x1.16b56f2847754p-57, -0x1.e6288ec48e112p-1},
+    {-0x1.760b1e2e3f81ep-55, -0x1.e9f4156c62ddap-1},
+    {-0x1.e82c791f59cc2p-56, -0x1.ed740e7684963p-1},
+    {-0x1.52c7adc6b4989p-56, -0x1.f0a7efb9230d7p-1},
+    {0x1.d7bafb51f72e6p-56, -0x1.f38f3ac64e589p-1},
+    {-0x1.562172a361fd3p-56, -0x1.f6297cff75cbp-1},
+    {-0x1.ab256778ffcb6p-56, -0x1.f8764fa714ba9p-1},
+    {0x1.7a0a8ca13571fp-55, -0x1.fa7557f08a517p-1},
+    {-0x1.1ec8668ecaceep-55, -0x1.fc26470e19fd3p-1},
+    {0x1.87df6378811c7p-55, -0x1.fd88da3d12526p-1},
+    {-0x1.521ecd0c67e35p-57, -0x1.fe9cdad01883ap-1},
+    {0x1.c57bc2e24aa15p-57, -0x1.ff621e3796d7ep-1},
+    {0x1.1354d4556e4cbp-55, -0x1.ffd886084cd0dp-1},
+    {0, -1},
+    {0x1.1354d4556e4cbp-55, -0x1.ffd886084cd0dp-1},
+    {0x1.c57bc2e24aa15p-57, -0x1.ff621e3796d7ep-1},
+    {-0x1.521ecd0c67e35p-57, -0x1.fe9cdad01883ap-1},
+    {0x1.87df6378811c7p-55, -0x1.fd88da3d12526p-1},
+    {-0x1.1ec8668ecaceep-55, -0x1.fc26470e19fd3p-1},
+    {0x1.7a0a8ca13571fp-55, -0x1.fa7557f08a517p-1},
+    {-0x1.ab256778ffcb6p-56, -0x1.f8764fa714ba9p-1},
+    {-0x1.562172a361fd3p-56, -0x1.f6297cff75cbp-1},
+    {0x1.d7bafb51f72e6p-56, -0x1.f38f3ac64e589p-1},
+    {-0x1.52c7adc6b4989p-56, -0x1.f0a7efb9230d7p-1},
+    {-0x1.e82c791f59cc2p-56, -0x1.ed740e7684963p-1},
+    {-0x1.760b1e2e3f81ep-55, -0x1.e9f4156c62ddap-1},
+    {0x1.16b56f2847754p-57, -0x1.e6288ec48e112p-1},
+    {0x1.014c76c126527p-55, -0x1.e212104f686e5p-1},
+    {-0x1.83c37c6107db3p-55, -0x1.ddb13b6ccc23cp-1},
+    {-0x1.457e610231ac2p-56, -0x1.d906bcf328d46p-1},
+    {0x1.4ef5295d25af2p-55, -0x1.d4134d14dc93ap-1},
+    {0x1.e7b6bb5ab58aep-58, -0x1.ced7af43cc773p-1},
+    {0x1.2fb761e946603p-58, -0x1.c954b213411f5p-1},
+    {0x1.6e0b1757c8d07p-56, -0x1.c38b2f180bdb1p-1},
+    {0x1.825a732ac700ap-55, -0x1.bd7c0ac6f952ap-1},
+    {0x1.bc69f324e6d61p-55, -0x1.b728345196e3ep-1},
+    {0x1.926da300ffccep-55, -0x1.b090a581502p-1},
+    {-0x1.9f630e8b6dac8p-60, -0x1.a9b66290ea1a3p-1},
+    {0x1.128bb015df175p-56, -0x1.a29a7a0462782p-1},
+    {0x1.30ee286712474p-55, -0x1.9b3e047f38741p-1},
+    {-0x1.3d419a920df0bp-55, -0x1.93a22499263fbp-1},
+    {0x1.2c5e12ed1336dp-55, -0x1.8bc806b151741p-1},
+    {0x1.6f420f8ea3475p-56, -0x1.83b0e0bff976ep-1},
+    {0x1.0f537acdf0ad7p-56, -0x1.7b5df226aafafp-1},
+    {-0x1.0d4ef0f1d915cp-55, -0x1.72d0837efff96p-1},
+    {0x1.bdd3413b26456p-55, -0x1.6a09e667f3bcdp-1},
+    {0x1.251b352ff2a37p-56, -0x1.610b7551d2cdfp-1},
+    {0x1.75720992bfbb2p-55, -0x1.57d69348cecap-1},
+    {-0x1.3c293edceb327p-57, -0x1.4e6cabbe3e5e9p-1},
+    {-0x1.8076a2cfdc6b3p-57, -0x1.44cf325091dd6p-1},
+    {-0x1.e3e25e3954964p-56, -0x1.3affa292050b9p-1},
+    {0x1.efcc626f74a6fp-57, -0x1.30ff7fce17035p-1},
+    {0x1.5da743ef3770cp-55, -0x1.26d054cdd12dfp-1},
+    {-0x1.b25dd267f66p-55, -0x1.1c73b39ae68c8p-1},
+    {0x1.ef23b69abe4f1p-55, -0x1.11eb3541b4b23p-1},
+    {0x1.a5a014347406cp-55, -0x1.073879922ffeep-1},
+    {0x1.2ec1fc1b776b8p-60, -0x1.f8ba4dbf89abap-2},
+    {-0x1.e0d891d3c6841p-58, -0x1.e2b5d3806f63bp-2},
+    {-0x1.6850e59c37f8fp-58, -0x1.cc66e9931c45ep-2},
+    {-0x1.5b362cb974183p-57, -0x1.b5d1009e15ccp-2},
+    {-0x1.6da81290bdbabp-57, -0x1.9ef7943a8ed8ap-2},
+    {0x1.72cedd3d5a61p-57, -0x1.87de2a6aea963p-2},
+    {0x1.44b19e0864c5dp-56, -0x1.7088530fa459fp-2},
+    {0x1.efdc0d58cf62p-62, -0x1.58f9a75ab1fddp-2},
+    {-0x1.0c97c4afa2518p-56, -0x1.4135c94176601p-2},
+    {0x1.5d28da2c4612dp-56, -0x1.294062ed59f06p-2},
+    {-0x1.824c20ab7aa9ap-56, -0x1.111d262b1f677p-2},
+    {0x1.42deef11da2c4p-57, -0x1.f19f97b215f1bp-3},
+    {0x1.af1439e521935p-62, -0x1.c0b826a7e4f63p-3},
+    {0x1.26d19b9ff8d82p-57, -0x1.8f8b83c69a60bp-3},
+    {-0x1.531ff779ddac6p-57, -0x1.5e214448b3fc6p-3},
+    {-0x1.13000a89a11ep-58, -0x1.2c8106e8e613ap-3},
+    {-0x1.a2704729ae56dp-59, -0x1.f564e56a9730ep-4},
+    {0x1.e2718d26ed688p-60, -0x1.917a6bc29b42cp-4},
+    {0x1.9a088a8bf6b2cp-59, -0x1.2d52092ce19f6p-4},
+    {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5},
+    {0x1.b1d63091a013p-64, -0x1.92155f7a3667ep-6},
+};
+
+// For |x| < 2^-32, return k and u such that:
+//   k = round(x * 128/pi)
+//   x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo
+LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
+  // Digits of pi/128, generated by Sollya with:
+  // > a = round(pi/128, D, RN);
+  // > b = round(pi/128 - a, D, RN);
+  constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60,
+                                           0x1.921fb54442d18p-6};
+
+  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI[3][0];
+  double kd = fputil::nearest_integer(prod_hi);
+
+  // Let y = x - k * (pi/128)
+  // Then |y| < pi / 256
+  // With extra rounding errors, we can bound |y| < 2^-6.
+  double y_hi = fputil::multiply_add(kd, -PI_OVER_128_DD.hi, x); // Exact
+  // u_hi + u_lo ~ (y_hi + kd*(-PI_OVER_128_DD[1]))
+  // and |u_lo| < 2* ulp(u_hi)
+  // The upper bound 2^-6 is over-estimated, we should still have:
+  // |u_hi + u_lo| < 2^-6.
+  u.hi = fputil::multiply_add(kd, -PI_OVER_128_DD.lo, y_hi);
+  u.lo = y_hi - u.hi; // Exact;
+  u.lo = fputil::multiply_add(kd, -PI_OVER_128_DD.lo, u.lo);
+  // Error bound:
+  // For |x| < 2^32:
+  //   |x * high part of 128/pi| < 2^32 * 2^6 = 2^38
+  // So |k| = |round(x * high part of 128/pi)| < 2^38
+  // And hence,
+  //   |(x mod pi/128) - (u.hi + u.lo)| <= ulp(2 * kd * PI_OVER_128_DD.lo)
+  //                                    < 2 * 2^38 * 2^-59 * 2^-52
+  //                                    = 2^-72
+  // Note: if we limit the input exponent to the same as in non-FMA version,
+  // i.e., |x| < 2^-23, then the output errors can be bounded by 2^-81, similar
+  // to the large range reduction bound.
+  return static_cast<unsigned>(static_cast<int64_t>(kd));
+}
+
+} // namespace fma
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_FMA_H
diff --git a/libc/src/math/generic/range_reduction_double_nofma.h b/libc/src/math/generic/range_reduction_double_nofma.h
new file mode 100644
index 00000000000000..b9d34d6d9f1a8f
--- /dev/null
+++ b/libc/src/math/generic/range_reduction_double_nofma.h
@@ -0,0 +1,493 @@
+//===-- Range reduction for double precision sin/cos/tan --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_NOFMA_H
+#define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_NOFMA_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+namespace nofma {
+
+using fputil::DoubleDouble;
+
+LIBC_INLINE constexpr int FAST_PASS_EXPONENT = 23;
+
+// Digits of 2^(16*i) / pi, generated by Sollya with:
+// For [2..62]:
+// >  for i from 3 to 63 do {
+//     pi_inv = 2^(16*(i - 3)) / pi;
+//     pn = nearestint(pi_inv);
+//     pi_frac = pi_inv - pn;
+//     a = round(pi_frac, 51, RN);
+//     b = round(pi_frac - a, 51, RN);
+//     c = round(pi_frac - a - b, D, RN);
+//     d = round(pi_frac - a - b - c, D, RN);
+//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
+//   };
+// For [0..1]:
+// The leading bit of 2^(16*(i - 3)) / pi is very small, so we add 0.25 so that
+// the conditions for the algorithms are still satisfied, and one of those
+// conditions guarantees that ulp(0.25 * x_reduced) >= 2, and will safely be
+// discarded.
+//  for i from 0 to 2 do {
+//     pi_frac = 0.25 + 2^(16*(i - 3)) / pi;
+//     a = round(pi_frac, 51, RN);
+//     b = round(pi_frac - a, 51, RN);
+//     c = round(pi_frac - a - b, D, RN);
+//     d = round(pi_frac - a - b - c, D, RN);
+//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
+//   };
+// For The fast pass using double-double, we only need 3 parts (a, b, c), but
+// for the accurate pass using Float128, instead of using another table of
+// Float128s, we simply add the fourth path (a, b, c, d), which simplify the
+// implementation a bit and saving some memory.
+LIBC_INLINE constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = {
+    {0x1.0000000000014p5, 0x1.7cc1b727220a8p-49, 0x1.4fe13abe8fa9ap-101,
+     0x1.bb81b6c52b328p-155},
+    {0x1.0000000145f3p5, 0x1.b727220a94fep-49, 0x1.3abe8fa9a6eep-101,
+     0x1.b6c52b3278872p-155},
+    {0x1.000145f306dc8p5, 0x1.c882a53f84ebp-47, -0x1.70565911f924fp-101,
+     0x1.2b3278872084p-155},
+    {0x1.45f306dc9c884p5, -0x1.5ac07b1505c14p-47, -0x1.96447e493ad4dp-99,
+     0x1.3c439041fe516p-154},
+    {-0x1.f246c6efab58p4, -0x1.ec5417056591p-49, -0x1.f924eb53361dep-101,
+     -0x1.bef806ba71508p-156},
+    {0x1.391054a7f09d4p4, 0x1.f47d4d377036cp-48, 0x1.8a5664f10e41p-100,
+     0x1.fe5163abdebbcp-154},
+    {0x1.529fc2757d1f4p2, 0x1.34ddc0db62958p-50, 0x1.93c439041fe51p-102,
+     0x1.8eaf7aef1586ep-156},
+    {-0x1.ec5417056591p-1, -0x1.f924eb53361ep-53, 0x1.c820ff28b1d5fp-105,
+     -0x1.443a9e48db91cp-162},
+    {-0x1.505c1596447e4p5, -0x1.275a99b0ef1cp-48, 0x1.07f9458eaf7afp-100,
+     -0x1.d4f246dc8e2dfp-157},
+    {-0x1.596447e493ad4p1, -0x1.9b0ef1bef806cp-52, 0x1.63abdebbc561bp-106,
+     0x1.c91b8e909374cp-160},
+    {0x1.bb81b6c52b328p5, -0x1.de37df00d74e4p-49, 0x1.5ef5de2b0db92p-101,
+     0x1.b8e909374b802p-156},
+    {0x1.b6c52b3278874p5, -0x1.f7c035d38a844p-47, 0x1.778ac36e48dc7p-99,
+     0x1.2126e97003249p-153},
+    {0x1.2b3278872084p5, -0x1.ae9c5421443a8p-50, -0x1.e48db91c5bdb2p-102,
+     -0x1.68ffcdb688afbp-157},
+    {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db8p-49, 0x1.2371d2126e97p-101,
+     0x1.924bba8274648p-160},
+    {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dbp-104,
+     -0x1.115f62e6de302p-158},
+    {-0x1.ae9c5421443a8p-2, -0x1.e48db91c5bdb4p-54, 0x1.d2e006492eea1p-106,
+     -0x1.8b9b78c078854p-160},
+    {-0x1.38a84288753c8p5, -0x1.1b7238b7b645cp-47, 0x1.c00c925dd413ap-99,
+     0x1.921cfe1deb1cbp-154},
+    {-0x1.0a21d4f246dc8p3, -0x1.c5bdb22d1ff9cp-50, 0x1.25dd413a3243ap-103,
+     -0x1.e214e34ed658cp-162},
+    {-0x1.d4f246dc8e2ep3, 0x1.26e9700324978p-49, -0x1.5f62e6de301e2p-102,
+     -0x1.4e34ed658c117p-158},
+    {-0x1.236e4716f6c8cp4, 0x1.700324977505p-49, -0x1.736f180f10a72p-101,
+     0x1.62534e7dd1047p-155},
+    {0x1.b8e909374b8p4, 0x1.924bba8274648p-48, 0x1.cfe1deb1cb12ap-102,
+     -0x1.63045df7282b4p-156},
+    {0x1.09374b801924cp4, -0x1.15f62e6de302p-50, 0x1.deb1cb129a73fp-102,
+     -0x1.77dca0ad144bbp-158},
+    {-0x1.68ffcdb688afcp3, 0x1.d1921cfe1debp-50, 0x1.cb129a73ee882p-102,
+     0x1.afa975da24275p-157},
+    {0x1.924bba8274648p0, 0x1.cfe1deb1cb128p-54, 0x1.a73ee88235f53p-106,
+     -0x1.44bb7b16638fep-162},
+    {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106,
+     0x1.213a671c09ad1p-160},
+    {0x1.3a32439fc3bd8p1, -0x1.c69dacb1822fp-51, 0x1.1afa975da2427p-105,
+     0x1.338e04d68befdp-159},
+    {-0x1.b78c0788538d4p4, 0x1.29a73ee88236p-50, -0x1.5a28976f62cc7p-103,
+     -0x1.fb29741037d8dp-159},
+    {0x1.fc3bd63962534p5, 0x1.cfba208d7d4bcp-48, -0x1.12edec598e3f6p-100,
+     -0x1.4ba081bec66e3p-154},
+    {-0x1.4e34ed658c118p2, 0x1.046bea5d7689p-51, 0x1.3a671c09ad17ep-104,
+     -0x1.bec66e29c67cbp-162},
+    {0x1.62534e7dd1048p5, -0x1.415a28976f62cp-47, -0x1.8e3f652e8207p-100,
+     0x1.3991d63983534p-154},
+    {-0x1.63045df7282b4p4, -0x1.44bb7b16638fcp-50, -0x1.94ba081bec66ep-102,
+     -0x1.4e33e566305b2p-157},
+    {0x1.d1046bea5d768p5, 0x1.213a671c09adp-48, 0x1.7df904e64758ep-100,
+     0x1.835339f49c846p-154},
+    {0x1.afa975da24274p3, 0x1.9c7026b45f7e4p-50, 0x1.3991d63983534p-106,
+     -0x1.82d8dee81d108p-160},
+    {-0x1.a28976f62cc7p5, -0x1.fb29741037d8cp-47, -0x1.b8a719f2b3183p-100,
+     0x1.3908bf177bf25p-155},
+    {-0x1.76f62cc71fb28p5, -0x1.741037d8cdc54p-47, 0x1.cc1a99cfa4e42p-101,
+     0x1.7e2ef7e4a0ec8p-156},
+    {0x1.d338e04d68bfp5, -0x1.bec66e29c67ccp-50, 0x1.339f49c845f8cp-102,
+     -0x1.081b5f13801dap-156},
+    {0x1.c09ad17df905p4, -0x1.9b8a719f2b318p-48, -0x1.6c6f740e8840ep-103,
+     0x1.41d8ffc4bffefp-157},
+    {0x1.68befc827323cp5, -0x1.38cf9598c16c8p-47, 0x1.08bf177bf2507p-99,
+     0x1.8ffc4bffef02dp-153},
+    {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ff1p-103,
+     0x1.7ffde05980fefp-158},
+    {-0x1.8cdc538cf9598p5, -0x1.82d8dee81d108p-48, -0x1.b5f13801da001p-104,
+     0x1.e05980fef2f12p-158},
+    {-0x1.4e33e566305bp3, -0x1.bdd03a21036cp-49, 0x1.d8ffc4bffef03p-101,
+     -0x1.9fc04343b9d29p-156},
+    {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f138p-52, -0x1.da00087e99fcp-104,
+     -0x1.0d0ee74a5f593p-158},
+    {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086877p-109,
+     -0x1.d297d64b824b2p-164},
+    {0x1.3908bf177bf24p5, 0x1.0763ff12fffbcp-47, 0x1.6603fbcbc462dp-104,
+     0x1.a0a6d1f6d367fp-158},
+    {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f593p-110,
+     0x1.f6d367ecf27cbp-166},
+    {-0x1.081b5f13801dcp4, 0x1.fff7816603fbcp-48, 0x1.788c5ad05369p-101,
+     -0x1.25930261b069fp-155},
+    {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee8p-50, 0x1.6b414da3eda6dp-103,
+     -0x1.30d834f648b0cp-162},
+    {-0x1.c00ed00043f4cp5, -0x1.fc04343b9d298p-48, 0x1.4da3eda6cfd9ep-103,
+     0x1.3e584dba7a32p-157},
+    {0x1.2fffbc0b301fcp5, 0x1.e5e2316b414dcp-47, -0x1.c125930261b07p-99,
+     0x1.84dba7a31fb35p-153},
+    {-0x1.0fd33f8086878p3, 0x1.8b5a0a6d1f6d4p-50, -0x1.30261b069ec91p-103,
+     -0x1.85ce04cb0d00bp-157},
+    {-0x1.9fc04343b9d28p4, -0x1.7d64b824b2604p-48, -0x1.86c1a7b24585dp-101,
+     0x1.fb34f2ff516bbp-157},
+    {-0x1.0d0ee74a5f594p2, 0x1.1f6d367ecf27cp-50, 0x1.6136e9e8c7ecdp-103,
+     0x1.e5fea2d7527bbp-158},
+    {-0x1.dce94beb25c14p5, 0x1.a6cfd9e4f9614p-47, -0x1.22c2e70265868p-100,
+     -0x1.5d28ad8453814p-158},
+    {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b6p-104,
+     -0x1.5b08a7028341dp-159},
+    {0x1.b47db4d9fb3c8p4, 0x1.f2c26dd3d18fcp-48, 0x1.9a797fa8b5d4ap-100,
+     -0x1.14e050683a131p-156},
+    {-0x1.25930261b06ap5, 0x1.36e9e8c7ecd3cp-47, 0x1.7fa8b5d49eeb2p-100,
+     -0x1.41a0e84c2f8c6p-158},
+    {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bcp-51, 0x1.aea4f758fd7ccp-103,
+     -0x1.d0985f18c10ebp-159},
+    {-0x1.b069ec9161738p5, -0x1.32c3402ba515cp-51, 0x1.eeb1faf97c5edp-104,
+     -0x1.7c63043ad6b69p-161},
+    {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41dp-104,
+     -0x1.821d6b5b4565p-160},
+    {-0x1.61738132c3404p5, 0x1.45aea4f758fd8p-47, -0x1.a0e84c2f8c608p-102,
+     -0x1.d6b5b45650128p-156},
+    {0x1.fb34f2ff516bcp3, -0x1.6c229c0a0d074p-49, -0x1.30be31821d6b6p-104,
+     0x1.2ea6bfb5fb12p-158},
+    {0x1.3cbfd45aea4f8p5, -0x1.4e050683a130cp-48, 0x1.ce7de294a4baap-104,
+     -0x1.404a04ee072a3p-158},
+    {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108,
+     -0x1.3b81ca8bdea7fp-164},
+    {-0x1.15b08a702834p5, -0x1.d0985f18c10ecp-47, 0x1.4a4ba9afed7ecp-100,
+     0x1.1f8d5d0856033p-154},
+};
+
+// Lookup table for sin(k * pi / 128) with k = 0, ..., 255.
+// Table is generated with Sollya as follow:
+// > display = hexadecimal;
+// > for k from 0 to 255 do {
+//     a = round(sin(k * pi/128), 51, RN);
+//     b = round(sin(k * pi/128) - a, D, RN);
+//     print("{", b, ",", a, "},");
+//   };
+LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
+    {0, 0},
+    {0x1.f938a73db97fbp-58, 0x1.92155f7a3667cp-6},
+    {-0x1.912bd0d569a9p-61, 0x1.91f65f10dd814p-5},
+    {0x1.ccbeeeae8129ap-56, 0x1.2d52092ce19f4p-4},
+    {-0x1.e2718d26ed688p-60, 0x1.917a6bc29b42cp-4},
+    {-0x1.cbb1f71aca352p-56, 0x1.f564e56a9731p-4},
+    {-0x1.dd9ffeaecbdc4p-55, 0x1.2c8106e8e613cp-3},
+    {-0x1.ab3802218894fp-55, 0x1.5e214448b3fc8p-3},
+    {-0x1.49b466e7fe36p-55, 0x1.8f8b83c69a60cp-3},
+    {-0x1.035e2873ca432p-55, 0x1.c0b826a7e4f64p-3},
+    {-0x1.50b7bbc4768b1p-55, 0x1.f19f97b215f1cp-3},
+    {-0x1.3ed9efaa42ab3p-55, 0x1.111d262b1f678p-2},
+    {0x1.a8b5c974ee7b5p-54, 0x1.294062ed59f04p-2},
+    {0x1.4325f12be8946p-54, 0x1.4135c941766p-2},
+    {0x1.fc2047e54e614p-55, 0x1.58f9a75ab1fdcp-2},
+    {-0x1.512c678219317p-54, 0x1.7088530fa45ap-2},
+    {-0x1.2e59dba7ab4c2p-54, 0x1.87de2a6aea964p-2},
+    {-0x1.d24afdade848bp-54, 0x1.9ef7943a8ed8cp-2},
+    {0x1.5b362cb974183p-57, 0x1.b5d1009e15ccp-2},
+    {-0x1.e97af1a63c807p-54, 0x1.cc66e9931c46p-2},
+    {-0x1.c3e4edc5872f8p-55, 0x1.e2b5d3806f63cp-2},
+    {0x1.fb44f80f92225p-54, 0x1.f8ba4dbf89ab8p-2},
+    {0x1.9697faf2e2fe5p-53, 0x1.073879922ffecp-1},
+    {-0x1.7bc8eda6af93cp-53, 0x1.11eb3541b4b24p-1},
+    {0x1.b25dd267f66p-55, 0x1.1c73b39ae68c8p-1},
+    {-0x1.5769d0fbcddc3p-53, 0x1.26d054cdd12ep-1},
+    {0x1.c20673b2116b2p-54, 0x1.30ff7fce17034p-1},
+    {0x1.3c7c4bc72a92cp-53, 0x1.3affa292050b8p-1},
+    {-0x1.e7f895d302395p-53, 0x1.44cf325091dd8p-1},
+    {0x1.13c293edceb32p-53, 0x1.4e6cabbe3e5e8p-1},
+    {-0x1.75720992bfbb2p-55, 0x1.57d69348cecap-1},
+    {-0x1.24a366a5fe547p-53, 0x1.610b7551d2cep-1},
+    {0x1.21165f626cdd5p-54, 0x1.6a09e667f3bccp-1},
+    {-0x1.bcac43c389ba9p-53, 0x1.72d0837efff98p-1},
+    {-0x1.21ea6f59be15bp-53, 0x1.7b5df226aafbp-1},
+    {0x1.d217be0e2b971p-53, 0x1.83b0e0bff976cp-1},
+    {0x1.69d0f6897664ap-54, 0x1.8bc806b15174p-1},
+    {-0x1.615f32b6f907ap-54, 0x1.93a22499263fcp-1},
+    {0x1.6788ebcc76dc6p-54, 0x1.9b3e047f3874p-1},
+    {0x1.ddae89fd441d1p-53, 0x1.a29a7a046278p-1},
+    {-0x1.f98273c5d2495p-54, 0x1.a9b66290ea1a4p-1},
+    {-0x1.926da300ffccep-55, 0x1.b090a581502p-1},
+    {0x1.90e58336c64a8p-53, 0x1.b728345196e3cp-1},
+    {0x1.9f6963354e3fep-53, 0x1.bd7c0ac6f9528p-1},
+    {0x1.a47d3a2a0dcbep-54, 0x1.c38b2f180bdbp-1},
+    {0x1.ed0489e16b9ap-54, 0x1.c954b213411f4p-1},
+    {-0x1.0f3db5dad5ac5p-53, 0x1.ced7af43cc774p-1},
+    {0x1.ac42b5a8b6943p-53, 0x1.d4134d14dc938p-1},
+    {-0x1.d75033dfb9ca8p-53, 0x1.d906bcf328d48p-1},
+    {0x1.83c37c6107db3p-55, 0x1.ddb13b6ccc23cp-1},
+    {0x1.7f59c49f6cd6dp-54, 0x1.e212104f686e4p-1},
+    {0x1.ee94a90d7b88bp-53, 0x1.e6288ec48e11p-1},
+    {-0x1.a27d3874701f9p-53, 0x1.e9f4156c62ddcp-1},
+    {-0x1.85f4e1b8298dp-54, 0x1.ed740e7684964p-1},
+    {-0x1.ab4e148e52d9ep-54, 0x1.f0a7efb9230d8p-1},
+    {0x1.8a11412b82346p-54, 0x1.f38f3ac64e588p-1},
+    {0x1.562172a361fd3p-56, 0x1.f6297cff75cbp-1},
+    {0x1.3564acef1ff97p-53, 0x1.f8764fa714ba8p-1},
+    {-0x1.5e82a3284d5c8p-53, 0x1.fa7557f08a518p-1},
+    {-0x1.709bccb89a989p-54, 0x1.fc26470e19fd4p-1},
+    {0x1.9e082721dfb8ep-53, 0x1.fd88da3d12524p-1},
+    {-0x1.eade132f3981dp-53, 0x1.fe9cdad01883cp-1},
+    {0x1.e3a843d1db55fp-53, 0x1.ff621e3796d7cp-1},
+    {0x1.765595d548d9ap-54, 0x1.ffd886084cd0cp-1},
+    {0, 1},
+    {0x1.765595d548d9ap-54, 0x1.ffd886084cd0cp-1},
+    {0x1.e3a843d1db55fp-53, 0x1.ff621e3796d7cp-1},
+    {-0x1.eade132f3981dp-53, 0x1.fe9cdad01883cp-1},
+    {0x1.9e082721dfb8ep-53, 0x1.fd88da3d12524p-1},
+    {-0x1.709bccb89a989p-54, 0x1.fc26470e19fd4p-1},
+    {-0x1.5e82a3284d5c8p-53, 0x1.fa7557f08a518p-1},
+    {0x1.3564acef1ff97p-53, 0x1.f8764fa714ba8p-1},
+    {0x1.562172a361fd3p-56, 0x1.f6297cff75cbp-1},
+    {0x1.8a11412b82346p-54, 0x1.f38f3ac64e588p-1},
+    {-0x1.ab4e148e52d9ep-54, 0x1.f0a7efb9230d8p-1},
+    {-0x1.85f4e1b8298dp-54, 0x1.ed740e7684964p-1},
+    {-0x1.a27d3874701f9p-53, 0x1.e9f4156c62ddcp-1},
+    {0x1.ee94a90d7b88bp-53, 0x1.e6288ec48e11p-1},
+    {0x1.7f59c49f6cd6dp-54, 0x1.e212104f686e4p-1},
+    {0x1.83c37c6107db3p-55, 0x1.ddb13b6ccc23cp-1},
+    {-0x1.d75033dfb9ca8p-53, 0x1.d906bcf328d48p-1},
+    {0x1.ac42b5a8b6943p-53, 0x1.d4134d14dc938p-1},
+    {-0x1.0f3db5dad5ac5p-53, 0x1.ced7af43cc774p-1},
+    {0x1.ed0489e16b9ap-54, 0x1.c954b213411f4p-1},
+    {0x1.a47d3a2a0dcbep-54, 0x1.c38b2f180bdbp-1},
+    {0x1.9f6963354e3fep-53, 0x1.bd7c0ac6f9528p-1},
+    {0x1.90e58336c64a8p-53, 0x1.b728345196e3cp-1},
+    {-0x1.926da300ffccep-55, 0x1.b090a581502p-1},
+    {-0x1.f98273c5d2495p-54, 0x1.a9b66290ea1a4p-1},
+    {0x1.ddae89fd441d1p-53, 0x1.a29a7a046278p-1},
+    {0x1.6788ebcc76dc6p-54, 0x1.9b3e047f3874p-1},
+    {-0x1.615f32b6f907ap-54, 0x1.93a22499263fcp-1},
+    {0x1.69d0f6897664ap-54, 0x1.8bc806b15174p-1},
+    {0x1.d217be0e2b971p-53, 0x1.83b0e0bff976cp-1},
+    {-0x1.21ea6f59be15bp-53, 0x1.7b5df226aafbp-1},
+    {-0x1.bcac43c389ba9p-53, 0x1.72d0837efff98p-1},
+    {0x1.21165f626cdd5p-54, 0x1.6a09e667f3bccp-1},
+    {-0x1.24a366a5fe547p-53, 0x1.610b7551d2cep-1},
+    {-0x1.75720992bfbb2p-55, 0x1.57d69348cecap-1},
+    {0x1.13c293edceb32p-53, 0x1.4e6cabbe3e5e8p-1},
+    {-0x1.e7f895d302395p-53, 0x1.44cf325091dd8p-1},
+    {0x1.3c7c4bc72a92cp-53, 0x1.3affa292050b8p-1},
+    {0x1.c20673b2116b2p-54, 0x1.30ff7fce17034p-1},
+    {-0x1.5769d0fbcddc3p-53, 0x1.26d054cdd12ep-1},
+    {0x1.b25dd267f66p-55, 0x1.1c73b39ae68c8p-1},
+    {-0x1.7bc8eda6af93cp-53, 0x1.11eb3541b4b24p-1},
+    {0x1.9697faf2e2fe5p-53, 0x1.073879922ffecp-1},
+    {0x1.fb44f80f92225p-54, 0x1.f8ba4dbf89ab8p-2},
+    {-0x1.c3e4edc5872f8p-55, 0x1.e2b5d3806f63cp-2},
+    {-0x1.e97af1a63c807p-54, 0x1.cc66e9931c46p-2},
+    {0x1.5b362cb974183p-57, 0x1.b5d1009e15ccp-2},
+    {-0x1.d24afdade848bp-54, 0x1.9ef7943a8ed8cp-2},
+    {-0x1.2e59dba7ab4c2p-54, 0x1.87de2a6aea964p-2},
+    {-0x1.512c678219317p-54, 0x1.7088530fa45ap-2},
+    {0x1.fc2047e54e614p-55, 0x1.58f9a75ab1fdcp-2},
+    {0x1.4325f12be8946p-54, 0x1.4135c941766p-2},
+    {0x1.a8b5c974ee7b5p-54, 0x1.294062ed59f04p-2},
+    {-0x1.3ed9efaa42ab3p-55, 0x1.111d262b1f678p-2},
+    {-0x1.50b7bbc4768b1p-55, 0x1.f19f97b215f1cp-3},
+    {-0x1.035e2873ca432p-55, 0x1.c0b826a7e4f64p-3},
+    {-0x1.49b466e7fe36p-55, 0x1.8f8b83c69a60cp-3},
+    {-0x1.ab3802218894fp-55, 0x1.5e214448b3fc8p-3},
+    {-0x1.dd9ffeaecbdc4p-55, 0x1.2c8106e8e613cp-3},
+    {-0x1.cbb1f71aca352p-56, 0x1.f564e56a9731p-4},
+    {-0x1.e2718d26ed688p-60, 0x1.917a6bc29b42cp-4},
+    {0x1.ccbeeeae8129ap-56, 0x1.2d52092ce19f4p-4},
+    {-0x1.912bd0d569a9p-61, 0x1.91f65f10dd814p-5},
+    {0x1.f938a73db97fbp-58, 0x1.92155f7a3667cp-6},
+    {0, 0},
+    {-0x1.f938a73db97fbp-58, -0x1.92155f7a3667cp-6},
+    {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5},
+    {-0x1.ccbeeeae8129ap-56, -0x1.2d52092ce19f4p-4},
+    {0x1.e2718d26ed688p-60, -0x1.917a6bc29b42cp-4},
+    {0x1.cbb1f71aca352p-56, -0x1.f564e56a9731p-4},
+    {0x1.dd9ffeaecbdc4p-55, -0x1.2c8106e8e613cp-3},
+    {0x1.ab3802218894fp-55, -0x1.5e214448b3fc8p-3},
+    {0x1.49b466e7fe36p-55, -0x1.8f8b83c69a60cp-3},
+    {0x1.035e2873ca432p-55, -0x1.c0b826a7e4f64p-3},
+    {0x1.50b7bbc4768b1p-55, -0x1.f19f97b215f1cp-3},
+    {0x1.3ed9efaa42ab3p-55, -0x1.111d262b1f678p-2},
+    {-0x1.a8b5c974ee7b5p-54, -0x1.294062ed59f04p-2},
+    {-0x1.4325f12be8946p-54, -0x1.4135c941766p-2},
+    {-0x1.fc2047e54e614p-55, -0x1.58f9a75ab1fdcp-2},
+    {0x1.512c678219317p-54, -0x1.7088530fa45ap-2},
+    {0x1.2e59dba7ab4c2p-54, -0x1.87de2a6aea964p-2},
+    {0x1.d24afdade848bp-54, -0x1.9ef7943a8ed8cp-2},
+    {-0x1.5b362cb974183p-57, -0x1.b5d1009e15ccp-2},
+    {0x1.e97af1a63c807p-54, -0x1.cc66e9931c46p-2},
+    {0x1.c3e4edc5872f8p-55, -0x1.e2b5d3806f63cp-2},
+    {-0x1.fb44f80f92225p-54, -0x1.f8ba4dbf89ab8p-2},
+    {-0x1.9697faf2e2fe5p-53, -0x1.073879922ffecp-1},
+    {0x1.7bc8eda6af93cp-53, -0x1.11eb3541b4b24p-1},
+    {-0x1.b25dd267f66p-55, -0x1.1c73b39ae68c8p-1},
+    {0x1.5769d0fbcddc3p-53, -0x1.26d054cdd12ep-1},
+    {-0x1.c20673b2116b2p-54, -0x1.30ff7fce17034p-1},
+    {-0x1.3c7c4bc72a92cp-53, -0x1.3affa292050b8p-1},
+    {0x1.e7f895d302395p-53, -0x1.44cf325091dd8p-1},
+    {-0x1.13c293edceb32p-53, -0x1.4e6cabbe3e5e8p-1},
+    {0x1.75720992bfbb2p-55, -0x1.57d69348cecap-1},
+    {0x1.24a366a5fe547p-53, -0x1.610b7551d2cep-1},
+    {-0x1.21165f626cdd5p-54, -0x1.6a09e667f3bccp-1},
+    {0x1.bcac43c389ba9p-53, -0x1.72d0837efff98p-1},
+    {0x1.21ea6f59be15bp-53, -0x1.7b5df226aafbp-1},
+    {-0x1.d217be0e2b971p-53, -0x1.83b0e0bff976cp-1},
+    {-0x1.69d0f6897664ap-54, -0x1.8bc806b15174p-1},
+    {0x1.615f32b6f907ap-54, -0x1.93a22499263fcp-1},
+    {-0x1.6788ebcc76dc6p-54, -0x1.9b3e047f3874p-1},
+    {-0x1.ddae89fd441d1p-53, -0x1.a29a7a046278p-1},
+    {0x1.f98273c5d2495p-54, -0x1.a9b66290ea1a4p-1},
+    {0x1.926da300ffccep-55, -0x1.b090a581502p-1},
+    {-0x1.90e58336c64a8p-53, -0x1.b728345196e3cp-1},
+    {-0x1.9f6963354e3fep-53, -0x1.bd7c0ac6f9528p-1},
+    {-0x1.a47d3a2a0dcbep-54, -0x1.c38b2f180bdbp-1},
+    {-0x1.ed0489e16b9ap-54, -0x1.c954b213411f4p-1},
+    {0x1.0f3db5dad5ac5p-53, -0x1.ced7af43cc774p-1},
+    {-0x1.ac42b5a8b6943p-53, -0x1.d4134d14dc938p-1},
+    {0x1.d75033dfb9ca8p-53, -0x1.d906bcf328d48p-1},
+    {-0x1.83c37c6107db3p-55, -0x1.ddb13b6ccc23cp-1},
+    {-0x1.7f59c49f6cd6dp-54, -0x1.e212104f686e4p-1},
+    {-0x1.ee94a90d7b88bp-53, -0x1.e6288ec48e11p-1},
+    {0x1.a27d3874701f9p-53, -0x1.e9f4156c62ddcp-1},
+    {0x1.85f4e1b8298dp-54, -0x1.ed740e7684964p-1},
+    {0x1.ab4e148e52d9ep-54, -0x1.f0a7efb9230d8p-1},
+    {-0x1.8a11412b82346p-54, -0x1.f38f3ac64e588p-1},
+    {-0x1.562172a361fd3p-56, -0x1.f6297cff75cbp-1},
+    {-0x1.3564acef1ff97p-53, -0x1.f8764fa714ba8p-1},
+    {0x1.5e82a3284d5c8p-53, -0x1.fa7557f08a518p-1},
+    {0x1.709bccb89a989p-54, -0x1.fc26470e19fd4p-1},
+    {-0x1.9e082721dfb8ep-53, -0x1.fd88da3d12524p-1},
+    {0x1.eade132f3981dp-53, -0x1.fe9cdad01883cp-1},
+    {-0x1.e3a843d1db55fp-53, -0x1.ff621e3796d7cp-1},
+    {-0x1.765595d548d9ap-54, -0x1.ffd886084cd0cp-1},
+    {0, -1},
+    {-0x1.765595d548d9ap-54, -0x1.ffd886084cd0cp-1},
+    {-0x1.e3a843d1db55fp-53, -0x1.ff621e3796d7cp-1},
+    {0x1.eade132f3981dp-53, -0x1.fe9cdad01883cp-1},
+    {-0x1.9e082721dfb8ep-53, -0x1.fd88da3d12524p-1},
+    {0x1.709bccb89a989p-54, -0x1.fc26470e19fd4p-1},
+    {0x1.5e82a3284d5c8p-53, -0x1.fa7557f08a518p-1},
+    {-0x1.3564acef1ff97p-53, -0x1.f8764fa714ba8p-1},
+    {-0x1.562172a361fd3p-56, -0x1.f6297cff75cbp-1},
+    {-0x1.8a11412b82346p-54, -0x1.f38f3ac64e588p-1},
+    {0x1.ab4e148e52d9ep-54, -0x1.f0a7efb9230d8p-1},
+    {0x1.85f4e1b8298dp-54, -0x1.ed740e7684964p-1},
+    {0x1.a27d3874701f9p-53, -0x1.e9f4156c62ddcp-1},
+    {-0x1.ee94a90d7b88bp-53, -0x1.e6288ec48e11p-1},
+    {-0x1.7f59c49f6cd6dp-54, -0x1.e212104f686e4p-1},
+    {-0x1.83c37c6107db3p-55, -0x1.ddb13b6ccc23cp-1},
+    {0x1.d75033dfb9ca8p-53, -0x1.d906bcf328d48p-1},
+    {-0x1.ac42b5a8b6943p-53, -0x1.d4134d14dc938p-1},
+    {0x1.0f3db5dad5ac5p-53, -0x1.ced7af43cc774p-1},
+    {-0x1.ed0489e16b9ap-54, -0x1.c954b213411f4p-1},
+    {-0x1.a47d3a2a0dcbep-54, -0x1.c38b2f180bdbp-1},
+    {-0x1.9f6963354e3fep-53, -0x1.bd7c0ac6f9528p-1},
+    {-0x1.90e58336c64a8p-53, -0x1.b728345196e3cp-1},
+    {0x1.926da300ffccep-55, -0x1.b090a581502p-1},
+    {0x1.f98273c5d2495p-54, -0x1.a9b66290ea1a4p-1},
+    {-0x1.ddae89fd441d1p-53, -0x1.a29a7a046278p-1},
+    {-0x1.6788ebcc76dc6p-54, -0x1.9b3e047f3874p-1},
+    {0x1.615f32b6f907ap-54, -0x1.93a22499263fcp-1},
+    {-0x1.69d0f6897664ap-54, -0x1.8bc806b15174p-1},
+    {-0x1.d217be0e2b971p-53, -0x1.83b0e0bff976cp-1},
+    {0x1.21ea6f59be15bp-53, -0x1.7b5df226aafbp-1},
+    {0x1.bcac43c389ba9p-53, -0x1.72d0837efff98p-1},
+    {-0x1.21165f626cdd5p-54, -0x1.6a09e667f3bccp-1},
+    {0x1.24a366a5fe547p-53, -0x1.610b7551d2cep-1},
+    {0x1.75720992bfbb2p-55, -0x1.57d69348cecap-1},
+    {-0x1.13c293edceb32p-53, -0x1.4e6cabbe3e5e8p-1},
+    {0x1.e7f895d302395p-53, -0x1.44cf325091dd8p-1},
+    {-0x1.3c7c4bc72a92cp-53, -0x1.3affa292050b8p-1},
+    {-0x1.c20673b2116b2p-54, -0x1.30ff7fce17034p-1},
+    {0x1.5769d0fbcddc3p-53, -0x1.26d054cdd12ep-1},
+    {-0x1.b25dd267f66p-55, -0x1.1c73b39ae68c8p-1},
+    {0x1.7bc8eda6af93cp-53, -0x1.11eb3541b4b24p-1},
+    {-0x1.9697faf2e2fe5p-53, -0x1.073879922ffecp-1},
+    {-0x1.fb44f80f92225p-54, -0x1.f8ba4dbf89ab8p-2},
+    {0x1.c3e4edc5872f8p-55, -0x1.e2b5d3806f63cp-2},
+    {0x1.e97af1a63c807p-54, -0x1.cc66e9931c46p-2},
+    {-0x1.5b362cb974183p-57, -0x1.b5d1009e15ccp-2},
+    {0x1.d24afdade848bp-54, -0x1.9ef7943a8ed8cp-2},
+    {0x1.2e59dba7ab4c2p-54, -0x1.87de2a6aea964p-2},
+    {0x1.512c678219317p-54, -0x1.7088530fa45ap-2},
+    {-0x1.fc2047e54e614p-55, -0x1.58f9a75ab1fdcp-2},
+    {-0x1.4325f12be8946p-54, -0x1.4135c941766p-2},
+    {-0x1.a8b5c974ee7b5p-54, -0x1.294062ed59f04p-2},
+    {0x1.3ed9efaa42ab3p-55, -0x1.111d262b1f678p-2},
+    {0x1.50b7bbc4768b1p-55, -0x1.f19f97b215f1cp-3},
+    {0x1.035e2873ca432p-55, -0x1.c0b826a7e4f64p-3},
+    {0x1.49b466e7fe36p-55, -0x1.8f8b83c69a60cp-3},
+    {0x1.ab3802218894fp-55, -0x1.5e214448b3fc8p-3},
+    {0x1.dd9ffeaecbdc4p-55, -0x1.2c8106e8e613cp-3},
+    {0x1.cbb1f71aca352p-56, -0x1.f564e56a9731p-4},
+    {0x1.e2718d26ed688p-60, -0x1.917a6bc29b42cp-4},
+    {-0x1.ccbeeeae8129ap-56, -0x1.2d52092ce19f4p-4},
+    {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5},
+    {-0x1.f938a73db97fbp-58, -0x1.92155f7a3667cp-6},
+};
+
+LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
+  constexpr double ONE_TWENTY_EIGHT_OVER_PI = 0x1.45f306dc9c883p5;
+
+  // Digits of -pi/128, generated by Sollya with:
+  // > a = round(-pi/128, 25, RN);
+  // > b = round(-pi/128 - a, 23, RN);
+  // > c = round(-pi/128 - a - b, 25, RN);
+  // > d = round(-pi/128 - a - b - c, D, RN);
+  //   -pi/128 ~ a + b + c + d
+  // The precisions of the parts are chosen so that:
+  // 1)  k * a, k * b, k * c are exact in double precision
+  // 2)  k * b + (x - (k * a)) is exact in double precsion
+  constexpr double MPI_OVER_128[4] = {-0x1.921fb5p-6, -0x1.110b48p-32,
+                                      +0x1.ee59dap-56, -0x1.98a2e03707345p-83};
+
+  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI;
+  double kd = fputil::nearest_integer(prod_hi);
+
+  // With -pi/128 ~ a + b + c + d as in MPI_OVER_128 description:
+  // t = x + k * a
+  double t = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact
+  // y_hi = t + k * b = (x + k * a) + k * b
+  double y_hi = fputil::multiply_add(kd, MPI_OVER_128[1], t); // Exact
+  // y_lo ~ k * c + k * d
+  double y_lo = fputil::multiply_add(kd, MPI_OVER_128[2], kd * MPI_OVER_128[3]);
+  // u.hi + u.lo ~ x + k * (a + b + c + d)
+  u = fputil::exact_add(y_hi, y_lo);
+  // Error bound: For |x| < 2^-23,
+  //  |(x mod pi/128) - (u_hi + u_lo)| <  ulp(y_lo)
+  //                                   <= ulp(2 * x * c)
+  //                                   <= ulp(2^24 * 2^-56)
+  //                                   =  2^(24 - 56 - 52)
+  //                                   =  2^-84
+  return static_cast<unsigned>(static_cast<int>(kd));
+}
+
+} // namespace nofma
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_NOFMA_H
diff --git a/libc/src/math/generic/sin.cpp b/libc/src/math/generic/sin.cpp
new file mode 100644
index 00000000000000..207435d4385ae1
--- /dev/null
+++ b/libc/src/math/generic/sin.cpp
@@ -0,0 +1,207 @@
+//===-- Double-precision sin function -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sin.h"
+#include "hdr/errno_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/sincos_eval.h"
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+
+using LIBC_NAMESPACE::fma::FAST_PASS_EXPONENT;
+using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI;
+using LIBC_NAMESPACE::fma::range_reduction_small;
+using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128;
+
+LIBC_INLINE constexpr bool NO_FMA = false;
+#else
+#include "range_reduction_double_nofma.h"
+
+using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT;
+using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI;
+using LIBC_NAMESPACE::nofma::range_reduction_small;
+using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128;
+
+LIBC_INLINE constexpr bool NO_FMA = true;
+#endif // LIBC_TARGET_CPU_HAS_FMA
+
+// TODO: We might be able to improve the performance of large range reduction of
+// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
+// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
+// those lookup table.
+#include "range_reduction_double_common.h"
+
+#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
+#define LIBC_MATH_SIN_SKIP_ACCURATE_PASS
+#endif
+
+namespace LIBC_NAMESPACE {
+
+using DoubleDouble = fputil::DoubleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+LLVM_LIBC_FUNCTION(double, sin, (double x)) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint16_t x_e = xbits.get_biased_exponent();
+
+  DoubleDouble y;
+  unsigned k;
+  generic::LargeRangeReduction<NO_FMA> range_reduction_large;
+
+  // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
+    // |x| < 2^-26
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) {
+      // Signed zeros.
+      if (LIBC_UNLIKELY(x == 0.0))
+        return x;
+
+        // For |x| < 2^-26, |sin(x) - x| < ulp(x)/2.
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+      return fputil::multiply_add(x, -0x1.0p-54, x);
+#else
+      if (LIBC_UNLIKELY(x_e < 4)) {
+        int rounding_mode = fputil::quick_get_round();
+        if (rounding_mode == FE_TOWARDZERO ||
+            (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) ||
+            (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD))
+          return FPBits(xbits.uintval() - 1).get_val();
+      }
+      return fputil::multiply_add(x, -0x1.0p-54, x);
+#endif // LIBC_TARGET_CPU_HAS_FMA
+    }
+
+    // // Small range reduction.
+    k = range_reduction_small(x, y);
+  } else {
+    // Inf or NaN
+    if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
+      // sin(+-Inf) = NaN
+      if (xbits.get_mantissa() == 0) {
+        fputil::set_errno_if_required(EDOM);
+        fputil::raise_except_if_required(FE_INVALID);
+      }
+      return x + FPBits::quiet_nan().get_val();
+    }
+
+    // Large range reduction.
+    k = range_reduction_large.compute_high_part(x);
+    y = range_reduction_large.fast();
+  }
+
+  DoubleDouble sin_y, cos_y;
+
+  generic::sincos_eval(y, sin_y, cos_y);
+
+  // Look up sin(k * pi/128) and cos(k * pi/128)
+  // Memory saving versions:
+
+  // Use 128-entry table instead:
+  // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
+  // uint64_t sin_s = static_cast<uint64_t>(k & 128) << (63 - 7);
+  // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
+  // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
+  // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
+  // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
+  // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
+  // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
+
+  // Use 64-entry table instead:
+  // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+  //   unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+  //   DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+  //   if (kk & 128) {
+  //     ans.hi = -ans.hi;
+  //     ans.lo = -ans.lo;
+  //   }
+  //   return ans;
+  // };
+  // DoubleDouble sin_k = get_idx_dd(k);
+  // DoubleDouble cos_k = get_idx_dd(k + 64);
+
+  // Fast look up version, but needs 256-entry table.
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 255];
+  DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+
+  // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
+  // So k is an integer and -pi / 256 <= y <= pi / 256.
+  // Then sin(x) = sin((k * pi/128 + y)
+  //             = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128)
+  DoubleDouble sin_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, sin_k);
+  DoubleDouble cos_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, cos_k);
+
+  DoubleDouble rr = fputil::exact_add<false>(sin_k_cos_y.hi, cos_k_sin_y.hi);
+  rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;
+
+#ifdef LIBC_MATH_SIN_SKIP_ACCURATE_PASS
+  return rr.hi + rr.lo;
+#else
+  // Accurate test and pass for correctly rounded implementation.
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+  constexpr double ERR = 0x1.0p-70;
+#else
+  // TODO: Improve non-FMA fast pass accuracy.
+  constexpr double ERR = 0x1.0p-66;
+#endif // LIBC_TARGET_CPU_HAS_FMA
+
+  double rlp = rr.lo + ERR;
+  double rlm = rr.lo - ERR;
+
+  double r_upper = rr.hi + rlp; // (rr.lo + ERR);
+  double r_lower = rr.hi + rlm; // (rr.lo - ERR);
+
+  // Ziv's rounding test.
+  if (LIBC_LIKELY(r_upper == r_lower))
+    return r_upper;
+
+  Float128 u_f128, sin_u, cos_u;
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
+    u_f128 = generic::range_reduction_small_f128(x);
+  else
+    u_f128 = range_reduction_large.accurate();
+
+  generic::sincos_eval(u_f128, sin_u, cos_u);
+
+  auto get_sin_k = [](unsigned kk) -> Float128 {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+    if (kk & 128)
+      ans.sign = Sign::NEG;
+    return ans;
+  };
+
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  Float128 sin_k_f128 = get_sin_k(k);
+  Float128 cos_k_f128 = get_sin_k(k + 64);
+
+  // sin(x) = sin((k * pi/128 + u)
+  //        = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128)
+  Float128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u),
+                                 fputil::quick_mul(cos_k_f128, sin_u));
+
+  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
+  // https://github.com/llvm/llvm-project/issues/96452.
+
+  return static_cast<double>(r);
+#endif // !LIBC_MATH_SIN_SKIP_ACCURATE_PASS
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/sincos.cpp b/libc/src/math/generic/sincos.cpp
new file mode 100644
index 00000000000000..a0dd3a018af59c
--- /dev/null
+++ b/libc/src/math/generic/sincos.cpp
@@ -0,0 +1,247 @@
+//===-- Double-precision sincos function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sincos.h"
+#include "hdr/errno_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/sincos_eval.h"
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+
+using LIBC_NAMESPACE::fma::FAST_PASS_EXPONENT;
+using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI;
+using LIBC_NAMESPACE::fma::range_reduction_small;
+using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128;
+
+LIBC_INLINE constexpr bool NO_FMA = false;
+#else
+#include "range_reduction_double_nofma.h"
+
+using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT;
+using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI;
+using LIBC_NAMESPACE::nofma::range_reduction_small;
+using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128;
+
+LIBC_INLINE constexpr bool NO_FMA = true;
+#endif // LIBC_TARGET_CPU_HAS_FMA
+
+// TODO: We might be able to improve the performance of large range reduction of
+// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
+// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
+// those lookup table.
+#include "range_reduction_double_common.h"
+
+#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
+#define LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS
+#endif
+
+namespace LIBC_NAMESPACE {
+
+using DoubleDouble = fputil::DoubleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint16_t x_e = xbits.get_biased_exponent();
+
+  DoubleDouble y;
+  unsigned k;
+  generic::LargeRangeReduction<NO_FMA> range_reduction_large;
+
+  // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
+    // |x| < 2^-27
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
+      // Signed zeros.
+      if (LIBC_UNLIKELY(x == 0.0)) {
+        *sin_x = x;
+        *cos_x = 1.0;
+        return;
+      }
+
+      // For |x| < 2^-27, max(|sin(x) - x|, |cos(x) - 1|) < ulp(x)/2.
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+      *sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
+      *cos_x = fputil::multiply_add(x, -x, 1.0);
+#else
+      *cos_x = fputil::round_result_slightly_down(1.0);
+
+      if (LIBC_UNLIKELY(x_e < 4)) {
+        int rounding_mode = fputil::quick_get_round();
+        if (rounding_mode == FE_TOWARDZERO ||
+            (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) ||
+            (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD))
+          *sin_x = FPBits(xbits.uintval() - 1).get_val();
+      }
+      *sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
+#endif // LIBC_TARGET_CPU_HAS_FMA
+      return;
+    }
+
+    // // Small range reduction.
+    k = range_reduction_small(x, y);
+  } else {
+    // Inf or NaN
+    if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
+      // sin(+-Inf) = NaN
+      if (xbits.get_mantissa() == 0) {
+        fputil::set_errno_if_required(EDOM);
+        fputil::raise_except_if_required(FE_INVALID);
+      }
+      *sin_x = *cos_x = x + FPBits::quiet_nan().get_val();
+      return;
+    }
+
+    // Large range reduction.
+    k = range_reduction_large.compute_high_part(x);
+    y = range_reduction_large.fast();
+  }
+
+  DoubleDouble sin_y, cos_y;
+
+  generic::sincos_eval(y, sin_y, cos_y);
+
+  // Look up sin(k * pi/128) and cos(k * pi/128)
+  // Memory saving versions:
+
+  // Use 128-entry table instead:
+  // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
+  // uint64_t sin_s = static_cast<uint64_t>(k & 128) << (63 - 7);
+  // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
+  // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
+  // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
+  // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
+  // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
+  // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
+
+  // Use 64-entry table instead:
+  // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+  //   unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+  //   DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+  //   if (kk & 128) {
+  //     ans.hi = -ans.hi;
+  //     ans.lo = -ans.lo;
+  //   }
+  //   return ans;
+  // };
+  // DoubleDouble sin_k = get_idx_dd(k);
+  // DoubleDouble cos_k = get_idx_dd(k + 64);
+
+  // Fast look up version, but needs 256-entry table.
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 255];
+  DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+  DoubleDouble msin_k{-sin_k.lo, -sin_k.hi};
+
+  // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
+  // So k is an integer and -pi / 256 <= y <= pi / 256.
+  // Then sin(x) = sin((k * pi/128 + y)
+  //             = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128)
+  DoubleDouble sin_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, sin_k);
+  DoubleDouble cos_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, cos_k);
+  //      cos(x) = cos((k * pi/128 + y)
+  //             = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)
+  DoubleDouble cos_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, cos_k);
+  DoubleDouble msin_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, msin_k);
+
+  DoubleDouble sin_dd =
+      fputil::exact_add<false>(sin_k_cos_y.hi, cos_k_sin_y.hi);
+  DoubleDouble cos_dd =
+      fputil::exact_add<false>(cos_k_cos_y.hi, msin_k_sin_y.hi);
+  sin_dd.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;
+  cos_dd.lo += msin_k_sin_y.lo + cos_k_cos_y.lo;
+
+#ifdef LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS
+  *sin_x = sin_dd.hi + sin_dd.lo;
+  *cos_x = cos_dd.hi + cos_dd.lo;
+  return;
+#else
+  // Accurate test and pass for correctly rounded implementation.
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+  constexpr double ERR = 0x1.0p-70;
+#else
+  // TODO: Improve non-FMA fast pass accuracy.
+  constexpr double ERR = 0x1.0p-66;
+#endif // LIBC_TARGET_CPU_HAS_FMA
+
+  double sin_lp = sin_dd.lo + ERR;
+  double sin_lm = sin_dd.lo - ERR;
+  double cos_lp = cos_dd.lo + ERR;
+  double cos_lm = cos_dd.lo - ERR;
+
+  double sin_upper = sin_dd.hi + sin_lp;
+  double sin_lower = sin_dd.hi + sin_lm;
+  double cos_upper = cos_dd.hi + cos_lp;
+  double cos_lower = cos_dd.hi + cos_lm;
+
+  // Ziv's rounding test.
+  if (LIBC_LIKELY(sin_upper == sin_lower && cos_upper == cos_lower)) {
+    *sin_x = sin_upper;
+    *cos_x = cos_upper;
+    return;
+  }
+
+  Float128 u_f128, sin_u, cos_u;
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
+    u_f128 = generic::range_reduction_small_f128(x);
+  else
+    u_f128 = range_reduction_large.accurate();
+
+  generic::sincos_eval(u_f128, sin_u, cos_u);
+
+  auto get_sin_k = [](unsigned kk) -> Float128 {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+    if (kk & 128)
+      ans.sign = Sign::NEG;
+    return ans;
+  };
+
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  Float128 sin_k_f128 = get_sin_k(k);
+  Float128 cos_k_f128 = get_sin_k(k + 64);
+  Float128 msin_k_f128 = get_sin_k(k + 128);
+
+  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
+  // https://github.com/llvm/llvm-project/issues/96452.
+
+  if (sin_upper == sin_lower)
+    *sin_x = sin_upper;
+  else
+    // sin(x) = sin((k * pi/128 + u)
+    //        = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128)
+    *sin_x = static_cast<double>(
+        fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u),
+                          fputil::quick_mul(cos_k_f128, sin_u)));
+
+  if (cos_upper == cos_lower)
+    *cos_x = cos_upper;
+  else
+    // cos(x) = cos((k * pi/128 + u)
+    //        = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128)
+    *cos_x = static_cast<double>(
+        fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u),
+                          fputil::quick_mul(msin_k_f128, sin_u)));
+
+#endif // !LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/sincos_eval.h b/libc/src/math/generic/sincos_eval.h
new file mode 100644
index 00000000000000..5b4f3b8563cffc
--- /dev/null
+++ b/libc/src/math/generic/sincos_eval.h
@@ -0,0 +1,124 @@
+//===-- Compute sin + cos for small angles ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GENERIC_SINCOS_EVAL_H
+#define LLVM_LIBC_SRC_MATH_GENERIC_SINCOS_EVAL_H
+
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/integer_literals.h"
+
+namespace LIBC_NAMESPACE {
+
+namespace generic {
+
+using fputil::DoubleDouble;
+using Float128 = fputil::DyadicFloat<128>;
+
+LIBC_INLINE void sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u,
+                             DoubleDouble &cos_u) {
+  // Evaluate sin(y) = sin(x - k * (pi/128))
+  // We use the degree-7 Taylor approximation:
+  //   sin(y) ~ y - y^3/3! + y^5/5! - y^7/7!
+  // Then the error is bounded by:
+  //   |sin(y) - (y - y^3/3! + y^5/5! - y^7/7!)| < |y|^9/9! < 2^-54/9! < 2^-72.
+  // For y ~ u_hi + u_lo, fully expanding the polynomial and drop any terms
+  // < ulp(u_hi^3) gives us:
+  //   y - y^3/3! + y^5/5! - y^7/7! = ...
+  // ~ u_hi + u_hi^3 * (-1/6 + u_hi^2 * (1/120 - u_hi^2 * 1/5040)) +
+  //        + u_lo (1 + u_hi^2 * (-1/2 + u_hi^2 / 24))
+  double u_hi_sq = u.hi * u.hi; // Error < ulp(u_hi^2) < 2^(-6 - 52) = 2^-58.
+  // p1 ~ 1/120 + u_hi^2 / 5040.
+  double p1 = fputil::multiply_add(u_hi_sq, -0x1.a01a01a01a01ap-13,
+                                   0x1.1111111111111p-7);
+  // q1 ~ -1/2 + u_hi^2 / 24.
+  double q1 = fputil::multiply_add(u_hi_sq, 0x1.5555555555555p-5, -0x1.0p-1);
+  double u_hi_3 = u_hi_sq * u.hi;
+  // p2 ~ -1/6 + u_hi^2 (1/120 - u_hi^2 * 1/5040)
+  double p2 = fputil::multiply_add(u_hi_sq, p1, -0x1.5555555555555p-3);
+  // q2 ~ 1 + u_hi^2 (-1/2 + u_hi^2 / 24)
+  double q2 = fputil::multiply_add(u_hi_sq, q1, 1.0);
+  double sin_lo = fputil::multiply_add(u_hi_3, p2, u.lo * q2);
+  // Overall, |sin(y) - (u_hi + sin_lo)| < 2*ulp(u_hi^3) < 2^-69.
+
+  // Evaluate cos(y) = cos(x - k * (pi/128))
+  // We use the degree-8 Taylor approximation:
+  //   cos(y) ~ 1 - y^2/2 + y^4/4! - y^6/6! + y^8/8!
+  // Then the error is bounded by:
+  //   |cos(y) - (...)| < |y|^10/10! < 2^-81
+  // For y ~ u_hi + u_lo, fully expanding the polynomial and drop any terms
+  // < ulp(u_hi^3) gives us:
+  //   1 - y^2/2 + y^4/4! - y^6/6! + y^8/8! = ...
+  // ~ 1 - u_hi^2/2 + u_hi^4(1/24 + u_hi^2 (-1/720 + u_hi^2/40320)) +
+  //     + u_hi u_lo (-1 + u_hi^2/6)
+  // We compute 1 - u_hi^2 accurately:
+  //   v_hi + v_lo ~ 1 - u_hi^2/2
+  double v_hi = fputil::multiply_add(u.hi, u.hi * (-0.5), 1.0);
+  double v_lo = 1.0 - v_hi; // Exact
+  v_lo = fputil::multiply_add(u.hi, u.hi * (-0.5), v_lo);
+
+  // r1 ~ -1/720 + u_hi^2 / 40320
+  double r1 = fputil::multiply_add(u_hi_sq, 0x1.a01a01a01a01ap-16,
+                                   -0x1.6c16c16c16c17p-10);
+  // s1 ~ -1 + u_hi^2 / 6
+  double s1 = fputil::multiply_add(u_hi_sq, 0x1.5555555555555p-3, -1.0);
+  double u_hi_4 = u_hi_sq * u_hi_sq;
+  double u_hi_u_lo = u.hi * u.lo;
+  // r2 ~ 1/24 + u_hi^2 (-1/720 + u_hi^2 / 40320)
+  double r2 = fputil::multiply_add(u_hi_sq, r1, 0x1.5555555555555p-5);
+  // s2 ~ v_lo + u_hi * u_lo * (-1 + u_hi^2 / 6)
+  double s2 = fputil::multiply_add(u_hi_u_lo, s1, v_lo);
+  double cos_lo = fputil::multiply_add(u_hi_4, r2, s2);
+  // Overall, |cos(y) - (v_hi + cos_lo)| < 2*ulp(u_hi^4) < 2^-75.
+
+  sin_u = fputil::exact_add(u.hi, sin_lo);
+  cos_u = fputil::exact_add(v_hi, cos_lo);
+}
+
+LIBC_INLINE void sincos_eval(const Float128 &u, Float128 &sin_u,
+                             Float128 &cos_u) {
+  Float128 u_sq = fputil::quick_mul(u, u);
+
+  // sin(u) ~ x - x^3/3! + x^5/5! - x^7/7! + x^9/9! - x^11/11! + x^13/13!
+  constexpr Float128 SIN_COEFFS[] = {
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1
+      {Sign::NEG, -130, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128}, // -1/3!
+      {Sign::POS, -134, 0x88888888'88888888'88888888'88888889_u128}, // 1/5!
+      {Sign::NEG, -140, 0xd00d00d0'0d00d00d'00d00d00'd00d00d0_u128}, // -1/7!
+      {Sign::POS, -146, 0xb8ef1d2a'b6399c7d'560e4472'800b8ef2_u128}, // 1/9!
+      {Sign::NEG, -153, 0xd7322b3f'aa271c7f'3a3f25c1'bee38f10_u128}, // -1/11!
+      {Sign::POS, -160, 0xb092309d'43684be5'1c198e91'd7b4269e_u128}, // 1/13!
+  };
+
+  // cos(u) ~ 1 - x^2/2 + x^4/4! - x^6/6! + x^8/8! - x^10/10! + x^12/12!
+  constexpr Float128 COS_COEFFS[] = {
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+      {Sign::NEG, -128, 0x80000000'00000000'00000000'00000000_u128}, // 1/2
+      {Sign::POS, -132, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128}, // 1/4!
+      {Sign::NEG, -137, 0xb60b60b6'0b60b60b'60b60b60'b60b60b6_u128}, // 1/6!
+      {Sign::POS, -143, 0xd00d00d0'0d00d00d'00d00d00'd00d00d0_u128}, // 1/8!
+      {Sign::NEG, -149, 0x93f27dbb'c4fae397'780b69f5'333c725b_u128}, // 1/10!
+      {Sign::POS, -156, 0x8f76c77f'c6c4bdaa'26d4c3d6'7f425f60_u128}, // 1/12!
+  };
+
+  sin_u = fputil::quick_mul(u, fputil::polyeval(u_sq, SIN_COEFFS[0],
+                                                SIN_COEFFS[1], SIN_COEFFS[2],
+                                                SIN_COEFFS[3], SIN_COEFFS[4],
+                                                SIN_COEFFS[5], SIN_COEFFS[6]));
+  cos_u = fputil::polyeval(u_sq, COS_COEFFS[0], COS_COEFFS[1], COS_COEFFS[2],
+                           COS_COEFFS[3], COS_COEFFS[4], COS_COEFFS[5],
+                           COS_COEFFS[6]);
+}
+
+} // namespace generic
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF_EVAL_H
diff --git a/libc/src/math/x86_64/CMakeLists.txt b/libc/src/math/x86_64/CMakeLists.txt
index cd129e3eefb75c..3cfc422e56d49a 100644
--- a/libc/src/math/x86_64/CMakeLists.txt
+++ b/libc/src/math/x86_64/CMakeLists.txt
@@ -1,23 +1,3 @@
-add_entrypoint_object(
-  cos
-  SRCS
-    cos.cpp
-  HDRS
-    ../cos.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_entrypoint_object(
-  sin
-  SRCS
-    sin.cpp
-  HDRS
-    ../sin.h
-  COMPILE_OPTIONS
-    -O2
-)
-
 add_entrypoint_object(
   tan
   SRCS
diff --git a/libc/src/signal/linux/CMakeLists.txt b/libc/src/signal/linux/CMakeLists.txt
index 7606b4b21d3ddb..5a6be242ddd94f 100644
--- a/libc/src/signal/linux/CMakeLists.txt
+++ b/libc/src/signal/linux/CMakeLists.txt
@@ -60,6 +60,7 @@ add_entrypoint_object(
   DEPENDS
     .__restore
     libc.hdr.types.sigset_t
+    libc.hdr.types.struct_sigaction
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
@@ -124,7 +125,7 @@ add_entrypoint_object(
     ../signal.h
   DEPENDS
     .sigaction
-    libc.include.signal
+    libc.hdr.signal_macros
 )
 
 add_entrypoint_object(
diff --git a/libc/src/signal/linux/signal.cpp b/libc/src/signal/linux/signal.cpp
index a517fa73ba8a2c..0d5463553eb78d 100644
--- a/libc/src/signal/linux/signal.cpp
+++ b/libc/src/signal/linux/signal.cpp
@@ -7,11 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/signal/signal.h"
-#include "src/signal/sigaction.h"
-
+#include "hdr/signal_macros.h"
 #include "src/__support/common.h"
-
-#include <signal.h>
+#include "src/signal/sigaction.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/signal/sigaction.h b/libc/src/signal/sigaction.h
index c36a3ec9fedfaa..15b689b7dacc75 100644
--- a/libc/src/signal/sigaction.h
+++ b/libc/src/signal/sigaction.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_SIGNAL_SIGACTION_H
 #define LLVM_LIBC_SRC_SIGNAL_SIGACTION_H
 
-#include <signal.h>
+#include "hdr/types/struct_sigaction.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 61c05f07389590..677bf358c82c4a 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -304,6 +304,7 @@ add_entrypoint_object(
   DEPENDS
     .rand_util
     libc.include.stdlib
+    libc.src.__support.threads.sleep
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdlib/rand.cpp b/libc/src/stdlib/rand.cpp
index ad543b4048a949..ff3875c2f69592 100644
--- a/libc/src/stdlib/rand.cpp
+++ b/libc/src/stdlib/rand.cpp
@@ -8,6 +8,7 @@
 
 #include "src/stdlib/rand.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/sleep.h"
 #include "src/stdlib/rand_util.h"
 
 namespace LIBC_NAMESPACE {
@@ -15,12 +16,17 @@ namespace LIBC_NAMESPACE {
 // An implementation of the xorshift64star pseudo random number generator. This
 // is a good general purpose generator for most non-cryptographics applications.
 LLVM_LIBC_FUNCTION(int, rand, (void)) {
-  unsigned long x = rand_next;
-  x ^= x >> 12;
-  x ^= x << 25;
-  x ^= x >> 27;
-  rand_next = x;
-  return static_cast<int>((x * 0x2545F4914F6CDD1Dul) >> 32) & RAND_MAX;
+  unsigned long orig = rand_next.load(cpp::MemoryOrder::RELAXED);
+  for (;;) {
+    unsigned long x = orig;
+    x ^= x >> 12;
+    x ^= x << 25;
+    x ^= x >> 27;
+    if (rand_next.compare_exchange_strong(orig, x, cpp::MemoryOrder::ACQUIRE,
+                                          cpp::MemoryOrder::RELAXED))
+      return static_cast<int>((x * 0x2545F4914F6CDD1Dul) >> 32) & RAND_MAX;
+    sleep_briefly();
+  }
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdlib/rand_util.cpp b/libc/src/stdlib/rand_util.cpp
index 1f3dbce9c78608..ff3478db700035 100644
--- a/libc/src/stdlib/rand_util.cpp
+++ b/libc/src/stdlib/rand_util.cpp
@@ -7,18 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/rand_util.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/macros/attributes.h"
 
 namespace LIBC_NAMESPACE {
 
-#ifdef LIBC_TARGET_ARCH_IS_GPU
-// FIXME: Local GPU memory cannot be initialized so we cannot currently provide
-// a standard compliant default value.
-ThreadLocal<unsigned long> rand_next;
-#else
-// C standard 7.10p2: If 'rand' is called before 'srand' it is to proceed as if
-// the 'srand' function was called with a value of '1'.
-LIBC_THREAD_LOCAL unsigned long rand_next = 1;
-#endif
+// C standard 7.10p2: If 'rand' is called before 'srand' it is to
+// proceed as if the 'srand' function was called with a value of '1'.
+cpp::Atomic<unsigned long> rand_next = 1;
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdlib/rand_util.h b/libc/src/stdlib/rand_util.h
index cadd6b5cdcbb8c..5d7febf8248d88 100644
--- a/libc/src/stdlib/rand_util.h
+++ b/libc/src/stdlib/rand_util.h
@@ -9,33 +9,15 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_RAND_UTIL_H
 #define LLVM_LIBC_SRC_STDLIB_RAND_UTIL_H
 
-#include "src/__support/GPU/utils.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/macros/attributes.h"
 
 namespace LIBC_NAMESPACE {
 
-#ifdef LIBC_TARGET_ARCH_IS_GPU
-// Implement thread local storage on the GPU using local memory. Each thread
-// gets its slot in the local memory array and is private to the group.
-// TODO: We need to implement the 'thread_local' keyword on the GPU. This is an
-// inefficient and incomplete stand-in until that is done.
-template <typename T> class ThreadLocal {
-private:
-  static constexpr long MAX_THREADS = 1024;
-  [[clang::loader_uninitialized]] static inline gpu::Local<T>
-      storage[MAX_THREADS];
-
-public:
-  LIBC_INLINE operator T() const { return storage[gpu::get_thread_id()]; }
-  LIBC_INLINE void operator=(const T &value) {
-    storage[gpu::get_thread_id()] = value;
-  }
-};
-
-extern ThreadLocal<unsigned long> rand_next;
-#else
-extern LIBC_THREAD_LOCAL unsigned long rand_next;
-#endif
+// The ISO C standard does not explicitly require thread-safe behavior for the
+// generic `rand()` function. Some implementations expect it however, so we
+// provide it here.
+extern cpp::Atomic<unsigned long> rand_next;
 
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/stdlib/srand.cpp b/libc/src/stdlib/srand.cpp
index 008c7a9e565e41..21166c7a6754e9 100644
--- a/libc/src/stdlib/srand.cpp
+++ b/libc/src/stdlib/srand.cpp
@@ -12,6 +12,8 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(void, srand, (unsigned int seed)) { rand_next = seed; }
+LLVM_LIBC_FUNCTION(void, srand, (unsigned int seed)) {
+  rand_next.store(seed, cpp::MemoryOrder::RELAXED);
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
index f041a4826dc947..336c5d0f6bfa27 100644
--- a/libc/startup/linux/CMakeLists.txt
+++ b/libc/startup/linux/CMakeLists.txt
@@ -98,6 +98,7 @@ add_object_library(
     libc.config.linux.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
+    libc.include.llvm-libc-macros.link_macros
     libc.src.__support.threads.thread
     libc.src.__support.OSUtil.osutil
     libc.src.stdlib.exit
diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
index 55fd575f7ad0b0..30ab1f0e26ea58 100644
--- a/libc/startup/linux/do_start.cpp
+++ b/libc/startup/linux/do_start.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "startup/linux/do_start.h"
+#include "include/llvm-libc-macros/link-macros.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/threads/thread.h"
 #include "src/stdlib/atexit.h"
@@ -68,8 +69,8 @@ static ThreadAttributes main_thread_attrib;
   // After the argv array, is a 8-byte long NULL value before the array of env
   // values. The end of the env values is marked by another 8-byte long NULL
   // value. We step over it (the "+ 1" below) to get to the env values.
-  ArgVEntryType *env_ptr = app.args->argv + app.args->argc + 1;
-  ArgVEntryType *env_end_marker = env_ptr;
+  uintptr_t *env_ptr = app.args->argv + app.args->argc + 1;
+  uintptr_t *env_end_marker = env_ptr;
   app.env_ptr = env_ptr;
   while (*env_end_marker)
     ++env_end_marker;
@@ -79,13 +80,13 @@ static ThreadAttributes main_thread_attrib;
 
   // After the env array, is the aux-vector. The end of the aux-vector is
   // denoted by an AT_NULL entry.
-  Elf64_Phdr *program_hdr_table = nullptr;
+  ElfW(Phdr) *program_hdr_table = nullptr;
   uintptr_t program_hdr_count = 0;
   app.auxv_ptr = reinterpret_cast<AuxEntry *>(env_end_marker + 1);
   for (auto *aux_entry = app.auxv_ptr; aux_entry->id != AT_NULL; ++aux_entry) {
     switch (aux_entry->id) {
     case AT_PHDR:
-      program_hdr_table = reinterpret_cast<Elf64_Phdr *>(aux_entry->value);
+      program_hdr_table = reinterpret_cast<ElfW(Phdr) *>(aux_entry->value);
       break;
     case AT_PHNUM:
       program_hdr_count = aux_entry->value;
@@ -100,10 +101,10 @@ static ThreadAttributes main_thread_attrib;
 
   ptrdiff_t base = 0;
   app.tls.size = 0;
-  Elf64_Phdr *tls_phdr = nullptr;
+  ElfW(Phdr) *tls_phdr = nullptr;
 
   for (uintptr_t i = 0; i < program_hdr_count; ++i) {
-    Elf64_Phdr &phdr = program_hdr_table[i];
+    ElfW(Phdr) &phdr = program_hdr_table[i];
     if (phdr.p_type == PT_PHDR)
       base = reinterpret_cast<ptrdiff_t>(program_hdr_table) - phdr.p_vaddr;
     if (phdr.p_type == PT_DYNAMIC && _DYNAMIC)
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index e224e82c922f03..ce8413fed71721 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -13,17 +13,19 @@ add_libc_test(
     libc.src.string.memcpy
 )
 
-add_libc_test(
-  freelist_test
-  SUITE
-    libc-support-tests
-  SRCS
-    freelist_test.cpp
-  DEPENDS
-    libc.src.__support.CPP.array
-    libc.src.__support.CPP.span
-    libc.src.__support.freelist
-)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  add_libc_test(
+    freelist_test
+    SUITE
+      libc-support-tests
+    SRCS
+      freelist_test.cpp
+    DEPENDS
+      libc.src.__support.CPP.array
+      libc.src.__support.CPP.span
+      libc.src.__support.freelist
+  )
+endif()
 
 if(LLVM_LIBC_FULL_BUILD)
   add_libc_test(
diff --git a/libc/test/src/__support/fixedvector_test.cpp b/libc/test/src/__support/fixedvector_test.cpp
index 212e1aed20f7cd..b73df04d03746b 100644
--- a/libc/test/src/__support/fixedvector_test.cpp
+++ b/libc/test/src/__support/fixedvector_test.cpp
@@ -96,3 +96,13 @@ TEST(LlvmLibcFixedVectorTest, ForwardIteration) {
     ASSERT_EQ(*it, arr[idx]);
   }
 }
+
+TEST(LlvmLibcFixedVectorTest, ConstForwardIteration) {
+  const LIBC_NAMESPACE::cpp::array<int, 4> arr{1, 2, 3, 4};
+  const LIBC_NAMESPACE::FixedVector<int, 5> vec(arr.begin(), arr.end());
+  ASSERT_EQ(vec.size(), arr.size());
+  for (auto it = vec.begin(); it != vec.end(); ++it) {
+    auto idx = it - vec.begin();
+    ASSERT_EQ(*it, arr[idx]);
+  }
+}
diff --git a/libc/test/src/__support/math_extras_test.cpp b/libc/test/src/__support/math_extras_test.cpp
index 0047888965177e..9f9036b9d5cdfa 100644
--- a/libc/test/src/__support/math_extras_test.cpp
+++ b/libc/test/src/__support/math_extras_test.cpp
@@ -102,7 +102,7 @@ TYPED_TEST(LlvmLibcBitTest, CountZeros, UnsignedTypesNoBigInt) {
 }
 
 using UnsignedTypes = testing::TypeList<
-#if defined(__SIZEOF_INT128__)
+#if defined(LIBC_TYPES_HAS_INT128)
     __uint128_t,
 #endif
     unsigned char, unsigned short, unsigned int, unsigned long,
diff --git a/libc/test/src/fcntl/CMakeLists.txt b/libc/test/src/fcntl/CMakeLists.txt
index aae296f074bea0..48048b7fe88666 100644
--- a/libc/test/src/fcntl/CMakeLists.txt
+++ b/libc/test/src/fcntl/CMakeLists.txt
@@ -29,6 +29,7 @@ add_libc_unittest(
     libc.src.fcntl.fcntl
     libc.src.fcntl.open
     libc.src.unistd.close
+    libc.src.unistd.getpid
     libc.hdr.types.struct_flock
     libc.hdr.fcntl_macros
     libc.test.UnitTest.ErrnoSetterMatcher
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
index fc909acc1aa0be..ffbb3ec337ed4d 100644
--- a/libc/test/src/fcntl/fcntl_test.cpp
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -12,6 +12,7 @@
 #include "src/fcntl/fcntl.h"
 #include "src/fcntl/open.h"
 #include "src/unistd/close.h"
+#include "src/unistd/getpid.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -163,3 +164,22 @@ TEST(LlvmLibcFcntlTest, UseAfterClose) {
   ASSERT_EQ(-1, LIBC_NAMESPACE::fcntl(fd, F_GETFL));
   ASSERT_ERRNO_EQ(EBADF);
 }
+
+TEST(LlvmLibcFcntlTest, SetGetOwnerTest) {
+  LIBC_NAMESPACE::libc_errno = 0;
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  pid_t pid = LIBC_NAMESPACE::getpid();
+  ASSERT_GT(pid, -1);
+  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_set_get_owner.test";
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+  int ret = LIBC_NAMESPACE::fcntl(fd, F_SETOWN, pid);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(ret, -1);
+  int ret2 = LIBC_NAMESPACE::fcntl(fd, F_GETOWN);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_EQ(ret2, pid);
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+}
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index bb364c3f0a1751..2e04abd7ce062a 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -72,6 +72,18 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sincos_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    sincos_test.cpp
+  DEPENDS
+    libc.src.math.sincos
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   tanf_test
   NEED_MPFR
@@ -1247,9 +1259,10 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     sqrtf_test.cpp
+  HDRS
+    SqrtTest.h
   DEPENDS
     libc.src.math.sqrtf
-    libc.src.__support.FPUtil.fp_bits
 )
 
 add_fp_unittest(
@@ -1259,9 +1272,10 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     sqrt_test.cpp
+  HDRS
+    SqrtTest.h
   DEPENDS
     libc.src.math.sqrt
-    libc.src.__support.FPUtil.fp_bits
 )
 
 add_fp_unittest(
@@ -1271,9 +1285,10 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     sqrtl_test.cpp
+  HDRS
+    SqrtTest.h
   DEPENDS
     libc.src.math.sqrtl
-    libc.src.__support.FPUtil.fp_bits
 )
 
 add_fp_unittest(
@@ -1283,9 +1298,10 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     generic_sqrtf_test.cpp
+  HDRS
+    SqrtTest.h
   DEPENDS
     libc.src.math.sqrtf
-    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
@@ -1298,9 +1314,10 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     generic_sqrt_test.cpp
+  HDRS
+    SqrtTest.h
   DEPENDS
     libc.src.math.sqrt
-    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
@@ -1313,9 +1330,10 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     generic_sqrtl_test.cpp
+  HDRS
+    SqrtTest.h
   DEPENDS
     libc.src.math.sqrtl
-    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
@@ -1729,6 +1747,18 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  fmul_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmul_test.cpp
+  HDRS
+    FMulTest.h
+  DEPENDS
+    libc.src.math.fmul
+)
 add_fp_unittest(
   asinhf_test
   NEED_MPFR
@@ -1890,6 +1920,60 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  f16div_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16div_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.f16div
+)
+
+add_fp_unittest(
+  f16divf_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16divf_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.f16divf
+)
+
+add_fp_unittest(
+  f16divl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16divl_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.f16divl
+)
+
+add_fp_unittest(
+  f16fma_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16fma_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.f16fma
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
 add_fp_unittest(
   f16fmaf_test
   NEED_MPFR
@@ -1905,6 +1989,47 @@ add_fp_unittest(
     libc.src.stdlib.srand
 )
 
+add_fp_unittest(
+  f16fmal_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16fmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.f16fmal
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
+add_fp_unittest(
+  f16sqrt_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16sqrt_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.f16sqrt
+)
+
+add_fp_unittest(
+  f16sqrtl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16sqrtl_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.f16sqrtl
+)
+
 add_subdirectory(generic)
 add_subdirectory(smoke)
 
diff --git a/libc/test/src/math/DivTest.h b/libc/test/src/math/DivTest.h
new file mode 100644
index 00000000000000..1cdc1398a1a1c1
--- /dev/null
+++ b/libc/test/src/math/DivTest.h
@@ -0,0 +1,74 @@
+//===-- Utility class to test different flavors of float div ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_DIVTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_DIVTEST_H
+
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+template <typename OutType, typename InType>
+class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  struct InConstants {
+    DECLARE_SPECIAL_CONSTANTS(InType)
+  };
+
+  using InFPBits = typename InConstants::FPBits;
+  using InStorageType = typename InConstants::StorageType;
+
+  static constexpr InStorageType IN_MAX_NORMAL_U =
+      InFPBits::max_normal().uintval();
+  static constexpr InStorageType IN_MIN_NORMAL_U =
+      InFPBits::min_normal().uintval();
+  static constexpr InStorageType IN_MAX_SUBNORMAL_U =
+      InFPBits::max_subnormal().uintval();
+  static constexpr InStorageType IN_MIN_SUBNORMAL_U =
+      InFPBits::min_subnormal().uintval();
+
+public:
+  using DivFunc = OutType (*)(InType, InType);
+
+  void test_subnormal_range(DivFunc func) {
+    constexpr InStorageType COUNT = 100'001;
+    constexpr InStorageType STEP =
+        (IN_MAX_SUBNORMAL_U - IN_MIN_SUBNORMAL_U) / COUNT;
+    for (InStorageType i = 0, v = 0, w = IN_MAX_SUBNORMAL_U; i <= COUNT;
+         ++i, v += STEP, w -= STEP) {
+      InType x = InFPBits(v).get_val();
+      InType y = InFPBits(w).get_val();
+      mpfr::BinaryInput<InType> input{x, y};
+      EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
+                                     0.5);
+    }
+  }
+
+  void test_normal_range(DivFunc func) {
+    constexpr InStorageType COUNT = 100'001;
+    constexpr InStorageType STEP = (IN_MAX_NORMAL_U - IN_MIN_NORMAL_U) / COUNT;
+    for (InStorageType i = 0, v = 0, w = IN_MAX_NORMAL_U; i <= COUNT;
+         ++i, v += STEP, w -= STEP) {
+      InType x = InFPBits(v).get_val();
+      InType y = InFPBits(w).get_val();
+      mpfr::BinaryInput<InType> input{x, y};
+      EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
+                                     0.5);
+    }
+  }
+};
+
+#define LIST_DIV_TESTS(OutType, InType, func)                                  \
+  using LlvmLibcDivTest = DivTest<OutType, InType>;                            \
+  TEST_F(LlvmLibcDivTest, SubnormalRange) { test_subnormal_range(&func); }     \
+  TEST_F(LlvmLibcDivTest, NormalRange) { test_normal_range(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_DIVTEST_H
diff --git a/libc/test/src/math/FMulTest.h b/libc/test/src/math/FMulTest.h
new file mode 100644
index 00000000000000..864910c29d83f4
--- /dev/null
+++ b/libc/test/src/math/FMulTest.h
@@ -0,0 +1,121 @@
+//===-- Utility class to test fmul[f|l] ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+template <typename OutType, typename InType>
+class FmulMPFRTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  DECLARE_SPECIAL_CONSTANTS(InType)
+
+public:
+  typedef OutType (*FMulFunc)(InType, InType);
+
+  void testFMulMPFR(FMulFunc func) {
+    constexpr int N = 10;
+    mpfr::BinaryInput<InType> INPUTS[N] = {
+        {3.0, 5.0},
+        {0x1.0p1, 0x1.0p-131},
+        {0x1.0p2, 0x1.0p-129},
+        {1.0, 1.0},
+        {-0.0, -0.0},
+        {-0.0, 0.0},
+        {0.0, -0.0},
+        {0x1.0p100, 0x1.0p100},
+        {1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150},
+        {1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150}};
+
+    for (int i = 0; i < N; ++i) {
+      InType x = INPUTS[i].x;
+      InType y = INPUTS[i].y;
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, INPUTS[i],
+                                     func(x, y), 0.5);
+    }
+  }
+
+  void testSpecialInputsMPFR(FMulFunc func) {
+    constexpr int N = 27;
+    mpfr::BinaryInput<InType> INPUTS[N] = {{inf, 0x1.0p-129},
+                                           {0x1.0p-129, inf},
+                                           {inf, 2.0},
+                                           {3.0, inf},
+                                           {0.0, 0.0},
+                                           {neg_inf, aNaN},
+                                           {aNaN, neg_inf},
+                                           {neg_inf, neg_inf},
+                                           {0.0, neg_inf},
+                                           {neg_inf, 0.0},
+                                           {neg_inf, 1.0},
+                                           {1.0, neg_inf},
+                                           {neg_inf, 0x1.0p-129},
+                                           {0x1.0p-129, neg_inf},
+                                           {0.0, 0x1.0p-129},
+                                           {inf, 0.0},
+                                           {0.0, inf},
+                                           {0.0, aNaN},
+                                           {2.0, aNaN},
+                                           {0x1.0p-129, aNaN},
+                                           {inf, aNaN},
+                                           {aNaN, aNaN},
+                                           {0.0, sNaN},
+                                           {2.0, sNaN},
+                                           {0x1.0p-129, sNaN},
+                                           {inf, sNaN},
+                                           {sNaN, sNaN}};
+
+    for (int i = 0; i < N; ++i) {
+      InType x = INPUTS[i].x;
+      InType y = INPUTS[i].y;
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, INPUTS[i],
+                                     func(x, y), 0.5);
+    }
+  }
+
+  void testNormalRange(FMulFunc func) {
+    using FPBits = LIBC_NAMESPACE::fputil::FPBits<InType>;
+    using StorageType = typename FPBits::StorageType;
+    static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+    static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+
+    constexpr StorageType COUNT = 10'001;
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (int signs = 0; signs < 4; ++signs) {
+      for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
+           v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
+        InType x = FPBits(v).get_val(), y = FPBits(w).get_val();
+        if (signs % 2 == 1) {
+          x = -x;
+        }
+        if (signs >= 2) {
+          y = -y;
+        }
+
+        mpfr::BinaryInput<InType> input{x, y};
+        ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, input, func(x, y),
+                                       0.5);
+      }
+    }
+  }
+};
+
+#define LIST_FMUL_MPFR_TESTS(OutType, InType, func)                            \
+  using LlvmLibcFmulTest = FmulMPFRTest<OutType, InType>;                      \
+  TEST_F(LlvmLibcFmulTest, MulMpfr) { testFMulMPFR(&func); }                   \
+  TEST_F(LlvmLibcFmulTest, NanInfMpfr) { testSpecialInputsMPFR(&func); }       \
+  TEST_F(LlvmLibcFmulTest, NormalRange) { testNormalRange(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 53895e7d633c29..01143331d4ab57 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -45,9 +45,6 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
   static constexpr InStorageType IN_MIN_SUBNORMAL_U =
       InFPBits::min_subnormal().uintval();
 
-  OutConstants out;
-  InConstants in;
-
   InStorageType get_random_bit_pattern() {
     InStorageType bits{0};
     for (InStorageType i = 0; i < sizeof(InStorageType) / 2; ++i) {
@@ -92,4 +89,14 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
   }
 };
 
+#define LIST_FMA_TESTS(T, func)                                                \
+  using LlvmLibcFmaTest = FmaTestTemplate<T>;                                  \
+  TEST_F(LlvmLibcFmaTest, SubnormalRange) { test_subnormal_range(&func); }     \
+  TEST_F(LlvmLibcFmaTest, NormalRange) { test_normal_range(&func); }
+
+#define LIST_NARROWING_FMA_TESTS(OutType, InType, func)                        \
+  using LlvmLibcFmaTest = FmaTestTemplate<OutType, InType>;                    \
+  TEST_F(LlvmLibcFmaTest, SubnormalRange) { test_subnormal_range(&func); }     \
+  TEST_F(LlvmLibcFmaTest, NormalRange) { test_normal_range(&func); }
+
 #endif // LLVM_LIBC_TEST_SRC_MATH_FMATEST_H
diff --git a/libc/test/src/math/SqrtTest.h b/libc/test/src/math/SqrtTest.h
index 1c422e201bb234..770cc94b3b9406 100644
--- a/libc/test/src/math/SqrtTest.h
+++ b/libc/test/src/math/SqrtTest.h
@@ -6,51 +6,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/CPP/bit.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/math_macros.h"
-
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
-template <typename T>
+template <typename OutType, typename InType>
 class SqrtTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
 
-  DECLARE_SPECIAL_CONSTANTS(T)
+  DECLARE_SPECIAL_CONSTANTS(InType)
 
   static constexpr StorageType HIDDEN_BIT =
-      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
+      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<InType>::FRACTION_LEN;
 
 public:
-  typedef T (*SqrtFunc)(T);
-
-  void test_special_numbers(SqrtFunc func) {
-    ASSERT_FP_EQ(aNaN, func(aNaN));
-    ASSERT_FP_EQ(inf, func(inf));
-    ASSERT_FP_EQ(aNaN, func(neg_inf));
-    ASSERT_FP_EQ(0.0, func(0.0));
-    ASSERT_FP_EQ(-0.0, func(-0.0));
-    ASSERT_FP_EQ(aNaN, func(T(-1.0)));
-    ASSERT_FP_EQ(T(1.0), func(T(1.0)));
-    ASSERT_FP_EQ(T(2.0), func(T(4.0)));
-    ASSERT_FP_EQ(T(3.0), func(T(9.0)));
-  }
+  using SqrtFunc = OutType (*)(InType);
 
   void test_denormal_values(SqrtFunc func) {
     for (StorageType mant = 1; mant < HIDDEN_BIT; mant <<= 1) {
-      FPBits denormal(T(0.0));
+      FPBits denormal(zero);
       denormal.set_mantissa(mant);
-      T x = denormal.get_val();
+      InType x = denormal.get_val();
       EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sqrt, x, func(x), 0.5);
     }
 
     constexpr StorageType COUNT = 200'001;
     constexpr StorageType STEP = HIDDEN_BIT / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = LIBC_NAMESPACE::cpp::bit_cast<T>(v);
+      InType x = FPBits(i).get_val();
       EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sqrt, x, func(x), 0.5);
     }
   }
@@ -59,17 +44,21 @@ class SqrtTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     constexpr StorageType COUNT = 200'001;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = LIBC_NAMESPACE::cpp::bit_cast<T>(v);
-      if (isnan(x) || (x < 0)) {
+      FPBits x_bits(v);
+      InType x = x_bits.get_val();
+      if (x_bits.is_nan() || (x < 0))
         continue;
-      }
       EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sqrt, x, func(x), 0.5);
     }
   }
 };
 
 #define LIST_SQRT_TESTS(T, func)                                               \
-  using LlvmLibcSqrtTest = SqrtTest<T>;                                        \
-  TEST_F(LlvmLibcSqrtTest, SpecialNumbers) { test_special_numbers(&func); }    \
+  using LlvmLibcSqrtTest = SqrtTest<T, T>;                                     \
+  TEST_F(LlvmLibcSqrtTest, DenormalValues) { test_denormal_values(&func); }    \
+  TEST_F(LlvmLibcSqrtTest, NormalRange) { test_normal_range(&func); }
+
+#define LIST_NARROWING_SQRT_TESTS(OutType, InType, func)                       \
+  using LlvmLibcSqrtTest = SqrtTest<OutType, InType>;                          \
   TEST_F(LlvmLibcSqrtTest, DenormalValues) { test_denormal_values(&func); }    \
   TEST_F(LlvmLibcSqrtTest, NormalRange) { test_normal_range(&func); }
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index 9a39616ed16f81..e12e9a8274692d 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -6,27 +6,124 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/math/cos.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/math_macros.h"
-
 using LlvmLibcCosTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
-TEST_F(LlvmLibcCosTest, Range) {
-  static constexpr double _2pi = 6.283185307179586;
-  constexpr StorageType COUNT = 100'000;
-  constexpr StorageType STEP = STORAGE_MAX / COUNT;
-  for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    double x = FPBits(v).get_val();
-    // TODO: Expand the range of testing after range reduction is implemented.
-    if (isnan(x) || isinf(x) || x > _2pi || x < -_2pi)
-      continue;
-
-    ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cos(x), 1.0);
+using LIBC_NAMESPACE::testing::tlog;
+
+TEST_F(LlvmLibcCosTest, TrickyInputs) {
+  constexpr double INPUTS[] = {
+      0x1.8000000000009p-23,    0x1.8000000000024p-22,
+      0x1.800000000009p-21,     0x1.20000000000f3p-20,
+      0x1.800000000024p-20,     0x1.e0000000001c2p-20,
+      0x1.940c877fb7dacp-7,     -0x1.f42fb19b5b9b2p-6,
+      -0x1.0285070f9f1bcp-5,    0x1.23f40dccdef72p+0,
+      0x1.43cf16358c9d7p+0,     0x1.addf3b9722265p+0,
+      0x1.ae78d360afa15p+0,     0x1.e31b55306f22cp+2,
+      0x1.e639103a05997p+2,     -0x1.f7898d5a756ddp+2,
+      0x1.1685973506319p+3,     0x1.5f09cad750ab1p+3,
+      -0x1.aaf85537ea4c7p+3,    0x1.4f2b874135d27p+4,
+      0x1.13114266f9764p+4,     0x1.a211877de55dbp+4,
+      -0x1.a5eece87e8606p+4,    0x1.a65d441ea6dcep+4,
+      0x1.1ffb509f3db15p+5,     0x1.2345d1e090529p+5,
+      0x1.c96e28eb679f8p+5,     0x1.be886d9c2324dp+6,
+      -0x1.ab514bfc61c76p+7,    -0x1.14823229799c2p+7,
+      0x1.48ff1782ca91dp+8,     0x1.dcbfda0c7559ep+8,
+      0x1.dcbfda0c7559ep+8,     0x1.2e566149bf5fdp+9,
+      0x1.cb996c60f437ep+9,     0x1.ae945054939c2p+10,
+      -0x1.119471e9216cdp+10,   0x1.ae945054939c2p+10,
+      0x1.fffffffffdb6p+24,     0x1.fd4da4ef37075p+29,
+      0x1.55202aefde314p+31,    0x1.b951f1572eba5p+31,
+      0x1.7776c2343ba4ep+101,   0x1.85fc0f04c0128p+101,
+      0x1.678309fa50d58p+110,   0x1.fffffffffef4ep+199,
+      0x1.3eec5912ea7cdp+331,   -0x1.3eec5912ea7cdp+331,
+      -0x1.6deb37da81129p+205,  0x1.08087e9aad90bp+887,
+      -0x1.08087e9aad90bp+887,  -0x1.8bb5847d49973p+845,
+      0x1.6ac5b262ca1ffp+849,   0x1.f08b14e1c4d0fp+890,
+      0x1.2b5fe88a9d8d5p+903,   0x1.f6d7518808571p+1023,
+      -0x1.a880417b7b119p+1023, 0x1.00a33764a0a83p-7,
+      0x1.fe81868fc47fep+1,     0x1.0da8cc189b47dp-10,
+      0x1.da1838053b866p+5,
+
+  };
+  constexpr int N = sizeof(INPUTS) / sizeof(INPUTS[0]);
+
+  for (int i = 0; i < N; ++i) {
+    double x = INPUTS[i];
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
+                                   LIBC_NAMESPACE::cos(x), 0.5);
   }
 }
+
+TEST_F(LlvmLibcCosTest, InDoubleRange) {
+  constexpr uint64_t COUNT = 1'234'51;
+  uint64_t START = LIBC_NAMESPACE::fputil::FPBits<double>(0x1.0p-50).uintval();
+  uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits<double>(0x1.0p200).uintval();
+  uint64_t STEP = (STOP - START) / COUNT;
+
+  auto test = [&](mpfr::RoundingMode rounding_mode) {
+    mpfr::ForceRoundingMode __r(rounding_mode);
+    if (!__r.success)
+      return;
+
+    uint64_t fails = 0;
+    uint64_t tested = 0;
+    uint64_t total = 0;
+    double worst_input, worst_output = 0.0;
+    double ulp = 0.5;
+
+    for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
+      double x = FPBits(v).get_val();
+      if (isnan(x) || isinf(x))
+        continue;
+
+      double result = LIBC_NAMESPACE::cos(x);
+      ++total;
+      if (isnan(result) || isinf(result))
+        continue;
+
+      ++tested;
+
+      if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cos, x, result,
+                                             0.5, rounding_mode)) {
+        ++fails;
+        while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cos, x,
+                                                  result, ulp, rounding_mode)) {
+          worst_input = x;
+          worst_output = result;
+
+          if (ulp > 1000.0)
+            break;
+
+          ulp *= 2.0;
+        }
+      }
+    }
+    if (fails) {
+      tlog << " Cos failed: " << fails << "/" << tested << "/" << total
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(ulp) << ".\n";
+      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, worst_input, worst_output, 0.5,
+                        rounding_mode);
+    }
+  };
+
+  tlog << " Test Rounding To Nearest...\n";
+  test(mpfr::RoundingMode::Nearest);
+
+  tlog << " Test Rounding Downward...\n";
+  test(mpfr::RoundingMode::Downward);
+
+  tlog << " Test Rounding Upward...\n";
+  test(mpfr::RoundingMode::Upward);
+
+  tlog << " Test Rounding Toward Zero...\n";
+  test(mpfr::RoundingMode::TowardZero);
+}
diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index 34df8720ed4dbe..fb3596c3378ff4 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -4,6 +4,10 @@ add_header_library(
   exhaustive_test
   HDRS
     exhaustive_test.h
+  DEPENDS
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.properties.types
 )
 
 add_fp_unittest(
@@ -277,6 +281,21 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.fmod
 )
 
+add_fp_unittest(
+  fmodf16_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    fmodf16_test.cpp
+  DEPENDS
+    .exhaustive_test
+    libc.src.math.fmodf16
+  LINK_LIBRARIES
+    -lpthread
+)
+
 add_fp_unittest(
   coshf_test
   NO_RUN_POSTBUILD
diff --git a/libc/test/src/math/exhaustive/exhaustive_test.h b/libc/test/src/math/exhaustive/exhaustive_test.h
index 13e272783250b8..6f0c78ebefa470 100644
--- a/libc/test/src/math/exhaustive/exhaustive_test.h
+++ b/libc/test/src/math/exhaustive/exhaustive_test.h
@@ -8,6 +8,7 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/properties/types.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
@@ -68,9 +69,46 @@ struct UnaryOpChecker : public virtual LIBC_NAMESPACE::testing::Test {
   }
 };
 
+template <typename OutType, typename InType = OutType>
+using BinaryOp = OutType(InType, InType);
+
+template <typename OutType, typename InType, mpfr::Operation Op,
+          BinaryOp<OutType, InType> Func>
+struct BinaryOpChecker : public virtual LIBC_NAMESPACE::testing::Test {
+  using FloatType = InType;
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<FloatType>;
+  using StorageType = typename FPBits::StorageType;
+
+  // Check in a range, return the number of failures.
+  uint64_t check(StorageType x_start, StorageType x_stop, StorageType y_start,
+                 StorageType y_stop, mpfr::RoundingMode rounding) {
+    mpfr::ForceRoundingMode r(rounding);
+    if (!r.success)
+      return x_stop > x_start || y_stop > y_start;
+    StorageType xbits = x_start;
+    uint64_t failed = 0;
+    do {
+      FloatType x = FPBits(xbits).get_val();
+      StorageType ybits = y_start;
+      do {
+        FloatType y = FPBits(ybits).get_val();
+        mpfr::BinaryInput<FloatType> input{x, y};
+        bool correct = TEST_MPFR_MATCH_ROUNDING_SILENTLY(Op, input, Func(x, y),
+                                                         0.5, rounding);
+        failed += (!correct);
+        // Uncomment to print out failed values.
+        // if (!correct) {
+        //   EXPECT_MPFR_MATCH_ROUNDING(Op, input, Func(x, y), 0.5, rounding);
+        // }
+      } while (ybits++ < y_stop);
+    } while (xbits++ < x_stop);
+    return failed;
+  }
+};
+
 // Checker class needs inherit from LIBC_NAMESPACE::testing::Test and provide
 //   StorageType and check method.
-template <typename Checker>
+template <typename Checker, size_t Increment = 1 << 20>
 struct LlvmLibcExhaustiveMathTest
     : public virtual LIBC_NAMESPACE::testing::Test,
       public Checker {
@@ -78,12 +116,35 @@ struct LlvmLibcExhaustiveMathTest
   using FPBits = typename Checker::FPBits;
   using StorageType = typename Checker::StorageType;
 
-  static constexpr StorageType INCREMENT = (1 << 20);
+  void explain_failed_range(std::stringstream &msg, StorageType x_begin,
+                            StorageType x_end) {
+#ifdef LIBC_TYPES_HAS_FLOAT16
+    using T = LIBC_NAMESPACE::cpp::conditional_t<
+        LIBC_NAMESPACE::cpp::is_same_v<FloatType, float16>, float, FloatType>;
+#else
+    using T = FloatType;
+#endif
+
+    msg << x_begin << " to " << x_end << " [0x" << std::hex << x_begin << ", 0x"
+        << x_end << "), [" << std::hexfloat
+        << static_cast<T>(FPBits(x_begin).get_val()) << ", "
+        << static_cast<T>(FPBits(x_end).get_val()) << ")";
+  }
+
+  void explain_failed_range(std::stringstream &msg, StorageType x_begin,
+                            StorageType x_end, StorageType y_begin,
+                            StorageType y_end) {
+    msg << "x ";
+    explain_failed_range(msg, x_begin, x_end);
+    msg << ", y ";
+    explain_failed_range(msg, y_begin, y_end);
+  }
 
   // Break [start, stop) into `nthreads` subintervals and apply *check to each
   // subinterval in parallel.
-  void test_full_range(StorageType start, StorageType stop,
-                       mpfr::RoundingMode rounding) {
+  template <typename... T>
+  void test_full_range(mpfr::RoundingMode rounding, StorageType start,
+                       StorageType stop, T... extra_range_bounds) {
     int n_threads = std::thread::hardware_concurrency();
     std::vector<std::thread> thread_list;
     std::mutex mx_cur_val;
@@ -102,8 +163,8 @@ struct LlvmLibcExhaustiveMathTest
               return;
 
             range_begin = current_value;
-            if (stop >= INCREMENT && stop - INCREMENT >= current_value) {
-              range_end = current_value + INCREMENT;
+            if (stop >= Increment && stop - Increment >= current_value) {
+              range_end = current_value + Increment;
             } else {
               range_end = stop;
             }
@@ -120,15 +181,14 @@ struct LlvmLibcExhaustiveMathTest
             std::cout << msg.str() << std::flush;
           }
 
-          uint64_t failed_in_range =
-              Checker::check(range_begin, range_end, rounding);
+          uint64_t failed_in_range = Checker::check(
+              range_begin, range_end, extra_range_bounds..., rounding);
           if (failed_in_range > 0) {
             std::stringstream msg;
             msg << "Test failed for " << std::dec << failed_in_range
-                << " inputs in range: " << range_begin << " to " << range_end
-                << " [0x" << std::hex << range_begin << ", 0x" << range_end
-                << "), [" << std::hexfloat << FPBits(range_begin).get_val()
-                << ", " << FPBits(range_end).get_val() << ")\n";
+                << " inputs in range: ";
+            explain_failed_range(msg, start, stop, extra_range_bounds...);
+            msg << "\n";
             std::cerr << msg.str() << std::flush;
 
             failed.fetch_add(failed_in_range);
@@ -151,19 +211,46 @@ struct LlvmLibcExhaustiveMathTest
   void test_full_range_all_roundings(StorageType start, StorageType stop) {
     std::cout << "-- Testing for FE_TONEAREST in range [0x" << std::hex << start
               << ", 0x" << stop << ") --" << std::dec << std::endl;
-    test_full_range(start, stop, mpfr::RoundingMode::Nearest);
+    test_full_range(mpfr::RoundingMode::Nearest, start, stop);
 
     std::cout << "-- Testing for FE_UPWARD in range [0x" << std::hex << start
               << ", 0x" << stop << ") --" << std::dec << std::endl;
-    test_full_range(start, stop, mpfr::RoundingMode::Upward);
+    test_full_range(mpfr::RoundingMode::Upward, start, stop);
 
     std::cout << "-- Testing for FE_DOWNWARD in range [0x" << std::hex << start
               << ", 0x" << stop << ") --" << std::dec << std::endl;
-    test_full_range(start, stop, mpfr::RoundingMode::Downward);
+    test_full_range(mpfr::RoundingMode::Downward, start, stop);
 
     std::cout << "-- Testing for FE_TOWARDZERO in range [0x" << std::hex
               << start << ", 0x" << stop << ") --" << std::dec << std::endl;
-    test_full_range(start, stop, mpfr::RoundingMode::TowardZero);
+    test_full_range(mpfr::RoundingMode::TowardZero, start, stop);
+  };
+
+  void test_full_range_all_roundings(StorageType x_start, StorageType x_stop,
+                                     StorageType y_start, StorageType y_stop) {
+    std::cout << "-- Testing for FE_TONEAREST in x range [0x" << std::hex
+              << x_start << ", 0x" << x_stop << "), y range [0x" << y_start
+              << ", 0x" << y_stop << ") --" << std::dec << std::endl;
+    test_full_range(mpfr::RoundingMode::Nearest, x_start, x_stop, y_start,
+                    y_stop);
+
+    std::cout << "-- Testing for FE_UPWARD in x range [0x" << std::hex
+              << x_start << ", 0x" << x_stop << "), y range [0x" << y_start
+              << ", 0x" << y_stop << ") --" << std::dec << std::endl;
+    test_full_range(mpfr::RoundingMode::Upward, x_start, x_stop, y_start,
+                    y_stop);
+
+    std::cout << "-- Testing for FE_DOWNWARD in x range [0x" << std::hex
+              << x_start << ", 0x" << x_stop << "), y range [0x" << y_start
+              << ", 0x" << y_stop << ") --" << std::dec << std::endl;
+    test_full_range(mpfr::RoundingMode::Downward, x_start, x_stop, y_start,
+                    y_stop);
+
+    std::cout << "-- Testing for FE_TOWARDZERO in x range [0x" << std::hex
+              << x_start << ", 0x" << x_stop << "), y range [0x" << y_start
+              << ", 0x" << y_stop << ") --" << std::dec << std::endl;
+    test_full_range(mpfr::RoundingMode::TowardZero, x_start, x_stop, y_start,
+                    y_stop);
   };
 };
 
@@ -175,3 +262,8 @@ template <typename OutType, typename InType, mpfr::Operation Op,
           UnaryOp<OutType, InType> Func>
 using LlvmLibcUnaryNarrowingOpExhaustiveMathTest =
     LlvmLibcExhaustiveMathTest<UnaryOpChecker<OutType, InType, Op, Func>>;
+
+template <typename FloatType, mpfr::Operation Op, BinaryOp<FloatType> Func>
+using LlvmLibcBinaryOpExhaustiveMathTest =
+    LlvmLibcExhaustiveMathTest<BinaryOpChecker<FloatType, FloatType, Op, Func>,
+                               1 << 2>;
diff --git a/libc/test/src/math/exhaustive/fmodf16_test.cpp b/libc/test/src/math/exhaustive/fmodf16_test.cpp
new file mode 100644
index 00000000000000..067cec969a4f70
--- /dev/null
+++ b/libc/test/src/math/exhaustive/fmodf16_test.cpp
@@ -0,0 +1,41 @@
+//===-- Exhaustive test for fmodf16 ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "exhaustive_test.h"
+#include "src/math/fmodf16.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+using LlvmLibcFmodf16ExhaustiveTest =
+    LlvmLibcBinaryOpExhaustiveMathTest<float16, mpfr::Operation::Fmod,
+                                       LIBC_NAMESPACE::fmodf16>;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcFmodf16ExhaustiveTest, PostivePositiveRange) {
+  test_full_range_all_roundings(POS_START, POS_STOP, POS_START, POS_STOP);
+}
+
+TEST_F(LlvmLibcFmodf16ExhaustiveTest, PostiveNegativeRange) {
+  test_full_range_all_roundings(POS_START, POS_STOP, NEG_START, NEG_STOP);
+}
+
+TEST_F(LlvmLibcFmodf16ExhaustiveTest, NegativePositiveRange) {
+  test_full_range_all_roundings(NEG_START, NEG_STOP, POS_START, POS_STOP);
+}
+
+TEST_F(LlvmLibcFmodf16ExhaustiveTest, NegativeNegativeRange) {
+  test_full_range_all_roundings(NEG_START, NEG_STOP, POS_START, POS_STOP);
+}
diff --git a/libc/config/linux/signal.h.in b/libc/test/src/math/f16div_test.cpp
similarity index 63%
rename from libc/config/linux/signal.h.in
rename to libc/test/src/math/f16div_test.cpp
index fd0b41d3de5b2b..0bfa69f9b8028a 100644
--- a/libc/config/linux/signal.h.in
+++ b/libc/test/src/math/f16div_test.cpp
@@ -1,4 +1,4 @@
-//===-- Linux specific signal.h definitions -------------------------------===//
+//===-- Unittests for f16div ----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-%%begin()
+#include "DivTest.h"
 
-#include <linux/signal.h>
+#include "src/math/f16div.h"
 
-#ifndef __LLVM_LIBC_INTERNAL_SIGACTION
-#define sigaction __sigaction
-#endif
+LIST_DIV_TESTS(float16, double, LIBC_NAMESPACE::f16div)
diff --git a/libcxx/test/libcxx/containers/sequences/vector/const_T.compile.pass.cpp b/libc/test/src/math/f16divf_test.cpp
similarity index 58%
rename from libcxx/test/libcxx/containers/sequences/vector/const_T.compile.pass.cpp
rename to libc/test/src/math/f16divf_test.cpp
index 62fff96ac5abe3..85be1ebcd55c9e 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/const_T.compile.pass.cpp
+++ b/libc/test/src/math/f16divf_test.cpp
@@ -1,4 +1,4 @@
-//===----------------------------------------------------------------------===//
+//===-- Unittests for f16divf ---------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Make sure that `vector<const T>` works
+#include "DivTest.h"
 
-#include <vector>
+#include "src/math/f16divf.h"
 
-void test() {
-  std::vector<const int> v;
-  v.emplace_back(1);
-  v.push_back(1);
-  v.resize(3);
-}
+LIST_DIV_TESTS(float16, float, LIBC_NAMESPACE::f16divf)
diff --git a/libc/test/src/math/f16divl_test.cpp b/libc/test/src/math/f16divl_test.cpp
new file mode 100644
index 00000000000000..bad3e70a477b4f
--- /dev/null
+++ b/libc/test/src/math/f16divl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16divl ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/f16divl.h"
+
+LIST_DIV_TESTS(float16, long double, LIBC_NAMESPACE::f16divl)
diff --git a/libc/test/src/math/f16fma_test.cpp b/libc/test/src/math/f16fma_test.cpp
new file mode 100644
index 00000000000000..2e46b5bdd46829
--- /dev/null
+++ b/libc/test/src/math/f16fma_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16fma ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/f16fma.h"
+
+LIST_NARROWING_FMA_TESTS(float16, double, LIBC_NAMESPACE::f16fma)
diff --git a/libc/test/src/math/f16fmaf_test.cpp b/libc/test/src/math/f16fmaf_test.cpp
index e4ca88b8810e16..5e3aec768c1910 100644
--- a/libc/test/src/math/f16fmaf_test.cpp
+++ b/libc/test/src/math/f16fmaf_test.cpp
@@ -10,12 +10,4 @@
 
 #include "src/math/f16fmaf.h"
 
-using LlvmLibcF16fmafTest = FmaTestTemplate<float16, float>;
-
-TEST_F(LlvmLibcF16fmafTest, SubnormalRange) {
-  test_subnormal_range(&LIBC_NAMESPACE::f16fmaf);
-}
-
-TEST_F(LlvmLibcF16fmafTest, NormalRange) {
-  test_normal_range(&LIBC_NAMESPACE::f16fmaf);
-}
+LIST_NARROWING_FMA_TESTS(float16, float, LIBC_NAMESPACE::f16fmaf)
diff --git a/libc/test/src/math/f16fmal_test.cpp b/libc/test/src/math/f16fmal_test.cpp
new file mode 100644
index 00000000000000..5394268a9cd8f3
--- /dev/null
+++ b/libc/test/src/math/f16fmal_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16fmal ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/f16fmal.h"
+
+LIST_NARROWING_FMA_TESTS(float16, long double, LIBC_NAMESPACE::f16fmal)
diff --git a/libc/test/src/math/f16sqrt_test.cpp b/libc/test/src/math/f16sqrt_test.cpp
new file mode 100644
index 00000000000000..759b388ad3b3d9
--- /dev/null
+++ b/libc/test/src/math/f16sqrt_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16sqrt ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/f16sqrt.h"
+
+LIST_NARROWING_SQRT_TESTS(float16, double, LIBC_NAMESPACE::f16sqrt)
diff --git a/libc/test/src/math/f16sqrtl_test.cpp b/libc/test/src/math/f16sqrtl_test.cpp
new file mode 100644
index 00000000000000..4e069bc8dbffd1
--- /dev/null
+++ b/libc/test/src/math/f16sqrtl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16sqrtl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/f16sqrtl.h"
+
+LIST_NARROWING_SQRT_TESTS(float16, long double, LIBC_NAMESPACE::f16sqrtl)
diff --git a/libc/test/src/math/fmaf_test.cpp b/libc/test/src/math/fmaf_test.cpp
index 0e498d46ecfb0d..09e9c504b942a0 100644
--- a/libc/test/src/math/fmaf_test.cpp
+++ b/libc/test/src/math/fmaf_test.cpp
@@ -10,12 +10,4 @@
 
 #include "src/math/fmaf.h"
 
-using LlvmLibcFmafTest = FmaTestTemplate<float>;
-
-TEST_F(LlvmLibcFmafTest, SubnormalRange) {
-  test_subnormal_range(&LIBC_NAMESPACE::fmaf);
-}
-
-TEST_F(LlvmLibcFmafTest, NormalRange) {
-  test_normal_range(&LIBC_NAMESPACE::fmaf);
-}
+LIST_FMA_TESTS(float, LIBC_NAMESPACE::fmaf)
diff --git a/libc/config/linux/platform_defs.h.inc b/libc/test/src/math/fmul_test.cpp
similarity index 51%
rename from libc/config/linux/platform_defs.h.inc
rename to libc/test/src/math/fmul_test.cpp
index 495d07c80cf294..16eaa1a818daf7 100644
--- a/libc/config/linux/platform_defs.h.inc
+++ b/libc/test/src/math/fmul_test.cpp
@@ -1,13 +1,13 @@
-//===----- Definition of platform specific macros for ELF paltforms -------===//
+//===-- Unittests for fmul-------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
-%%begin()
+#include "FMulTest.h"
 
-#define ENTRYPOINT_SECTION_ATTRIBUTE(name) \
-    __attribute__((section(".llvm.libc.entrypoint."#name)))
-#define LLVM_LIBC_ENTRYPOINT(name) ENTRYPOINT_SECTION_ATTRIBUTE(name) name
+#include "src/math/fmul.h"
+
+LIST_FMUL_MPFR_TESTS(float, double, LIBC_NAMESPACE::fmul)
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index 0171b79810d4e9..89534ae82b56e3 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -12,22 +12,103 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/math_macros.h"
-
 using LlvmLibcSinTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
-TEST_F(LlvmLibcSinTest, Range) {
-  static constexpr double _2pi = 6.283185307179586;
-  constexpr StorageType COUNT = 100'000;
-  constexpr StorageType STEP = STORAGE_MAX / COUNT;
-  for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    double x = FPBits(v).get_val();
-    // TODO: Expand the range of testing after range reduction is implemented.
-    if (isnan(x) || isinf(x) || x > _2pi || x < -_2pi)
-      continue;
-
-    ASSERT_MPFR_MATCH(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sin(x), 1.0);
+using LIBC_NAMESPACE::testing::tlog;
+
+TEST_F(LlvmLibcSinTest, TrickyInputs) {
+  constexpr double INPUTS[] = {
+      0x1.940c877fb7dacp-7,    0x1.fffffffffdb6p24,    0x1.fd4da4ef37075p29,
+      0x1.b951f1572eba5p+31,   0x1.55202aefde314p+31,  0x1.85fc0f04c0128p101,
+      0x1.7776c2343ba4ep101,   0x1.678309fa50d58p110,  0x1.fffffffffef4ep199,
+      -0x1.ab514bfc61c76p+7,   -0x1.f7898d5a756ddp+2,  -0x1.f42fb19b5b9b2p-6,
+      0x1.5f09cad750ab1p+3,    -0x1.14823229799c2p+7,  -0x1.0285070f9f1bcp-5,
+      0x1.23f40dccdef72p+0,    0x1.43cf16358c9d7p+0,   0x1.addf3b9722265p+0,
+      0x1.48ff1782ca91dp+8,    0x1.a211877de55dbp+4,   0x1.dcbfda0c7559ep+8,
+      0x1.1ffb509f3db15p+5,    0x1.2345d1e090529p+5,   0x1.ae945054939c2p+10,
+      0x1.2e566149bf5fdp+9,    0x1.be886d9c2324dp+6,   -0x1.119471e9216cdp+10,
+      -0x1.aaf85537ea4c7p+3,   0x1.cb996c60f437ep+9,   0x1.c96e28eb679f8p+5,
+      -0x1.a5eece87e8606p+4,   0x1.e31b55306f22cp+2,   0x1.ae78d360afa15p+0,
+      0x1.1685973506319p+3,    0x1.4f2b874135d27p+4,   0x1.ae945054939c2p+10,
+      0x1.3eec5912ea7cdp+331,  0x1.dcbfda0c7559ep+8,   0x1.a65d441ea6dcep+4,
+      0x1.e639103a05997p+2,    0x1.13114266f9764p+4,   -0x1.3eec5912ea7cdp+331,
+      0x1.08087e9aad90bp+887,  0x1.2b5fe88a9d8d5p+903, -0x1.a880417b7b119p+1023,
+      -0x1.6deb37da81129p+205, 0x1.08087e9aad90bp+887, 0x1.f6d7518808571p+1023,
+      -0x1.8bb5847d49973p+845, 0x1.f08b14e1c4d0fp+890, 0x1.6ac5b262ca1ffp+849,
+      0x1.e0000000001c2p-20,
+  };
+  constexpr int N = sizeof(INPUTS) / sizeof(INPUTS[0]);
+
+  for (int i = 0; i < N; ++i) {
+    double x = INPUTS[i];
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
+                                   LIBC_NAMESPACE::sin(x), 0.5);
   }
 }
+
+TEST_F(LlvmLibcSinTest, InDoubleRange) {
+  constexpr uint64_t COUNT = 1'234'51;
+  uint64_t START = LIBC_NAMESPACE::fputil::FPBits<double>(0x1.0p-50).uintval();
+  uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits<double>(0x1.0p200).uintval();
+  uint64_t STEP = (STOP - START) / COUNT;
+
+  auto test = [&](mpfr::RoundingMode rounding_mode) {
+    mpfr::ForceRoundingMode __r(rounding_mode);
+    if (!__r.success)
+      return;
+
+    uint64_t fails = 0;
+    uint64_t count = 0;
+    uint64_t cc = 0;
+    double mx, mr = 0.0;
+    double tol = 0.5;
+
+    for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
+      double x = FPBits(v).get_val();
+      if (isnan(x) || isinf(x))
+        continue;
+      LIBC_NAMESPACE::libc_errno = 0;
+      double result = LIBC_NAMESPACE::sin(x);
+      ++cc;
+      if (isnan(result) || isinf(result))
+        continue;
+
+      ++count;
+
+      if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Sin, x, result,
+                                             0.5, rounding_mode)) {
+        ++fails;
+        while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Sin, x,
+                                                  result, tol, rounding_mode)) {
+          mx = x;
+          mr = result;
+
+          if (tol > 1000.0)
+            break;
+
+          tol *= 2.0;
+        }
+      }
+    }
+    if (fails) {
+      tlog << " Sin failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
+      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, mx, mr, 0.5, rounding_mode);
+    }
+  };
+
+  tlog << " Test Rounding To Nearest...\n";
+  test(mpfr::RoundingMode::Nearest);
+
+  tlog << " Test Rounding Downward...\n";
+  test(mpfr::RoundingMode::Downward);
+
+  tlog << " Test Rounding Upward...\n";
+  test(mpfr::RoundingMode::Upward);
+
+  tlog << " Test Rounding Toward Zero...\n";
+  test(mpfr::RoundingMode::TowardZero);
+}
diff --git a/libc/test/src/math/sincos_test.cpp b/libc/test/src/math/sincos_test.cpp
new file mode 100644
index 00000000000000..7e06456f2e05c5
--- /dev/null
+++ b/libc/test/src/math/sincos_test.cpp
@@ -0,0 +1,118 @@
+//===-- Unittests for sincos ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sincos.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcSincosTest = LIBC_NAMESPACE::testing::FPTest<double>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+using LIBC_NAMESPACE::testing::tlog;
+
+#define ASSERT_SINCOS_MATCH_ALL_ROUNDING(input)                                \
+  do {                                                                         \
+    double sin_x, cos_x;                                                       \
+    namespace mpfr = LIBC_NAMESPACE::testing::mpfr;                            \
+                                                                               \
+    mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest);                 \
+    if (__r1.success) {                                                        \
+      LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+                        mpfr::RoundingMode::Nearest);                          \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+                        mpfr::RoundingMode::Nearest);                          \
+    }                                                                          \
+                                                                               \
+    mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward);                  \
+    if (__r2.success) {                                                        \
+      LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+                        mpfr::RoundingMode::Upward);                           \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+                        mpfr::RoundingMode::Upward);                           \
+    }                                                                          \
+                                                                               \
+    mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward);                \
+    if (__r3.success) {                                                        \
+      LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+                        mpfr::RoundingMode::Downward);                         \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+                        mpfr::RoundingMode::Downward);                         \
+    }                                                                          \
+                                                                               \
+    mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero);              \
+    if (__r4.success) {                                                        \
+      LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+                        mpfr::RoundingMode::TowardZero);                       \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+                        mpfr::RoundingMode::TowardZero);                       \
+    }                                                                          \
+  } while (0)
+
+TEST_F(LlvmLibcSincosTest, TrickyInputs) {
+  constexpr double INPUTS[] = {
+      0x1.8000000000009p-23,    0x1.8000000000024p-22,
+      0x1.800000000009p-21,     0x1.20000000000f3p-20,
+      0x1.800000000024p-20,     0x1.e0000000001c2p-20,
+      0x1.940c877fb7dacp-7,     -0x1.f42fb19b5b9b2p-6,
+      -0x1.0285070f9f1bcp-5,    0x1.23f40dccdef72p+0,
+      0x1.43cf16358c9d7p+0,     0x1.addf3b9722265p+0,
+      0x1.ae78d360afa15p+0,     0x1.e31b55306f22cp+2,
+      0x1.e639103a05997p+2,     -0x1.f7898d5a756ddp+2,
+      0x1.1685973506319p+3,     0x1.5f09cad750ab1p+3,
+      -0x1.aaf85537ea4c7p+3,    0x1.4f2b874135d27p+4,
+      0x1.13114266f9764p+4,     0x1.a211877de55dbp+4,
+      -0x1.a5eece87e8606p+4,    0x1.a65d441ea6dcep+4,
+      0x1.1ffb509f3db15p+5,     0x1.2345d1e090529p+5,
+      0x1.c96e28eb679f8p+5,     0x1.be886d9c2324dp+6,
+      -0x1.ab514bfc61c76p+7,    -0x1.14823229799c2p+7,
+      0x1.48ff1782ca91dp+8,     0x1.dcbfda0c7559ep+8,
+      0x1.dcbfda0c7559ep+8,     0x1.2e566149bf5fdp+9,
+      0x1.cb996c60f437ep+9,     0x1.ae945054939c2p+10,
+      -0x1.119471e9216cdp+10,   0x1.ae945054939c2p+10,
+      0x1.fffffffffdb6p+24,     0x1.fd4da4ef37075p+29,
+      0x1.55202aefde314p+31,    0x1.b951f1572eba5p+31,
+      0x1.7776c2343ba4ep+101,   0x1.85fc0f04c0128p+101,
+      0x1.678309fa50d58p+110,   0x1.fffffffffef4ep+199,
+      0x1.3eec5912ea7cdp+331,   -0x1.3eec5912ea7cdp+331,
+      -0x1.6deb37da81129p+205,  0x1.08087e9aad90bp+887,
+      -0x1.08087e9aad90bp+887,  -0x1.8bb5847d49973p+845,
+      0x1.6ac5b262ca1ffp+849,   0x1.f08b14e1c4d0fp+890,
+      0x1.2b5fe88a9d8d5p+903,   0x1.f6d7518808571p+1023,
+      -0x1.a880417b7b119p+1023, 0x1.00a33764a0a83p-7,
+      0x1.fe81868fc47fep+1,     0x1.0da8cc189b47dp-10,
+      0x1.da1838053b866p+5,
+
+  };
+  constexpr int N = sizeof(INPUTS) / sizeof(INPUTS[0]);
+
+  for (int i = 0; i < N; ++i) {
+    ASSERT_SINCOS_MATCH_ALL_ROUNDING(INPUTS[i]);
+  }
+}
+
+TEST_F(LlvmLibcSincosTest, InDoubleRange) {
+  constexpr uint64_t COUNT = 123'41;
+  uint64_t START = LIBC_NAMESPACE::fputil::FPBits<double>(0x1.0p-50).uintval();
+  uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits<double>(0x1.0p200).uintval();
+  uint64_t STEP = (STOP - START) / COUNT;
+
+  for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
+    double x = FPBits(v).get_val();
+    if (isnan(x) || isinf(x))
+      continue;
+
+    ASSERT_SINCOS_MATCH_ALL_ROUNDING(x);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 8cae52fb0cebe5..a1b4a03eb2e854 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3630,6 +3630,74 @@ add_fp_unittest(
     libc.src.math.setpayloadsigf16
 )
 
+add_fp_unittest(
+  f16div_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16div_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16div
+)
+
+add_fp_unittest(
+  f16divf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16divf_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16divf
+)
+
+add_fp_unittest(
+  f16divl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16divl_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16divl
+)
+
+add_fp_unittest(
+  f16divf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16divf128_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.math.f16divf128
+)
+
+add_fp_unittest(
+  f16fma_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16fma_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.f16fma
+)
+
 add_fp_unittest(
   f16fmaf_test
   SUITE
@@ -3642,6 +3710,42 @@ add_fp_unittest(
     libc.src.math.f16fmaf
 )
 
+add_fp_unittest(
+  f16fmal_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16fmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.f16fmal
+)
+
+add_fp_unittest(
+  f16fmaf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16fmaf128_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.f16fmaf128
+)
+
+add_fp_unittest(
+  f16sqrt_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16sqrt_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.f16sqrt
+)
+
 add_fp_unittest(
   f16sqrtf_test
   SUITE
@@ -3653,3 +3757,57 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.f16sqrtf
 )
+
+add_fp_unittest(
+  f16sqrtl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16sqrtl_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.f16sqrtl
+)
+
+add_fp_unittest(
+  f16sqrtf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16sqrtf128_test.cpp
+  HDRS
+    SqrtTest.h
+  DEPENDS
+    libc.src.math.f16sqrtf128
+)
+
+add_fp_unittest(
+  sin_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    sin_test.cpp
+  DEPENDS
+    libc.src.math.sin
+)
+
+add_fp_unittest(
+  cos_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    cos_test.cpp
+  DEPENDS
+    libc.src.math.cos
+)
+
+add_fp_unittest(
+  sincos_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    sincos_test.cpp
+  DEPENDS
+    libc.src.math.sincos
+)
diff --git a/libc/test/src/math/smoke/DivTest.h b/libc/test/src/math/smoke/DivTest.h
new file mode 100644
index 00000000000000..71cfb326b97cc2
--- /dev/null
+++ b/libc/test/src/math/smoke/DivTest.h
@@ -0,0 +1,171 @@
+//===-- Utility class to test different flavors of float div --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_DIVTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_SMOKE_DIVTEST_H
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/RoundingModeUtils.h"
+#include "test/UnitTest/Test.h"
+
+template <typename OutType, typename InType>
+class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  DECLARE_SPECIAL_CONSTANTS(OutType)
+
+  struct InConstants {
+    DECLARE_SPECIAL_CONSTANTS(InType)
+  };
+
+  using InFPBits = typename InConstants::FPBits;
+  using InStorageType = typename InConstants::StorageType;
+
+public:
+  using DivFunc = OutType (*)(InType, InType);
+
+  void test_special_numbers(DivFunc func) {
+    EXPECT_FP_IS_NAN(func(aNaN, aNaN));
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
+
+    InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
+      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
+    }
+
+    EXPECT_FP_EQ(inf, func(inf, zero));
+    EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
+    EXPECT_FP_EQ(neg_inf, func(inf, neg_zero));
+    EXPECT_FP_EQ(inf, func(neg_inf, neg_zero));
+  }
+
+  void test_division_by_zero(DivFunc func) {
+    EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(InType(1.0), zero), FE_DIVBYZERO);
+    EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(InType(-1.0), zero),
+                                FE_DIVBYZERO);
+    EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(InType(1.0), neg_zero),
+                                FE_DIVBYZERO);
+    EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(InType(1.0), zero), FE_DIVBYZERO);
+  }
+
+  void test_invalid_operations(DivFunc func) {
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(zero, zero), FE_INVALID);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_zero, zero), FE_INVALID);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(zero, neg_zero), FE_INVALID);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_zero, neg_zero), FE_INVALID);
+
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, inf), FE_INVALID);
+    EXPECT_MATH_ERRNO(EDOM);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, inf), FE_INVALID);
+    EXPECT_MATH_ERRNO(EDOM);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, neg_inf), FE_INVALID);
+    EXPECT_MATH_ERRNO(EDOM);
+    EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, neg_inf), FE_INVALID);
+    EXPECT_MATH_ERRNO(EDOM);
+  }
+
+  void test_range_errors(DivFunc func) {
+    using namespace LIBC_NAMESPACE::fputil::testing;
+
+    if (ForceRoundingMode r(RoundingMode::Nearest); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, min_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, min_denormal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+
+    if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, min_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal,
+                                  func(neg_max_normal, min_denormal),
+                                  FE_OVERFLOW | FE_INEXACT);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+
+    if (ForceRoundingMode r(RoundingMode::Downward); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, min_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, min_denormal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_min_denormal,
+                                  func(neg_min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+
+    if (ForceRoundingMode r(RoundingMode::Upward); r.success) {
+      EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, min_normal),
+                                  FE_OVERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal,
+                                  func(neg_max_normal, min_denormal),
+                                  FE_OVERFLOW | FE_INEXACT);
+
+      EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal, func(min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+      EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_min_denormal, max_normal),
+                                  FE_UNDERFLOW | FE_INEXACT);
+      EXPECT_MATH_ERRNO(ERANGE);
+    }
+  }
+
+  void test_inexact_results(DivFunc func) {
+    func(InType(1.0), InType(3.0));
+    EXPECT_FP_EXCEPTION(FE_INEXACT);
+  }
+};
+
+#define LIST_DIV_TESTS(OutType, InType, func)                                  \
+  using LlvmLibcDivTest = DivTest<OutType, InType>;                            \
+  TEST_F(LlvmLibcDivTest, SpecialNumbers) { test_special_numbers(&func); }     \
+  TEST_F(LlvmLibcDivTest, DivisionByZero) { test_division_by_zero(&func); }    \
+  TEST_F(LlvmLibcDivTest, InvalidOperations) {                                 \
+    test_invalid_operations(&func);                                            \
+  }                                                                            \
+  TEST_F(LlvmLibcDivTest, RangeErrors) { test_range_errors(&func); }           \
+  TEST_F(LlvmLibcDivTest, InexactResults) { test_inexact_results(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_SMOKE_DIVTEST_H
diff --git a/libc/test/src/math/smoke/cos_test.cpp b/libc/test/src/math/smoke/cos_test.cpp
new file mode 100644
index 00000000000000..81c8612dba26e5
--- /dev/null
+++ b/libc/test/src/math/smoke/cos_test.cpp
@@ -0,0 +1,26 @@
+//===-- Unittests for cos -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cos.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcCosTest = LIBC_NAMESPACE::testing::FPTest<double>;
+
+using LIBC_NAMESPACE::testing::tlog;
+
+TEST_F(LlvmLibcCosTest, SpecialNumbers) {
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cos(aNaN));
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cos(inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cos(neg_inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(1.0, LIBC_NAMESPACE::cos(zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(1.0, LIBC_NAMESPACE::cos(neg_zero));
+  EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::cos(0x1.0p-50));
+  EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::cos(min_normal));
+  EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::cos(min_denormal));
+}
diff --git a/libc/test/src/math/smoke/f16div_test.cpp b/libc/test/src/math/smoke/f16div_test.cpp
new file mode 100644
index 00000000000000..0bfa69f9b8028a
--- /dev/null
+++ b/libc/test/src/math/smoke/f16div_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16div ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/f16div.h"
+
+LIST_DIV_TESTS(float16, double, LIBC_NAMESPACE::f16div)
diff --git a/libc/test/src/math/smoke/f16divf128_test.cpp b/libc/test/src/math/smoke/f16divf128_test.cpp
new file mode 100644
index 00000000000000..d2ea9718246216
--- /dev/null
+++ b/libc/test/src/math/smoke/f16divf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16divf128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/f16divf128.h"
+
+LIST_DIV_TESTS(float16, float128, LIBC_NAMESPACE::f16divf128)
diff --git a/libc/test/src/math/smoke/f16divf_test.cpp b/libc/test/src/math/smoke/f16divf_test.cpp
new file mode 100644
index 00000000000000..85be1ebcd55c9e
--- /dev/null
+++ b/libc/test/src/math/smoke/f16divf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16divf ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/f16divf.h"
+
+LIST_DIV_TESTS(float16, float, LIBC_NAMESPACE::f16divf)
diff --git a/libc/test/src/math/smoke/f16divl_test.cpp b/libc/test/src/math/smoke/f16divl_test.cpp
new file mode 100644
index 00000000000000..bad3e70a477b4f
--- /dev/null
+++ b/libc/test/src/math/smoke/f16divl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16divl ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/f16divl.h"
+
+LIST_DIV_TESTS(float16, long double, LIBC_NAMESPACE::f16divl)
diff --git a/libc/test/src/math/smoke/f16fma_test.cpp b/libc/test/src/math/smoke/f16fma_test.cpp
new file mode 100644
index 00000000000000..2e46b5bdd46829
--- /dev/null
+++ b/libc/test/src/math/smoke/f16fma_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16fma ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/f16fma.h"
+
+LIST_NARROWING_FMA_TESTS(float16, double, LIBC_NAMESPACE::f16fma)
diff --git a/libc/test/src/math/smoke/f16fmaf128_test.cpp b/libc/test/src/math/smoke/f16fmaf128_test.cpp
new file mode 100644
index 00000000000000..ea4003bc5f77ee
--- /dev/null
+++ b/libc/test/src/math/smoke/f16fmaf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16fmaf128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/f16fmaf128.h"
+
+LIST_NARROWING_FMA_TESTS(float16, float128, LIBC_NAMESPACE::f16fmaf128)
diff --git a/libc/test/src/math/smoke/f16fmal_test.cpp b/libc/test/src/math/smoke/f16fmal_test.cpp
new file mode 100644
index 00000000000000..5394268a9cd8f3
--- /dev/null
+++ b/libc/test/src/math/smoke/f16fmal_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16fmal ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/f16fmal.h"
+
+LIST_NARROWING_FMA_TESTS(float16, long double, LIBC_NAMESPACE::f16fmal)
diff --git a/libc/test/src/math/smoke/f16sqrt_test.cpp b/libc/test/src/math/smoke/f16sqrt_test.cpp
new file mode 100644
index 00000000000000..759b388ad3b3d9
--- /dev/null
+++ b/libc/test/src/math/smoke/f16sqrt_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16sqrt ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/f16sqrt.h"
+
+LIST_NARROWING_SQRT_TESTS(float16, double, LIBC_NAMESPACE::f16sqrt)
diff --git a/libc/test/src/math/smoke/f16sqrtf128_test.cpp b/libc/test/src/math/smoke/f16sqrtf128_test.cpp
new file mode 100644
index 00000000000000..31c994860dcd8e
--- /dev/null
+++ b/libc/test/src/math/smoke/f16sqrtf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16sqrtf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/f16sqrtf128.h"
+
+LIST_NARROWING_SQRT_TESTS(float16, float128, LIBC_NAMESPACE::f16sqrtf128)
diff --git a/libc/test/src/math/smoke/f16sqrtl_test.cpp b/libc/test/src/math/smoke/f16sqrtl_test.cpp
new file mode 100644
index 00000000000000..4e069bc8dbffd1
--- /dev/null
+++ b/libc/test/src/math/smoke/f16sqrtl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16sqrtl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/f16sqrtl.h"
+
+LIST_NARROWING_SQRT_TESTS(float16, long double, LIBC_NAMESPACE::f16sqrtl)
diff --git a/libc/test/src/math/smoke/sin_test.cpp b/libc/test/src/math/smoke/sin_test.cpp
new file mode 100644
index 00000000000000..16ced68709ca75
--- /dev/null
+++ b/libc/test/src/math/smoke/sin_test.cpp
@@ -0,0 +1,26 @@
+//===-- Unittests for sin -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sin.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcSinTest = LIBC_NAMESPACE::testing::FPTest<double>;
+
+using LIBC_NAMESPACE::testing::tlog;
+
+TEST_F(LlvmLibcSinTest, SpecialNumbers) {
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sin(aNaN));
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sin(inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sin(neg_inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::sin(zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::sin(neg_zero));
+  EXPECT_FP_EQ(0x1.0p-50, LIBC_NAMESPACE::sin(0x1.0p-50));
+  EXPECT_FP_EQ(min_normal, LIBC_NAMESPACE::sin(min_normal));
+  EXPECT_FP_EQ(min_denormal, LIBC_NAMESPACE::sin(min_denormal));
+}
diff --git a/libc/test/src/math/smoke/sincos_test.cpp b/libc/test/src/math/smoke/sincos_test.cpp
new file mode 100644
index 00000000000000..371c0ad63cbf55
--- /dev/null
+++ b/libc/test/src/math/smoke/sincos_test.cpp
@@ -0,0 +1,41 @@
+//===-- Unittests for sincos ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sincos.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcSincosTest = LIBC_NAMESPACE::testing::FPTest<double>;
+
+TEST_F(LlvmLibcSincosTest, SpecialNumbers) {
+  double sin_x, cos_x;
+
+  LIBC_NAMESPACE::sincos(aNaN, &sin_x, &cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, sin_x);
+
+  LIBC_NAMESPACE::sincos(zero, &sin_x, &cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(1.0, cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(0.0, sin_x);
+
+  LIBC_NAMESPACE::sincos(neg_zero, &sin_x, &cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(1.0, cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, sin_x);
+
+  LIBC_NAMESPACE::sincos(inf, &sin_x, &cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, sin_x);
+
+  LIBC_NAMESPACE::sincos(neg_inf, &sin_x, &cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, cos_x);
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, sin_x);
+
+  LIBC_NAMESPACE::sincos(0x1.0p-28, &sin_x, &cos_x);
+  EXPECT_FP_EQ(1.0, cos_x);
+  EXPECT_FP_EQ(0x1.0p-28, sin_x);
+}
diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt
index 56ae6fd2cf6170..edbd5c19edab38 100644
--- a/libc/test/src/signal/CMakeLists.txt
+++ b/libc/test/src/signal/CMakeLists.txt
@@ -32,8 +32,8 @@ add_libc_unittest(
   SRCS
     sigaction_test.cpp
   DEPENDS
-    libc.include.errno
-    libc.include.signal
+    libc.hdr.errno_macros
+    libc.hdr.signal_macros
     libc.src.signal.raise
     libc.src.signal.sigaction
     libc.test.UnitTest.ErrnoSetterMatcher
@@ -119,7 +119,7 @@ add_libc_unittest(
   SRCS
     sigaltstack_test.cpp
   DEPENDS
-    libc.include.signal
+    libc.hdr.signal_macros
     libc.src.errno.errno
     libc.src.signal.raise
     libc.src.signal.sigaltstack
diff --git a/libc/test/src/signal/sigaction_test.cpp b/libc/test/src/signal/sigaction_test.cpp
index 262469c7600e3f..a12d7989586fe6 100644
--- a/libc/test/src/signal/sigaction_test.cpp
+++ b/libc/test/src/signal/sigaction_test.cpp
@@ -6,15 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/errno_macros.h"
+#include "hdr/signal_macros.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaction.h"
-
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <errno.h>
-#include <signal.h>
-
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp
index 12bf2bf5e3723a..cc392da8f4731b 100644
--- a/libc/test/src/signal/sigaltstack_test.cpp
+++ b/libc/test/src/signal/sigaltstack_test.cpp
@@ -6,17 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/signal_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaction.h"
 #include "src/signal/sigaltstack.h"
-
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <signal.h>
 #include <stdint.h>
 #include <sys/syscall.h>
 
diff --git a/libc/test/src/stdlib/rand_test.cpp b/libc/test/src/stdlib/rand_test.cpp
index 7934dc16aa461a..6f25708e539053 100644
--- a/libc/test/src/stdlib/rand_test.cpp
+++ b/libc/test/src/stdlib/rand_test.cpp
@@ -23,15 +23,12 @@ TEST(LlvmLibcRandTest, UnsetSeed) {
     vals[i] = val;
   }
 
-  // FIXME: The GPU implementation cannot initialize the seed correctly.
-#ifndef LIBC_TARGET_ARCH_IS_GPU
   // The C standard specifies that if 'srand' is never called it should behave
   // as if 'srand' was called with a value of 1. If we seed the value with 1 we
   // should get the same sequence as the unseeded version.
   LIBC_NAMESPACE::srand(1);
   for (size_t i = 0; i < 1000; ++i)
     ASSERT_EQ(LIBC_NAMESPACE::rand(), vals[i]);
-#endif
 }
 
 TEST(LlvmLibcRandTest, SetSeed) {
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 2eac4dd8e199de..1ff241ddf20add 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -296,6 +296,12 @@ class MPFRNumber {
     return result;
   }
 
+  MPFRNumber div(const MPFRNumber &b) const {
+    MPFRNumber result(*this);
+    mpfr_div(result.value, value, b.value, mpfr_rounding);
+    return result;
+  }
+
   MPFRNumber floor() const {
     MPFRNumber result(*this);
     mpfr_floor(result.value, value);
@@ -467,6 +473,12 @@ class MPFRNumber {
     return result;
   }
 
+  MPFRNumber fmul(const MPFRNumber &b) {
+    MPFRNumber result(*this);
+    mpfr_mul(result.value, value, b.value, mpfr_rounding);
+    return result;
+  }
+
   cpp::string str() const {
     // 200 bytes should be more than sufficient to hold a 100-digit number
     // plus additional bytes for the decimal point, '-' sign etc.
@@ -708,12 +720,16 @@ binary_operation_one_output(Operation op, InputType x, InputType y,
   switch (op) {
   case Operation::Atan2:
     return inputX.atan2(inputY);
+  case Operation::Div:
+    return inputX.div(inputY);
   case Operation::Fmod:
     return inputX.fmod(inputY);
   case Operation::Hypot:
     return inputX.hypot(inputY);
   case Operation::Pow:
     return inputX.pow(inputY);
+  case Operation::Fmul:
+    return inputX.fmul(inputY);
   default:
     __builtin_unreachable();
   }
@@ -802,6 +818,12 @@ template void explain_unary_operation_single_output_error(Operation op, float16,
 template void explain_unary_operation_single_output_error(Operation op, float,
                                                           float16, double,
                                                           RoundingMode);
+template void explain_unary_operation_single_output_error(Operation op, double,
+                                                          float16, double,
+                                                          RoundingMode);
+template void explain_unary_operation_single_output_error(Operation op,
+                                                          long double, float16,
+                                                          double, RoundingMode);
 #endif
 
 template <typename T>
@@ -885,38 +907,37 @@ template void explain_binary_operation_two_outputs_error<long double>(
     Operation, const BinaryInput<long double> &,
     const BinaryOutput<long double> &, double, RoundingMode);
 
-template <typename T>
-void explain_binary_operation_one_output_error(Operation op,
-                                               const BinaryInput<T> &input,
-                                               T libc_result,
-                                               double ulp_tolerance,
-                                               RoundingMode rounding) {
-  unsigned int precision = get_precision<T>(ulp_tolerance);
+template <typename InputType, typename OutputType>
+void explain_binary_operation_one_output_error(
+    Operation op, const BinaryInput<InputType> &input, OutputType libc_result,
+    double ulp_tolerance, RoundingMode rounding) {
+  unsigned int precision = get_precision<InputType>(ulp_tolerance);
   MPFRNumber mpfrX(input.x, precision);
   MPFRNumber mpfrY(input.y, precision);
-  FPBits<T> xbits(input.x);
-  FPBits<T> ybits(input.y);
+  FPBits<InputType> xbits(input.x);
+  FPBits<InputType> ybits(input.y);
   MPFRNumber mpfr_result =
       binary_operation_one_output(op, input.x, input.y, precision, rounding);
   MPFRNumber mpfrMatchValue(libc_result);
 
   tlog << "Input decimal: x: " << mpfrX.str() << " y: " << mpfrY.str() << '\n';
-  tlog << "First input bits: " << str(FPBits<T>(input.x)) << '\n';
-  tlog << "Second input bits: " << str(FPBits<T>(input.y)) << '\n';
+  tlog << "First input bits: " << str(FPBits<InputType>(input.x)) << '\n';
+  tlog << "Second input bits: " << str(FPBits<InputType>(input.y)) << '\n';
 
   tlog << "Libc result: " << mpfrMatchValue.str() << '\n'
        << "MPFR result: " << mpfr_result.str() << '\n';
-  tlog << "Libc floating point result bits: " << str(FPBits<T>(libc_result))
-       << '\n';
+  tlog << "Libc floating point result bits: "
+       << str(FPBits<OutputType>(libc_result)) << '\n';
   tlog << "              MPFR rounded bits: "
-       << str(FPBits<T>(mpfr_result.as<T>())) << '\n';
+       << str(FPBits<OutputType>(mpfr_result.as<OutputType>())) << '\n';
   tlog << "ULP error: " << mpfr_result.ulp_as_mpfr_number(libc_result).str()
        << '\n';
 }
 
-template void explain_binary_operation_one_output_error<float>(
-    Operation, const BinaryInput<float> &, float, double, RoundingMode);
-template void explain_binary_operation_one_output_error<double>(
+template void
+explain_binary_operation_one_output_error(Operation, const BinaryInput<float> &,
+                                          float, double, RoundingMode);
+template void explain_binary_operation_one_output_error(
     Operation, const BinaryInput<double> &, double, double, RoundingMode);
 template void explain_binary_operation_one_output_error<long double>(
     Operation, const BinaryInput<long double> &, long double, double,
@@ -964,6 +985,12 @@ explain_ternary_operation_one_output_error(Operation,
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template void explain_ternary_operation_one_output_error(
     Operation, const TernaryInput<float> &, float16, double, RoundingMode);
+template void explain_ternary_operation_one_output_error(
+    Operation, const TernaryInput<double> &, float16, double, RoundingMode);
+template void
+explain_ternary_operation_one_output_error(Operation,
+                                           const TernaryInput<long double> &,
+                                           float16, double, RoundingMode);
 #endif
 
 template <typename InputType, typename OutputType>
@@ -990,6 +1017,11 @@ template bool compare_unary_operation_single_output(Operation, float16, float16,
                                                     double, RoundingMode);
 template bool compare_unary_operation_single_output(Operation, float, float16,
                                                     double, RoundingMode);
+template bool compare_unary_operation_single_output(Operation, double, float16,
+                                                    double, RoundingMode);
+template bool compare_unary_operation_single_output(Operation, long double,
+                                                    float16, double,
+                                                    RoundingMode);
 #endif
 
 template <typename T>
@@ -1051,10 +1083,11 @@ template bool compare_binary_operation_two_outputs<long double>(
     Operation, const BinaryInput<long double> &,
     const BinaryOutput<long double> &, double, RoundingMode);
 
-template <typename T>
+template <typename InputType, typename OutputType>
 bool compare_binary_operation_one_output(Operation op,
-                                         const BinaryInput<T> &input,
-                                         T libc_result, double ulp_tolerance,
+                                         const BinaryInput<InputType> &input,
+                                         OutputType libc_result,
+                                         double ulp_tolerance,
                                          RoundingMode rounding) {
   unsigned int precision = get_precision<T>(ulp_tolerance);
   MPFRNumber mpfr_result =
@@ -1102,6 +1135,14 @@ template bool compare_ternary_operation_one_output(Operation,
                                                    const TernaryInput<float> &,
                                                    float16, double,
                                                    RoundingMode);
+template bool compare_ternary_operation_one_output(Operation,
+                                                   const TernaryInput<double> &,
+                                                   float16, double,
+                                                   RoundingMode);
+template bool
+compare_ternary_operation_one_output(Operation,
+                                     const TernaryInput<long double> &, float16,
+                                     double, RoundingMode);
 #endif
 
 } // namespace internal
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 0b4f42a72ec813..71087361947063 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -41,6 +41,7 @@ enum class Operation : int {
   Exp10,
   Expm1,
   Floor,
+  Fmul,
   Log,
   Log2,
   Log10,
@@ -71,6 +72,7 @@ enum class Operation : int {
   // output.
   BeginBinaryOperationsSingleOutput,
   Atan2,
+  Div,
   Fmod,
   Hypot,
   Pow,
@@ -129,6 +131,14 @@ struct AreMatchingBinaryInputAndBinaryOutput<BinaryInput<T>, BinaryOutput<T>> {
   static constexpr bool VALUE = cpp::is_floating_point_v<T>;
 };
 
+template <typename T> struct IsBinaryInput {
+  static constexpr bool VALUE = false;
+};
+
+template <typename T> struct IsBinaryInput<BinaryInput<T>> {
+  static constexpr bool VALUE = true;
+};
+
 template <typename T> struct IsTernaryInput {
   static constexpr bool VALUE = false;
 };
@@ -137,8 +147,19 @@ template <typename T> struct IsTernaryInput<TernaryInput<T>> {
   static constexpr bool VALUE = true;
 };
 
+template <typename T> struct IsBinaryInput {
+  static constexpr bool VALUE = false;
+};
+
+template <typename T> struct IsBinaryInput<BinaryInput<T>> {
+  static constexpr bool VALUE = true;
+};
+
 template <typename T> struct MakeScalarInput : cpp::type_identity<T> {};
 
+template <typename T>
+struct MakeScalarInput<BinaryInput<T>> : cpp::type_identity<T> {};
+
 template <typename T>
 struct MakeScalarInput<TernaryInput<T>> : cpp::type_identity<T> {};
 
@@ -159,10 +180,11 @@ bool compare_binary_operation_two_outputs(Operation op,
                                           double ulp_tolerance,
                                           RoundingMode rounding);
 
-template <typename T>
+template <typename InputType, typename OutputType>
 bool compare_binary_operation_one_output(Operation op,
-                                         const BinaryInput<T> &input,
-                                         T libc_output, double ulp_tolerance,
+                                         const BinaryInput<InputType> &input,
+                                         OutputType libc_output,
+                                         double ulp_tolerance,
                                          RoundingMode rounding);
 
 template <typename InputType, typename OutputType>
@@ -187,12 +209,10 @@ void explain_binary_operation_two_outputs_error(
     const BinaryOutput<T> &match_value, double ulp_tolerance,
     RoundingMode rounding);
 
-template <typename T>
-void explain_binary_operation_one_output_error(Operation op,
-                                               const BinaryInput<T> &input,
-                                               T match_value,
-                                               double ulp_tolerance,
-                                               RoundingMode rounding);
+template <typename InputType, typename OutputType>
+void explain_binary_operation_one_output_error(
+    Operation op, const BinaryInput<InputType> &input, OutputType match_value,
+    double ulp_tolerance, RoundingMode rounding);
 
 template <typename InputType, typename OutputType>
 void explain_ternary_operation_one_output_error(
@@ -225,56 +245,63 @@ class MPFRMatcher : public testing::Matcher<OutputType> {
   bool is_silent() const override { return silent; }
 
 private:
-  template <typename T, typename U> bool match(T in, U out) {
+  template <typename InType, typename OutType>
+  bool match(InType in, OutType out) {
     return compare_unary_operation_single_output(op, in, out, ulp_tolerance,
                                                  rounding);
   }
 
-  template <typename T> bool match(T in, const BinaryOutput<T> &out) {
+  template <typename InType>
+  bool match(InType in, const BinaryOutput<InType> &out) {
     return compare_unary_operation_two_outputs(op, in, out, ulp_tolerance,
                                                rounding);
   }
 
-  template <typename T> bool match(const BinaryInput<T> &in, T out) {
+  template <typename T, typename U>
+  bool match(const BinaryInput<T> &in, U out) {
     return compare_binary_operation_one_output(op, in, out, ulp_tolerance,
                                                rounding);
   }
 
-  template <typename T>
-  bool match(BinaryInput<T> in, const BinaryOutput<T> &out) {
+  template <typename InType>
+  bool match(BinaryInput<InType> in, const BinaryOutput<InType> &out) {
     return compare_binary_operation_two_outputs(op, in, out, ulp_tolerance,
                                                 rounding);
   }
 
-  template <typename T, typename U>
-  bool match(const TernaryInput<T> &in, U out) {
+  template <typename InType, typename OutType>
+  bool match(const TernaryInput<InType> &in, OutType out) {
     return compare_ternary_operation_one_output(op, in, out, ulp_tolerance,
                                                 rounding);
   }
 
-  template <typename T, typename U> void explain_error(T in, U out) {
+  template <typename InType, typename OutType>
+  void explain_error(InType in, OutType out) {
     explain_unary_operation_single_output_error(op, in, out, ulp_tolerance,
                                                 rounding);
   }
 
-  template <typename T> void explain_error(T in, const BinaryOutput<T> &out) {
+  template <typename InType>
+  void explain_error(InType in, const BinaryOutput<InType> &out) {
     explain_unary_operation_two_outputs_error(op, in, out, ulp_tolerance,
                                               rounding);
   }
 
-  template <typename T>
-  void explain_error(const BinaryInput<T> &in, const BinaryOutput<T> &out) {
+  template <typename InType>
+  void explain_error(const BinaryInput<InType> &in,
+                     const BinaryOutput<InType> &out) {
     explain_binary_operation_two_outputs_error(op, in, out, ulp_tolerance,
                                                rounding);
   }
 
-  template <typename T> void explain_error(const BinaryInput<T> &in, T out) {
+  template <typename T, typename U>
+  void explain_error(const BinaryInput<T> &in, U out) {
     explain_binary_operation_one_output_error(op, in, out, ulp_tolerance,
                                               rounding);
   }
 
-  template <typename T, typename U>
-  void explain_error(const TernaryInput<T> &in, U out) {
+  template <typename InType, typename OutType>
+  void explain_error(const TernaryInput<InType> &in, OutType out) {
     explain_ternary_operation_one_output_error(op, in, out, ulp_tolerance,
                                                rounding);
   }
@@ -290,9 +317,15 @@ constexpr bool is_valid_operation() {
       (op == Operation::Sqrt && cpp::is_floating_point_v<InputType> &&
        cpp::is_floating_point_v<OutputType> &&
        sizeof(OutputType) <= sizeof(InputType)) ||
+      (op == Operation::Div && internal::IsBinaryInput<InputType>::VALUE &&
+       cpp::is_floating_point_v<
+           typename internal::MakeScalarInput<InputType>::type> &&
+       cpp::is_floating_point_v<OutputType>) ||
       (op == Operation::Fma && internal::IsTernaryInput<InputType>::VALUE &&
        cpp::is_floating_point_v<
            typename internal::MakeScalarInput<InputType>::type> &&
+       cpp::is_floating_point_v<OutputType>) ||
+      (op == Operation::Fmul && internal::IsBinaryInput<InputType>::VALUE &&
        cpp::is_floating_point_v<OutputType>);
   if (IS_NARROWING_OP)
     return true;
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 9858ae905983f3..ef8d21b167623a 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -374,15 +374,21 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         OUTPUT ${output_file}
         EXTRA_OPTS "${mcpu}" -fno-builtin -nostdlib
                    "${build_flags}" -I${PROJECT_SOURCE_DIR}/${file_dir}
+        DEPENDENCIES generate_convert.cl clspv-generate_convert.cl
       )
       list( APPEND bytecode_files ${output_file} )
     endforeach()
 
-    set( builtins_link_lib_tgt builtins.link.${arch_suffix} )
+    set( builtins_comp_lib_tgt builtins.comp.${arch_suffix} )
+    add_custom_target( ${builtins_comp_lib_tgt}
+      DEPENDS ${bytecode_files}
+    )
 
+    set( builtins_link_lib_tgt builtins.link.${arch_suffix} )
     link_bc(
       TARGET ${builtins_link_lib_tgt}
       INPUTS ${bytecode_files}
+      DEPENDENCIES ${builtins_comp_lib_tgt}
     )
 
     set( builtins_link_lib $<TARGET_PROPERTY:${builtins_link_lib_tgt},TARGET_FILE> )
@@ -391,7 +397,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       set( spv_suffix ${arch_suffix}.spv )
       add_custom_command( OUTPUT ${spv_suffix}
         COMMAND libclc::llvm-spirv ${spvflags} -o ${spv_suffix} ${builtins_link_lib}
-        DEPENDS ${builtins_link_lib}
+        DEPENDS ${builtins_link_lib} ${builtins_link_lib_tgt}
       )
       add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
       install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
@@ -403,7 +409,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       add_custom_command( OUTPUT ${builtins_opt_lib_tgt}.bc
         COMMAND libclc::opt ${opt_flags} -o ${builtins_opt_lib_tgt}.bc
           ${builtins_link_lib}
-        DEPENDS libclc::opt ${builtins_link_lib}
+        DEPENDS libclc::opt ${builtins_link_lib} ${builtins_link_lib_tgt}
       )
       add_custom_target( ${builtins_opt_lib_tgt}
         ALL DEPENDS ${builtins_opt_lib_tgt}.bc
@@ -418,7 +424,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       set( obj_suffix ${arch_suffix}.bc )
       add_custom_command( OUTPUT ${obj_suffix}
         COMMAND prepare_builtins -o ${obj_suffix} ${builtins_opt_lib}
-        DEPENDS ${builtins_opt_lib} prepare_builtins )
+        DEPENDS ${builtins_opt_lib} ${builtins_opt_lib_tgt} prepare_builtins )
       add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} )
 
       # nvptx-- targets don't include workitem builtins
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 7f4620fa6a21df..ea97e504364ba5 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -80,11 +80,13 @@ endfunction()
 #     Custom target to create
 # * INPUT <string> ...
 #     List of bytecode files to link together
+# * DEPENDENCIES <string> ...
+#     List of extra dependencies to inject
 function(link_bc)
   cmake_parse_arguments(ARG
     ""
     "TARGET"
-    "INPUTS"
+    "INPUTS;DEPENDENCIES"
     ${ARGN}
   )
 
@@ -106,7 +108,7 @@ function(link_bc)
   add_custom_command(
     OUTPUT ${ARG_TARGET}.bc
     COMMAND libclc::llvm-link -o ${ARG_TARGET}.bc ${LINK_INPUT_ARG}
-    DEPENDS libclc::llvm-link ${ARG_INPUTS} ${RSP_FILE}
+    DEPENDS libclc::llvm-link ${ARG_DEPENDENCIES} ${ARG_INPUTS} ${RSP_FILE}
   )
 
   add_custom_target( ${ARG_TARGET} ALL DEPENDS ${ARG_TARGET}.bc )
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 4b927017f8c2ac..e098bd574eec73 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -553,6 +553,10 @@ function(cxx_add_basic_build_flags target)
     target_add_compile_flags_if_supported(${target} PRIVATE -fvisibility=hidden)
   endif()
 
+  # Build with -fsized-deallocation, which is default in recent versions of Clang.
+  # TODO(LLVM 21): This can be dropped once we only support Clang >= 19.
+  target_add_compile_flags_if_supported(${target} PRIVATE -fsized-deallocation)
+
   # Let the library headers know they are currently being used to build the
   # library.
   target_compile_definitions(${target} PRIVATE -D_LIBCPP_BUILDING_LIBRARY)
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 71de10abb6eaa8..d30021b7eb2347 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -121,6 +121,15 @@ Deprecations and Removals
 - The ``_LIBCPP_DISABLE_ADDITIONAL_DIAGNOSTICS`` macro has been removed and is not honored anymore. Additional
   warnings provided by libc++ as a matter of QoI will now be provided unconditionally.
 
+- libc++ no longer supports ``std::allocator<const T>`` and containers of ``const``-qualified element type, such
+  as ``std::vector<const T>`` and ``std::list<const T>``. This used to be supported as an undocumented extension.
+  If you were using ``std::vector<const T>``, replace it with ``std::vector<T>`` instead. The
+  ``_LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST`` macro can be defined to temporarily re-enable this extension as
+  folks transition their code. This macro will be honored for one released and ignored starting in LLVM 20.
+  To assist with the clean-up process, consider running your code through Clang Tidy, with
+  `std-allocator-const <https://clang.llvm.org/extra/clang-tidy/checks/portability/std-allocator-const.html>`
+  enabled.
+
 
 Upcoming Deprecations and Removals
 ----------------------------------
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 6453f227cfcc24..e748ff6ad749b7 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -200,7 +200,7 @@
 "`3200 <https://wg21.link/LWG3200>`__","``midpoint``\  should not constrain ``T``\  is complete","Prague","|Nothing To Do|",""
 "`3201 <https://wg21.link/LWG3201>`__","``lerp``\  should be marked as ``noexcept``\ ","Prague","|Complete|",""
 "`3226 <https://wg21.link/LWG3226>`__","``zoned_time``\  constructor from ``string_view``\  should accept ``zoned_time<Duration2, TimeZonePtr2>``\ ","Prague","","","|chrono|"
-"`3233 <https://wg21.link/LWG3233>`__","Broken requirements for ``shared_ptr``\  converting constructors","Prague","",""
+"`3233 <https://wg21.link/LWG3233>`__","Broken requirements for ``shared_ptr``\  converting constructors","Prague","|Complete|","19.0"
 "`3237 <https://wg21.link/LWG3237>`__","LWG 3038 and 3190 have inconsistent PRs","Prague","|Complete|","16.0"
 "`3238 <https://wg21.link/LWG3238>`__","Insufficiently-defined behavior of ``std::function``\  deduction guides","Prague","|Nothing To Do|",""
 "`3242 <https://wg21.link/LWG3242>`__","``std::format``\ : missing rules for ``arg-id``\  in ``width``\  and ``precision``\ ","Prague","|Complete|","14.0","|format|"
diff --git a/libcxx/docs/Status/ParallelismProjects.csv b/libcxx/docs/Status/ParallelismProjects.csv
index 2ddac1e52f0264..06087e3d870f93 100644
--- a/libcxx/docs/Status/ParallelismProjects.csv
+++ b/libcxx/docs/Status/ParallelismProjects.csv
@@ -16,6 +16,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.whereexpr] <https://wg21.link/N4808>`_, "Where expression class templates", None, Yin Zhang, |In Progress|
 | `[parallel.simd.reference] <https://wg21.link/N4808>`_, "`Element references operator value_type() <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.reference] <https://wg21.link/N4808>`_, "`Element references operator= <https://github.com/llvm/llvm-project/pull/70020>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.reference] <https://wg21.link/N4808>`_, "`Element references swap functions <https://github.com/llvm/llvm-project/pull/86478>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`Class template simd declaration and alias <https://reviews.llvm.org/D144362>`_", [parallel.simd.abi], Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd<>::size() <https://reviews.llvm.org/D144363>`_", [parallel.simd.traits] simd_size[_v], Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd default constructor <https://github.com/llvm/llvm-project/pull/70424>`_", None, Yin Zhang, |Complete|
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
index d7a5b99b546910..7e26434c9c3a0a 100644
--- a/libcxx/include/__atomic/atomic_base.h
+++ b/libcxx/include/__atomic/atomic_base.h
@@ -33,7 +33,7 @@ struct __atomic_base // false
   mutable __cxx_atomic_impl<_Tp> __a_;
 
 #if _LIBCPP_STD_VER >= 17
-  static _LIBCPP_CONSTEXPR bool is_always_lock_free = __libcpp_is_always_lock_free<__cxx_atomic_impl<_Tp> >::__value;
+  static constexpr bool is_always_lock_free = __libcpp_is_always_lock_free<__cxx_atomic_impl<_Tp> >::__value;
 #endif
 
   _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const volatile _NOEXCEPT {
@@ -129,11 +129,6 @@ struct __atomic_base // false
   __atomic_base(const __atomic_base&) = delete;
 };
 
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp, bool __b>
-_LIBCPP_CONSTEXPR bool __atomic_base<_Tp, __b>::is_always_lock_free;
-#endif
-
 // atomic<Integral>
 
 template <class _Tp>
diff --git a/libcxx/include/__atomic/atomic_init.h b/libcxx/include/__atomic/atomic_init.h
index 2ed37a9a77c86d..8e86ba31b4ac3b 100644
--- a/libcxx/include/__atomic/atomic_init.h
+++ b/libcxx/include/__atomic/atomic_init.h
@@ -18,7 +18,7 @@
 #define ATOMIC_FLAG_INIT {false}
 #define ATOMIC_VAR_INIT(__v) {__v}
 
-#if _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
+#if _LIBCPP_STD_VER >= 20 && defined(_LIBCPP_COMPILER_CLANG_BASED) && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
 #  pragma clang deprecated(ATOMIC_VAR_INIT)
 #endif
 
diff --git a/libcxx/include/__charconv/chars_format.h b/libcxx/include/__charconv/chars_format.h
index 95faa29010dd81..c76cebd5d1847d 100644
--- a/libcxx/include/__charconv/chars_format.h
+++ b/libcxx/include/__charconv/chars_format.h
@@ -39,20 +39,17 @@ inline _LIBCPP_HIDE_FROM_ABI constexpr chars_format operator^(chars_format __x,
   return chars_format(std::__to_underlying(__x) ^ std::__to_underlying(__y));
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 chars_format&
-operator&=(chars_format& __x, chars_format __y) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr chars_format& operator&=(chars_format& __x, chars_format __y) {
   __x = __x & __y;
   return __x;
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 chars_format&
-operator|=(chars_format& __x, chars_format __y) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr chars_format& operator|=(chars_format& __x, chars_format __y) {
   __x = __x | __y;
   return __x;
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 chars_format&
-operator^=(chars_format& __x, chars_format __y) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr chars_format& operator^=(chars_format& __x, chars_format __y) {
   __x = __x ^ __y;
   return __x;
 }
diff --git a/libcxx/include/__chrono/time_point.h b/libcxx/include/__chrono/time_point.h
index cb5bf4d13f705b..aaf0b098f280e0 100644
--- a/libcxx/include/__chrono/time_point.h
+++ b/libcxx/include/__chrono/time_point.h
@@ -92,25 +92,22 @@ time_point_cast(const time_point<_Clock, _Duration>& __t) {
 
 #if _LIBCPP_STD_VER >= 17
 template <class _ToDuration, class _Clock, class _Duration, enable_if_t<__is_duration<_ToDuration>::value, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR time_point<_Clock, _ToDuration>
-floor(const time_point<_Clock, _Duration>& __t) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr time_point<_Clock, _ToDuration> floor(const time_point<_Clock, _Duration>& __t) {
   return time_point<_Clock, _ToDuration>{chrono::floor<_ToDuration>(__t.time_since_epoch())};
 }
 
 template <class _ToDuration, class _Clock, class _Duration, enable_if_t<__is_duration<_ToDuration>::value, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR time_point<_Clock, _ToDuration>
-ceil(const time_point<_Clock, _Duration>& __t) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr time_point<_Clock, _ToDuration> ceil(const time_point<_Clock, _Duration>& __t) {
   return time_point<_Clock, _ToDuration>{chrono::ceil<_ToDuration>(__t.time_since_epoch())};
 }
 
 template <class _ToDuration, class _Clock, class _Duration, enable_if_t<__is_duration<_ToDuration>::value, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR time_point<_Clock, _ToDuration>
-round(const time_point<_Clock, _Duration>& __t) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr time_point<_Clock, _ToDuration> round(const time_point<_Clock, _Duration>& __t) {
   return time_point<_Clock, _ToDuration>{chrono::round<_ToDuration>(__t.time_since_epoch())};
 }
 
 template <class _Rep, class _Period, enable_if_t<numeric_limits<_Rep>::is_signed, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<_Rep, _Period> abs(duration<_Rep, _Period> __d) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr duration<_Rep, _Period> abs(duration<_Rep, _Period> __d) {
   return __d >= __d.zero() ? +__d : -__d;
 }
 #endif // _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h
index 7e6e18b22d93aa..ab483a07c9c137 100644
--- a/libcxx/include/__configuration/availability.h
+++ b/libcxx/include/__configuration/availability.h
@@ -87,171 +87,193 @@
 // in all versions of the library are available.
 #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_4 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE /* nothing */
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_9 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE      /* nothing */
-#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH /* nothing */
-#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP  /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE /* nothing */
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_11 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_11_ATTRIBUTE /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_17 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_17_ATTRIBUTE /* nothing */
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_12 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_16 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE /* nothing */
+
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_14 1
 #  define _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE /* nothing */
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_15 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_13 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_13_ATTRIBUTE /* nothing */
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_16 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_12 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE /* nothing */
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_18 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_11 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_11_ATTRIBUTE /* nothing */
 
-#  define _LIBCPP_INTRODUCED_IN_LLVM_19 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_10 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_10_ATTRIBUTE /* nothing */
+
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE      /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP  /* nothing */
+
+#  define _LIBCPP_INTRODUCED_IN_LLVM_8 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_8_ATTRIBUTE /* nothing */
+
+#  define _LIBCPP_INTRODUCED_IN_LLVM_4 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE /* nothing */
 
 #elif defined(__APPLE__)
 
-// LLVM 4
-#  if defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 50000
-#    define _LIBCPP_INTRODUCED_IN_LLVM_4 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE __attribute__((availability(watchos, strict, introduced = 5.0)))
+// clang-format off
+
+// LLVM 19
+// TODO: Fill this in
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19 0
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE __attribute__((unavailable))
+
+// LLVM 18
+// TODO: Fill this in
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18 0
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE __attribute__((unavailable))
+
+// LLVM 17
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140400) ||       \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170400) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170400) ||             \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100400)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_17 0
 #  else
-#    define _LIBCPP_INTRODUCED_IN_LLVM_4 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_17 1
 #  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_17_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 14.4)))                                               \
+    __attribute__((availability(ios, strict, introduced = 17.4)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 17.4)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 10.4)))
+
+// LLVM 16
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140000) ||       \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170000) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170000) ||             \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16 0
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16 1
+#  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 14.0)))                                               \
+    __attribute__((availability(ios, strict, introduced = 17.0)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 17.0)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 10.0)))
 
-// LLVM 9
-// clang-format off
-#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||   \
-      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \
-      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) ||         \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
-// clang-format on
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE                                                                     \
-      __attribute__((availability(macos, strict, introduced = 10.15)))                                                 \
-      __attribute__((availability(ios, strict, introduced = 13.0)))                                                    \
-      __attribute__((availability(tvos, strict, introduced = 13.0)))                                                   \
-      __attribute__((availability(watchos, strict, introduced = 6.0)))
-// clang-format off
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH                                                                               \
-      _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), apply_to=any(function,record))") \
-      _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")    \
-      _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")   \
-      _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP                                                                    \
-      _Pragma("clang attribute pop") \
-      _Pragma("clang attribute pop") \
-      _Pragma("clang attribute pop") \
-      _Pragma("clang attribute pop")
-// clang-format on
+// LLVM 15
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130400) ||   \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160500) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160500) ||         \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_15 0
 #  else
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE      /* nothing */
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH /* nothing */
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP  /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_15 1
 #  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 13.4)))                                               \
+    __attribute__((availability(ios, strict, introduced = 16.5)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 16.5)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 9.5)))
 
-// LLVM 11
-// clang-format off
-#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) ||   \
-      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000) || \
-      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) ||         \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
-// clang-format on
-#    define _LIBCPP_INTRODUCED_IN_LLVM_11 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_11_ATTRIBUTE                                                                    \
-      __attribute__((availability(macos, strict, introduced = 11.0)))                                                  \
-      __attribute__((availability(ios, strict, introduced = 14.0)))                                                    \
-      __attribute__((availability(tvos, strict, introduced = 14.0)))                                                   \
-      __attribute__((availability(watchos, strict, introduced = 7.0)))
+// LLVM 14
+#  define _LIBCPP_INTRODUCED_IN_LLVM_14 _LIBCPP_INTRODUCED_IN_LLVM_15
+#  define _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE
+
+// LLVM 13
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130000) ||   \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160000) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160000) ||         \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90000)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_13 0
 #  else
-#    define _LIBCPP_INTRODUCED_IN_LLVM_11 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_11_ATTRIBUTE /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_13 1
 #  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_13_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 13.0)))                                               \
+    __attribute__((availability(ios, strict, introduced = 16.0)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 16.0)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 9.0)))
 
 // LLVM 12
-// clang-format off
-#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 120000)   || \
-      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 150000) || \
-      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 150000)         || \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80000)
-// clang-format on
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 120300)   ||     \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 150300) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 150300)         ||     \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80300)
 #    define _LIBCPP_INTRODUCED_IN_LLVM_12 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE                                                                    \
-      __attribute__((availability(macos, strict, introduced = 12.0)))                                                  \
-      __attribute__((availability(ios, strict, introduced = 15.0)))                                                    \
-      __attribute__((availability(tvos, strict, introduced = 15.0)))                                                   \
-      __attribute__((availability(watchos, strict, introduced = 8.0)))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_12 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE /* nothing */
 #  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 12.3)))                                               \
+    __attribute__((availability(ios, strict, introduced = 15.3)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 15.3)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 8.3)))
 
-// LLVM 14
-// clang-format off
-#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130400) ||   \
-      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160500) || \
-      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160500) ||         \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500)
-// clang-format on
-#    define _LIBCPP_INTRODUCED_IN_LLVM_14 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE                                                                    \
-      __attribute__((availability(macos, strict, introduced = 13.4)))                                                  \
-      __attribute__((availability(ios, strict, introduced = 16.5)))                                                    \
-      __attribute__((availability(tvos, strict, introduced = 16.5)))                                                   \
-      __attribute__((availability(watchos, strict, introduced = 9.5)))
+// LLVM 11
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) ||   \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) ||         \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_11 0
 #  else
-#    define _LIBCPP_INTRODUCED_IN_LLVM_14 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_11 1
 #  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_11_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 11.0)))                                               \
+    __attribute__((availability(ios, strict, introduced = 14.0)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 14.0)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 7.0)))
 
-// LLVM 15-16
-#  define _LIBCPP_INTRODUCED_IN_LLVM_15 _LIBCPP_INTRODUCED_IN_LLVM_16
-#  define _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE
-// clang-format off
-#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140000) ||   \
-      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170000) || \
-      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170000) ||         \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000)
-// clang-format on
-#    define _LIBCPP_INTRODUCED_IN_LLVM_16 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE                                                                    \
-      __attribute__((availability(macos, strict, introduced = 14.0)))                                                  \
-      __attribute__((availability(ios, strict, introduced = 17.0)))                                                    \
-      __attribute__((availability(tvos, strict, introduced = 17.0)))                                                   \
-      __attribute__((availability(watchos, strict, introduced = 10.0)))
-#  else
-#    define _LIBCPP_INTRODUCED_IN_LLVM_16 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE /* nothing */
-#  endif
+// LLVM 10
+#  define _LIBCPP_INTRODUCED_IN_LLVM_10 _LIBCPP_INTRODUCED_IN_LLVM_11
+#  define _LIBCPP_INTRODUCED_IN_LLVM_10_ATTRIBUTE _LIBCPP_INTRODUCED_IN_LLVM_11_ATTRIBUTE
 
-// LLVM 18
-// TODO: Fill this in
-#  if 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_18 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE __attribute__((unavailable))
+// LLVM 9
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||   \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) ||         \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9 0
 #  else
-#    define _LIBCPP_INTRODUCED_IN_LLVM_18 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9 1
 #  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE                                                                  \
+    __attribute__((availability(macos, strict, introduced = 10.15)))                                              \
+    __attribute__((availability(ios, strict, introduced = 13.0)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 13.0)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 6.0)))
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH                                                                            \
+    _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), apply_to=any(function,record))") \
+    _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")    \
+    _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")   \
+    _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP                                                                    \
+    _Pragma("clang attribute pop") \
+    _Pragma("clang attribute pop") \
+    _Pragma("clang attribute pop") \
+    _Pragma("clang attribute pop")
 
-// LLVM 19
-// TODO: Fill this in
-#  if 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_19 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE __attribute__((unavailable))
+// LLVM 4
+#  if defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 50000
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4 0
 #  else
-#    define _LIBCPP_INTRODUCED_IN_LLVM_19 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4 1
 #  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE __attribute__((availability(watchos, strict, introduced = 5.0)))
+
+// clang-format on
 
 #else
 
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 0f994e297a877a..f618b20603e609 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -507,7 +507,9 @@ class expected : private __expected_base<_Tp, _Err> {
       _And< is_constructible<_Tp, _UfQual>,
             is_constructible<_Err, _OtherErrQual>,
             _If<_Not<is_same<remove_cv_t<_Tp>, bool>>::value,
-                _And< _Not<is_constructible<_Tp, expected<_Up, _OtherErr>&>>,
+                _And< 
+                      _Not<_And<is_same<_Tp, _Up>, is_same<_Err, _OtherErr>>>, // use the copy constructor instead, see #92676
+                      _Not<is_constructible<_Tp, expected<_Up, _OtherErr>&>>,
                       _Not<is_constructible<_Tp, expected<_Up, _OtherErr>>>,
                       _Not<is_constructible<_Tp, const expected<_Up, _OtherErr>&>>,
                       _Not<is_constructible<_Tp, const expected<_Up, _OtherErr>>>,
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index fb82cfceadaff7..c7b98035e34bfa 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -516,7 +516,7 @@ struct __policy {
   }
 
   _LIBCPP_HIDE_FROM_ABI static const __policy* __create_empty() {
-    static const _LIBCPP_CONSTEXPR __policy __policy = {
+    static constexpr __policy __policy = {
         nullptr,
         nullptr,
         true,
@@ -543,7 +543,7 @@ struct __policy {
 
   template <typename _Fun>
   _LIBCPP_HIDE_FROM_ABI static const __policy* __choose_policy(/* is_small = */ false_type) {
-    static const _LIBCPP_CONSTEXPR __policy __policy = {
+    static constexpr __policy __policy = {
         &__large_clone<_Fun>,
         &__large_destroy<_Fun>,
         false,
@@ -558,7 +558,7 @@ struct __policy {
 
   template <typename _Fun>
   _LIBCPP_HIDE_FROM_ABI static const __policy* __choose_policy(/* is_small = */ true_type) {
-    static const _LIBCPP_CONSTEXPR __policy __policy = {
+    static constexpr __policy __policy = {
         nullptr,
         nullptr,
         false,
diff --git a/libcxx/include/__iterator/data.h b/libcxx/include/__iterator/data.h
index 39867390610172..b7c1603652b0e6 100644
--- a/libcxx/include/__iterator/data.h
+++ b/libcxx/include/__iterator/data.h
@@ -23,12 +23,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 17
 
 template <class _Cont>
-constexpr _LIBCPP_HIDE_FROM_ABI auto data(_Cont& __c) _NOEXCEPT_(noexcept(__c.data())) -> decltype(__c.data()) {
+constexpr _LIBCPP_HIDE_FROM_ABI auto data(_Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data()) {
   return __c.data();
 }
 
 template <class _Cont>
-constexpr _LIBCPP_HIDE_FROM_ABI auto data(const _Cont& __c) _NOEXCEPT_(noexcept(__c.data())) -> decltype(__c.data()) {
+constexpr _LIBCPP_HIDE_FROM_ABI auto data(const _Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data()) {
   return __c.data();
 }
 
diff --git a/libcxx/include/__iterator/move_iterator.h b/libcxx/include/__iterator/move_iterator.h
index c266022159a156..a1c53e9bd2b596 100644
--- a/libcxx/include/__iterator/move_iterator.h
+++ b/libcxx/include/__iterator/move_iterator.h
@@ -105,9 +105,8 @@ class _LIBCPP_TEMPLATE_VIS move_iterator
   typedef iterator_type pointer;
 
   typedef typename iterator_traits<iterator_type>::reference __reference;
-  typedef typename conditional< is_reference<__reference>::value,
-                                __libcpp_remove_reference_t<__reference>&&,
-                                __reference >::type reference;
+  typedef __conditional_t<is_reference<__reference>::value, __libcpp_remove_reference_t<__reference>&&, __reference>
+      reference;
 #endif // _LIBCPP_STD_VER >= 20
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 explicit move_iterator(_Iter __i) : __current_(std::move(__i)) {}
diff --git a/libcxx/include/__iterator/size.h b/libcxx/include/__iterator/size.h
index eac3ccf522c315..876e6963f77d91 100644
--- a/libcxx/include/__iterator/size.h
+++ b/libcxx/include/__iterator/size.h
@@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 17
 
 template <class _Cont>
-_LIBCPP_HIDE_FROM_ABI constexpr auto size(const _Cont& __c) _NOEXCEPT_(noexcept(__c.size())) -> decltype(__c.size()) {
+_LIBCPP_HIDE_FROM_ABI constexpr auto size(const _Cont& __c) noexcept(noexcept(__c.size())) -> decltype(__c.size()) {
   return __c.size();
 }
 
@@ -35,9 +35,9 @@ _LIBCPP_HIDE_FROM_ABI constexpr size_t size(const _Tp (&)[_Sz]) noexcept {
 
 #  if _LIBCPP_STD_VER >= 20
 template <class _Cont>
-_LIBCPP_HIDE_FROM_ABI constexpr auto ssize(const _Cont& __c)
-    _NOEXCEPT_(noexcept(static_cast<common_type_t<ptrdiff_t, make_signed_t<decltype(__c.size())>>>(__c.size())))
-        -> common_type_t<ptrdiff_t, make_signed_t<decltype(__c.size())>> {
+_LIBCPP_HIDE_FROM_ABI constexpr auto
+ssize(const _Cont& __c) noexcept(noexcept(static_cast<common_type_t<ptrdiff_t, make_signed_t<decltype(__c.size())>>>(
+    __c.size()))) -> common_type_t<ptrdiff_t, make_signed_t<decltype(__c.size())>> {
   return static_cast<common_type_t<ptrdiff_t, make_signed_t<decltype(__c.size())>>>(__c.size());
 }
 
diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h
index ddf9fabd8cea79..fea0decd8c6af7 100644
--- a/libcxx/include/__mdspan/extents.h
+++ b/libcxx/include/__mdspan/extents.h
@@ -455,8 +455,15 @@ template <class _IndexType, size_t _Rank>
 using dextents = typename __mdspan_detail::__make_dextents<_IndexType, _Rank>::type;
 
 // Deduction guide for extents
+#  if _LIBCPP_STD_VER >= 26
 template <class... _IndexTypes>
-extents(_IndexTypes...) -> extents<size_t, size_t(((void)sizeof(_IndexTypes), dynamic_extent))...>;
+  requires(is_convertible_v<_IndexTypes, size_t> && ...)
+explicit extents(_IndexTypes...) -> extents<size_t, __maybe_static_ext<_IndexTypes>...>;
+#  else
+template <class... _IndexTypes>
+  requires(is_convertible_v<_IndexTypes, size_t> && ...)
+explicit extents(_IndexTypes...) -> extents<size_t, size_t(((void)sizeof(_IndexTypes), dynamic_extent))...>;
+#  endif
 
 namespace __mdspan_detail {
 
diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h
index 215d3832f9ef34..2d8624e771bce0 100644
--- a/libcxx/include/__memory/allocator.h
+++ b/libcxx/include/__memory/allocator.h
@@ -14,6 +14,7 @@
 #include <__memory/addressof.h>
 #include <__memory/allocate_at_least.h>
 #include <__memory/allocator_traits.h>
+#include <__type_traits/is_const.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_void.h>
@@ -36,8 +37,6 @@ class allocator;
 // Specializing allocator<void> is deprecated, but not using it.
 template <>
 class _LIBCPP_TEMPLATE_VIS allocator<void> {
-#  if _LIBCPP_STD_VER <= 17
-
 public:
   _LIBCPP_DEPRECATED_IN_CXX17 typedef void* pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const void* const_pointer;
@@ -47,13 +46,12 @@ class _LIBCPP_TEMPLATE_VIS allocator<void> {
   struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {
     typedef allocator<_Up> other;
   };
-#  endif
 };
 
+// TODO(LLVM 20): Remove the escape hatch
+#  ifdef _LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST
 template <>
 class _LIBCPP_TEMPLATE_VIS allocator<const void> {
-#  if _LIBCPP_STD_VER <= 17
-
 public:
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const void* pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const void* const_pointer;
@@ -63,9 +61,9 @@ class _LIBCPP_TEMPLATE_VIS allocator<const void> {
   struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {
     typedef allocator<_Up> other;
   };
-#  endif
 };
-#endif
+#  endif // _LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST
+#endif   // _LIBCPP_STD_VER <= 17
 
 // This class provides a non-trivial default constructor to the class that derives from it
 // if the condition is satisfied.
@@ -94,6 +92,7 @@ struct __non_trivial_if<true, _Unique> {
 
 template <class _Tp>
 class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if<!is_void<_Tp>::value, allocator<_Tp> > {
+  static_assert(!is_const<_Tp>::value, "std::allocator does not support const types");
   static_assert(!is_volatile<_Tp>::value, "std::allocator does not support volatile types");
 
 public:
@@ -170,6 +169,8 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if<!is_void<_Tp>::v
 #endif
 };
 
+// TODO(LLVM 20): Remove the escape hatch
+#ifdef _LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST
 template <class _Tp>
 class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
     : private __non_trivial_if<!is_void<_Tp>::value, allocator<const _Tp> > {
@@ -180,9 +181,9 @@ class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
   typedef ptrdiff_t difference_type;
   typedef const _Tp value_type;
   typedef true_type propagate_on_container_move_assignment;
-#if _LIBCPP_STD_VER <= 23 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS)
+#  if _LIBCPP_STD_VER <= 23 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS)
   _LIBCPP_DEPRECATED_IN_CXX23 typedef true_type is_always_equal;
-#endif
+#  endif
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator() _NOEXCEPT = default;
 
@@ -199,11 +200,11 @@ class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
     }
   }
 
-#if _LIBCPP_STD_VER >= 23
+#  if _LIBCPP_STD_VER >= 23
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr allocation_result<const _Tp*> allocate_at_least(size_t __n) {
     return {allocate(__n), __n};
   }
-#endif
+#  endif
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void deallocate(const _Tp* __p, size_t __n) {
     if (__libcpp_is_constant_evaluated()) {
@@ -214,7 +215,7 @@ class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
   }
 
   // C++20 Removed members
-#if _LIBCPP_STD_VER <= 17
+#  if _LIBCPP_STD_VER <= 17
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp& reference;
@@ -243,8 +244,9 @@ class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
   }
 
   _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI void destroy(pointer __p) { __p->~_Tp(); }
-#endif
+#  endif
 };
+#endif // _LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST
 
 template <class _Tp, class _Up>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
diff --git a/libcxx/include/__memory/allocator_traits.h b/libcxx/include/__memory/allocator_traits.h
index 47fe132d15cb1f..ac564f0e6fa0cc 100644
--- a/libcxx/include/__memory/allocator_traits.h
+++ b/libcxx/include/__memory/allocator_traits.h
@@ -16,6 +16,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_empty.h>
+#include <__type_traits/is_same.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/remove_reference.h>
 #include <__type_traits/void_t.h>
@@ -372,6 +373,14 @@ template <class _Traits, class _Tp>
 using __rebind_alloc = typename _Traits::template rebind_alloc<_Tp>::other;
 #endif
 
+template <class _Alloc>
+struct __check_valid_allocator : true_type {
+  using _Traits = std::allocator_traits<_Alloc>;
+  static_assert(is_same<_Alloc, __rebind_alloc<_Traits, typename _Traits::value_type> >::value,
+                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
+                "original allocator");
+};
+
 // __is_default_allocator
 template <class _Tp>
 struct __is_default_allocator : false_type {};
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 358a851958db1b..d487e4fbe3a953 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -259,7 +259,7 @@ struct __shared_ptr_emplace : __shared_weak_count {
             class _Allocator                                                                          = _Alloc,
             __enable_if_t<!is_same<typename _Allocator::value_type, __for_overwrite_tag>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI explicit __shared_ptr_emplace(_Alloc __a, _Args&&... __args) : __storage_(std::move(__a)) {
-    using _TpAlloc = typename __allocator_traits_rebind<_Alloc, _Tp>::type;
+    using _TpAlloc = typename __allocator_traits_rebind<_Alloc, __remove_cv_t<_Tp> >::type;
     _TpAlloc __tmp(*__get_alloc());
     allocator_traits<_TpAlloc>::construct(__tmp, __get_elem(), std::forward<_Args>(__args)...);
   }
@@ -278,7 +278,7 @@ struct __shared_ptr_emplace : __shared_weak_count {
   template <class _Allocator                                                                          = _Alloc,
             __enable_if_t<!is_same<typename _Allocator::value_type, __for_overwrite_tag>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void __on_zero_shared_impl() _NOEXCEPT {
-    using _TpAlloc = typename __allocator_traits_rebind<_Allocator, _Tp>::type;
+    using _TpAlloc = typename __allocator_traits_rebind<_Allocator, __remove_cv_t<_Tp> >::type;
     _TpAlloc __tmp(*__get_alloc());
     allocator_traits<_TpAlloc>::destroy(__tmp, __get_elem());
   }
@@ -403,6 +403,9 @@ struct __shared_ptr_deleter_ctor_reqs {
                             __well_formed_deleter<_Dp, _Yp*>::value;
 };
 
+template <class _Dp>
+using __shared_ptr_nullptr_deleter_ctor_reqs = _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
+
 #if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI)
 #  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__))
 #else
@@ -411,6 +414,8 @@ struct __shared_ptr_deleter_ctor_reqs {
 
 template <class _Tp>
 class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
+  struct __nullptr_sfinae_tag {};
+
 public:
 #if _LIBCPP_STD_VER >= 17
   typedef weak_ptr<_Tp> weak_type;
@@ -503,7 +508,11 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
   }
 
   template <class _Dp>
-  _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d) : __ptr_(nullptr) {
+  _LIBCPP_HIDE_FROM_ABI shared_ptr(
+      nullptr_t __p,
+      _Dp __d,
+      __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp>::value, __nullptr_sfinae_tag> = __nullptr_sfinae_tag())
+      : __ptr_(nullptr) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
@@ -523,7 +532,12 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
   }
 
   template <class _Dp, class _Alloc>
-  _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d, _Alloc __a) : __ptr_(nullptr) {
+  _LIBCPP_HIDE_FROM_ABI shared_ptr(
+      nullptr_t __p,
+      _Dp __d,
+      _Alloc __a,
+      __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp>::value, __nullptr_sfinae_tag> = __nullptr_sfinae_tag())
+      : __ptr_(nullptr) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
@@ -598,8 +612,8 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
 #if _LIBCPP_STD_VER <= 14 || defined(_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR)
   template <class _Yp, __enable_if_t<is_convertible<_Yp*, element_type*>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI shared_ptr(auto_ptr<_Yp>&& __r) : __ptr_(__r.get()) {
-    typedef __shared_ptr_pointer<_Yp*, default_delete<_Yp>, allocator<_Yp> > _CntrlBlk;
-    __cntrl_ = new _CntrlBlk(__r.get(), default_delete<_Yp>(), allocator<_Yp>());
+    typedef __shared_ptr_pointer<_Yp*, default_delete<_Yp>, allocator<__remove_cv_t<_Yp> > > _CntrlBlk;
+    __cntrl_ = new _CntrlBlk(__r.get(), default_delete<_Yp>(), allocator<__remove_cv_t<_Yp> >());
     __enable_weak_this(__r.get(), __r.get());
     __r.release();
   }
@@ -776,7 +790,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
 private:
   template <class _Yp, bool = is_function<_Yp>::value>
   struct __shared_ptr_default_allocator {
-    typedef allocator<_Yp> type;
+    typedef allocator<__remove_cv_t<_Yp> > type;
   };
 
   template <class _Yp>
@@ -834,7 +848,7 @@ _LIBCPP_HIDE_FROM_ABI shared_ptr<_Tp> allocate_shared(const _Alloc& __a, _Args&&
 
 template <class _Tp, class... _Args, __enable_if_t<!is_array<_Tp>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI shared_ptr<_Tp> make_shared(_Args&&... __args) {
-  return std::allocate_shared<_Tp>(allocator<_Tp>(), std::forward<_Args>(__args)...);
+  return std::allocate_shared<_Tp>(allocator<__remove_cv_t<_Tp> >(), std::forward<_Args>(__args)...);
 }
 
 #if _LIBCPP_STD_VER >= 20
@@ -848,7 +862,7 @@ _LIBCPP_HIDE_FROM_ABI shared_ptr<_Tp> allocate_shared_for_overwrite(const _Alloc
 
 template <class _Tp, __enable_if_t<!is_array<_Tp>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI shared_ptr<_Tp> make_shared_for_overwrite() {
-  return std::allocate_shared_for_overwrite<_Tp>(allocator<_Tp>());
+  return std::allocate_shared_for_overwrite<_Tp>(allocator<__remove_cv_t<_Tp>>());
 }
 
 #endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__numeric/gcd_lcm.h b/libcxx/include/__numeric/gcd_lcm.h
index 87ebbae0157f55..9be6cf8516b131 100644
--- a/libcxx/include/__numeric/gcd_lcm.h
+++ b/libcxx/include/__numeric/gcd_lcm.h
@@ -37,7 +37,7 @@ struct __ct_abs;
 
 template <typename _Result, typename _Source>
 struct __ct_abs<_Result, _Source, true> {
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Result operator()(_Source __t) const noexcept {
+  constexpr _LIBCPP_HIDE_FROM_ABI _Result operator()(_Source __t) const noexcept {
     if (__t >= 0)
       return __t;
     if (__t == numeric_limits<_Source>::min())
@@ -48,11 +48,11 @@ struct __ct_abs<_Result, _Source, true> {
 
 template <typename _Result, typename _Source>
 struct __ct_abs<_Result, _Source, false> {
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Result operator()(_Source __t) const noexcept { return __t; }
+  constexpr _LIBCPP_HIDE_FROM_ABI _Result operator()(_Source __t) const noexcept { return __t; }
 };
 
 template <class _Tp>
-_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __a, _Tp __b) {
+constexpr _LIBCPP_HIDDEN _Tp __gcd(_Tp __a, _Tp __b) {
   static_assert(!is_signed<_Tp>::value, "");
 
   // From: https://lemire.me/blog/2013/12/26/fastest-way-to-compute-the-greatest-common-divisor
@@ -96,7 +96,7 @@ _LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __a, _Tp __b) {
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI common_type_t<_Tp, _Up> gcd(_Tp __m, _Up __n) {
+constexpr _LIBCPP_HIDE_FROM_ABI common_type_t<_Tp, _Up> gcd(_Tp __m, _Up __n) {
   static_assert(is_integral<_Tp>::value && is_integral<_Up>::value, "Arguments to gcd must be integer types");
   static_assert(!is_same<__remove_cv_t<_Tp>, bool>::value, "First argument to gcd cannot be bool");
   static_assert(!is_same<__remove_cv_t<_Up>, bool>::value, "Second argument to gcd cannot be bool");
@@ -107,7 +107,7 @@ _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI common_type_t<_Tp, _Up> gcd(_Tp __m, _Up
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI common_type_t<_Tp, _Up> lcm(_Tp __m, _Up __n) {
+constexpr _LIBCPP_HIDE_FROM_ABI common_type_t<_Tp, _Up> lcm(_Tp __m, _Up __n) {
   static_assert(is_integral<_Tp>::value && is_integral<_Up>::value, "Arguments to lcm must be integer types");
   static_assert(!is_same<__remove_cv_t<_Tp>, bool>::value, "First argument to lcm cannot be bool");
   static_assert(!is_same<__remove_cv_t<_Up>, bool>::value, "Second argument to lcm cannot be bool");
@@ -117,11 +117,13 @@ _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI common_type_t<_Tp, _Up> lcm(_Tp __m, _Up
   using _Rp  = common_type_t<_Tp, _Up>;
   _Rp __val1 = __ct_abs<_Rp, _Tp>()(__m) / std::gcd(__m, __n);
   _Rp __val2 = __ct_abs<_Rp, _Up>()(__n);
-  _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN((numeric_limits<_Rp>::max() / __val1 > __val2), "Overflow in lcm");
-  return __val1 * __val2;
+  _Rp __res;
+  [[maybe_unused]] bool __overflow = __builtin_mul_overflow(__val1, __val2, &__res);
+  _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(!__overflow, "Overflow in lcm");
+  return __res;
 }
 
-#endif // _LIBCPP_STD_VER
+#endif // _LIBCPP_STD_VER >= 17
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__random/discard_block_engine.h b/libcxx/include/__random/discard_block_engine.h
index 9279fb66f6942c..07f599067279e6 100644
--- a/libcxx/include/__random/discard_block_engine.h
+++ b/libcxx/include/__random/discard_block_engine.h
@@ -50,8 +50,8 @@ class _LIBCPP_TEMPLATE_VIS discard_block_engine {
   static const result_type _Min = _Engine::_Min;
   static const result_type _Max = _Engine::_Max;
 #else
-  static _LIBCPP_CONSTEXPR const result_type _Min = _Engine::min();
-  static _LIBCPP_CONSTEXPR const result_type _Max = _Engine::max();
+  static constexpr result_type _Min = _Engine::min();
+  static constexpr result_type _Max = _Engine::max();
 #endif
 
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type min() { return _Engine::min(); }
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index 9e6f724241ccf1..c0f5ed936a66d9 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -22,7 +22,6 @@
 #include <__concepts/semiregular.h>
 #include <__concepts/totally_ordered.h>
 #include <__config>
-#include <__functional/ranges_operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
@@ -313,8 +312,8 @@ class iota_view : public view_interface<iota_view<_Start, _BoundSentinel>> {
       : __value_(std::move(__value)), __bound_sentinel_(std::move(__bound_sentinel)) {
     // Validate the precondition if possible.
     if constexpr (totally_ordered_with<_Start, _BoundSentinel>) {
-      _LIBCPP_ASSERT_UNCATEGORIZED(
-          ranges::less_equal()(__value_, __bound_sentinel_), "Precondition violated: value is greater than bound.");
+      _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+          bool(__value_ <= __bound_sentinel_), "iota_view: bound must be reachable from value");
     }
   }
 
diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h
index 9d347b188ee148..40821f86465b89 100644
--- a/libcxx/include/__string/char_traits.h
+++ b/libcxx/include/__string/char_traits.h
@@ -270,26 +270,26 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char8_t> {
   static inline _LIBCPP_HIDE_FROM_ABI constexpr bool lt(char_type __c1, char_type __c2) noexcept { return __c1 < __c2; }
 
   static _LIBCPP_HIDE_FROM_ABI constexpr int
-  compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
+  compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept {
     return std::__constexpr_memcmp(__s1, __s2, __element_count(__n));
   }
 
-  static _LIBCPP_HIDE_FROM_ABI constexpr size_t length(const char_type* __str) _NOEXCEPT {
+  static _LIBCPP_HIDE_FROM_ABI constexpr size_t length(const char_type* __str) noexcept {
     return std::__constexpr_strlen(__str);
   }
 
   _LIBCPP_HIDE_FROM_ABI static constexpr const char_type*
-  find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT {
+  find(const char_type* __s, size_t __n, const char_type& __a) noexcept {
     return std::__constexpr_memchr(__s, __a, __n);
   }
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
+  move(char_type* __s1, const char_type* __s2, size_t __n) noexcept {
     return std::__constexpr_memmove(__s1, __s2, __element_count(__n));
   }
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
+  copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept {
     _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
                                           "char_traits::copy: source and destination ranges overlap");
     std::__constexpr_memmove(__s1, __s2, __element_count(__n));
@@ -297,7 +297,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char8_t> {
   }
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT {
+  assign(char_type* __s, size_t __n, char_type __a) noexcept {
     std::fill_n(__s, __n, __a);
     return __s;
   }
diff --git a/libcxx/include/__type_traits/common_type.h b/libcxx/include/__type_traits/common_type.h
index 7f86fcaaace444..f6bd9ed71b7a47 100644
--- a/libcxx/include/__type_traits/common_type.h
+++ b/libcxx/include/__type_traits/common_type.h
@@ -82,9 +82,9 @@ struct _LIBCPP_TEMPLATE_VIS common_type<_Tp> : public common_type<_Tp, _Tp> {};
 // sub-bullet 1 - "If is_same_v<T1, D1> is false or ..."
 template <class _Tp, class _Up>
 struct _LIBCPP_TEMPLATE_VIS common_type<_Tp, _Up>
-    : conditional<_IsSame<_Tp, __decay_t<_Tp> >::value && _IsSame<_Up, __decay_t<_Up> >::value,
-                  __common_type2_imp<_Tp, _Up>,
-                  common_type<__decay_t<_Tp>, __decay_t<_Up> > >::type {};
+    : __conditional_t<_IsSame<_Tp, __decay_t<_Tp> >::value && _IsSame<_Up, __decay_t<_Up> >::value,
+                      __common_type2_imp<_Tp, _Up>,
+                      common_type<__decay_t<_Tp>, __decay_t<_Up> > > {};
 
 // bullet 4 - sizeof...(Tp) > 2
 
diff --git a/libcxx/include/__type_traits/decay.h b/libcxx/include/__type_traits/decay.h
index 95dccaa29cbe19..7412044f931796 100644
--- a/libcxx/include/__type_traits/decay.h
+++ b/libcxx/include/__type_traits/decay.h
@@ -43,11 +43,11 @@ struct __decay {
 template <class _Up>
 struct __decay<_Up, true> {
 public:
-  typedef _LIBCPP_NODEBUG typename conditional<
-      is_array<_Up>::value,
-      __add_pointer_t<__remove_extent_t<_Up> >,
-      typename conditional<is_function<_Up>::value, typename add_pointer<_Up>::type, __remove_cv_t<_Up> >::type >::type
-      type;
+  typedef _LIBCPP_NODEBUG
+      __conditional_t<is_array<_Up>::value,
+                      __add_pointer_t<__remove_extent_t<_Up> >,
+                      __conditional_t<is_function<_Up>::value, typename add_pointer<_Up>::type, __remove_cv_t<_Up> > >
+          type;
 };
 
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/is_nothrow_destructible.h b/libcxx/include/__type_traits/is_nothrow_destructible.h
index 370ba27ecd1f1c..c2d5ca87232aa1 100644
--- a/libcxx/include/__type_traits/is_nothrow_destructible.h
+++ b/libcxx/include/__type_traits/is_nothrow_destructible.h
@@ -12,9 +12,6 @@
 #include <__config>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_destructible.h>
-#include <__type_traits/is_reference.h>
-#include <__type_traits/is_scalar.h>
-#include <__type_traits/remove_all_extents.h>
 #include <__utility/declval.h>
 #include <cstddef>
 
@@ -24,7 +21,12 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if !defined(_LIBCPP_CXX03_LANG)
+#if __has_builtin(__is_nothrow_destructible)
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_destructible : integral_constant<bool, __is_nothrow_destructible(_Tp)> {};
+
+#else
 
 template <bool, class _Tp>
 struct __libcpp_is_nothrow_destructible;
@@ -49,20 +51,7 @@ struct _LIBCPP_TEMPLATE_VIS is_nothrow_destructible<_Tp&> : public true_type {};
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS is_nothrow_destructible<_Tp&&> : public true_type {};
 
-#else
-
-template <class _Tp>
-struct __libcpp_nothrow_destructor : public integral_constant<bool, is_scalar<_Tp>::value || is_reference<_Tp>::value> {
-};
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_destructible : public __libcpp_nothrow_destructor<__remove_all_extents_t<_Tp> > {
-};
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_destructible<_Tp[]> : public false_type {};
-
-#endif
+#endif // __has_builtin(__is_nothrow_destructible)
 
 #if _LIBCPP_STD_VER >= 17
 template <class _Tp>
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index 2da434e0fea41b..0afbebcdc9f2ae 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -135,16 +135,16 @@ struct _LIBCPP_TEMPLATE_VIS pair
       typename conditional< _MaybeEnable, _CheckArgs, __check_tuple_constructor_fail>::type;
 
   template <bool _Dummy = true, __enable_if_t<_CheckArgsDep<_Dummy>::__enable_default(), int> = 0>
-  explicit(!_CheckArgsDep<_Dummy>::__enable_implicit_default()) _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR pair() _NOEXCEPT_(
-      is_nothrow_default_constructible<first_type>::value&& is_nothrow_default_constructible<second_type>::value)
+  explicit(!_CheckArgsDep<_Dummy>::__enable_implicit_default()) _LIBCPP_HIDE_FROM_ABI constexpr pair() noexcept(
+      is_nothrow_default_constructible<first_type>::value && is_nothrow_default_constructible<second_type>::value)
       : first(), second() {}
 
   template <bool _Dummy = true,
             __enable_if_t<_CheckArgsDep<_Dummy>::template __is_pair_constructible<_T1 const&, _T2 const&>(), int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(
-      !_CheckArgsDep<_Dummy>::template __is_implicit<_T1 const&, _T2 const&>()) pair(_T1 const& __t1, _T2 const& __t2)
-      _NOEXCEPT_(is_nothrow_copy_constructible<first_type>::value&& is_nothrow_copy_constructible<second_type>::value)
+  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgsDep<_Dummy>::template __is_implicit<_T1 const&, _T2 const&>())
+      pair(_T1 const& __t1, _T2 const& __t2) noexcept(is_nothrow_copy_constructible<first_type>::value &&
+                                                      is_nothrow_copy_constructible<second_type>::value)
       : first(__t1), second(__t2) {}
 
   template <
@@ -156,10 +156,9 @@ struct _LIBCPP_TEMPLATE_VIS pair
       class _U2,
 #  endif
       __enable_if_t<_CheckArgs::template __is_pair_constructible<_U1, _U2>(), int> = 0 >
-  _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgs::template __is_implicit<_U1, _U2>()) pair(_U1&& __u1, _U2&& __u2)
-      _NOEXCEPT_((is_nothrow_constructible<first_type, _U1>::value &&
-                  is_nothrow_constructible<second_type, _U2>::value))
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgs::template __is_implicit<_U1, _U2>())
+      pair(_U1&& __u1, _U2&& __u2) noexcept(is_nothrow_constructible<first_type, _U1>::value &&
+                                            is_nothrow_constructible<second_type, _U2>::value)
       : first(std::forward<_U1>(__u1)), second(std::forward<_U2>(__u2)) {
   }
 
@@ -176,16 +175,14 @@ struct _LIBCPP_TEMPLATE_VIS pair
             __enable_if_t<_CheckArgs::template __is_pair_constructible<_U1 const&, _U2 const&>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgs::template __is_implicit<_U1 const&, _U2 const&>())
-      pair(pair<_U1, _U2> const& __p)
-          _NOEXCEPT_((is_nothrow_constructible<first_type, _U1 const&>::value &&
-                      is_nothrow_constructible<second_type, _U2 const&>::value))
+      pair(pair<_U1, _U2> const& __p) noexcept(is_nothrow_constructible<first_type, _U1 const&>::value &&
+                                               is_nothrow_constructible<second_type, _U2 const&>::value)
       : first(__p.first), second(__p.second) {}
 
   template <class _U1, class _U2, __enable_if_t<_CheckArgs::template __is_pair_constructible<_U1, _U2>(), int> = 0>
-  _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgs::template __is_implicit<_U1, _U2>()) pair(pair<_U1, _U2>&& __p)
-      _NOEXCEPT_((is_nothrow_constructible<first_type, _U1&&>::value &&
-                  is_nothrow_constructible<second_type, _U2&&>::value))
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgs::template __is_implicit<_U1, _U2>())
+      pair(pair<_U1, _U2>&& __p) noexcept(is_nothrow_constructible<first_type, _U1&&>::value &&
+                                          is_nothrow_constructible<second_type, _U2&&>::value)
       : first(std::forward<_U1>(__p.first)), second(std::forward<_U2>(__p.second)) {}
 
 #  if _LIBCPP_STD_VER >= 23
@@ -219,8 +216,8 @@ struct _LIBCPP_TEMPLATE_VIS pair
 
   template <class... _Args1, class... _Args2>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-  pair(piecewise_construct_t __pc, tuple<_Args1...> __first_args, tuple<_Args2...> __second_args) _NOEXCEPT_(
-      is_nothrow_constructible<first_type, _Args1...>::value&& is_nothrow_constructible<second_type, _Args2...>::value)
+  pair(piecewise_construct_t __pc, tuple<_Args1...> __first_args, tuple<_Args2...> __second_args) noexcept(
+      is_nothrow_constructible<first_type, _Args1...>::value && is_nothrow_constructible<second_type, _Args2...>::value)
       : pair(__pc,
              __first_args,
              __second_args,
@@ -228,19 +225,19 @@ struct _LIBCPP_TEMPLATE_VIS pair
              typename __make_tuple_indices<sizeof...(_Args2) >::type()) {}
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair&
-  operator=(__conditional_t< is_copy_assignable<first_type>::value && is_copy_assignable<second_type>::value,
-                             pair,
-                             __nat> const& __p)
-      _NOEXCEPT_(is_nothrow_copy_assignable<first_type>::value&& is_nothrow_copy_assignable<second_type>::value) {
+  operator=(__conditional_t<is_copy_assignable<first_type>::value && is_copy_assignable<second_type>::value,
+                            pair,
+                            __nat> const& __p) noexcept(is_nothrow_copy_assignable<first_type>::value &&
+                                                        is_nothrow_copy_assignable<second_type>::value) {
     first  = __p.first;
     second = __p.second;
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=(
-      __conditional_t< is_move_assignable<first_type>::value && is_move_assignable<second_type>::value, pair, __nat>&&
-          __p)
-      _NOEXCEPT_(is_nothrow_move_assignable<first_type>::value&& is_nothrow_move_assignable<second_type>::value) {
+      __conditional_t<is_move_assignable<first_type>::value && is_move_assignable<second_type>::value, pair, __nat>&&
+          __p) noexcept(is_nothrow_move_assignable<first_type>::value &&
+                        is_nothrow_move_assignable<second_type>::value) {
     first  = std::forward<first_type>(__p.first);
     second = std::forward<second_type>(__p.second);
     return *this;
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 09888cc7ba78cc..aee4764859dd20 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -449,11 +449,11 @@ public:
 
   using value_type = _Tp;
 
-  static_assert(is_same<typename _Allocator::value_type, value_type>::value,
-                "Allocator::value_type must be same type as value_type");
-
   using allocator_type = _Allocator;
   using __alloc_traits = allocator_traits<allocator_type>;
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
+  static_assert(is_same<typename allocator_type::value_type, value_type>::value,
+                "Allocator::value_type must be same type as value_type");
 
   using size_type       = typename __alloc_traits::size_type;
   using difference_type = typename __alloc_traits::difference_type;
@@ -488,9 +488,6 @@ public:
       deque,
       void>;
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
   static_assert(is_nothrow_default_constructible<allocator_type>::value ==
                     is_nothrow_default_constructible<__pointer_allocator>::value,
                 "rebinding an allocator should not change exception guarantees");
@@ -632,11 +629,11 @@ public:
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI deque(deque&& __c) _NOEXCEPT_(is_nothrow_move_constructible<allocator_type>::value);
+  _LIBCPP_HIDE_FROM_ABI deque(deque&& __c) noexcept(is_nothrow_move_constructible<allocator_type>::value);
   _LIBCPP_HIDE_FROM_ABI deque(deque&& __c, const __type_identity_t<allocator_type>& __a);
-  _LIBCPP_HIDE_FROM_ABI deque& operator=(deque&& __c)
-      _NOEXCEPT_(__alloc_traits::propagate_on_container_move_assignment::value&&
-                     is_nothrow_move_assignable<allocator_type>::value);
+  _LIBCPP_HIDE_FROM_ABI deque&
+  operator=(deque&& __c) noexcept(__alloc_traits::propagate_on_container_move_assignment::value &&
+                                  is_nothrow_move_assignable<allocator_type>::value);
 
   _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il) { assign(__il.begin(), __il.end()); }
 #endif // _LIBCPP_CXX03_LANG
@@ -1309,7 +1306,7 @@ deque<_Tp, _Allocator>::deque(initializer_list<value_type> __il, const allocator
 }
 
 template <class _Tp, class _Allocator>
-inline deque<_Tp, _Allocator>::deque(deque&& __c) _NOEXCEPT_(is_nothrow_move_constructible<allocator_type>::value)
+inline deque<_Tp, _Allocator>::deque(deque&& __c) noexcept(is_nothrow_move_constructible<allocator_type>::value)
     : __map_(std::move(__c.__map_)), __start_(std::move(__c.__start_)), __size_(std::move(__c.__size_)) {
   __c.__start_ = 0;
   __c.__size() = 0;
@@ -1333,8 +1330,9 @@ inline deque<_Tp, _Allocator>::deque(deque&& __c, const __type_identity_t<alloca
 }
 
 template <class _Tp, class _Allocator>
-inline deque<_Tp, _Allocator>& deque<_Tp, _Allocator>::operator=(deque&& __c) _NOEXCEPT_(
-    __alloc_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable<allocator_type>::value) {
+inline deque<_Tp, _Allocator>& deque<_Tp, _Allocator>::operator=(deque&& __c) noexcept(
+    __alloc_traits::propagate_on_container_move_assignment::value &&
+    is_nothrow_move_assignable<allocator_type>::value) {
   __move_assign(__c, integral_constant<bool, __alloc_traits::propagate_on_container_move_assignment::value>());
   return *this;
 }
@@ -1349,8 +1347,8 @@ void deque<_Tp, _Allocator>::__move_assign(deque& __c, false_type) {
 }
 
 template <class _Tp, class _Allocator>
-void deque<_Tp, _Allocator>::__move_assign(deque& __c, true_type)
-    _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value) {
+void deque<_Tp, _Allocator>::__move_assign(deque& __c,
+                                           true_type) noexcept(is_nothrow_move_assignable<allocator_type>::value) {
   clear();
   shrink_to_fit();
   __move_assign(__c);
diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h
index 7efbba96ec71b1..af61dbcc2fe92d 100644
--- a/libcxx/include/experimental/__simd/reference.h
+++ b/libcxx/include/experimental/__simd/reference.h
@@ -13,10 +13,14 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_same.h>
 #include <__utility/forward.h>
+#include <__utility/move.h>
 #include <cstddef>
 #include <experimental/__config>
 #include <experimental/__simd/utility.h>
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
@@ -55,10 +59,47 @@ class __simd_reference {
     __set(static_cast<value_type>(std::forward<_Up>(__v)));
     return {__s_, __idx_};
   }
+
+  // Note: This approach might not fully align with the specification,
+  // which might be a wording defect. (https://wg21.link/N4808 section 9.6.3)
+  template <class _Tp1, class _Storage1, class _Vp1>
+  friend void
+  swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept;
+
+  template <class _Tp1, class _Storage1, class _Vp1>
+  friend void swap(_Vp1& __a, __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept;
+
+  template <class _Tp1, class _Storage1, class _Vp1>
+  friend void swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, _Vp1& __b) noexcept;
 };
 
+template <class _Tp, class _Storage, class _Vp>
+_LIBCPP_HIDE_FROM_ABI void
+swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept {
+  _Vp __tmp(std::move(__a));
+  std::move(__a) = std::move(__b);
+  std::move(__b) = std::move(__tmp);
+}
+
+template <class _Tp, class _Storage, class _Vp>
+_LIBCPP_HIDE_FROM_ABI void swap(_Vp& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept {
+  _Vp __tmp(std::move(__a));
+  __a            = std::move(__b);
+  std::move(__b) = std::move(__tmp);
+}
+
+template <class _Tp, class _Storage, class _Vp>
+_LIBCPP_HIDE_FROM_ABI void swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, _Vp& __b) noexcept {
+  _Vp __tmp(std::move(__a));
+  std::move(__a) = std::move(__b);
+  __b            = std::move(__tmp);
+}
+
 } // namespace parallelism_v2
 _LIBCPP_END_NAMESPACE_EXPERIMENTAL
 
 #endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
+
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
diff --git a/libcxx/include/experimental/propagate_const b/libcxx/include/experimental/propagate_const
index a30bba9effb142..d7a695d8388923 100644
--- a/libcxx/include/experimental/propagate_const
+++ b/libcxx/include/experimental/propagate_const
@@ -145,10 +145,10 @@ template <class _Tp>
 class propagate_const;
 
 template <class _Up>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const _Up& get_underlying(const propagate_const<_Up>& __pu) _NOEXCEPT;
+inline _LIBCPP_HIDE_FROM_ABI constexpr const _Up& get_underlying(const propagate_const<_Up>& __pu) _NOEXCEPT;
 
 template <class _Up>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Up& get_underlying(propagate_const<_Up>& __pu) _NOEXCEPT;
+inline _LIBCPP_HIDE_FROM_ABI constexpr _Up& get_underlying(propagate_const<_Up>& __pu) _NOEXCEPT;
 
 template <class _Tp>
 class propagate_const {
@@ -164,22 +164,22 @@ public:
 
 private:
   template <class _Up>
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR element_type* __get_pointer(_Up* __u) {
+  static _LIBCPP_HIDE_FROM_ABI constexpr element_type* __get_pointer(_Up* __u) {
     return __u;
   }
 
   template <class _Up>
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR element_type* __get_pointer(_Up& __u) {
+  static _LIBCPP_HIDE_FROM_ABI constexpr element_type* __get_pointer(_Up& __u) {
     return __get_pointer(__u.get());
   }
 
   template <class _Up>
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const element_type* __get_pointer(const _Up* __u) {
+  static _LIBCPP_HIDE_FROM_ABI constexpr const element_type* __get_pointer(const _Up* __u) {
     return __u;
   }
 
   template <class _Up>
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const element_type* __get_pointer(const _Up& __u) {
+  static _LIBCPP_HIDE_FROM_ABI constexpr const element_type* __get_pointer(const _Up& __u) {
     return __get_pointer(__u.get());
   }
 
@@ -193,214 +193,207 @@ private:
 
 public:
   template <class _Up>
-  friend _LIBCPP_CONSTEXPR const _Up&
-  experimental::fundamentals_v2::get_underlying(const propagate_const<_Up>& __pu) _NOEXCEPT;
+  friend constexpr const _Up& experimental::fundamentals_v2::get_underlying(const propagate_const<_Up>& __pu) _NOEXCEPT;
   template <class _Up>
-  friend _LIBCPP_CONSTEXPR _Up& experimental::fundamentals_v2::get_underlying(propagate_const<_Up>& __pu) _NOEXCEPT;
+  friend constexpr _Up& experimental::fundamentals_v2::get_underlying(propagate_const<_Up>& __pu) _NOEXCEPT;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const() = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr propagate_const() = default;
 
   propagate_const(const propagate_const&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const(propagate_const&&) = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr propagate_const(propagate_const&&) = default;
 
   template <class _Up,
             enable_if_t<!is_convertible<_Up, _Tp>::value && is_constructible<_Tp, _Up&&>::value, bool> = true>
-  explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const(propagate_const<_Up>&& __pu)
+  explicit _LIBCPP_HIDE_FROM_ABI constexpr propagate_const(propagate_const<_Up>&& __pu)
       : __t_(std::move(experimental::get_underlying(__pu))) {}
 
   template <class _Up,
             enable_if_t<is_convertible<_Up&&, _Tp>::value && is_constructible<_Tp, _Up&&>::value, bool> = false>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const(propagate_const<_Up>&& __pu)
+  _LIBCPP_HIDE_FROM_ABI constexpr propagate_const(propagate_const<_Up>&& __pu)
       : __t_(std::move(experimental::get_underlying(__pu))) {}
 
   template <class _Up,
             enable_if_t<!is_convertible<_Up&&, _Tp>::value && is_constructible<_Tp, _Up&&>::value &&
                             !__is_propagate_const<decay_t<_Up>>::value,
                         bool> = true>
-  explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const(_Up&& __u) : __t_(std::forward<_Up>(__u)) {}
+  explicit _LIBCPP_HIDE_FROM_ABI constexpr propagate_const(_Up&& __u) : __t_(std::forward<_Up>(__u)) {}
 
   template <class _Up,
             enable_if_t<is_convertible<_Up&&, _Tp>::value && is_constructible<_Tp, _Up&&>::value &&
                             !__is_propagate_const<decay_t<_Up>>::value,
                         bool> = false>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const(_Up&& __u) : __t_(std::forward<_Up>(__u)) {}
+  _LIBCPP_HIDE_FROM_ABI constexpr propagate_const(_Up&& __u) : __t_(std::forward<_Up>(__u)) {}
 
   propagate_const& operator=(const propagate_const&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const& operator=(propagate_const&&) = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr propagate_const& operator=(propagate_const&&) = default;
 
   template <class _Up>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const& operator=(propagate_const<_Up>&& __pu) {
+  _LIBCPP_HIDE_FROM_ABI constexpr propagate_const& operator=(propagate_const<_Up>&& __pu) {
     __t_ = std::move(experimental::get_underlying(__pu));
     return *this;
   }
 
   template <class _Up, class _Vp = enable_if_t<!__is_propagate_const<decay_t<_Up>>::value>>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR propagate_const& operator=(_Up&& __u) {
+  _LIBCPP_HIDE_FROM_ABI constexpr propagate_const& operator=(_Up&& __u) {
     __t_ = std::forward<_Up>(__u);
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const element_type* get() const { return __get_pointer(__t_); }
+  _LIBCPP_HIDE_FROM_ABI constexpr const element_type* get() const { return __get_pointer(__t_); }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR element_type* get() { return __get_pointer(__t_); }
+  _LIBCPP_HIDE_FROM_ABI constexpr element_type* get() { return __get_pointer(__t_); }
 
-  _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR operator bool() const { return get() != nullptr; }
+  _LIBCPP_HIDE_FROM_ABI explicit constexpr operator bool() const { return get() != nullptr; }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const element_type* operator->() const { return get(); }
+  _LIBCPP_HIDE_FROM_ABI constexpr const element_type* operator->() const { return get(); }
 
   template <class _Dummy = _Tp, class _Up = enable_if_t<is_convertible< const _Dummy, const element_type*>::value>>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR operator const element_type*() const {
+  _LIBCPP_HIDE_FROM_ABI constexpr operator const element_type*() const {
     return get();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const element_type& operator*() const { return *get(); }
+  _LIBCPP_HIDE_FROM_ABI constexpr const element_type& operator*() const { return *get(); }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR element_type* operator->() { return get(); }
+  _LIBCPP_HIDE_FROM_ABI constexpr element_type* operator->() { return get(); }
 
   template <class _Dummy = _Tp, class _Up = enable_if_t< is_convertible<_Dummy, element_type*>::value>>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR operator element_type*() {
+  _LIBCPP_HIDE_FROM_ABI constexpr operator element_type*() {
     return get();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR element_type& operator*() { return *get(); }
+  _LIBCPP_HIDE_FROM_ABI constexpr element_type& operator*() { return *get(); }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR void swap(propagate_const& __pt) _NOEXCEPT_(__is_nothrow_swappable_v<_Tp>) {
+  _LIBCPP_HIDE_FROM_ABI constexpr void swap(propagate_const& __pt) noexcept(__is_nothrow_swappable_v<_Tp>) {
     using std::swap;
     swap(__t_, __pt.__t_);
   }
 };
 
 template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator==(const propagate_const<_Tp>& __pt, nullptr_t) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const propagate_const<_Tp>& __pt, nullptr_t) {
   return experimental::get_underlying(__pt) == nullptr;
 }
 
 template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator==(nullptr_t, const propagate_const<_Tp>& __pt) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(nullptr_t, const propagate_const<_Tp>& __pt) {
   return nullptr == experimental::get_underlying(__pt);
 }
 
 template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator!=(const propagate_const<_Tp>& __pt, nullptr_t) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const propagate_const<_Tp>& __pt, nullptr_t) {
   return experimental::get_underlying(__pt) != nullptr;
 }
 
 template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator!=(nullptr_t, const propagate_const<_Tp>& __pt) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(nullptr_t, const propagate_const<_Tp>& __pt) {
   return nullptr != experimental::get_underlying(__pt);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
-operator==(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
   return experimental::get_underlying(__pt) == experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
-operator!=(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
   return experimental::get_underlying(__pt) != experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
-operator<(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
   return experimental::get_underlying(__pt) < experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
-operator>(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
   return experimental::get_underlying(__pt) > experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
-operator<=(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
   return experimental::get_underlying(__pt) <= experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
-operator>=(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const propagate_const<_Tp>& __pt, const propagate_const<_Up>& __pu) {
   return experimental::get_underlying(__pt) >= experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator==(const propagate_const<_Tp>& __pt, const _Up& __u) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const propagate_const<_Tp>& __pt, const _Up& __u) {
   return experimental::get_underlying(__pt) == __u;
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator!=(const propagate_const<_Tp>& __pt, const _Up& __u) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const propagate_const<_Tp>& __pt, const _Up& __u) {
   return experimental::get_underlying(__pt) != __u;
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator<(const propagate_const<_Tp>& __pt, const _Up& __u) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const propagate_const<_Tp>& __pt, const _Up& __u) {
   return experimental::get_underlying(__pt) < __u;
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator>(const propagate_const<_Tp>& __pt, const _Up& __u) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const propagate_const<_Tp>& __pt, const _Up& __u) {
   return experimental::get_underlying(__pt) > __u;
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator<=(const propagate_const<_Tp>& __pt, const _Up& __u) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const propagate_const<_Tp>& __pt, const _Up& __u) {
   return experimental::get_underlying(__pt) <= __u;
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator>=(const propagate_const<_Tp>& __pt, const _Up& __u) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const propagate_const<_Tp>& __pt, const _Up& __u) {
   return experimental::get_underlying(__pt) >= __u;
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator==(const _Tp& __t, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const _Tp& __t, const propagate_const<_Up>& __pu) {
   return __t == experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator!=(const _Tp& __t, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const _Tp& __t, const propagate_const<_Up>& __pu) {
   return __t != experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator<(const _Tp& __t, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const _Tp& __t, const propagate_const<_Up>& __pu) {
   return __t < experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator>(const _Tp& __t, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const _Tp& __t, const propagate_const<_Up>& __pu) {
   return __t > experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator<=(const _Tp& __t, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const _Tp& __t, const propagate_const<_Up>& __pu) {
   return __t <= experimental::get_underlying(__pu);
 }
 
 template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator>=(const _Tp& __t, const propagate_const<_Up>& __pu) {
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const _Tp& __t, const propagate_const<_Up>& __pu) {
   return __t >= experimental::get_underlying(__pu);
 }
 
 template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR void swap(propagate_const<_Tp>& __pc1, propagate_const<_Tp>& __pc2)
-    _NOEXCEPT_(__is_nothrow_swappable_v<_Tp>) {
+_LIBCPP_HIDE_FROM_ABI constexpr void
+swap(propagate_const<_Tp>& __pc1, propagate_const<_Tp>& __pc2) noexcept(__is_nothrow_swappable_v<_Tp>) {
   __pc1.swap(__pc2);
 }
 
 template <class _Tp>
-_LIBCPP_CONSTEXPR const _Tp& get_underlying(const propagate_const<_Tp>& __pt) _NOEXCEPT {
+constexpr const _Tp& get_underlying(const propagate_const<_Tp>& __pt) _NOEXCEPT {
   return __pt.__t_;
 }
 
 template <class _Tp>
-_LIBCPP_CONSTEXPR _Tp& get_underlying(propagate_const<_Tp>& __pt) _NOEXCEPT {
+constexpr _Tp& get_underlying(propagate_const<_Tp>& __pt) _NOEXCEPT {
   return __pt.__t_;
 }
 
diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd
index fad6431d13a193..484543b81daf1f 100644
--- a/libcxx/include/experimental/simd
+++ b/libcxx/include/experimental/simd
@@ -78,6 +78,7 @@ inline namespace parallelism_v2 {
 #include <experimental/__config>
 #include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
+#include <experimental/__simd/reference.h>
 #include <experimental/__simd/scalar.h>
 #include <experimental/__simd/simd.h>
 #include <experimental/__simd/simd_mask.h>
diff --git a/libcxx/include/experimental/type_traits b/libcxx/include/experimental/type_traits
index 37be434f8edd56..31b041bc94c43a 100644
--- a/libcxx/include/experimental/type_traits
+++ b/libcxx/include/experimental/type_traits
@@ -129,7 +129,7 @@ using is_detected = typename _DETECTOR<nonesuch, void, _Op, _Args...>::value_t;
 template <template <class...> class _Op, class... _Args>
 using detected_t = typename _DETECTOR<nonesuch, void, _Op, _Args...>::type;
 template <template <class...> class _Op, class... _Args>
-_LIBCPP_CONSTEXPR bool is_detected_v = is_detected<_Op, _Args...>::value;
+constexpr bool is_detected_v = is_detected<_Op, _Args...>::value;
 
 template <class _Default, template <class...> class _Op, class... _Args>
 using detected_or = _DETECTOR<_Default, void, _Op, _Args...>;
@@ -139,12 +139,12 @@ using detected_or_t = typename detected_or<_Default, _Op, _Args...>::type;
 template <class _Expected, template <class...> class _Op, class... _Args>
 using is_detected_exact = is_same<_Expected, detected_t<_Op, _Args...>>;
 template <class _Expected, template <class...> class _Op, class... _Args>
-_LIBCPP_CONSTEXPR bool is_detected_exact_v = is_detected_exact<_Expected, _Op, _Args...>::value;
+constexpr bool is_detected_exact_v = is_detected_exact<_Expected, _Op, _Args...>::value;
 
 template <class _To, template <class...> class _Op, class... _Args>
 using is_detected_convertible = is_convertible<detected_t<_Op, _Args...>, _To>;
 template <class _To, template <class...> class _Op, class... _Args>
-_LIBCPP_CONSTEXPR bool is_detected_convertible_v = is_detected_convertible<_To, _Op, _Args...>::value;
+constexpr bool is_detected_convertible_v = is_detected_convertible<_To, _Op, _Args...>::value;
 
 _LIBCPP_END_NAMESPACE_LFTS
 
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index cb3752070e86fb..3731d3f6cf6d16 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -513,8 +513,8 @@ protected:
 
 public:
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x)
-      _NOEXCEPT_(is_nothrow_move_constructible<__node_allocator>::value);
+  _LIBCPP_HIDE_FROM_ABI
+  __forward_list_base(__forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value);
   _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x, const allocator_type& __a);
 #endif // _LIBCPP_CXX03_LANG
 
@@ -589,8 +589,8 @@ private:
 #ifndef _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x)
-    _NOEXCEPT_(is_nothrow_move_constructible<__node_allocator>::value)
+inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x) noexcept(
+    is_nothrow_move_constructible<__node_allocator>::value)
     : __before_begin_(std::move(__x.__before_begin_)) {
   __x.__before_begin()->__next_ = nullptr;
 }
@@ -648,13 +648,11 @@ public:
   typedef _Tp value_type;
   typedef _Alloc allocator_type;
 
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
+
   static_assert(is_same<value_type, typename allocator_type::value_type>::value,
                 "Allocator::value_type must be same type as value_type");
 
-  static_assert(is_same<allocator_type, __rebind_alloc<allocator_traits<allocator_type>, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
-
   static_assert(!is_same<allocator_type, __node_allocator>::value,
                 "internal allocator type must differ from user-specified type; otherwise overload resolution breaks");
 
@@ -707,15 +705,16 @@ public:
   _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x);
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) _NOEXCEPT_(is_nothrow_move_constructible<base>::value)
+  _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<base>::value)
       : base(std::move(__x)) {}
   _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x, const __type_identity_t<allocator_type>& __a);
 
   _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il);
   _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il, const allocator_type& __a);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) _NOEXCEPT_(
-      __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable<allocator_type>::value);
+  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept(
+      __node_traits::propagate_on_container_move_assignment::value &&
+      is_nothrow_move_assignable<allocator_type>::value);
 
   _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list<value_type> __il);
 
diff --git a/libcxx/include/future b/libcxx/include/future
index 589f11a19f95fc..dea73dc6389bc7 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -426,19 +426,19 @@ _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(launch)
 
 typedef underlying_type<launch>::type __launch_underlying_type;
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR launch operator&(launch __x, launch __y) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr launch operator&(launch __x, launch __y) {
   return static_cast<launch>(static_cast<__launch_underlying_type>(__x) & static_cast<__launch_underlying_type>(__y));
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR launch operator|(launch __x, launch __y) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr launch operator|(launch __x, launch __y) {
   return static_cast<launch>(static_cast<__launch_underlying_type>(__x) | static_cast<__launch_underlying_type>(__y));
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR launch operator^(launch __x, launch __y) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr launch operator^(launch __x, launch __y) {
   return static_cast<launch>(static_cast<__launch_underlying_type>(__x) ^ static_cast<__launch_underlying_type>(__y));
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR launch operator~(launch __x) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr launch operator~(launch __x) {
   return static_cast<launch>(~static_cast<__launch_underlying_type>(__x) & 3);
 }
 
diff --git a/libcxx/include/list b/libcxx/include/list
index e9ed52f9322782..1678559a841dde 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -673,6 +673,7 @@ class _LIBCPP_TEMPLATE_VIS list : private __list_imp<_Tp, _Alloc> {
 public:
   typedef _Tp value_type;
   typedef _Alloc allocator_type;
+  static_assert(__check_valid_allocator<allocator_type>::value);
   static_assert(is_same<value_type, typename allocator_type::value_type>::value,
                 "Allocator::value_type must be same type as value_type");
   typedef value_type& reference;
@@ -691,10 +692,6 @@ public:
   typedef void __remove_return_type;
 #endif
 
-  static_assert(is_same<allocator_type, __rebind_alloc<allocator_traits<allocator_type>, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
-
   _LIBCPP_HIDE_FROM_ABI list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {}
   _LIBCPP_HIDE_FROM_ABI explicit list(const allocator_type& __a) : base(__a) {}
   _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n);
@@ -1052,7 +1049,7 @@ list<_Tp, _Alloc>::list(initializer_list<value_type> __il) {
 }
 
 template <class _Tp, class _Alloc>
-inline list<_Tp, _Alloc>::list(list&& __c) _NOEXCEPT_(is_nothrow_move_constructible<__node_allocator>::value)
+inline list<_Tp, _Alloc>::list(list&& __c) noexcept(is_nothrow_move_constructible<__node_allocator>::value)
     : base(std::move(__c.__node_alloc())) {
   splice(end(), __c);
 }
@@ -1068,9 +1065,9 @@ inline list<_Tp, _Alloc>::list(list&& __c, const __type_identity_t<allocator_typ
 }
 
 template <class _Tp, class _Alloc>
-inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(list&& __c)
-    _NOEXCEPT_(__node_alloc_traits::propagate_on_container_move_assignment::value&&
-                   is_nothrow_move_assignable<__node_allocator>::value) {
+inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(list&& __c) noexcept(
+    __node_alloc_traits::propagate_on_container_move_assignment::value &&
+    is_nothrow_move_assignable<__node_allocator>::value) {
   __move_assign(__c, integral_constant<bool, __node_alloc_traits::propagate_on_container_move_assignment::value>());
   return *this;
 }
@@ -1085,8 +1082,8 @@ void list<_Tp, _Alloc>::__move_assign(list& __c, false_type) {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::__move_assign(list& __c, true_type)
-    _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) {
+void list<_Tp, _Alloc>::__move_assign(list& __c,
+                                      true_type) noexcept(is_nothrow_move_assignable<__node_allocator>::value) {
   clear();
   base::__move_assign_alloc(__c);
   splice(end(), __c);
diff --git a/libcxx/include/map b/libcxx/include/map
index d98ce6adecff85..4b2f3fc71cbfea 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -997,9 +997,7 @@ private:
   typedef typename __base::__node_traits __node_traits;
   typedef allocator_traits<allocator_type> __alloc_traits;
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
 
   __base __tree_;
 
@@ -1090,12 +1088,12 @@ public:
 
 #ifndef _LIBCPP_CXX03_LANG
 
-  _LIBCPP_HIDE_FROM_ABI map(map&& __m) _NOEXCEPT_(is_nothrow_move_constructible<__base>::value)
+  _LIBCPP_HIDE_FROM_ABI map(map&& __m) noexcept(is_nothrow_move_constructible<__base>::value)
       : __tree_(std::move(__m.__tree_)) {}
 
   _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a);
 
-  _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) _NOEXCEPT_(is_nothrow_move_assignable<__base>::value) {
+  _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) noexcept(is_nothrow_move_assignable<__base>::value) {
     __tree_ = std::move(__m.__tree_);
     return *this;
   }
@@ -1656,6 +1654,7 @@ public:
   typedef value_type& reference;
   typedef const value_type& const_reference;
 
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<typename allocator_type::value_type, value_type>::value,
                 "Allocator::value_type must be same type as value_type");
 
@@ -1681,10 +1680,6 @@ private:
   typedef typename __base::__node_traits __node_traits;
   typedef allocator_traits<allocator_type> __alloc_traits;
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
-
   __base __tree_;
 
 public:
@@ -1777,12 +1772,12 @@ public:
 
 #ifndef _LIBCPP_CXX03_LANG
 
-  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) _NOEXCEPT_(is_nothrow_move_constructible<__base>::value)
+  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) noexcept(is_nothrow_move_constructible<__base>::value)
       : __tree_(std::move(__m.__tree_)) {}
 
   _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a);
 
-  _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) _NOEXCEPT_(is_nothrow_move_assignable<__base>::value) {
+  _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) noexcept(is_nothrow_move_assignable<__base>::value) {
     __tree_ = std::move(__m.__tree_);
     return *this;
   }
diff --git a/libcxx/include/queue b/libcxx/include/queue
index 3de11031f78161..9508de9f9eff22 100644
--- a/libcxx/include/queue
+++ b/libcxx/include/queue
@@ -341,10 +341,10 @@ public:
   }
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI queue(queue&& __q) _NOEXCEPT_(is_nothrow_move_constructible<container_type>::value)
+  _LIBCPP_HIDE_FROM_ABI queue(queue&& __q) noexcept(is_nothrow_move_constructible<container_type>::value)
       : c(std::move(__q.c)) {}
 
-  _LIBCPP_HIDE_FROM_ABI queue& operator=(queue&& __q) _NOEXCEPT_(is_nothrow_move_assignable<container_type>::value) {
+  _LIBCPP_HIDE_FROM_ABI queue& operator=(queue&& __q) noexcept(is_nothrow_move_assignable<container_type>::value) {
     c = std::move(__q.c);
     return *this;
   }
@@ -539,12 +539,12 @@ public:
   }
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI priority_queue(priority_queue&& __q) _NOEXCEPT_(
-      is_nothrow_move_constructible<container_type>::value&& is_nothrow_move_constructible<value_compare>::value)
+  _LIBCPP_HIDE_FROM_ABI priority_queue(priority_queue&& __q) noexcept(
+      is_nothrow_move_constructible<container_type>::value && is_nothrow_move_constructible<value_compare>::value)
       : c(std::move(__q.c)), comp(std::move(__q.comp)) {}
 
-  _LIBCPP_HIDE_FROM_ABI priority_queue& operator=(priority_queue&& __q)
-      _NOEXCEPT_(is_nothrow_move_assignable<container_type>::value&& is_nothrow_move_assignable<value_compare>::value) {
+  _LIBCPP_HIDE_FROM_ABI priority_queue& operator=(priority_queue&& __q) noexcept(
+      is_nothrow_move_assignable<container_type>::value && is_nothrow_move_assignable<value_compare>::value) {
     c    = std::move(__q.c);
     comp = std::move(__q.comp);
     return *this;
diff --git a/libcxx/include/set b/libcxx/include/set
index a4c29bdab9b075..9a2eb12d0a25ab 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -578,9 +578,7 @@ private:
   typedef __tree<value_type, value_compare, allocator_type> __base;
   typedef allocator_traits<allocator_type> __alloc_traits;
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
 
   __base __tree_;
 
@@ -659,7 +657,7 @@ public:
   }
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI set(set&& __s) _NOEXCEPT_(is_nothrow_move_constructible<__base>::value)
+  _LIBCPP_HIDE_FROM_ABI set(set&& __s) noexcept(is_nothrow_move_constructible<__base>::value)
       : __tree_(std::move(__s.__tree_)) {}
 #endif // _LIBCPP_CXX03_LANG
 
@@ -692,7 +690,7 @@ public:
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI set& operator=(set&& __s) _NOEXCEPT_(is_nothrow_move_assignable<__base>::value) {
+  _LIBCPP_HIDE_FROM_ABI set& operator=(set&& __s) noexcept(is_nothrow_move_assignable<__base>::value) {
     __tree_ = std::move(__s.__tree_);
     return *this;
   }
@@ -1035,9 +1033,7 @@ private:
   typedef __tree<value_type, value_compare, allocator_type> __base;
   typedef allocator_traits<allocator_type> __alloc_traits;
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
 
   __base __tree_;
 
@@ -1119,7 +1115,7 @@ public:
   }
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) _NOEXCEPT_(is_nothrow_move_constructible<__base>::value)
+  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) noexcept(is_nothrow_move_constructible<__base>::value)
       : __tree_(std::move(__s.__tree_)) {}
 
   _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a);
diff --git a/libcxx/include/stack b/libcxx/include/stack
index a3a94642c109f9..90f8933cca3088 100644
--- a/libcxx/include/stack
+++ b/libcxx/include/stack
@@ -173,10 +173,10 @@ public:
   }
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI stack(stack&& __q) _NOEXCEPT_(is_nothrow_move_constructible<container_type>::value)
+  _LIBCPP_HIDE_FROM_ABI stack(stack&& __q) noexcept(is_nothrow_move_constructible<container_type>::value)
       : c(std::move(__q.c)) {}
 
-  _LIBCPP_HIDE_FROM_ABI stack& operator=(stack&& __q) _NOEXCEPT_(is_nothrow_move_assignable<container_type>::value) {
+  _LIBCPP_HIDE_FROM_ABI stack& operator=(stack&& __q) noexcept(is_nothrow_move_assignable<container_type>::value) {
     c = std::move(__q.c);
     return *this;
   }
diff --git a/libcxx/include/string b/libcxx/include/string
index 56307b391a3e58..9a52ab6aef41e8 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -780,10 +780,7 @@ public:
                 "traits_type::char_type must be the same type as CharT");
   static_assert(is_same<typename allocator_type::value_type, value_type>::value,
                 "Allocator::value_type must be same type as value_type");
-
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
 
   // TODO: Implement iterator bounds checking without requiring the global database.
   typedef __wrap_iter<pointer> iterator;
@@ -1161,8 +1158,8 @@ public:
   }
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(basic_string&& __str)
-      _NOEXCEPT_(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
+  operator=(basic_string&& __str) noexcept(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value) {
     __move_assign(__str, integral_constant<bool, __alloc_traits::propagate_on_container_move_assignment::value>());
     return *this;
   }
@@ -1418,8 +1415,8 @@ public:
     return *this = __str;
   }
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(basic_string&& __str)
-      _NOEXCEPT_(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
+  assign(basic_string&& __str) noexcept(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value) {
     *this = std::move(__str);
     return *this;
   }
@@ -2081,14 +2078,14 @@ private:
   __copy_assign_alloc(const basic_string&, false_type) _NOEXCEPT {}
 
 #ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign(basic_string& __str, false_type)
-      _NOEXCEPT_(__alloc_traits::is_always_equal::value);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+  __move_assign(basic_string& __str, false_type) noexcept(__alloc_traits::is_always_equal::value);
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS void
   __move_assign(basic_string& __str, true_type)
 #  if _LIBCPP_STD_VER >= 17
-      _NOEXCEPT;
+      noexcept;
 #  else
-      _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value);
+      noexcept(is_nothrow_move_assignable<allocator_type>::value);
 #  endif
 #endif
 
@@ -2579,9 +2576,8 @@ basic_string<_CharT, _Traits, _Allocator>::operator=(const basic_string& __str)
 #ifndef _LIBCPP_CXX03_LANG
 
 template <class _CharT, class _Traits, class _Allocator>
-inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-basic_string<_CharT, _Traits, _Allocator>::__move_assign(basic_string& __str, false_type)
-    _NOEXCEPT_(__alloc_traits::is_always_equal::value) {
+inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__move_assign(
+    basic_string& __str, false_type) noexcept(__alloc_traits::is_always_equal::value) {
   if (__alloc() != __str.__alloc())
     assign(__str);
   else
@@ -2592,9 +2588,9 @@ template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS void
 basic_string<_CharT, _Traits, _Allocator>::__move_assign(basic_string& __str, true_type)
 #  if _LIBCPP_STD_VER >= 17
-    _NOEXCEPT
+    noexcept
 #  else
-    _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value)
+    noexcept(is_nothrow_move_assignable<allocator_type>::value)
 #  endif
 {
   __annotate_delete();
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index c1eaadbc40bece..b2a4db4e7519ad 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -902,32 +902,31 @@ struct hash<basic_string_view<wchar_t, char_traits<wchar_t> > > : __string_view_
 #if _LIBCPP_STD_VER >= 14
 inline namespace literals {
 inline namespace string_view_literals {
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR basic_string_view<char>
-operator""sv(const char* __str, size_t __len) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char> operator""sv(const char* __str, size_t __len) noexcept {
   return basic_string_view<char>(__str, __len);
 }
 
 #  ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR basic_string_view<wchar_t>
-operator""sv(const wchar_t* __str, size_t __len) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<wchar_t>
+operator""sv(const wchar_t* __str, size_t __len) noexcept {
   return basic_string_view<wchar_t>(__str, __len);
 }
 #  endif
 
 #  ifndef _LIBCPP_HAS_NO_CHAR8_T
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR basic_string_view<char8_t>
-operator""sv(const char8_t* __str, size_t __len) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char8_t>
+operator""sv(const char8_t* __str, size_t __len) noexcept {
   return basic_string_view<char8_t>(__str, __len);
 }
 #  endif
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR basic_string_view<char16_t>
-operator""sv(const char16_t* __str, size_t __len) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char16_t>
+operator""sv(const char16_t* __str, size_t __len) noexcept {
   return basic_string_view<char16_t>(__str, __len);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR basic_string_view<char32_t>
-operator""sv(const char32_t* __str, size_t __len) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char32_t>
+operator""sv(const char32_t* __str, size_t __len) noexcept {
   return basic_string_view<char32_t>(__str, __len);
 }
 } // namespace string_view_literals
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index fbd38b9d3f7a63..26652ffe81e9f0 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -279,14 +279,14 @@ class __tuple_leaf;
 
 template <size_t _Ip, class _Hp, bool _Ep>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-swap(__tuple_leaf<_Ip, _Hp, _Ep>& __x, __tuple_leaf<_Ip, _Hp, _Ep>& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Hp>) {
+swap(__tuple_leaf<_Ip, _Hp, _Ep>& __x, __tuple_leaf<_Ip, _Hp, _Ep>& __y) noexcept(__is_nothrow_swappable_v<_Hp>) {
   swap(__x.get(), __y.get());
 }
 
 template <size_t _Ip, class _Hp, bool _Ep>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-swap(const __tuple_leaf<_Ip, _Hp, _Ep>& __x, const __tuple_leaf<_Ip, _Hp, _Ep>& __y)
-    _NOEXCEPT_(__is_nothrow_swappable_v<const _Hp>) {
+swap(const __tuple_leaf<_Ip, _Hp, _Ep>& __x,
+     const __tuple_leaf<_Ip, _Hp, _Ep>& __y) noexcept(__is_nothrow_swappable_v<const _Hp>) {
   swap(__x.get(), __y.get());
 }
 
@@ -306,7 +306,7 @@ class __tuple_leaf {
 public:
   _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_leaf& operator=(const __tuple_leaf&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr __tuple_leaf() _NOEXCEPT_(is_nothrow_default_constructible<_Hp>::value) : __value_() {
+  _LIBCPP_HIDE_FROM_ABI constexpr __tuple_leaf() noexcept(is_nothrow_default_constructible<_Hp>::value) : __value_() {
     static_assert(!is_reference<_Hp>::value, "Attempted to default construct a reference element in a tuple");
   }
 
@@ -330,7 +330,7 @@ public:
       class _Tp,
       __enable_if_t<_And<_IsNotSame<__remove_cvref_t<_Tp>, __tuple_leaf>, is_constructible<_Hp, _Tp> >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __tuple_leaf(_Tp&& __t) _NOEXCEPT_(is_nothrow_constructible<_Hp, _Tp>::value)
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __tuple_leaf(_Tp&& __t) noexcept(is_nothrow_constructible<_Hp, _Tp>::value)
       : __value_(std::forward<_Tp>(__t)) {
     static_assert(__can_bind_reference<_Tp&&>(),
                   "Attempted construction of reference element binds to a temporary whose lifetime has ended");
@@ -361,14 +361,14 @@ public:
   _LIBCPP_HIDE_FROM_ABI __tuple_leaf(const __tuple_leaf& __t) = default;
   _LIBCPP_HIDE_FROM_ABI __tuple_leaf(__tuple_leaf&& __t)      = default;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int swap(__tuple_leaf& __t)
-      _NOEXCEPT_(__is_nothrow_swappable_v<__tuple_leaf>) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int
+  swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable_v<__tuple_leaf>) {
     std::swap(*this, __t);
     return 0;
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int swap(const __tuple_leaf& __t) const
-      _NOEXCEPT_(__is_nothrow_swappable_v<const __tuple_leaf>) {
+      noexcept(__is_nothrow_swappable_v<const __tuple_leaf>) {
     std::swap(*this, __t);
     return 0;
   }
@@ -382,7 +382,7 @@ class __tuple_leaf<_Ip, _Hp, true> : private _Hp {
 public:
   _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_leaf& operator=(const __tuple_leaf&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr __tuple_leaf() _NOEXCEPT_(is_nothrow_default_constructible<_Hp>::value) {}
+  _LIBCPP_HIDE_FROM_ABI constexpr __tuple_leaf() noexcept(is_nothrow_default_constructible<_Hp>::value) {}
 
   template <class _Alloc>
   _LIBCPP_HIDE_FROM_ABI constexpr __tuple_leaf(integral_constant<int, 0>, const _Alloc&) {}
@@ -398,7 +398,7 @@ public:
             __enable_if_t< _And< _IsNotSame<__remove_cvref_t<_Tp>, __tuple_leaf>, is_constructible<_Hp, _Tp> >::value,
                            int> = 0>
   _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __tuple_leaf(_Tp&& __t) _NOEXCEPT_(is_nothrow_constructible<_Hp, _Tp>::value)
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __tuple_leaf(_Tp&& __t) noexcept(is_nothrow_constructible<_Hp, _Tp>::value)
       : _Hp(std::forward<_Tp>(__t)) {}
 
   template <class _Tp, class _Alloc>
@@ -416,14 +416,14 @@ public:
   __tuple_leaf(__tuple_leaf const&) = default;
   __tuple_leaf(__tuple_leaf&&)      = default;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int swap(__tuple_leaf& __t)
-      _NOEXCEPT_(__is_nothrow_swappable_v<__tuple_leaf>) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int
+  swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable_v<__tuple_leaf>) {
     std::swap(*this, __t);
     return 0;
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int swap(const __tuple_leaf& __rhs) const
-      _NOEXCEPT_(__is_nothrow_swappable_v<const __tuple_leaf>) {
+      noexcept(__is_nothrow_swappable_v<const __tuple_leaf>) {
     std::swap(*this, __rhs);
     return 0;
   }
@@ -451,14 +451,17 @@ struct __tuple_impl;
 template <size_t... _Indx, class... _Tp>
 struct _LIBCPP_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>, _Tp...>
     : public __tuple_leaf<_Indx, _Tp>... {
-  _LIBCPP_HIDE_FROM_ABI constexpr __tuple_impl()
-      _NOEXCEPT_(__all<is_nothrow_default_constructible<_Tp>::value...>::value) {}
+  _LIBCPP_HIDE_FROM_ABI constexpr __tuple_impl() noexcept(
+      __all<is_nothrow_default_constructible<_Tp>::value...>::value) {}
 
   template <size_t... _Uf, class... _Tf, size_t... _Ul, class... _Tl, class... _Up>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __tuple_impl(
-      __tuple_indices<_Uf...>, __tuple_types<_Tf...>, __tuple_indices<_Ul...>, __tuple_types<_Tl...>, _Up&&... __u)
-      _NOEXCEPT_(__all<is_nothrow_constructible<_Tf, _Up>::value...>::value&&
-                     __all<is_nothrow_default_constructible<_Tl>::value...>::value)
+      __tuple_indices<_Uf...>,
+      __tuple_types<_Tf...>,
+      __tuple_indices<_Ul...>,
+      __tuple_types<_Tl...>,
+      _Up&&... __u) noexcept(__all<is_nothrow_constructible<_Tf, _Up>::value...>::value &&
+                             __all<is_nothrow_default_constructible<_Tl>::value...>::value)
       : __tuple_leaf<_Uf, _Tf>(std::forward<_Up>(__u))..., __tuple_leaf<_Ul, _Tl>()... {}
 
   template <class _Alloc, size_t... _Uf, class... _Tf, size_t... _Ul, class... _Tl, class... _Up>
@@ -474,7 +477,7 @@ struct _LIBCPP_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>, _Tp.
         __tuple_leaf<_Ul, _Tl>(__uses_alloc_ctor<_Tl, _Alloc>(), __a)... {}
 
   template <class _Tuple, __enable_if_t<__tuple_constructible<_Tuple, tuple<_Tp...> >::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_impl(_Tuple&& __t) _NOEXCEPT_(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_impl(_Tuple&& __t) noexcept(
       (__all<is_nothrow_constructible<
            _Tp,
            typename tuple_element<_Indx, typename __make_tuple_types<_Tuple>::type>::type>::value...>::value))
@@ -495,13 +498,13 @@ struct _LIBCPP_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>, _Tp.
   __tuple_impl(const __tuple_impl&) = default;
   __tuple_impl(__tuple_impl&&)      = default;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void swap(__tuple_impl& __t)
-      _NOEXCEPT_(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
+  swap(__tuple_impl& __t) noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
     std::__swallow(__tuple_leaf<_Indx, _Tp>::swap(static_cast<__tuple_leaf<_Indx, _Tp>&>(__t))...);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void swap(const __tuple_impl& __t) const
-      _NOEXCEPT_(__all<__is_nothrow_swappable_v<const _Tp>...>::value) {
+      noexcept(__all<__is_nothrow_swappable_v<const _Tp>...>::value) {
     std::__swallow(__tuple_leaf<_Indx, _Tp>::swap(static_cast<const __tuple_leaf<_Indx, _Tp>&>(__t))...);
   }
 };
@@ -545,8 +548,8 @@ public:
   template <template <class...> class _IsImpDefault                = __is_implicitly_default_constructible,
             template <class...> class _IsDefault                   = is_default_constructible,
             __enable_if_t< _And< _IsDefault<_Tp>... >::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit(_Not<_Lazy<_And, _IsImpDefault<_Tp>...> >::value) tuple()
-      _NOEXCEPT_(_And<is_nothrow_default_constructible<_Tp>...>::value) {}
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit(_Not<_Lazy<_And, _IsImpDefault<_Tp>...> >::value)
+      tuple() noexcept(_And<is_nothrow_default_constructible<_Tp>...>::value) {}
 
   template <class _Alloc,
             template <class...> class _IsImpDefault                = __is_implicitly_default_constructible,
@@ -566,7 +569,7 @@ public:
             __enable_if_t< _And< _BoolConstant<sizeof...(_Tp) >= 1>, is_copy_constructible<_Tp>... >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_Lazy<_And, is_convertible<const _Tp&, _Tp>...> >::value)
-      tuple(const _Tp&... __t) _NOEXCEPT_(_And<is_nothrow_copy_constructible<_Tp>...>::value)
+      tuple(const _Tp&... __t) noexcept(_And<is_nothrow_copy_constructible<_Tp>...>::value)
       : __base_(typename __make_tuple_indices<sizeof...(_Tp)>::type(),
                 typename __make_tuple_types<tuple, sizeof...(_Tp)>::type(),
                 typename __make_tuple_indices<0>::type(),
@@ -603,7 +606,7 @@ public:
             __enable_if_t< _And< _BoolConstant<sizeof...(_Up) == sizeof...(_Tp)>, _EnableUTypesCtor<_Up...> >::value,
                            int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_Lazy<_And, is_convertible<_Up, _Tp>...> >::value)
-      tuple(_Up&&... __u) _NOEXCEPT_(_And<is_nothrow_constructible<_Tp, _Up>...>::value)
+      tuple(_Up&&... __u) noexcept(_And<is_nothrow_constructible<_Tp, _Up>...>::value)
       : __base_(typename __make_tuple_indices<sizeof...(_Up)>::type(),
                 typename __make_tuple_types<tuple, sizeof...(_Up)>::type(),
                 typename __make_tuple_indices<sizeof...(_Tp), sizeof...(_Up)>::type(),
@@ -670,7 +673,7 @@ public:
   template <class... _Up, __enable_if_t< _And< _EnableCtorFromUTypesTuple<const tuple<_Up...>&> >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_Lazy<_And, is_convertible<const _Up&, _Tp>...> >::value)
-      tuple(const tuple<_Up...>& __t) _NOEXCEPT_(_And<is_nothrow_constructible<_Tp, const _Up&>...>::value)
+      tuple(const tuple<_Up...>& __t) noexcept(_And<is_nothrow_constructible<_Tp, const _Up&>...>::value)
       : __base_(__t) {}
 
   template <class... _Up,
@@ -697,7 +700,7 @@ public:
   // tuple(tuple<U...>&&) constructors (including allocator_arg_t variants)
   template <class... _Up, __enable_if_t< _And< _EnableCtorFromUTypesTuple<tuple<_Up...>&&> >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_Lazy<_And, is_convertible<_Up, _Tp>...> >::value)
-      tuple(tuple<_Up...>&& __t) _NOEXCEPT_(_And<is_nothrow_constructible<_Tp, _Up>...>::value)
+      tuple(tuple<_Up...>&& __t) noexcept(_And<is_nothrow_constructible<_Tp, _Up>...>::value)
       : __base_(std::move(__t)) {}
 
   template <class _Alloc,
@@ -754,7 +757,7 @@ public:
             __enable_if_t< _And< _EnableCtorFromPair<const pair<_Up1, _Up2>&> >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_BothImplicitlyConvertible<const pair<_Up1, _Up2>&> >::value)
-      tuple(const pair<_Up1, _Up2>& __p) _NOEXCEPT_(_NothrowConstructibleFromPair<const pair<_Up1, _Up2>&>::value)
+      tuple(const pair<_Up1, _Up2>& __p) noexcept(_NothrowConstructibleFromPair<const pair<_Up1, _Up2>&>::value)
       : __base_(__p) {}
 
   template <class _Alloc,
@@ -792,7 +795,7 @@ public:
             __enable_if_t< _And< _EnableCtorFromPair<pair<_Up1, _Up2>&&> >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_BothImplicitlyConvertible<pair<_Up1, _Up2>&&> >::value)
-      tuple(pair<_Up1, _Up2>&& __p) _NOEXCEPT_(_NothrowConstructibleFromPair<pair<_Up1, _Up2>&&>::value)
+      tuple(pair<_Up1, _Up2>&& __p) noexcept(_NothrowConstructibleFromPair<pair<_Up1, _Up2>&&>::value)
       : __base_(std::move(__p)) {}
 
   template <class _Alloc,
@@ -825,7 +828,7 @@ public:
   // [tuple.assign]
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(_If<_And<is_copy_assignable<_Tp>...>::value, tuple, __nat> const& __tuple)
-      _NOEXCEPT_(_And<is_nothrow_copy_assignable<_Tp>...>::value) {
+      noexcept(_And<is_nothrow_copy_assignable<_Tp>...>::value) {
     std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
   }
@@ -849,7 +852,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(_If<_And<is_move_assignable<_Tp>...>::value, tuple, __nat>&& __tuple)
-      _NOEXCEPT_(_And<is_nothrow_move_assignable<_Tp>...>::value) {
+      noexcept(_And<is_nothrow_move_assignable<_Tp>...>::value) {
     std::__memberwise_forward_assign(
         *this, std::move(__tuple), __tuple_types<_Tp...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
@@ -860,7 +863,7 @@ public:
       __enable_if_t< _And< _BoolConstant<sizeof...(_Tp) == sizeof...(_Up)>, is_assignable<_Tp&, _Up const&>... >::value,
                      int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(tuple<_Up...> const& __tuple)
-      _NOEXCEPT_(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
+      noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
     std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
   }
@@ -869,7 +872,7 @@ public:
             __enable_if_t< _And< _BoolConstant<sizeof...(_Tp) == sizeof...(_Up)>, is_assignable<_Tp&, _Up>... >::value,
                            int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(tuple<_Up...>&& __tuple)
-      _NOEXCEPT_(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
+      noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
         *this, std::move(__tuple), __tuple_types<_Up...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
@@ -934,7 +937,7 @@ public:
             class _Up2,
             __enable_if_t< _EnableAssignFromPair<false, pair<_Up1, _Up2> const&>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(pair<_Up1, _Up2> const& __pair)
-      _NOEXCEPT_(_NothrowAssignFromPair<false, pair<_Up1, _Up2> const&>::value) {
+      noexcept(_NothrowAssignFromPair<false, pair<_Up1, _Up2> const&>::value) {
     std::get<0>(*this) = __pair.first;
     std::get<1>(*this) = __pair.second;
     return *this;
@@ -942,7 +945,7 @@ public:
 
   template <class _Up1, class _Up2, __enable_if_t< _EnableAssignFromPair<false, pair<_Up1, _Up2>&&>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(pair<_Up1, _Up2>&& __pair)
-      _NOEXCEPT_(_NothrowAssignFromPair<false, pair<_Up1, _Up2>&&>::value) {
+      noexcept(_NothrowAssignFromPair<false, pair<_Up1, _Up2>&&>::value) {
     std::get<0>(*this) = std::forward<_Up1>(__pair.first);
     std::get<1>(*this) = std::forward<_Up2>(__pair.second);
     return *this;
@@ -954,7 +957,7 @@ public:
       size_t _Np,
       __enable_if_t< _And< _BoolConstant<_Np == sizeof...(_Tp)>, is_assignable<_Tp&, _Up const&>... >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(array<_Up, _Np> const& __array)
-      _NOEXCEPT_(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
+      noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
     std::__memberwise_copy_assign(*this, __array, typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
   }
@@ -965,7 +968,7 @@ public:
             class = void,
             __enable_if_t< _And< _BoolConstant<_Np == sizeof...(_Tp)>, is_assignable<_Tp&, _Up>... >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(array<_Up, _Np>&& __array)
-      _NOEXCEPT_(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
+      noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
         *this,
         std::move(__array),
@@ -976,7 +979,7 @@ public:
 
   // [tuple.swap]
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(tuple& __t)
-      _NOEXCEPT_(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
+      noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
     __base_.swap(__t.__base_);
   }
 
@@ -1035,7 +1038,7 @@ tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>;
 
 template <class... _Tp, __enable_if_t<__all<__is_swappable_v<_Tp>...>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(tuple<_Tp...>& __t, tuple<_Tp...>& __u)
-    _NOEXCEPT_(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
+    noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
   __t.swap(__u);
 }
 
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index a243689b9dc868..7c31c4fce26b03 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -1057,9 +1057,7 @@ private:
   typedef unique_ptr<__node, _Dp> __node_holder;
   typedef allocator_traits<allocator_type> __alloc_traits;
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
 
   static_assert(is_same<typename __table::__container_value_type, value_type>::value, "");
   static_assert(is_same<typename __table::__node_value_type, __value_type>::value, "");
@@ -1839,6 +1837,7 @@ public:
   typedef pair<const key_type, mapped_type> value_type;
   typedef value_type& reference;
   typedef const value_type& const_reference;
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<value_type, typename allocator_type::value_type>::value,
                 "Allocator::value_type must be same type as value_type");
 
@@ -1862,10 +1861,6 @@ private:
   static_assert(is_same<typename __node_traits::size_type, typename __alloc_traits::size_type>::value,
                 "Allocator uses different size_type for different types");
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
-
 public:
   typedef typename __alloc_traits::pointer pointer;
   typedef typename __alloc_traits::const_pointer const_pointer;
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 5813e38a41f0d9..3297294a893f82 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -588,13 +588,10 @@ public:
   typedef __type_identity_t<_Alloc> allocator_type;
   typedef value_type& reference;
   typedef const value_type& const_reference;
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<value_type, typename allocator_type::value_type>::value,
                 "Allocator::value_type must be same type as value_type");
 
-  static_assert(is_same<allocator_type, __rebind_alloc<allocator_traits<allocator_type>, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
-
 private:
   typedef __hash_table<value_type, hasher, key_equal, allocator_type> __table;
 
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 014971b4a680eb..299ad8c9b23f28 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -415,13 +415,10 @@ public:
       vector,
       void>;
 
+  static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<typename allocator_type::value_type, value_type>::value,
                 "Allocator::value_type must be same type as value_type");
 
-  static_assert(is_same<allocator_type, __rebind_alloc<__alloc_traits, value_type> >::value,
-                "[allocator.requirements] states that rebinding an allocator to the same type should result in the "
-                "original allocator");
-
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector()
       _NOEXCEPT_(is_nothrow_default_constructible<allocator_type>::value) {}
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(const allocator_type& __a)
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index 64cf368e6e6849..917388f86811fe 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -327,6 +327,7 @@
 {'is_defined': False, 'name': '___cxa_bad_cast', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_bad_typeid', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_begin_catch', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_call_terminate', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_call_unexpected', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_current_exception_type', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_current_primary_exception', 'type': 'U'}
@@ -2593,6 +2594,7 @@
 {'is_defined': True, 'name': '___cxa_bad_cast', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_bad_typeid', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_begin_catch', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_call_terminate', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_call_unexpected', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_exception_type', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_primary_exception', 'type': 'I'}
diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index 8751dffe230259..8af5db472f7c3a 100644
--- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -2301,6 +2301,7 @@
 {'is_defined': True, 'name': '__cxa_bad_cast', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_bad_typeid', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_begin_catch', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__cxa_call_terminate', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_call_unexpected', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_current_exception_type', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_current_primary_exception', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index 7e223e66528847..033d9f9987fa80 100644
--- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -2022,7 +2022,8 @@
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdaPv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdaPvSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
-{'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPvSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPvm', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPvmSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_Znam', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZnamSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_Znwm', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index 407d0456757af2..332d8abeb03e3a 100644
--- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -2022,7 +2022,8 @@
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdaPv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdaPvSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
-{'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPvSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPvm', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'wIMP', 'is_defined': False, 'name': '_ZdlPvmSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_Znam', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_ZnamSt11align_val_t', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'wIMP', 'is_defined': False, 'name': '_Znwm', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index d578b41383c0e3..defe235a283c21 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -327,6 +327,7 @@
 {'is_defined': False, 'name': '___cxa_bad_cast', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_bad_typeid', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_begin_catch', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_call_terminate', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_call_unexpected', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_current_exception_type', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_current_primary_exception', 'type': 'U'}
@@ -2627,6 +2628,7 @@
 {'is_defined': True, 'name': '___cxa_bad_cast', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_bad_typeid', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_begin_catch', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_call_terminate', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_call_unexpected', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_exception_type', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_primary_exception', 'type': 'I'}
diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index fc0f4fcf415e63..6b77cda1e2866d 100644
--- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -2295,6 +2295,7 @@
 {'is_defined': True, 'name': '__cxa_bad_cast', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_bad_typeid', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_begin_catch', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__cxa_call_terminate', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_call_unexpected', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_current_exception_type', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_current_primary_exception', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
index 4022339562b3ad..3458b333dd6a9b 100644
--- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -39,7 +39,8 @@
 {'is_defined': False, 'name': '_ZdaPv', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZdaPvSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZdlPv', 'type': 'FUNC'}
-{'is_defined': False, 'name': '_ZdlPvSt11align_val_t', 'type': 'FUNC'}
+{'is_defined': False, 'name': '_ZdlPvm', 'type': 'FUNC'}
+{'is_defined': False, 'name': '_ZdlPvmSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_Znam', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZnamSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_Znwm', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
index 574c4504c59b8e..bdf90ba25c7fd9 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -39,7 +39,8 @@
 {'is_defined': False, 'name': '_ZdaPv', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZdaPvSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZdlPv', 'type': 'FUNC'}
-{'is_defined': False, 'name': '_ZdlPvSt11align_val_t', 'type': 'FUNC'}
+{'is_defined': False, 'name': '_ZdlPvm', 'type': 'FUNC'}
+{'is_defined': False, 'name': '_ZdlPvmSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_Znam', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZnamSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_Znwm', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
index 665546699e8ded..ac3cc129c04b51 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
@@ -18,7 +18,8 @@
 {'is_defined': False, 'name': '_ZdaPv', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZdaPvSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZdlPv', 'type': 'FUNC'}
-{'is_defined': False, 'name': '_ZdlPvSt11align_val_t', 'type': 'FUNC'}
+{'is_defined': False, 'name': '_ZdlPvm', 'type': 'FUNC'}
+{'is_defined': False, 'name': '_ZdlPvmSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_Znam', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_ZnamSt11align_val_t', 'type': 'FUNC'}
 {'is_defined': False, 'name': '_Znwm', 'type': 'FUNC'}
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
index c35f6fecb1fd76..9fef16e9cefde9 100644
--- a/libcxx/modules/CMakeLists.txt.in
+++ b/libcxx/modules/CMakeLists.txt.in
@@ -41,7 +41,7 @@ target_sources(std
     std.cppm
 )
 
-target_include_directories(std SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+target_include_directories(std SYSTEM PUBLIC @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
 
 if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
   target_compile_options(std PUBLIC -fno-exceptions)
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 9e6c70335a7940..8ba6f00bf9dad5 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -275,8 +275,13 @@ if (LIBCXX_ENABLE_SHARED)
     # Since we most likely do not have a mt.exe replacement, disable the
     # manifest bundling.  This allows a normal cmake invocation to pass which
     # will attempt to use the manifest tool to generate the bundled manifest
-    set_target_properties(cxx_shared PROPERTIES
-                          APPEND_STRING PROPERTY LINK_FLAGS " /MANIFEST:NO")
+    if (${CMAKE_CXX_COMPILER_FRONTEND_VARIANT} STREQUAL "MSVC")
+      set_target_properties(cxx_shared PROPERTIES
+                            APPEND_STRING PROPERTY LINK_FLAGS " /MANIFEST:NO")
+    else()
+      set_target_properties(cxx_shared PROPERTIES
+                            APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker /MANIFEST:NO")
+    endif()
   endif()
 endif()
 
diff --git a/libcxx/src/barrier.cpp b/libcxx/src/barrier.cpp
index bbcfb6001bba18..69601bfeec0546 100644
--- a/libcxx/src/barrier.cpp
+++ b/libcxx/src/barrier.cpp
@@ -21,17 +21,17 @@ class __barrier_algorithm_base {
     } __tickets[64];
   };
 
-  ptrdiff_t& __expected;
-  unique_ptr<__state_t[]> __state;
+  ptrdiff_t& __expected_;
+  unique_ptr<__state_t[]> __state_;
 
-  _LIBCPP_HIDDEN __barrier_algorithm_base(ptrdiff_t& __expected) : __expected(__expected) {
+  _LIBCPP_HIDDEN __barrier_algorithm_base(ptrdiff_t& __expected) : __expected_(__expected) {
     size_t const __count = (__expected + 1) >> 1;
-    __state              = unique_ptr<__state_t[]>(new __state_t[__count]);
+    __state_             = unique_ptr<__state_t[]>(new __state_t[__count]);
   }
   _LIBCPP_HIDDEN bool __arrive(__barrier_phase_t __old_phase) {
     __barrier_phase_t const __half_step = __old_phase + 1, __full_step = __old_phase + 2;
-    size_t __current_expected = __expected,
-           __current          = hash<thread::id>()(this_thread::get_id()) % ((__expected + 1) >> 1);
+    size_t __current_expected = __expected_,
+           __current          = hash<thread::id>()(this_thread::get_id()) % ((__expected_ + 1) >> 1);
     for (int __round = 0;; ++__round) {
       if (__current_expected <= 1)
         return true;
@@ -41,14 +41,14 @@ class __barrier_algorithm_base {
           __current = 0;
         __barrier_phase_t expect = __old_phase;
         if (__current == __last_node && (__current_expected & 1)) {
-          if (__state[__current].__tickets[__round].__phase.compare_exchange_strong(
+          if (__state_[__current].__tickets[__round].__phase.compare_exchange_strong(
                   expect, __full_step, memory_order_acq_rel))
             break; // I'm 1 in 1, go to next __round
-        } else if (__state[__current].__tickets[__round].__phase.compare_exchange_strong(
+        } else if (__state_[__current].__tickets[__round].__phase.compare_exchange_strong(
                        expect, __half_step, memory_order_acq_rel)) {
           return false; // I'm 1 in 2, done with arrival
         } else if (expect == __half_step) {
-          if (__state[__current].__tickets[__round].__phase.compare_exchange_strong(
+          if (__state_[__current].__tickets[__round].__phase.compare_exchange_strong(
                   expect, __full_step, memory_order_acq_rel))
             break; // I'm 2 in 2, go to next __round
         }
diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp
index abd8695978ea7c..a83c1ae15a4a40 100644
--- a/libcxx/src/filesystem/operations.cpp
+++ b/libcxx/src/filesystem/operations.cpp
@@ -934,7 +934,7 @@ path __weakly_canonical(const path& p, error_code* ec) {
   vector<string_view_t> DNEParts;
 
   error_code m_ec;
-  while (PP.State != PathParser::PS_BeforeBegin) {
+  while (PP.State_ != PathParser::PS_BeforeBegin) {
     tmp.assign(createView(p.native().data(), &PP.RawEntry.back()));
     file_status st = __status(tmp, &m_ec);
     if (!status_known(st)) {
@@ -949,7 +949,7 @@ path __weakly_canonical(const path& p, error_code* ec) {
     DNEParts.push_back(*PP);
     --PP;
   }
-  if (PP.State == PathParser::PS_BeforeBegin) {
+  if (PP.State_ == PathParser::PS_BeforeBegin) {
     result = __canonical("", &m_ec);
     if (m_ec) {
       return err.report(m_ec);
diff --git a/libcxx/src/filesystem/path.cpp b/libcxx/src/filesystem/path.cpp
index c89346aa510c78..b2019521377ede 100644
--- a/libcxx/src/filesystem/path.cpp
+++ b/libcxx/src/filesystem/path.cpp
@@ -45,23 +45,23 @@ path& path::replace_extension(path const& replacement) {
 
 string_view_t path::__root_name() const {
   auto PP = PathParser::CreateBegin(__pn_);
-  if (PP.State == PathParser::PS_InRootName)
+  if (PP.State_ == PathParser::PS_InRootName)
     return *PP;
   return {};
 }
 
 string_view_t path::__root_directory() const {
   auto PP = PathParser::CreateBegin(__pn_);
-  if (PP.State == PathParser::PS_InRootName)
+  if (PP.State_ == PathParser::PS_InRootName)
     ++PP;
-  if (PP.State == PathParser::PS_InRootDir)
+  if (PP.State_ == PathParser::PS_InRootDir)
     return *PP;
   return {};
 }
 
 string_view_t path::__root_path_raw() const {
   auto PP = PathParser::CreateBegin(__pn_);
-  if (PP.State == PathParser::PS_InRootName) {
+  if (PP.State_ == PathParser::PS_InRootName) {
     auto NextCh = PP.peek();
     if (NextCh && isSeparator(*NextCh)) {
       ++PP;
@@ -69,24 +69,24 @@ string_view_t path::__root_path_raw() const {
     }
     return PP.RawEntry;
   }
-  if (PP.State == PathParser::PS_InRootDir)
+  if (PP.State_ == PathParser::PS_InRootDir)
     return *PP;
   return {};
 }
 
 static bool ConsumeRootName(PathParser* PP) {
   static_assert(PathParser::PS_BeforeBegin == 1 && PathParser::PS_InRootName == 2, "Values for enums are incorrect");
-  while (PP->State <= PathParser::PS_InRootName)
+  while (PP->State_ <= PathParser::PS_InRootName)
     ++(*PP);
-  return PP->State == PathParser::PS_AtEnd;
+  return PP->State_ == PathParser::PS_AtEnd;
 }
 
 static bool ConsumeRootDir(PathParser* PP) {
   static_assert(PathParser::PS_BeforeBegin == 1 && PathParser::PS_InRootName == 2 && PathParser::PS_InRootDir == 3,
                 "Values for enums are incorrect");
-  while (PP->State <= PathParser::PS_InRootDir)
+  while (PP->State_ <= PathParser::PS_InRootDir)
     ++(*PP);
-  return PP->State == PathParser::PS_AtEnd;
+  return PP->State_ == PathParser::PS_AtEnd;
 }
 
 string_view_t path::__relative_path() const {
@@ -248,7 +248,7 @@ path path::lexically_relative(const path& base) const {
     auto PP                      = PathParser::CreateBegin(__pn_);
     auto PPBase                  = PathParser::CreateBegin(base.__pn_);
     auto CheckIterMismatchAtBase = [&]() {
-      return PP.State != PPBase.State && (PP.inRootPath() || PPBase.inRootPath());
+      return PP.State_ != PPBase.State_ && (PP.inRootPath() || PPBase.inRootPath());
     };
     if (PP.inRootName() && PPBase.inRootName()) {
       if (*PP != *PPBase)
@@ -267,7 +267,7 @@ path path::lexically_relative(const path& base) const {
   // Find the first mismatching element
   auto PP     = PathParser::CreateBegin(__pn_);
   auto PPBase = PathParser::CreateBegin(base.__pn_);
-  while (PP && PPBase && PP.State == PPBase.State && *PP == *PPBase) {
+  while (PP && PPBase && PP.State_ == PPBase.State_ && *PP == *PPBase) {
     ++PP;
     ++PPBase;
   }
@@ -380,7 +380,7 @@ path::iterator path::begin() const {
   auto PP = PathParser::CreateBegin(__pn_);
   iterator it;
   it.__path_ptr_ = this;
-  it.__state_    = static_cast<path::iterator::_ParserState>(PP.State);
+  it.__state_    = static_cast<path::iterator::_ParserState>(PP.State_);
   it.__entry_    = PP.RawEntry;
   it.__stashed_elem_.__assign_view(*PP);
   return it;
@@ -396,7 +396,7 @@ path::iterator path::end() const {
 path::iterator& path::iterator::__increment() {
   PathParser PP(__path_ptr_->native(), __entry_, __state_);
   ++PP;
-  __state_ = static_cast<_ParserState>(PP.State);
+  __state_ = static_cast<_ParserState>(PP.State_);
   __entry_ = PP.RawEntry;
   __stashed_elem_.__assign_view(*PP);
   return *this;
@@ -405,7 +405,7 @@ path::iterator& path::iterator::__increment() {
 path::iterator& path::iterator::__decrement() {
   PathParser PP(__path_ptr_->native(), __entry_, __state_);
   --PP;
-  __state_ = static_cast<_ParserState>(PP.State);
+  __state_ = static_cast<_ParserState>(PP.State_);
   __entry_ = PP.RawEntry;
   __stashed_elem_.__assign_view(*PP);
   return *this;
diff --git a/libcxx/src/filesystem/path_parser.h b/libcxx/src/filesystem/path_parser.h
index 28a8f240e3bd22..06623696452da6 100644
--- a/libcxx/src/filesystem/path_parser.h
+++ b/libcxx/src/filesystem/path_parser.h
@@ -50,14 +50,14 @@ struct PathParser {
 
   const string_view_t Path;
   string_view_t RawEntry;
-  ParserState State;
+  ParserState State_;
 
 private:
-  PathParser(string_view_t P, ParserState State) noexcept : Path(P), State(State) {}
+  PathParser(string_view_t P, ParserState State) noexcept : Path(P), State_(State) {}
 
 public:
   PathParser(string_view_t P, string_view_t E, unsigned char S)
-      : Path(P), RawEntry(E), State(static_cast<ParserState>(S)) {
+      : Path(P), RawEntry(E), State_(static_cast<ParserState>(S)) {
     // S cannot be '0' or PS_BeforeBegin.
   }
 
@@ -84,7 +84,7 @@ struct PathParser {
     if (Start == End)
       return makeState(PS_AtEnd);
 
-    switch (State) {
+    switch (State_) {
     case PS_BeforeBegin: {
       PosPtr TkEnd = consumeRootName(Start, End);
       if (TkEnd)
@@ -125,7 +125,7 @@ struct PathParser {
     if (RStart == REnd) // we're decrementing the begin
       return makeState(PS_BeforeBegin);
 
-    switch (State) {
+    switch (State_) {
     case PS_AtEnd: {
       // Try to consume a trailing separator or root directory first.
       if (PosPtr SepEnd = consumeAllSeparators(RStart, REnd)) {
@@ -169,7 +169,7 @@ struct PathParser {
   /// \brief Return a view with the "preferred representation" of the current
   ///   element. For example trailing separators are represented as a '.'
   string_view_t operator*() const noexcept {
-    switch (State) {
+    switch (State_) {
     case PS_BeforeBegin:
     case PS_AtEnd:
       return PATHSTR("");
@@ -187,7 +187,7 @@ struct PathParser {
     __libcpp_unreachable();
   }
 
-  explicit operator bool() const noexcept { return State != PS_BeforeBegin && State != PS_AtEnd; }
+  explicit operator bool() const noexcept { return State_ != PS_BeforeBegin && State_ != PS_AtEnd; }
 
   PathParser& operator++() noexcept {
     increment();
@@ -199,21 +199,21 @@ struct PathParser {
     return *this;
   }
 
-  bool atEnd() const noexcept { return State == PS_AtEnd; }
+  bool atEnd() const noexcept { return State_ == PS_AtEnd; }
 
-  bool inRootDir() const noexcept { return State == PS_InRootDir; }
+  bool inRootDir() const noexcept { return State_ == PS_InRootDir; }
 
-  bool inRootName() const noexcept { return State == PS_InRootName; }
+  bool inRootName() const noexcept { return State_ == PS_InRootName; }
 
   bool inRootPath() const noexcept { return inRootName() || inRootDir(); }
 
 private:
   void makeState(ParserState NewState, PosPtr Start, PosPtr End) noexcept {
-    State    = NewState;
+    State_    = NewState;
     RawEntry = string_view_t(Start, End - Start);
   }
   void makeState(ParserState NewState) noexcept {
-    State    = NewState;
+    State_    = NewState;
     RawEntry = {};
   }
 
@@ -224,7 +224,7 @@ struct PathParser {
   /// \brief Return a pointer to the first character after the currently
   ///   lexed element.
   PosPtr getNextTokenStartPos() const noexcept {
-    switch (State) {
+    switch (State_) {
     case PS_BeforeBegin:
       return Path.data();
     case PS_InRootName:
@@ -241,7 +241,7 @@ struct PathParser {
   /// \brief Return a pointer to the first character in the currently lexed
   ///   element.
   PosPtr getCurrentTokenStartPos() const noexcept {
-    switch (State) {
+    switch (State_) {
     case PS_BeforeBegin:
     case PS_InRootName:
       return &Path.front();
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index c5ab6de5d657a2..4efdc63c096611 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -557,9 +557,9 @@ locale::locale(const locale& other, const locale& one, category c)
 
 string locale::name() const { return __locale_->name(); }
 
-void locale::__install_ctor(const locale& other, facet* f, long id) {
+void locale::__install_ctor(const locale& other, facet* f, long facet_id) {
   if (f)
-    __locale_ = new __imp(*other.__locale_, f, id);
+    __locale_ = new __imp(*other.__locale_, f, facet_id);
   else
     __locale_ = other.__locale_;
   __locale_->acquire();
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 3c54a4edccff38..cdd1c2d90fbcf2 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -34,31 +34,6 @@ endif()
 
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
-if (MSVC)
-  # Shared code for initializing some parameters used by all
-  # llvm-libc++-*-clangcl.cfg.in test configs.
-  set(dbg_include "")
-
-  if (NOT CMAKE_MSVC_RUNTIME_LIBRARY OR CMAKE_MSVC_RUNTIME_LIBRARY MATCHES "DLL$")
-    set(fms_runtime_lib "dll")
-    set(cxx_lib "msvcprt")
-  else()
-    set(fms_runtime_lib "static")
-    set(cxx_lib "libcpmt")
-  endif()
-
-  if ((NOT CMAKE_MSVC_RUNTIME_LIBRARY AND uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-      OR (CMAKE_MSVC_RUNTIME_LIBRARY MATCHES "Debug"))
-    set(dbg_include " -include set_windows_crt_report_mode.h")
-    set(fms_runtime_lib "${fms_runtime_lib}_dbg")
-    set(cxx_lib "${cxx_lib}d")
-  endif()
-
-  serialize_lit_string_param(SERIALIZED_LIT_PARAMS dbg_include "${dbg_include}")
-  serialize_lit_string_param(SERIALIZED_LIT_PARAMS fms_runtime_lib "${fms_runtime_lib}")
-  serialize_lit_string_param(SERIALIZED_LIT_PARAMS cxx_lib "${cxx_lib}")
-endif()
-
 if (LIBCXX_INCLUDE_TESTS)
   include(AddLLVM) # for configure_lit_site_cfg and add_lit_testsuite
 
diff --git a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
index 163123fffb7525..cffe895f653e33 100644
--- a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
@@ -3,12 +3,26 @@
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
+dbg_include = ''
+runtime_library = '@CMAKE_MSVC_RUNTIME_LIBRARY@'
+if runtime_library == '' or runtime_library.endswith('DLL'):
+    fms_runtime_lib = 'dll'
+    cxx_lib = 'msvcprt'
+else:
+    fms_runtime_lib = 'static'
+    cxx_lib = 'libcpmt'
+
+if '@CMAKE_BUILD_TYPE@'.upper() == 'DEBUG':
+    dbg_include = ' -D_DEBUG -include set_windows_crt_report_mode.h'
+    fms_runtime_lib += '_dbg'
+    cxx_lib += 'd'
+
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{target-include-dir} -I %{include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
+    '-fms-runtime-lib=' + fms_runtime_lib + ' -nostdinc++ -I %{target-include-dir} -I %{include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -lc++ -l' + cxx_lib
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
diff --git a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
index a8ad920897e61f..80c7c0ee669ae2 100644
--- a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
@@ -4,12 +4,26 @@
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
+dbg_include = ''
+runtime_library = '@CMAKE_MSVC_RUNTIME_LIBRARY@'
+if runtime_library == '' or runtime_library.endswith('DLL'):
+    fms_runtime_lib = 'dll'
+    cxx_lib = 'msvcprt'
+else:
+    fms_runtime_lib = 'static'
+    cxx_lib = 'libcpmt'
+
+if '@CMAKE_BUILD_TYPE@'.upper() == 'DEBUG':
+    dbg_include = ' -D_DEBUG -include set_windows_crt_report_mode.h'
+    fms_runtime_lib += '_dbg'
+    cxx_lib += 'd'
+
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + config.dbg_include
+    '-fms-runtime-lib=' + fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -lc++ -l' + cxx_lib
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
diff --git a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
index 58ca3f27c713bb..4b6b3fcf2d9c86 100644
--- a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
@@ -3,12 +3,26 @@
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
+dbg_include = ''
+runtime_library = '@CMAKE_MSVC_RUNTIME_LIBRARY@'
+if runtime_library == '' or runtime_library.endswith('DLL'):
+    fms_runtime_lib = 'dll'
+    cxx_lib = 'msvcprt'
+else:
+    fms_runtime_lib = 'static'
+    cxx_lib = 'libcpmt'
+
+if '@CMAKE_BUILD_TYPE@'.upper() == 'DEBUG':
+    dbg_include = ' -D_DEBUG -include set_windows_crt_report_mode.h'
+    fms_runtime_lib += '_dbg'
+    cxx_lib += 'd'
+
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{target-include-dir} -I %{include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
+    '-fms-runtime-lib=' + fms_runtime_lib + ' -nostdinc++ -I %{target-include-dir} -I %{include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib-dir} -llibc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -llibc++ -l' + cxx_lib
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/libcxx/containers/sequences/vector/const_value_type.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/const_value_type.pass.cpp
deleted file mode 100644
index 1ad505a76d3fbf..00000000000000
--- a/libcxx/test/libcxx/containers/sequences/vector/const_value_type.pass.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03
-
-// <vector>
-
-// vector<const int> v;  // an extension
-
-#include <vector>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    std::vector<const int> v = {1, 2, 3};
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/diagnostics/memory.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/memory.nodiscard.verify.cpp
index 210d269c9fa238..646569e3d573ab 100644
--- a/libcxx/test/libcxx/diagnostics/memory.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/memory.nodiscard.verify.cpp
@@ -39,15 +39,3 @@ void test_allocator() {
   allocator.allocate_at_least(1); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 #endif
 }
-
-void test_const_allocator() {
-  std::allocator<const int> allocator;
-  allocator.allocate(1);          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-
-#if TEST_STD_VER <= 17
-  allocator.allocate(1, nullptr); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-#endif
-#if TEST_STD_VER >= 23
-  allocator.allocate_at_least(1); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-#endif
-}
diff --git a/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp
index f9d67c065de85e..69a4b8943caa6a 100644
--- a/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp
+++ b/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp
@@ -17,18 +17,10 @@
 #include <type_traits>
 
 typedef std::allocator<void> A1;
-typedef std::allocator<void const> A2;
-struct A3 : std::allocator<void> { };
-struct A4 : std::allocator<void const> { };
+struct A2 : std::allocator<void> { };
 
 static_assert(std::is_trivially_default_constructible<A1>::value, "");
 static_assert(std::is_trivial<A1>::value, "");
 
 static_assert(std::is_trivially_default_constructible<A2>::value, "");
 static_assert(std::is_trivial<A2>::value, "");
-
-static_assert(std::is_trivially_default_constructible<A3>::value, "");
-static_assert(std::is_trivial<A3>::value, "");
-
-static_assert(std::is_trivially_default_constructible<A4>::value, "");
-static_assert(std::is_trivial<A4>::value, "");
diff --git a/libcxx/test/libcxx/memory/allocator_volatile.verify.cpp b/libcxx/test/libcxx/memory/allocator_volatile.verify.cpp
index 6fa7fe1b1aeda0..53fdc08e7a024d 100644
--- a/libcxx/test/libcxx/memory/allocator_volatile.verify.cpp
+++ b/libcxx/test/libcxx/memory/allocator_volatile.verify.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// http://wg21.link/LWG2447 gives implementors freedom to reject volatile types in `std::allocator`.
+// http://wg21.link/LWG2447 gives implementors freedom to reject const or volatile types in `std::allocator`.
 
 #include <memory>
 
-std::allocator<volatile int> A1; // expected-error@*:* {{std::allocator does not support volatile types}}
-std::allocator<const volatile int> A2; // expected-error@*:* {{std::allocator does not support volatile types}}
+std::allocator<const int> A1; // expected-error@*:* {{std::allocator does not support const types}}
+std::allocator<volatile int> A2; // expected-error@*:* {{std::allocator does not support volatile types}}
diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/PR53170.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/PR53170.pass.cpp
index 2b46d4bc2fb085..e7b775068c54b9 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.capacity/PR53170.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/PR53170.pass.cpp
@@ -21,8 +21,8 @@
 //
 // Reported as https://llvm.org/PR53170.
 
-// reserve(n) used to shrink the string until https://llvm.org/D117332 was shipped.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0}}
+// reserve(n) used to shrink the string until https://llvm.org/D117332 was shipped in LLVM 14.
+// XFAIL: using-built-library-before-llvm-14
 
 #include <string>
 #include <stdexcept>
diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
index 750e7b165723da..6f127e1b62b02c 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.13|10.14|10.15|11.0}}
-
 // <string>
 
 // This test demonstrates the smaller allocation sizes when the alignment
 // requirements of std::string are dropped from 16 to 8.
+//
+// XFAIL: using-built-library-before-llvm-19
+
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
diff --git a/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp b/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp
index b7652b074bde83..66251648bd50ce 100644
--- a/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp
@@ -13,9 +13,6 @@
 // unique_lock.
 // UNSUPPORTED: c++03
 
-// PR30202 was fixed starting in macosx10.13.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // <condition_variable>
 
 // void notify_all_at_thread_exit(condition_variable& cond, unique_lock<mutex> lk);
diff --git a/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp b/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp
index 6f08dde2f3132f..e64a873fa095fa 100644
--- a/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp
@@ -8,11 +8,6 @@
 
 // UNSUPPORTED: no-threads
 
-// Until 58a0a70fb2f1, this_thread::sleep_for could sometimes get interrupted
-// by signals and this test would fail spuriously. Disable the test on the
-// corresponding system libraries.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
-
 // ALLOW_RETRIES: 3
 
 // <thread>
diff --git a/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.signals.pass.cpp b/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.signals.pass.cpp
index 12376695fac341..1dba5d8a409766 100644
--- a/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.signals.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.signals.pass.cpp
@@ -11,11 +11,6 @@
 // This test uses the POSIX header <sys/time.h> which Windows doesn't provide
 // UNSUPPORTED: windows
 
-// Until 58a0a70fb2f1, this_thread::sleep_for misbehaves when interrupted by
-// a signal, as tested here. Disable the test on the corresponding system
-// libraries.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
-
 // ALLOW_RETRIES: 3
 
 // <thread>
diff --git a/libcxx/test/lit.cfg.py b/libcxx/test/lit.cfg.py
index ea4a9a75b4fdb8..bb055268774790 100644
--- a/libcxx/test/lit.cfg.py
+++ b/libcxx/test/lit.cfg.py
@@ -7,5 +7,6 @@
 lit_config.fatal(
     "You seem to be running Lit directly -- you should be running Lit through "
     "<build>/bin/llvm-lit, which will ensure that the right Lit configuration "
-    "file is used."
+    "file is used. See https://libcxx.llvm.org/TestingLibcxx.html#usage for "
+    "how to run the libc++ tests."
 )
diff --git a/libcxx/test/std/concepts/concepts.lang/concept.default.init/default_initializable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.lang/concept.default.init/default_initializable.compile.pass.cpp
index ee1405f1f889d8..6a31fb90e4da2f 100644
--- a/libcxx/test/std/concepts/concepts.lang/concept.default.init/default_initializable.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.lang/concept.default.init/default_initializable.compile.pass.cpp
@@ -205,9 +205,6 @@ void test()
     test_not_const<std::array<      volatile int, 1>>();
     test_false    <std::array<const volatile int, 1>>();
     test_true     <std::deque<               int>>();
-#ifdef _LIBCPP_VERSION
-    test_true     <std::deque<const          int>>();
-#endif // _LIBCPP_VERSION
     test_true     <std::forward_list<int>>();
     test_true     <std::list<int>>();
     test_true     <std::vector<int>>();
@@ -226,9 +223,6 @@ void test()
 
     // Container adaptors
     test_true     <std::stack<               int>>();
-#ifdef _LIBCPP_VERSION
-    test_true     <std::stack<const          int>>();
-#endif // _LIBCPP_VERSION
     test_true     <std::queue<int>>();
     test_true     <std::priority_queue<int>>();
 
diff --git a/libcxx/test/std/containers/sequences/array/array.overview/nttp.equivalence.compile.pass.cpp b/libcxx/test/std/containers/sequences/array/array.overview/nttp.equivalence.compile.pass.cpp
new file mode 100644
index 00000000000000..29e10fd40bd3ff
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/array.overview/nttp.equivalence.compile.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <array>
+
+// LWG-3382 NTTP for pair and array:
+// Two values a1 and a2 of type array<T, N> are template-argument-equivalent if and only if each pair of corresponding
+// elements in a1 and a2 are template-argument-equivalent.
+
+#include <array>
+
+#include <type_traits>
+
+namespace test_full_type {
+template <class T, std::size_t S, std::array<T, S> A>
+struct test : std::false_type {};
+
+template <>
+struct test<int, 3, std::array<int, 3>{1, 2, 3}> : std::true_type {};
+
+static_assert(!test<int*, 4, std::array<int*, 4>{}>::value);
+static_assert(!test<int*, 3, std::array<int*, 3>{}>::value);
+static_assert(!test<int, 3, std::array<int, 3>{}>::value);
+static_assert(!test<int, 3, std::array<int, 3>{1}>::value);
+static_assert(!test<int, 3, std::array<int, 3>{1, 2}>::value);
+static_assert(!test<long, 3, std::array<long, 3>{1, 2, 3}>::value);
+static_assert(!test<unsigned int, 3, std::array<unsigned int, 3>{1, 2, 3}>::value);
+static_assert(test<int, 3, std::array<int, 3>{1, 2, 3}>::value);
+} // namespace test_full_type
+
+namespace test_ctad {
+template <std::array A>
+struct test : std::false_type {};
+
+template <>
+struct test<std::array<int, 3>{4, 5, 6}> : std::true_type {};
+
+static_assert(!test<std::array<int*, 4>{}>::value);
+static_assert(!test<std::array<int*, 3>{}>::value);
+static_assert(!test<std::array<int, 3>{}>::value);
+static_assert(!test<std::array<int, 3>{4}>::value);
+static_assert(!test<std::array<int, 3>{4, 5}>::value);
+static_assert(!test<std::array<long, 3>{4, 5, 6}>::value);
+static_assert(!test<std::array<unsigned int, 3>{4, 5, 6}>::value);
+static_assert(test<std::array<int, 3>{4, 5, 6}>::value);
+} // namespace test_ctad
+
+namespace test_auto {
+template <auto A>
+struct test : std::false_type {};
+
+template <>
+struct test<std::array<int, 3>{7, 8, 9}> : std::true_type {};
+
+static_assert(!test<std::array<int*, 4>{}>::value);
+static_assert(!test<std::array<int*, 3>{}>::value);
+static_assert(!test<std::array<int, 3>{}>::value);
+static_assert(!test<std::array<int, 3>{7}>::value);
+static_assert(!test<std::array<int, 3>{7, 8}>::value);
+static_assert(!test<std::array<long, 3>{7, 8, 9}>::value);
+static_assert(!test<std::array<unsigned int, 3>{7, 8, 9}>::value);
+static_assert(test<std::array<int, 3>{7, 8, 9}>::value);
+} // namespace test_auto
diff --git a/libcxx/test/std/containers/sequences/array/array.overview/nttp.verify.cpp b/libcxx/test/std/containers/sequences/array/array.overview/nttp.verify.cpp
new file mode 100644
index 00000000000000..3eb8e2596f85bf
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/array.overview/nttp.verify.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <array>
+
+// LWG-3382 NTTP for pair and array:
+// array<T, N> is a structural type ([temp.param]) if T is a structural type.
+
+#include <array>
+
+#include <cstddef>
+#include <string>
+
+struct LiteralBase {};
+struct LiteralNSDM {};
+
+struct LiteralType : LiteralBase {
+  LiteralNSDM nsdm;
+};
+
+struct NotALiteral {
+  NotALiteral() {}
+};
+
+int i;
+NotALiteral not_a_literal;
+
+namespace test_full_type {
+template <class T, std::size_t S, std::array<T, S> A>
+struct test {};
+
+using A = test<int, 2, std::array{2, 3}>;
+using B = test<LiteralType, 0, std::array<LiteralType, 0>{}>;
+using C = test<int*, 1, std::array<int*, 1>{&i}>;
+using D = test<NotALiteral*, 1, std::array<NotALiteral*, 1>{&not_a_literal}>;
+
+using E = test<NotALiteral, 1, std::array<NotALiteral, 1>{}>;
+// expected-error-re@*:* {{non-type template parameter has non-literal type 'std::array<NotALiteral, 1U{{L{0,2}.*}}>'}}
+
+using F = test<std::string, 2, std::array<std::string, 2>{}>;
+// expected-error-re@*:* {{type 'std::array<{{(std::)?}}string, 2U{{L{0,2}.*}}>' {{(\(aka 'array<basic_string<char>, 2UL{0,2}>'\) )?}}of non-type template parameter is not a structural type}}
+} // namespace test_full_type
+
+namespace test_ctad {
+template <std::array A>
+struct test {};
+
+using A = test<std::array{2, 3}>;
+using B = test<std::array<LiteralType, 0>{}>;
+using C = test<std::array<int*, 1>{&i}>;
+using D = test<std::array<NotALiteral*, 1>{&not_a_literal}>;
+
+using E = test<std::array<NotALiteral, 1>{}>;
+// expected-error@-1 {{non-type template parameter has non-literal type 'std::array<NotALiteral, 1>'}}
+
+using F = test<std::array<std::string, 2>{}>;
+// expected-error@-1 {{type 'std::array<string, 2>' (aka 'std::array<std::string, 2>') of non-type template parameter is not a structural type}}
+} // namespace test_ctad
+
+namespace test_auto {
+template <auto A>
+struct test {};
+
+using A = test<std::array{2, 3}>;
+using B = test<std::array<LiteralType, 0>{}>;
+using C = test<std::array<int*, 1>{&i}>;
+using D = test<std::array<NotALiteral*, 1>{&not_a_literal}>;
+
+using E = test<std::array<NotALiteral, 1>{}>;
+// expected-error@-1 {{non-type template parameter has non-literal type 'std::array<NotALiteral, 1>'}}
+
+using F = test<std::array<std::string, 2>{}>;
+// expected-error@-1 {{type 'std::array<std::string, 2>' (aka 'array<basic_string<char>, 2>') of non-type template parameter is not a structural type}}
+} // namespace test_auto
diff --git a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp
index 3f99d8a3b47a2e..9144bb6812e3cc 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp
@@ -13,10 +13,12 @@
 // explicit extents(Integrals...) -> see below;
 //   Constraints: (is_convertible_v<Integrals, size_t> && ...) is true.
 //
-// Remarks: The deduced type is dextents<size_t, sizeof...(Integrals)>.
+// Remarks: The deduced type is dextents<size_t, sizeof...(Integrals)>.           // until C++26
+// Remarks: The deduced type is extents<size_t, maybe-static-ext<Integrals>...>.  // since C++26
 
 #include <mdspan>
 #include <cassert>
+#include <type_traits>
 
 #include "../ConvertibleToIntegral.h"
 #include "test_macros.h"
@@ -43,6 +45,15 @@ constexpr bool test() {
   test(std::extents(1, 2u, 3, 4, 5, 6, 7, 8, 9),
        std::extents<std::size_t, D, D, D, D, D, D, D, D, D>(1, 2u, 3, 4, 5, 6, 7, 8, 9));
   test(std::extents(NoDefaultCtorIndex{1}, NoDefaultCtorIndex{2}), std::extents<std::size_t, D, D>(1, 2));
+
+#if _LIBCPP_STD_VER >= 26
+  // P3029R1: deduction from `integral_constant`
+  test(std::extents(std::integral_constant<size_t, 5>{}), std::extents<std::size_t, 5>());
+  test(std::extents(std::integral_constant<size_t, 5>{}, 6), std::extents<std::size_t, 5, std::dynamic_extent>(6));
+  test(std::extents(std::integral_constant<size_t, 5>{}, 6, std::integral_constant<size_t, 7>{}),
+       std::extents<std::size_t, 5, std::dynamic_extent, 7>(6));
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/depr/depr.c.headers/stdlib_h.aligned_alloc.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/stdlib_h.aligned_alloc.compile.pass.cpp
index 7ddf03fff4f4e1..aec2c7f98af75a 100644
--- a/libcxx/test/std/depr/depr.c.headers/stdlib_h.aligned_alloc.compile.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/stdlib_h.aligned_alloc.compile.pass.cpp
@@ -13,7 +13,7 @@
 
 // ::aligned_alloc is provided by the C library, but it's marked as unavailable
 // until macOS 10.15
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// XFAIL: target={{.+}}-apple-macosx10.{{(13|14)(.0)?}}
 
 // ::aligned_alloc is not implemented on Windows
 // XFAIL: target={{.+}}-windows-{{.+}}
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
index 35c2726fda2a00..54252033040143 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // <system_error>
 
 // class error_category
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index 0f622289248b71..6ba33ba44ca4cd 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -12,8 +12,6 @@
 
 // const error_category& system_category();
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 #include <system_error>
 #include <cassert>
 #include <string>
diff --git a/libcxx/test/std/experimental/simd/simd.reference/reference_swap.pass.cpp b/libcxx/test/std/experimental/simd/simd.reference/reference_swap.pass.cpp
new file mode 100644
index 00000000000000..fa1c3106970acf
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.reference/reference_swap.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// Note: To ensure the swap functions can be called directly in the std::experimental namespace,
+// the implementation approach might not fully align with the specification.
+//
+// [simd.reference]
+// friend void swap(reference&& a, reference&& b) noexcept;
+// friend void swap(value_type& a, reference&& b) noexcept;
+// friend void swap(reference&& a, value_type& b) noexcept;
+
+#include "../test_utils.h"
+#include <experimental/simd>
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, std::size_t>
+struct CheckSimdRefSwap {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd<T, SimdAbi> origin_simd_1(1);
+    ex::simd<T, SimdAbi> origin_simd_2(2);
+    T value = 3;
+
+    static_assert(noexcept(ex::swap(std::move(origin_simd_1[0]), std::move(origin_simd_2[0]))));
+    ex::swap(std::move(origin_simd_1[0]), std::move(origin_simd_2[0]));
+    assert((origin_simd_1[0] == 2) && (origin_simd_2[0] == 1));
+
+    static_assert(noexcept(ex::swap(std::move(origin_simd_1[0]), value)));
+    ex::swap(std::move(origin_simd_1[0]), value);
+    assert((origin_simd_1[0] == 3) && (value == 2));
+
+    static_assert(noexcept(ex::swap(value, std::move(origin_simd_2[0]))));
+    ex::swap(value, std::move(origin_simd_2[0]));
+    assert((value == 1) && (origin_simd_2[0] == 2));
+  }
+};
+
+template <class T, std::size_t>
+struct CheckMaskRefSwap {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd_mask<T, SimdAbi> origin_mask_1(true);
+    ex::simd_mask<T, SimdAbi> origin_mask_2(false);
+    bool value = true;
+
+    static_assert(noexcept(ex::swap(std::move(origin_mask_1[0]), std::move(origin_mask_2[0]))));
+    ex::swap(std::move(origin_mask_1[0]), std::move(origin_mask_2[0]));
+    assert((origin_mask_1[0] == false) && (origin_mask_2[0] == true));
+
+    static_assert(noexcept(ex::swap(std::move(origin_mask_1[0]), value)));
+    ex::swap(std::move(origin_mask_1[0]), value);
+    assert((origin_mask_1[0] == true) && (value == false));
+
+    static_assert(noexcept(ex::swap(value, std::move(origin_mask_2[0]))));
+    ex::swap(value, std::move(origin_mask_2[0]));
+    assert((value == true) && (origin_mask_2[0] == false));
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdRefSwap>();
+  test_all_simd_abi<CheckMaskRefSwap>();
+  return 0;
+}
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
index f2d5e7dd0e5638..20a545b719ed6f 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
@@ -10,8 +10,8 @@
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
-// against already-released libc++'s.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// against a built library that doesn't contain 0aa637b2037d.
+// XFAIL: using-built-library-before-llvm-13
 
 // <filesystem>
 
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
index 11295855d3ca02..10829f8a73db1f 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
@@ -10,8 +10,8 @@
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
-// against already-released libc++'s.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// against a built library that doesn't contain 0aa637b2037d.
+// XFAIL: using-built-library-before-llvm-13
 
 // Starting in Android N (API 24), SELinux policy prevents the shell user from
 // creating a FIFO file.
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
index f40b06e106e122..5d931891dd0eb6 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
@@ -10,8 +10,8 @@
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
-// against already-released libc++'s.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// against a built library that doesn't contain 0aa637b2037d.
+// XFAIL: using-built-library-before-llvm-13
 
 // Starting in Android N (API 24), SELinux policy prevents the shell user from
 // creating a hard link.
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
index 72039bb06921f8..82b94a3d6bf5f9 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
@@ -10,8 +10,8 @@
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
-// against already-released libc++'s.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// against a built library that doesn't contain 0aa637b2037d.
+// XFAIL: using-built-library-before-llvm-13
 
 // <filesystem>
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file.pass.cpp
index 3209e16d006125..5a80c2eb266645 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file.pass.cpp
@@ -11,8 +11,8 @@
 // UNSUPPORTED: availability-filesystem-missing
 
 // The string reported on errors changed, which makes those tests fail when run
-// against already-released libc++'s.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// against a built library that doesn't contain 0aa637b2037d.
+// XFAIL: using-built-library-before-llvm-13
 
 // Starting in Android N (API 24), SELinux policy prevents the shell user from
 // creating a FIFO file.
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
index 3658a24b72b287..daaa7cddd7086c 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
@@ -11,8 +11,8 @@
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
 
-// This test requires the dylib support introduced in D92769.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// This test requires the dylib support introduced in e4ed349c7658.
+// XFAIL: using-built-library-before-llvm-12
 
 // <filesystem>
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
index cdedf0d4612601..eff68839dd9fcc 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
@@ -11,8 +11,8 @@
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
 
-// This test requires the dylib support introduced in http://llvm.org/D92769.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// This test requires the dylib support introduced in e4ed349c7658.
+// XFAIL: using-built-library-before-llvm-12
 
 // <filesystem>
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
index dee6690997fb1f..f7e6e2e15d3e3e 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
@@ -11,8 +11,8 @@
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
 
-// This test requires the dylib support introduced in http://llvm.org/D92769.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// This test requires the dylib support introduced in e4ed349c7658.
+// XFAIL: using-built-library-before-llvm-12
 
 // <filesystem>
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
index 648598bb10124b..30d1cc7649e2e9 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
@@ -12,8 +12,8 @@
 // UNSUPPORTED: availability-filesystem-missing
 
 // The string reported on errors changed, which makes those tests fail when run
-// against already-released libc++'s.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// against a built library that doesn't contain 0aa637b2037d.
+// XFAIL: using-built-library-before-llvm-13
 
 // <filesystem>
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
index 459224d41953a4..c1816a94fd2b6a 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
@@ -12,8 +12,8 @@
 // UNSUPPORTED: availability-filesystem-missing
 
 // The string reported on errors changed, which makes those tests fail when run
-// against already-released libc++'s.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.15|11.0}}
+// against a built library that doesn't contain 0aa637b2037d.
+// XFAIL: using-built-library-before-llvm-13
 
 // <filesystem>
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp
index 5335f8070ffa4a..abdfe8a87bff7d 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp
@@ -29,11 +29,9 @@
 //
 // This is taken from https://github.com/rust-lang/wg-security-response/blob/master/patches/CVE-2022-21658/0002-Fix-CVE-2022-21658-for-UNIX-like.patch
 
-// This test requires a dylib containing the fix shipped in https://reviews.llvm.org/D118134.
+// This test requires a dylib containing the fix shipped in https://reviews.llvm.org/D118134 (4f67a909902d).
 // We use UNSUPPORTED instead of XFAIL because the test might not fail reliably.
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx11.0
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx12.{{0|1|2}}
+// UNSUPPORTED: using-built-library-before-llvm-14
 
 // Windows doesn't support the necessary APIs to mitigate this issue.
 // XFAIL: target={{.+}}-windows-{{.+}}
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/bool.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/bool.pass.cpp
index 4e915d340b31b3..0e71f254954bd8 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/bool.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/bool.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/double.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/double.pass.cpp
index c7f2476cf82f06..4776299c235c50 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/double.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/double.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/float.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/float.pass.cpp
index 1acc771a6af11f..9b84a69c9ae10c 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/float.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/float.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/int.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/int.pass.cpp
index dfd26975840eed..00e335795dc6aa 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/int.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/int.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long.pass.cpp
index 440afad4afb1b3..4d9b1181a1b70a 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_double.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_double.pass.cpp
index 49a2c3d5eb66bb..3d0f0035a3d6de 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_double.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_double.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_long.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_long.pass.cpp
index 1f2aee746c87c1..7e19304c4942d6 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_long.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/long_long.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/pointer.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/pointer.pass.cpp
index df467c98f793c1..0a94762b0082b0 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/pointer.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/pointer.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/short.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/short.pass.cpp
index 3530cd9adfa227..cd0955f8bf3498 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/short.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/short.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_int.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_int.pass.cpp
index 3240b482b813e9..87aadac1181f6f 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_int.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_int.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long.pass.cpp
index ba6dc22cacb4cd..189a79dae9e53f 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long_long.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long_long.pass.cpp
index 453d17ed31c5b5..7ca4224d098b5d 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long_long.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_long_long.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_short.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_short.pass.cpp
index 70075760c57bfc..ba231c47a7fe58 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_short.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream.formatted.arithmetic/unsigned_short.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream_extractors/streambuf.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream_extractors/streambuf.pass.cpp
index 66a60c15ac0600..7eaf6f5a68ec9a 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream_extractors/streambuf.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.formatted/istream_extractors/streambuf.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get.pass.cpp
index 068b70818aa1db..b9279d92f83f01 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_chart.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_chart.pass.cpp
index dce6ba770b4e06..deb186c179760f 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_chart.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_chart.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size.pass.cpp
index ab8d1f7e7d611d..7556d1b65020f8 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size.pass.cpp
@@ -8,7 +8,8 @@
 
 // In macosx10.9 to macosx10.14, streams are provided in the dylib AND they
 // have a bug in how they handle null-termination in case of errors (see D40677).
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size_chart.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size_chart.pass.cpp
index 54873944af6a49..b833d231be79b5 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size_chart.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_pointer_size_chart.pass.cpp
@@ -8,7 +8,8 @@
 
 // In macosx10.9 to macosx10.14, streams are provided in the dylib AND they
 // have a bug in how they handle null-termination in case of errors (see D40677).
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf.pass.cpp
index 7290236a905845..49e7556d467207 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf_chart.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf_chart.pass.cpp
index 3d8cbf59e3298f..9e117c84d77cea 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf_chart.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/get_streambuf_chart.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size.pass.cpp
index 6518469d8aa4e3..348464a9c7fd6a 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size.pass.cpp
@@ -8,7 +8,8 @@
 
 // In macosx10.9 to macosx10.14, streams are provided in the dylib AND they
 // have a bug in how they handle null-termination in case of errors (see D40677).
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size_chart.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size_chart.pass.cpp
index 86fd5a656733b7..9a02dd8c41be3d 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size_chart.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/getline_pointer_size_chart.pass.cpp
@@ -8,7 +8,8 @@
 
 // In macosx10.9 to macosx10.14, streams are provided in the dylib AND they
 // have a bug in how they handle null-termination in case of errors (see D40677).
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.pass.cpp
index 0a853672cebed3..646bd9f0feee0e 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/peek.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/peek.pass.cpp
index 45eab0ce28e486..cf39982115399a 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/peek.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/peek.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/read.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/read.pass.cpp
index ba884e0a67b8ab..c30ccb46c04cfa 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/read.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/read.pass.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
 
 // <istream>
 
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/seekg_off.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/seekg_off.pass.cpp
index 5af9bda9c6f303..8126fe4401af3c 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/seekg_off.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/seekg_off.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
-
 // <istream>
 
 // basic_istream<charT,traits>& seekg(off_type off, ios_base::seekdir dir);
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minmax_showbase.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minmax_showbase.pass.cpp
index bef2b071c84371..f5881c83d30b53 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minmax_showbase.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minmax_showbase.pass.cpp
@@ -25,7 +25,7 @@
 
 // This test exposes a regression that was not fixed yet in the libc++
 // shipped with macOS 10.12, 10.13 and 10.14. See D32670 for details.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{12|13|14}}
+// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{(13|14)(.0)?}}
 
 #include <cassert>
 #include <cstdint>
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minus1.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minus1.pass.cpp
index 34c143ac14495d..65d3930d82336e 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minus1.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/minus1.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.12
-
 // <ostream>
 
 // template <class charT, class traits = char_traits<charT> >
diff --git a/libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.multiple.pass.cpp b/libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.multiple.pass.cpp
index 1d8fecf34529a4..4842bf23e66905 100644
--- a/libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.multiple.pass.cpp
+++ b/libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.multiple.pass.cpp
@@ -14,9 +14,8 @@
 // Test to make sure that the streams only get initialized once
 // Taken from https://llvm.org/PR43300
 
-// The dylibs shipped on macOS so far do not contain the fix for PR43300, so
-// this test fails.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// This test requires the fix for PR43300 (7b81a13bfcd1).
+// XFAIL: using-built-library-before-llvm-9
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp
index 563f8bb2c8f32a..6077318278a1ba 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp
@@ -12,7 +12,6 @@
 // when sized deallocation is not supported, e.g., prior to C++14.
 
 // UNSUPPORTED: sanitizer-new-delete
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
 
 // REQUIRES: -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS: -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp
index d9d8e038784a8d..ff88662d208751 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp
@@ -12,7 +12,6 @@
 // when sized deallocation is not supported, e.g., prior to C++14.
 
 // UNSUPPORTED: sanitizer-new-delete
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
 
 // REQUIRES: -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS: -fsized-deallocation -O3
diff --git a/libcxx/test/std/language.support/support.exception/uncaught/uncaught_exceptions.pass.cpp b/libcxx/test/std/language.support/support.exception/uncaught/uncaught_exceptions.pass.cpp
index e8a36a92e43ef6..40aad7103a4b64 100644
--- a/libcxx/test/std/language.support/support.exception/uncaught/uncaught_exceptions.pass.cpp
+++ b/libcxx/test/std/language.support/support.exception/uncaught/uncaught_exceptions.pass.cpp
@@ -8,12 +8,10 @@
 
 // UNSUPPORTED: no-exceptions
 
-// std::uncaught_exceptions() was introduced in the dylib on Mac OS 10.12
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
-
-// However, std::uncaught_exceptions() gives the wrong answer in Mac OS 10.12
-// and 10.13, where it only gives 0 or 1. This was fixed later.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{12|13}}
+// std::uncaught_exceptions() gives the wrong answer in versions of
+// the dylib that don't contain 3a92ecc. Previously, it only returned
+// 0 or 1.
+// XFAIL: using-built-library-before-llvm-9
 
 // test uncaught_exceptions
 
diff --git a/libcxx/test/std/language.support/support.runtime/cstdlib.aligned_alloc.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/cstdlib.aligned_alloc.compile.pass.cpp
index da04d9f2dc0ff8..67eef7da413b7f 100644
--- a/libcxx/test/std/language.support/support.runtime/cstdlib.aligned_alloc.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/cstdlib.aligned_alloc.compile.pass.cpp
@@ -13,7 +13,7 @@
 
 // ::aligned_alloc is provided by the C library, but it's marked as unavailable
 // until macOS 10.15
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// XFAIL: target={{.+}}-apple-macosx10.{{13|14}}
 
 // ::aligned_alloc is not implemented on Windows
 // XFAIL: target={{.+}}-windows-{{.+}}
diff --git a/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
index 87957afc4206e2..b7fd892003e2d7 100644
--- a/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
@@ -16,7 +16,7 @@
 
 // ::timespec_get is provided by the C library, but it's marked as
 // unavailable until macOS 10.15
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// XFAIL: target={{.+}}-apple-macosx10.{{13|14}}
 
 // ::timespec_get is available starting with Android Q (API 29)
 // XFAIL: target={{.+}}-android{{(eabi)?(21|22|23|24|25|26|27|28)}}
diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
index a434a22ff96ba4..e54c0c2a4610a7 100644
--- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}}
+
+// Requires the fix in 390840f.
+// XFAIL: using-built-library-before-llvm-18
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/table_size.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/table_size.pass.cpp
index 05536053825bcb..a584567b2539b9 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/table_size.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.statics/table_size.pass.cpp
@@ -12,10 +12,9 @@
 
 // Make sure we can reference std::ctype<char>::table_size.
 
-// Before https://llvm.org/D110647, the shared library did not contain
+// Before https://llvm.org/D110647 (049f6c29a6f0), the shared library did not contain
 // std::ctype<char>::table_size, so this test fails with a link error.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{11.0|12.0|13.0}}
+// XFAIL: using-built-library-before-llvm-14
 
 #include <locale>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
index 68c8789f92f24a..03b74ebf539367 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
@@ -13,9 +13,9 @@
 // iter_type get(iter_type in, iter_type end, ios_base&,
 //               ios_base::iostate& err, long& v) const;
 
-// This test exercises the fix for PR28704, which isn't in the dylib for
-// some systems.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// This test exercises the fix for http://llvm.org/PR28704 (2dda1ff), which
+// isn't in the dylib for some systems.
+// XFAIL: using-built-library-before-llvm-9
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locales/locale/locale.cons/name_construction.pass.cpp b/libcxx/test/std/localization/locales/locale/locale.cons/name_construction.pass.cpp
index d7be7e0f95a50f..7881a2ba96717a 100644
--- a/libcxx/test/std/localization/locales/locale/locale.cons/name_construction.pass.cpp
+++ b/libcxx/test/std/localization/locales/locale/locale.cons/name_construction.pass.cpp
@@ -16,9 +16,9 @@
 // locale(const locale& other, const string& std_name, category cat);
 // locale(const locale& other, const locale& one, category cats);
 
-// This test exercises the fix for locale name construction (D119441), which
-// isn't in the dylib for some systems.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0}}
+// This test exercises the fix for locale name construction (http://llvm.org/D119441 aka 643b7bcdb404),
+// which isn't in the dylib for some systems.
+// XFAIL: using-built-library-before-llvm-17
 
 #include <locale>
 #include <cassert>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.lcm/lcm.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.lcm/lcm.pass.cpp
index 16bfbbef41c45e..edd6d9fef0d713 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.lcm/lcm.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.lcm/lcm.pass.cpp
@@ -16,6 +16,7 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 #include "test_macros.h"
 
@@ -89,6 +90,13 @@ constexpr bool do_test(int = 0)
     return accumulate;
 }
 
+template <class T>
+constexpr bool test_limits() {
+    assert(std::lcm(std::numeric_limits<T>::max() - 1, std::numeric_limits<T>::max() - 1) == std::numeric_limits<T>::max() - 1);
+    assert(std::lcm(std::numeric_limits<T>::max(), std::numeric_limits<T>::max()) == std::numeric_limits<T>::max());
+    return true;
+}
+
 int main(int argc, char**)
 {
     int non_cce = argc; // a value that can't possibly be constexpr
@@ -141,5 +149,22 @@ int main(int argc, char**)
     assert(res1 == 1324997410816LL);
     }
 
-  return 0;
+    // https://github.com/llvm/llvm-project/issues/96196
+    {
+        assert(test_limits<unsigned int>());
+        assert(test_limits<std::uint32_t>());
+        assert(test_limits<std::uint64_t>());
+        assert(test_limits<int>());
+        assert(test_limits<std::int32_t>());
+        assert(test_limits<std::int64_t>());
+
+        static_assert(test_limits<unsigned int>(), "");
+        static_assert(test_limits<std::uint32_t>(), "");
+        static_assert(test_limits<std::uint64_t>(), "");
+        static_assert(test_limits<int>(), "");
+        static_assert(test_limits<std::int32_t>(), "");
+        static_assert(test_limits<std::int64_t>(), "");
+    }
+
+    return 0;
 }
diff --git a/libcxx/test/std/numerics/rand/rand.device/ctor.pass.cpp b/libcxx/test/std/numerics/rand/rand.device/ctor.pass.cpp
index a9583be73af00e..6db61479e041f2 100644
--- a/libcxx/test/std/numerics/rand/rand.device/ctor.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.device/ctor.pass.cpp
@@ -6,12 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// See https://llvm.org/PR20183
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
-
 // The behavior of std::random_device changed on Apple platforms with
-// https://llvm.org/D116045.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0}}
+// https://llvm.org/D116045 (d202c76441e1).
+// XFAIL: target={{.+}}-apple-{{.+}} && using-built-library-before-llvm-14
 
 // UNSUPPORTED: no-random-device
 
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/assert.ctor.value.bound.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/assert.ctor.value.bound.pass.cpp
new file mode 100644
index 00000000000000..da21a32bf82982
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/assert.ctor.value.bound.pass.cpp
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// Test the precondition check in iota_view(value, bound) that `bound` is reachable from `value`.
+
+#include <ranges>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  { TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iota_view(5, 0), "iota_view: bound must be reachable from value"); }
+  { TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iota_view(10, 5), "iota_view: bound must be reachable from value"); }
+
+  return 0;
+}
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp
index e5e04c27a8e508..b9ffffc0993af8 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp
@@ -8,10 +8,11 @@
 
 // UNSUPPORTED: no-exceptions
 
-// After changing the alignment of the allocated pointer from 16 to 8, the exception thrown is no longer `bad_alloc`
-// but instead length_error on systems using new headers but older dylibs.
+// After changing the alignment of the allocated pointer from 16 to 8, the exception
+// thrown is no longer `bad_alloc` but instead length_error on systems using new
+// headers but a dylib that doesn't contain 04ce0ba.
 //
-// XFAIL: using-built-library-before-llvm-18
+// XFAIL: using-built-library-before-llvm-19
 
 // <string>
 
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
index eb8e003b519763..b919551c9f8806 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
@@ -7,12 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-exceptions
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
-
-// Prior to http://llvm.org/D123580, there was a bug with how the max_size()
-// was calculated. That was inlined into some functions in the dylib, which leads
-// to failures when running this test against an older system dylib.
-// XFAIL: stdlib=system && target=arm64-apple-macosx{{11.0|12.0}}
 
 // <string>
 
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/reserve_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/reserve_size.pass.cpp
index f466dbd05e7470..6196fd6836e7d1 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/reserve_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/reserve_size.pass.cpp
@@ -10,9 +10,9 @@
 
 // void reserve(size_type res_arg); // constexpr since C++20
 
-// This test relies on https://llvm.org/PR45368 being fixed, which isn't in
-// older Apple dylibs
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+// This test relies on https://llvm.org/PR45368 (841132e) being fixed, which isn't in
+// older Apple dylibs.
+// XFAIL: using-built-library-before-llvm-12
 
 #include <string>
 #include <stdexcept>
diff --git a/libcxx/test/std/thread/futures/futures.async/async_race.38682.pass.cpp b/libcxx/test/std/thread/futures/futures.async/async_race.38682.pass.cpp
index 64ff0e76ff2d05..f6bc1286eb394d 100644
--- a/libcxx/test/std/thread/futures/futures.async/async_race.38682.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.async/async_race.38682.pass.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
-// There's currently no release of OS X whose dylib contains the patch for
-// PR38682. Since the fix for future<void> is in the dylib, this test may fail.
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// This test requires the fix for http://llvm.org/PR38682 (616ef1863fae). We mark the
+// test as UNSUPPORTED instead of XFAIL because the test doesn't fail consistently.
+// UNSUPPORTED: using-built-library-before-llvm-10
 
 // This test is designed to cause and allow TSAN to detect a race condition
 // in std::async, as reported in https://llvm.org/PR38682.
diff --git a/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp b/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp
index 05781c24e45b5a..87d056f6309dc9 100644
--- a/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp
@@ -8,11 +8,6 @@
 //
 // UNSUPPORTED: no-threads
 
-// LWG 2056 changed the values of future_errc, so if we're using new headers
-// with an old library we'll get incorrect messages.
-//
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11}}
-
 // VC Runtime's std::exception::what() method is not marked as noexcept, so
 // this fails.
 // UNSUPPORTED: target=x86_64-pc-windows-msvc
diff --git a/libcxx/test/std/thread/thread.condition/notify_all_at_thread_exit_lwg3343.pass.cpp b/libcxx/test/std/thread/thread.condition/notify_all_at_thread_exit_lwg3343.pass.cpp
index c1c1ab0ef4e8d4..61094eced4cb21 100644
--- a/libcxx/test/std/thread/thread.condition/notify_all_at_thread_exit_lwg3343.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/notify_all_at_thread_exit_lwg3343.pass.cpp
@@ -11,11 +11,11 @@
 // notify_all_at_thread_exit(...) requires move semantics to transfer the unique_lock.
 // UNSUPPORTED: c++03
 
-// The fix of LWG3343 is done in the dylib. That means Apple backdeployment
-// targets remain broken. Due to the nature of the test, testing on a broken
-// system does not guarantee that the test fails, so the test can't use XFAIL.
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx10.{{.+}}
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires the fix for LWG3343 (64fc3cd), which is done in the dylib
+// and landed in LLVM 16. Due to the nature of the test, testing on a broken
+// system does not guarantee that the test fails, so we use UNSUPPORTED instead
+// of XFAIL.
+// UNSUPPORTED: using-built-library-before-llvm-16
 
 // This is a regression test for LWG3343.
 //
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
index 581df51207da29..ba9831750c842d 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
@@ -62,6 +62,16 @@ static_assert(!std::is_trivially_copy_constructible_v<std::expected<CopyableNonT
 static_assert(!std::is_trivially_copy_constructible_v<std::expected<int, CopyableNonTrivial>>);
 static_assert(!std::is_trivially_copy_constructible_v<std::expected<CopyableNonTrivial, CopyableNonTrivial>>);
 
+struct Any {
+  constexpr Any()                      = default;
+  constexpr Any(const Any&)            = default;
+  constexpr Any& operator=(const Any&) = default;
+
+  template <class T>
+    requires(!std::is_same_v<Any, std::decay_t<T>> && std::is_copy_constructible_v<std::decay_t<T>>)
+  constexpr Any(T&&) {}
+};
+
 constexpr bool test() {
   // copy the value non-trivial
   {
@@ -109,6 +119,16 @@ constexpr bool test() {
     assert(!e2.has_value());
   }
 
+  {
+    // TODO(LLVM 20): Remove once we drop support for Clang 17
+#if defined(TEST_CLANG_VER) && TEST_CLANG_VER >= 1800
+    // https://github.com/llvm/llvm-project/issues/92676
+    std::expected<Any, int> e1;
+    auto e2 = e1;
+    assert(e2.has_value());
+#endif
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp
index 6e6ff1f2d1344b..a278bc41ef7614 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp
@@ -37,12 +37,12 @@ TEST_CONSTEXPR_CXX20 bool test() {
 
 int main(int, char**) {
   test<char>();
-  test<char const>();
+  test<int>();
   test<void>();
 
 #if TEST_STD_VER > 17
   static_assert(test<char>());
-  static_assert(test<char const>());
+  static_assert(test<int>());
   static_assert(test<void>());
 #endif
   return 0;
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp
index a095ca102491e0..69c6595c986452 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp
@@ -26,15 +26,9 @@ int main(int, char**)
 {
     test<int>();
     test<void>();
-#ifdef _LIBCPP_VERSION // extension
-    test<int const>();
-#endif // _LIBCPP_VERSION
 
     static_assert(test<int>());
     static_assert(test<void>());
-#ifdef _LIBCPP_VERSION // extension
-    static_assert(test<int const>());
-#endif // _LIBCPP_VERSION
 
     return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.constexpr.size.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.constexpr.size.verify.cpp
index fbbb334fea0fda..34a89e8be8591b 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.constexpr.size.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.constexpr.size.verify.cpp
@@ -34,5 +34,4 @@ constexpr bool test()
 
 void f() {
     static_assert(test<double>()); // expected-error {{static assertion expression is not an integral constant expression}}
-    LIBCPP_STATIC_ASSERT(test<const double>()); // expected-error {{static assertion expression is not an integral constant expression}}
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.size.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.size.pass.cpp
index 1913a0e0dcc8d9..2b11bd7f48dd7f 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.size.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.size.pass.cpp
@@ -45,7 +45,6 @@ void test()
 int main(int, char**)
 {
     test<double>();
-    LIBCPP_ONLY(test<const double>());
 
   return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
index 917492929ecc25..96ae6427188c06 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
@@ -35,13 +35,6 @@ void f() {
         typedef std::allocator<char>::const_reference ConstReference;   // expected-warning {{'const_reference' is deprecated}}
         typedef std::allocator<char>::rebind<int>::other Rebind;        // expected-warning {{'rebind<int>' is deprecated}}
     }
-    {
-        typedef std::allocator<char const>::pointer Pointer;                  // expected-warning {{'pointer' is deprecated}}
-        typedef std::allocator<char const>::const_pointer ConstPointer;       // expected-warning {{'const_pointer' is deprecated}}
-        typedef std::allocator<char const>::reference Reference;              // expected-warning {{'reference' is deprecated}}
-        typedef std::allocator<char const>::const_reference ConstReference;   // expected-warning {{'const_reference' is deprecated}}
-        typedef std::allocator<char const>::rebind<int>::other Rebind;        // expected-warning {{'rebind<int>' is deprecated}}
-    }
     {
         typedef std::allocator<void>::pointer Pointer;                  // expected-warning {{'pointer' is deprecated}}
         typedef std::allocator<void>::const_pointer ConstPointer;       // expected-warning {{'const_pointer' is deprecated}}
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp
index ad431f94732eed..0a27cc712ed62e 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp
@@ -25,7 +25,7 @@ void test() {
     typedef std::allocator<char>::is_always_equal IAE; // expected-warning {{'is_always_equal' is deprecated}}
   }
   {
-    typedef std::allocator<const char>::is_always_equal IAE; // expected-warning {{'is_always_equal' is deprecated}}
+    typedef std::allocator<int>::is_always_equal IAE; // expected-warning {{'is_always_equal' is deprecated}}
   }
   {
     typedef std::allocator<void>::is_always_equal IAE; // expected-warning {{'is_always_equal' is deprecated}}
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
index 7085a1d3fc602f..581bdced36a0df 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
@@ -61,8 +61,5 @@ void test() {
 
 int main(int, char**) {
     test<char>();
-#ifdef _LIBCPP_VERSION
-    test<char const>(); // extension
-#endif
     return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp
index 2fb7a60c9a88fb..e8c48439b9f46c 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp
@@ -39,6 +39,6 @@ void check()
 
 void f() {
     check<char>();
-    check<char const>();
+    check<int>();
     check<void>();
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp
index 72cc21402327a0..07332ec37bc371 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp
@@ -29,6 +29,6 @@ void check() {
 
 void test() {
   check<char>();
-  check<char const>();
+  check<int>();
   check<void>();
 }
diff --git a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
index 47d4bba51be616..334517861e7b35 100644
--- a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
+++ b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
@@ -81,7 +81,7 @@ constexpr bool test()
     }
 
     {
-        std::allocator<Counted const> a;
+        std::allocator<Counted> a;
         Counted const* p = a.allocate(2);
         int count = 0;
         std::construct_at(p, count);
@@ -92,7 +92,7 @@ constexpr bool test()
         assert(count == 1);
         p->~Counted();
         assert(count == 0);
-        a.deallocate(p, 2);
+        a.deallocate(const_cast<Counted*>(p), 2);
     }
 
     return true;
diff --git a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
index 59cddd406374e1..8499478d1fb384 100644
--- a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
+++ b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
@@ -8,11 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// Aligned allocations are not supported on macOS < 10.13
-// Note: use 'unsupported' instead of 'xfail' to ensure
-// we won't pass prior to c++17.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 // <memory>
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
index 49497b6956b9fb..13340ed5294c05 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
@@ -17,6 +17,7 @@
 #include "test_macros.h"
 #include "deleter_types.h"
 
+#include "types.h"
 struct A
 {
     static int count;
@@ -28,6 +29,25 @@ struct A
 
 int A::count = 0;
 
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
+static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter>::value, "");
+
+#if TEST_STD_VER >= 17
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter>::value, "");
+
+static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter>::value, "");
+#endif
+
 int main(int, char**)
 {
     {
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
index 4e9fc227b99e81..53ca6fb5b234d4 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
@@ -17,6 +17,8 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
+#include "types.h"
+
 struct A
 {
     static int count;
@@ -28,6 +30,25 @@ struct A
 
 int A::count = 0;
 
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
+static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+
+#if TEST_STD_VER >= 17
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+
+static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+#endif
+
 int main(int, char**)
 {
     test_allocator_statistics alloc_stats;
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer.pass.cpp
index 87b0cbde14a355..37ce087ca7ba74 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer.pass.cpp
@@ -27,12 +27,12 @@ struct A
 
 int A::count = 0;
 
-struct Derived : A {};
+struct derived : A {};
 
 // https://llvm.org/PR60258
 // Invalid constructor SFINAE for std::shared_ptr's array ctors
 static_assert( std::is_constructible<std::shared_ptr<int>,  int*>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<A>,  Derived*>::value, "");
+static_assert( std::is_constructible<std::shared_ptr<A>,  derived*>::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<A>,  int*>::value, "");
 
 #if TEST_STD_VER >= 17
@@ -99,7 +99,7 @@ int main(int, char**)
 
     {
         assert(A::count == 0);
-        std::shared_ptr<A> pA(new Derived);
+        std::shared_ptr<A> pA(new derived);
         assert(pA.use_count() == 1);
         assert(A::count == 1);
     }
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
index 0f4aa0f5c06893..e8c6316d53dfe8 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
@@ -17,6 +17,8 @@
 #include "test_macros.h"
 #include "deleter_types.h"
 
+#include "types.h"
+
 struct A
 {
     static int count;
@@ -28,52 +30,22 @@ struct A
 
 int A::count = 0;
 
-struct bad_ty { };
-
-struct bad_deleter
-{
-    void operator()(bad_ty) { }
-};
-
-struct no_move_deleter
-{
-    no_move_deleter(no_move_deleter const&) = delete;
-    no_move_deleter(no_move_deleter &&) = delete;
-    void operator()(int*) { }
-};
-
-static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
-
-struct Base { };
-struct Derived : Base { };
-
-template<class T>
-class MoveDeleter
-{
-    MoveDeleter();
-    MoveDeleter(MoveDeleter const&);
-public:
-  MoveDeleter(MoveDeleter&&) {}
-
-  explicit MoveDeleter(int) {}
-
-  void operator()(T* ptr) { delete ptr; }
-};
-
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
 // https://llvm.org/PR60258
 // Invalid constructor SFINAE for std::shared_ptr's array ctors
 static_assert( std::is_constructible<std::shared_ptr<int>,  int*, test_deleter<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int>,  int*, bad_deleter>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<Base>,  Derived*, test_deleter<Base> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<base>,  derived*, test_deleter<base> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<A>,  int*, test_deleter<A> >::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>>::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int> >::value, "");
 #endif
 
 int f() { return 5; }
@@ -132,17 +104,36 @@ int main(int, char**)
 
         // Make sure that we can construct a shared_ptr where the element type and pointer type
         // aren't "convertible" but are "compatible".
-        static_assert(!std::is_constructible<std::shared_ptr<Derived[4]>, Base[4], test_deleter<Derived[4]> >::value, "");
+        static_assert(!std::is_constructible<std::shared_ptr<derived[4]>, base[4], test_deleter<derived[4]> >::value, "");
     }
 
 #if TEST_STD_VER >= 11
     {
-        MoveDeleter<int> d(0);
+        move_deleter<int> d(0);
         std::shared_ptr<int> p0(new int, std::move(d));
         std::shared_ptr<int> p1(nullptr, std::move(d));
     }
 #endif // TEST_STD_VER >= 11
 
+#if TEST_STD_VER >= 14
+    {
+      // LWG 4110
+      auto deleter = [](auto pointer) { delete pointer; };
+      std::shared_ptr<int> p(new int, deleter);
+    }
+
+    {
+      std::shared_ptr<int> p(NULL, [](auto){});
+    }
+#endif
+
+#if TEST_STD_VER >= 17
+    {
+      // See https://github.com/llvm/llvm-project/pull/93071#issuecomment-2166047398
+      std::shared_ptr<char[]> a(new char[10], std::default_delete<char[]>());
+    }
+#endif
+
   test_function_type();
   return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
index a110525b9b922d..074ef44a1f76be 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
@@ -17,6 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
+#include "types.h"
 struct A
 {
     static int count;
@@ -28,52 +29,22 @@ struct A
 
 int A::count = 0;
 
-struct bad_ty { };
-
-struct bad_deleter
-{
-    void operator()(bad_ty) { }
-};
-
-struct no_move_deleter
-{
-    no_move_deleter(no_move_deleter const&) = delete;
-    no_move_deleter(no_move_deleter &&) = delete;
-    void operator()(int*) { }
-};
-
-static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
-
-struct Base { };
-struct Derived : Base { };
-
-template<class T>
-class MoveDeleter
-{
-    MoveDeleter();
-    MoveDeleter(MoveDeleter const&);
-public:
-  MoveDeleter(MoveDeleter&&) {}
-
-  explicit MoveDeleter(int) {}
-
-  void operator()(T* ptr) { delete ptr; }
-};
-
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
 // https://llvm.org/PR60258
 // Invalid constructor SFINAE for std::shared_ptr's array ctors
 static_assert( std::is_constructible<std::shared_ptr<int>,  int*, test_deleter<int>, test_allocator<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int>,  int*, bad_deleter, test_allocator<int> >::value, "");
-static_assert( std::is_constructible<std::shared_ptr<Base>,  Derived*, test_deleter<Base>, test_allocator<Base> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<base>,  derived*, test_deleter<base>, test_allocator<base> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<A>,  int*, test_deleter<A>, test_allocator<A> >::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>, test_allocator<int>>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>, test_allocator<int>>::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>, test_allocator<int> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>, test_allocator<int> >::value, "");
 #endif
 
 
@@ -172,7 +143,7 @@ int main(int, char**)
     assert(test_deleter<A>::dealloc_count == 1);
 
     {
-        MoveDeleter<int> d(0);
+        move_deleter<int> d(0);
         std::shared_ptr<int> p2(new int, std::move(d), std::allocator<int>());
         std::shared_ptr<int> p3(nullptr, std::move(d), std::allocator<int>());
     }
@@ -189,9 +160,9 @@ int main(int, char**)
 
         // Make sure that we can construct a shared_ptr where the element type and pointer type
         // aren't "convertible" but are "compatible".
-        static_assert(!std::is_constructible<std::shared_ptr<Derived[4]>,
-                                             Base[4], test_deleter<Derived[4]>,
-                                             test_allocator<Derived[4]> >::value, "");
+        static_assert(!std::is_constructible<std::shared_ptr<derived[4]>,
+                                             base[4], test_deleter<derived[4]>,
+                                             test_allocator<derived[4]> >::value, "");
     }
 
   return 0;
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h
new file mode 100644
index 00000000000000..32b92b04f2488c
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
+#define TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
+
+#include <type_traits>
+
+struct bad_type {};
+
+struct bad_deleter {
+  void operator()(bad_type) {}
+};
+
+struct no_move_deleter {
+  no_move_deleter(no_move_deleter const&) = delete;
+  no_move_deleter(no_move_deleter&&)      = delete;
+  void operator()(int*) {}
+};
+
+static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
+
+struct no_nullptr_deleter {
+  void operator()(int*) const {}
+  void operator()(std::nullptr_t) const = delete;
+};
+
+struct base {};
+struct derived : base {};
+
+template <class T>
+class move_deleter {
+  move_deleter();
+  move_deleter(move_deleter const&);
+
+public:
+  move_deleter(move_deleter&&) {}
+
+  explicit move_deleter(int) {}
+
+  void operator()(T* ptr) { delete ptr; }
+};
+
+#endif // TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_nothrow_destructible.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_nothrow_destructible.pass.cpp
index 60d55620f32a46..742c9e9288de58 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_nothrow_destructible.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_nothrow_destructible.pass.cpp
@@ -56,6 +56,10 @@ struct PurePublicDestructor              { public:    virtual ~PurePublicDestruc
 struct PureProtectedDestructor           { protected: virtual ~PureProtectedDestructor() = 0; };
 struct PurePrivateDestructor             { private:   virtual ~PurePrivateDestructor() = 0; };
 
+struct ExplicitlyNoThrowDestructor {
+    ~ExplicitlyNoThrowDestructor() TEST_NOEXCEPT {}
+};
+
 class Empty
 {
 };
@@ -92,11 +96,12 @@ int main(int, char**)
     test_is_nothrow_destructible<PublicDestructor>();
     test_is_nothrow_destructible<VirtualPublicDestructor>();
     test_is_nothrow_destructible<PurePublicDestructor>();
+#endif
+    test_is_nothrow_destructible<ExplicitlyNoThrowDestructor>();
     test_is_nothrow_destructible<bit_zero>();
     test_is_nothrow_destructible<Abstract>();
     test_is_nothrow_destructible<Empty>();
     test_is_nothrow_destructible<Union>();
-#endif
     // requires access control
     test_is_not_nothrow_destructible<ProtectedDestructor>();
     test_is_not_nothrow_destructible<PrivateDestructor>();
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.equivalence.compile.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.equivalence.compile.pass.cpp
new file mode 100644
index 00000000000000..db45a56feb88aa
--- /dev/null
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.equivalence.compile.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: clang-17
+
+// <utility>
+
+// LWG-3382 NTTP for pair and array:
+// Two values p1 and p2 of type pair<T, U> are template-argument-equivalent ([temp.type]) if and only if
+// p1.first and p2.first are template-argument-equivalent and p1.second and p2.second are template-argument-equivalent.
+
+#include <utility>
+
+#include <type_traits>
+
+int i = 0;
+int j = 1;
+
+namespace test_full_type {
+template <class T, class U, std::pair<T, U> P>
+struct test : std::false_type {};
+
+template <>
+struct test<int&, int, std::pair<int&, int>{i, 5}> : std::true_type {};
+
+static_assert(!test<int*, int*, std::pair<int*, int*>{}>::value);
+static_assert(!test<int*, int, std::pair<int*, int>{}>::value);
+static_assert(!test<int&, int*, std::pair<int&, int*>{i, nullptr}>::value);
+static_assert(!test<int&, int, std::pair<int&, int>{j, 0}>::value);
+static_assert(!test<int&, int, std::pair<int&, int>{j, 5}>::value);
+static_assert(!test<int&, int, std::pair<int&, int>{i, 0}>::value);
+static_assert(!test<int&, unsigned int, std::pair<int&, unsigned int>{j, 0}>::value);
+static_assert(test<int&, int, std::pair<int&, int>{i, 5}>::value);
+} // namespace test_full_type
+
+namespace test_ctad {
+template <std::pair P>
+struct test : std::false_type {};
+
+template <>
+struct test<std::pair<int&, int>{i, 10}> : std::true_type {};
+
+static_assert(!test<std::pair<int*, int*>{}>::value);
+static_assert(!test<std::pair<int*, int>{}>::value);
+static_assert(!test<std::pair<int&, int*>{i, nullptr}>::value);
+static_assert(!test<std::pair<int&, int>{j, 0}>::value);
+static_assert(!test<std::pair<int&, int>{j, 10}>::value);
+static_assert(!test<std::pair<int&, int>{i, 0}>::value);
+static_assert(!test<std::pair<int&, unsigned int>{j, 0}>::value);
+static_assert(test<std::pair<int&, int>{i, 10}>::value);
+} // namespace test_ctad
+
+namespace test_auto {
+template <auto P>
+struct test : std::false_type {};
+
+template <>
+struct test<std::pair<int&, int>{i, 15}> : std::true_type {};
+
+static_assert(!test<std::pair<int*, int*>{}>::value);
+static_assert(!test<std::pair<int*, int>{}>::value);
+static_assert(!test<std::pair<int&, int*>{i, nullptr}>::value);
+static_assert(!test<std::pair<int&, int>{j, 0}>::value);
+static_assert(!test<std::pair<int&, int>{j, 15}>::value);
+static_assert(!test<std::pair<int&, int>{i, 0}>::value);
+static_assert(!test<std::pair<int&, unsigned int>{j, 0}>::value);
+static_assert(test<std::pair<int&, int>{i, 15}>::value);
+} // namespace test_auto
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp
new file mode 100644
index 00000000000000..ac081495a62052
--- /dev/null
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: clang-17
+
+// <utility>
+
+// LWG-3382 NTTP for pair and array:
+// pair<T, U> is a structural type ([temp.param]) if T and U are both structural types.
+
+#include <utility>
+
+#include <functional>
+#include <string>
+
+struct LiteralBase {};
+struct LiteralNSDM {};
+
+struct LiteralType : LiteralBase {
+  LiteralNSDM nsdm;
+};
+
+struct NotALiteral {
+  NotALiteral() {}
+};
+
+int i;
+NotALiteral not_a_literal;
+
+namespace test_full_type {
+template <class T, class U, std::pair<T, U> P>
+struct test {};
+
+using A = test<int, int, std::pair{0, 1}>;
+using B = test<int&, int&, std::make_pair(std::ref(i), std::ref(i))>;
+using C = test<const int&, const int&, std::make_pair(std::cref(i), std::cref(i))>;
+using D = test<LiteralType, LiteralType, std::pair<LiteralType, LiteralType>{}>;
+using E = test<int*, int*, std::pair<int*, int*>{&i, &i}>;
+using F = test<NotALiteral&, NotALiteral&, std::make_pair(std::ref(not_a_literal), std::ref(not_a_literal))>;
+
+using G = test<int&&, int&&, std::pair<int&&, int&&>{std::move(i), std::move(i)}>;
+// expected-error@*:* {{type 'std::pair<int &&, int &&>' of non-type template parameter is not a structural type}}
+
+using H = test<NotALiteral, NotALiteral, std::pair<NotALiteral, NotALiteral>{}>;
+// expected-error@*:* {{non-type template parameter has non-literal type 'std::pair<NotALiteral, NotALiteral>'}}
+
+using I = test<std::string, std::string, std::pair<std::string, std::string>{}>;
+// expected-error-re@*:* {{type 'std::pair<{{(std::)?}}string, {{(std::)?}}string>' {{(\(aka 'pair<basic_string<char>, basic_string<char>>'\) )?}}of non-type template parameter is not a structural type}}
+} // namespace test_full_type
+
+namespace test_ctad {
+template <std::pair P>
+struct test {};
+
+using A = test<std::pair{2, 3}>;
+using B = test<std::make_pair(std::ref(i), std::ref(i))>;
+using C = test<std::make_pair(std::cref(i), std::cref(i))>;
+using D = test<std::pair<LiteralType, LiteralType>{}>;
+using E = test<std::pair<int*, int*>{&i, &i}>;
+using F = test<std::make_pair(std::ref(not_a_literal), std::ref(not_a_literal))>;
+
+using G = test<std::pair<int&&, int&&>{std::move(i), std::move(i)}>;
+// expected-error@-1 {{type 'std::pair<int &&, int &&>' of non-type template parameter is not a structural type}}
+
+using H = test<std::pair<NotALiteral, NotALiteral>{}>;
+// expected-error@-1 {{non-type template parameter has non-literal type 'std::pair<NotALiteral, NotALiteral>'}}
+
+using I = test<std::pair<std::string, std::string>{}>;
+// expected-error@-1 {{type 'std::pair<string, string>' (aka 'std::pair<std::string, std::string>') of non-type template parameter is not a structural type}}
+} // namespace test_ctad
+
+namespace test_auto {
+template <auto P>
+struct test {};
+
+using A = test<std::pair{4, 5}>;
+using B = test<std::make_pair(std::ref(i), std::ref(i))>;
+using C = test<std::make_pair(std::cref(i), std::cref(i))>;
+using D = test<std::pair<LiteralType, LiteralType>{}>;
+using E = test<std::pair<int*, int*>{&i, &i}>;
+using F = test<std::make_pair(std::ref(not_a_literal), std::ref(not_a_literal))>;
+
+using G = test<std::pair<int&&, int&&>{std::move(i), std::move(i)}>;
+// expected-error@-1 {{type 'std::pair<int &&, int &&>' of non-type template parameter is not a structural type}}
+
+using H = test<std::pair<NotALiteral, NotALiteral>{}>;
+// expected-error@-1 {{non-type template parameter has non-literal type 'std::pair<NotALiteral, NotALiteral>'}}
+
+using I = test<std::pair<std::string, std::string>{}>;
+// expected-error@-1 {{type 'std::pair<std::string, std::string>' (aka 'pair<basic_string<char>, basic_string<char>>') of non-type template parameter is not a structural type}}
+} // namespace test_auto
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index f9e89c80435fea..234b0b5873eb6d 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -91,11 +91,11 @@ RUN sudo apt-get update \
         make \
         python3 \
         python3-dev \
-        python3-distutils \
         python3-packaging \
+        python3-setuptools \
         python3-psutil \
         software-properties-common \
-        swig4.0 \
+        swig \
         unzip \
         uuid-dev \
         wget \
@@ -142,7 +142,6 @@ EOF
 
 # Install the most recent GCC, like clang install the previous version as a transition.
 RUN <<EOF
-  sudo add-apt-repository ppa:ubuntu-toolchain-r/test
   sudo apt-get update
   sudo apt-get install -y \
     gcc-$((GCC_LATEST_VERSION - 1)) \
@@ -290,6 +289,3 @@ FROM builder-base AS actions-builder
 
 WORKDIR /home/runner
 USER runner
-
-
-
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index 28140038725d57..df5df8ce907f11 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -1,7 +1,7 @@
 version: '3.7'
 
 x-versions: &compiler_versions
-  GCC_LATEST_VERSION: 13
+  GCC_LATEST_VERSION: 14
   LLVM_HEAD_VERSION: 19
 
 services:
@@ -12,7 +12,7 @@ services:
       dockerfile: Dockerfile
       target: buildkite-builder
       args:
-        BASE_IMAGE: ubuntu:jammy
+        BASE_IMAGE: ubuntu:noble
         <<: *compiler_versions
   actions-builder:
     image: ghcr.io/libcxx/actions-builder:${TAG:-latest}
@@ -30,9 +30,8 @@ services:
       dockerfile: Dockerfile
       target: android-buildkite-builder
       args:
-        BASE_IMAGE: ubuntu:jammy
+        BASE_IMAGE: ubuntu:noble
         ANDROID_CLANG_VERSION: r498229b
         ANDROID_CLANG_PREBUILTS_COMMIT: 5186d132c99aa75dc25207c392e3ea5b93d0107e
         ANDROID_SYSROOT_BID: 10957860
         <<: *compiler_versions
-
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index f93d9ef70e65bd..ab6c84fe7326be 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -585,69 +585,65 @@ def check_gdb(cfg):
 # Those are used for backdeployment features below, do not use directly in tests.
 DEFAULT_FEATURES += [
     Feature(
-        name="_target-before-llvm-11",
+        name="_target-has-llvm-17",
         when=lambda cfg: BooleanExpression.evaluate(
-            "target={{.+}}-apple-macosx{{(10.13|10.14|10.15)(.0)?}}",
+            "target={{.+}}-apple-macosx{{14.[4-9](.0)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
-        name="_target-before-llvm-12",
+        name="_target-has-llvm-16",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-11 || target={{.+}}-apple-macosx{{(11(.\d+)?|12.[0-2])(.0)?}}",
+            "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.0)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
-        name="_target-before-llvm-13",
+        name="_target-has-llvm-15",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-12 || target={{.+}}-apple-macosx{{(12.[3-7]|13.[0-3])(.0)?}}",
+            "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.0)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
-        name="_target-before-llvm-14",
+        name="_target-has-llvm-14",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-13",
+            "_target-has-llvm-15",
             cfg.available_features,
         ),
     ),
     Feature(
-        name="_target-before-llvm-15",
+        name="_target-has-llvm-13",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-14 || target={{.+}}-apple-macosx{{13.[4-6](.0)?}}",
+            "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.0)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
-        name="_target-before-llvm-16",
+        name="_target-has-llvm-12",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-15 || target={{.+}}-apple-macosx{{12.[0-3](.0)?}}",
+            "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.0)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
-        name="_target-before-llvm-17",
+        name="_target-has-llvm-11",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-16",
+            "_target-has-llvm-12 || target={{.+}}-apple-macosx{{(11.[0-9]|12.[0-2])(.0)?}}",
             cfg.available_features,
         ),
     ),
-    # For now, no released version of macOS contains LLVM 18
-    # TODO(ldionne) Please provide the correct value.
     Feature(
-        name="_target-before-llvm-18",
+        name="_target-has-llvm-10",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-17 || target={{.+}}-apple-macosx{{.+}}",
+            "_target-has-llvm-11",
             cfg.available_features,
         ),
     ),
-    # For now, no released version of macOS contains LLVM 19
-    # TODO(ldionne) Please provide the correct value.
     Feature(
-        name="_target-before-llvm-19",
+        name="_target-has-llvm-9",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-before-llvm-18 || target={{.+}}-apple-macosx{{.+}}",
+            "_target-has-llvm-10 || target={{.+}}-apple-macosx{{10.15(.0)?}}",
             cfg.available_features,
         ),
     ),
@@ -690,12 +686,12 @@ def check_gdb(cfg):
 # a libc++ flavor that enables availability markup. Similarly, a test could fail when
 # run against the system library of an older version of FreeBSD, even though FreeBSD
 # doesn't provide availability markup at the time of writing this.
-for version in ("11", "12", "13", "14", "15", "16", "17", "18", "19"):
+for version in ("9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19"):
     DEFAULT_FEATURES.append(
         Feature(
             name="using-built-library-before-llvm-{}".format(version),
-            when=lambda cfg: BooleanExpression.evaluate(
-                "stdlib=system && _target-before-llvm-{}".format(version),
+            when=lambda cfg, v=version: BooleanExpression.evaluate(
+                "stdlib=system && !_target-has-llvm-{}".format(v),
                 cfg.available_features,
             ),
         )
@@ -706,7 +702,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-filesystem-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(13|14)(.0)?}})",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-9)",
             cfg.available_features,
         ),
     ),
@@ -714,7 +710,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-synchronization_library-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && _target-before-llvm-11)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-11)",
             cfg.available_features,
         ),
     ),
@@ -722,7 +718,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-char8_t_support-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && _target-before-llvm-12)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-12)",
             cfg.available_features,
         ),
     ),
@@ -730,7 +726,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-verbose_abort-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && _target-before-llvm-13)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-13)",
             cfg.available_features,
         ),
     ),
@@ -738,7 +734,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-pmr-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && _target-before-llvm-13)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-13)",
             cfg.available_features,
         ),
     ),
@@ -746,7 +742,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-fp_to_chars-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && _target-before-llvm-14)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)",
             cfg.available_features,
         ),
     ),
@@ -754,7 +750,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-print-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && _target-before-llvm-18)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-18)",
             cfg.available_features,
         ),
     ),
@@ -762,7 +758,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-tzdb-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && _target-before-llvm-19)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-19)",
             cfg.available_features,
         ),
     ),
diff --git a/libcxxabi/include/cxxabi.h b/libcxxabi/include/cxxabi.h
index 0e3969084e04fd..8d1b5c9b55782a 100644
--- a/libcxxabi/include/cxxabi.h
+++ b/libcxxabi/include/cxxabi.h
@@ -77,6 +77,11 @@ extern _LIBCXXABI_FUNC_VIS void __cxa_end_cleanup();
 #endif
 extern _LIBCXXABI_FUNC_VIS std::type_info *__cxa_current_exception_type();
 
+// GNU extension
+// Calls `terminate` with the current exception being caught. This function is used by GCC when a `noexcept` function
+// throws an exception inside a try/catch block and doesn't catch it.
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_call_terminate(void*) throw();
+
 // 2.5.4 Rethrowing Exceptions
 extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_rethrow();
 
diff --git a/libcxxabi/lib/itanium-exceptions.exp b/libcxxabi/lib/itanium-exceptions.exp
index 21b19edb92dfc4..9a7f317b0232e8 100644
--- a/libcxxabi/lib/itanium-exceptions.exp
+++ b/libcxxabi/lib/itanium-exceptions.exp
@@ -5,6 +5,7 @@ ___cxa_begin_catch
 ___cxa_call_unexpected
 ___cxa_current_exception_type
 ___cxa_end_catch
+___cxa_call_terminate
 ___cxa_free_dependent_exception
 ___cxa_free_exception
 ___cxa_get_exception_ptr
diff --git a/libcxxabi/src/cxa_exception.cpp b/libcxxabi/src/cxa_exception.cpp
index ff69a4c65e465e..92901a83bfd039 100644
--- a/libcxxabi/src/cxa_exception.cpp
+++ b/libcxxabi/src/cxa_exception.cpp
@@ -589,6 +589,11 @@ void __cxa_end_catch() {
     }
 }
 
+void __cxa_call_terminate(void* unwind_arg) throw() {
+  __cxa_begin_catch(unwind_arg);
+  std::terminate();
+}
+
 // Note:  exception_header may be masquerading as a __cxa_dependent_exception
 //        and that's ok.  exceptionType is there too.
 //        However watch out for foreign exceptions.  Return null for them.
diff --git a/libcxxabi/test/catch_const_pointer_nullptr.pass.cpp b/libcxxabi/test/catch_const_pointer_nullptr.pass.cpp
index a0d1f36f050f05..a66e6c9e530756 100644
--- a/libcxxabi/test/catch_const_pointer_nullptr.pass.cpp
+++ b/libcxxabi/test/catch_const_pointer_nullptr.pass.cpp
@@ -8,16 +8,11 @@
 
 // UNSUPPORTED: no-exceptions
 
-#include <cassert>
-
-// Clang emits  warnings about exceptions of type 'Child' being caught by
-// an earlier handler of type 'Base'. Congrats clang, you've just
-// diagnosed the behavior under test.
-#if defined(__clang__)
-#pragma clang diagnostic ignored "-Wexceptions"
-#endif
+// Clang and GCC emit warnings about exceptions of type 'Child' being caught by
+// an earlier handler of type 'Base'.
+// ADDITIONAL_COMPILE_FLAGS: -Wno-exceptions
 
-#if __has_feature(cxx_nullptr)
+#include <cassert>
 
 struct A {};
 
@@ -124,18 +119,6 @@ void test6()
     }
 }
 
-
-#else
-
-void test1() {}
-void test2() {}
-void test3() {}
-void test4() {}
-void test5() {}
-void test6() {}
-
-#endif
-
 int main(int, char**) {
     test1();
     test2();
diff --git a/libcxxabi/test/catch_function_01.pass.cpp b/libcxxabi/test/catch_function_01.pass.cpp
index 4ca42f8bd7a41d..95bfcf60a1a566 100644
--- a/libcxxabi/test/catch_function_01.pass.cpp
+++ b/libcxxabi/test/catch_function_01.pass.cpp
@@ -8,13 +8,11 @@
 
 // Can you have a catch clause of array type that catches anything?
 
+// UNSUPPORTED: no-exceptions
+
 // GCC incorrectly allows function pointer to be caught by reference.
 // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69372
 // XFAIL: gcc
-// UNSUPPORTED: no-exceptions
-
-// 65ace9daa360 made it in the dylib in macOS 10.11
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10}}
 
 #include <cassert>
 
diff --git a/libcxxabi/test/catch_function_03.pass.cpp b/libcxxabi/test/catch_function_03.pass.cpp
index 3f9e4fcb2e02ae..513a54aa087cf6 100644
--- a/libcxxabi/test/catch_function_03.pass.cpp
+++ b/libcxxabi/test/catch_function_03.pass.cpp
@@ -10,9 +10,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-exceptions
 
-// Support for catching a function pointer including noexcept was shipped in macOS 10.13
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 #include <cassert>
 
 template<bool Noexcept> void f() noexcept(Noexcept) {}
diff --git a/libcxxabi/test/catch_member_data_pointer_01.pass.cpp b/libcxxabi/test/catch_member_data_pointer_01.pass.cpp
index 86313cec014c8f..6d739e7932d63f 100644
--- a/libcxxabi/test/catch_member_data_pointer_01.pass.cpp
+++ b/libcxxabi/test/catch_member_data_pointer_01.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: no-exceptions
 
-// 1b00fc5d8133 made it in the dylib in macOS 10.11
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10}}
-
 #include <cassert>
 
 struct A
diff --git a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
index 9e6790b0abfb10..112e78779b4720 100644
--- a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
+++ b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
@@ -10,9 +10,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-exceptions
 
-// Support for catching a function pointer including noexcept was shipped in macOS 10.13
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // GCC supports noexcept function types but this test still fails.
 // This is likely a bug in their implementation. Investigation needed.
 // XFAIL: gcc-13, gcc-14
diff --git a/libcxxabi/test/catch_member_pointer_nullptr.pass.cpp b/libcxxabi/test/catch_member_pointer_nullptr.pass.cpp
index a476386f6db841..fedfb46bca590d 100644
--- a/libcxxabi/test/catch_member_pointer_nullptr.pass.cpp
+++ b/libcxxabi/test/catch_member_pointer_nullptr.pass.cpp
@@ -6,10 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Catching an exception thrown as nullptr was not properly handled before
-// 2f984cab4fa7, which landed in macOS 10.13
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // UNSUPPORTED: no-exceptions
 
 #include <cassert>
diff --git a/libcxxabi/test/catch_multi_level_pointer.pass.cpp b/libcxxabi/test/catch_multi_level_pointer.pass.cpp
index 244056871855cf..72c89de8a2081c 100644
--- a/libcxxabi/test/catch_multi_level_pointer.pass.cpp
+++ b/libcxxabi/test/catch_multi_level_pointer.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: no-exceptions
 
-// 1b00fc5d8133 made it in the dylib in macOS 10.11
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10}}
-
 // mps2-an385 machine used for testing of picolibc has just 4 MB of "flash"
 // memory and this test requires almost 5 MB
 // UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
diff --git a/libcxxabi/test/catch_null_pointer_to_object_pr64953.pass.cpp b/libcxxabi/test/catch_null_pointer_to_object_pr64953.pass.cpp
index d0f719685da75d..1bd4277e76d809 100644
--- a/libcxxabi/test/catch_null_pointer_to_object_pr64953.pass.cpp
+++ b/libcxxabi/test/catch_null_pointer_to_object_pr64953.pass.cpp
@@ -25,17 +25,10 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-exceptions
-// This test requires the fix to
-// https://github.com/llvm/llvm-project/issues/64953, which is in libc++abi.dylib.
-// The fix is not contained in older macOS system dylibs, so the test will fail
-// there.
-// FIXME: In the case that we are testing `natively` with the CI scripts we
-// currently pass the newly-built libraries to the execution, this leads to an
-// XPASS here so that we have to make these UNSUPPORTED for now (they should be
-// XFAILs when tested against current [macOS14] and previous installed libc++abi
-// as described above).
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}{{.*}}
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx{{11|12|13|14}}{{.*}}
+
+// This test requires the fix to https://github.com/llvm/llvm-project/issues/64953,
+// which landed in d5f84e6 and is in the libc++abi built library.
+// XFAIL: using-built-library-before-llvm-18
 
 #include <exception>
 #include <stdlib.h>
diff --git a/libcxxabi/test/catch_pointer_nullptr.pass.cpp b/libcxxabi/test/catch_pointer_nullptr.pass.cpp
index 473a703988bcf7..3afb88bc4c694b 100644
--- a/libcxxabi/test/catch_pointer_nullptr.pass.cpp
+++ b/libcxxabi/test/catch_pointer_nullptr.pass.cpp
@@ -6,10 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Catching an exception thrown as nullptr was not properly handled before
-// 2f984cab4fa7, which landed in macOS 10.13
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // UNSUPPORTED: c++03
 // UNSUPPORTED: no-exceptions
 
diff --git a/libcxxabi/test/catch_ptr_02.pass.cpp b/libcxxabi/test/catch_ptr_02.pass.cpp
index 2b22ad71b5a046..3662464ce43fdc 100644
--- a/libcxxabi/test/catch_ptr_02.pass.cpp
+++ b/libcxxabi/test/catch_ptr_02.pass.cpp
@@ -13,9 +13,6 @@
 // behavior under test.
 // ADDITIONAL_COMPILE_FLAGS: -Wno-exceptions
 
-// The fix for PR17222 made it in the dylib for macOS 10.10
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.9
-
 #include <cassert>
 #include <stdint.h>
 
diff --git a/libcxxabi/test/cxa_call_terminate.pass.cpp b/libcxxabi/test/cxa_call_terminate.pass.cpp
new file mode 100644
index 00000000000000..d0c5d13197ff0f
--- /dev/null
+++ b/libcxxabi/test/cxa_call_terminate.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// We're testing the diagnosed behaviour here.
+// ADDITIONAL_COMPILE_FLAGS: -Wno-exceptions
+
+#include <cassert>
+#include <cstdlib>
+#include <exception>
+
+#include "test_macros.h"
+
+void func() TEST_NOEXCEPT {
+  try {
+    throw 1;
+  } catch (float) {
+  }
+}
+
+void terminate_handler() {
+  assert(std::current_exception() != nullptr);
+  std::exit(0);
+}
+
+int main(int, char**) {
+  std::set_terminate(terminate_handler);
+  func();
+  assert(false);
+}
diff --git a/libcxxabi/test/cxa_vec_new_overflow_PR41395.pass.cpp b/libcxxabi/test/cxa_vec_new_overflow_PR41395.pass.cpp
index c013327f7b7510..b3ecf485d3951d 100644
--- a/libcxxabi/test/cxa_vec_new_overflow_PR41395.pass.cpp
+++ b/libcxxabi/test/cxa_vec_new_overflow_PR41395.pass.cpp
@@ -8,8 +8,8 @@
 
 // UNSUPPORTED: no-exceptions
 
-// PR41395 isn't fixed until the dylib shipped with macOS 10.15
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// This test requires the fix in http://llvm.org/PR41395 (c4225e124f9e).
+// XFAIL: using-built-library-before-llvm-9
 
 #include "cxxabi.h"
 #include <new>
diff --git a/libcxxabi/test/dynamic_cast.pass.cpp b/libcxxabi/test/dynamic_cast.pass.cpp
index 19d5aab454c55b..b144e8c06bc5ff 100644
--- a/libcxxabi/test/dynamic_cast.pass.cpp
+++ b/libcxxabi/test/dynamic_cast.pass.cpp
@@ -6,11 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// PR33425 and PR33487 are not fixed until the dylib shipped with macOS 10.15
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.14
-
-// PR33439 isn't fixed until the dylib shipped with macOS 10.14
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
+// This test requires PR33425, PR33487 and PR33439.
+// XFAIL: using-built-library-before-llvm-9
 
 #include <cassert>
 
diff --git a/libcxxabi/test/exception_object_alignment.pass.cpp b/libcxxabi/test/exception_object_alignment.pass.cpp
index 9de59b96d9a35d..f3bad321be8d83 100644
--- a/libcxxabi/test/exception_object_alignment.pass.cpp
+++ b/libcxxabi/test/exception_object_alignment.pass.cpp
@@ -8,12 +8,6 @@
 
 // UNSUPPORTED: no-exceptions
 
-// The situation for the alignment of exception objects is badly messed up
-// before macOS 10.14. The test fails on macOS 10.9 to 10.12, passes on macOS
-// 10.13 (no investigation done), and passes afterwards. Just mark all the OSes
-// before 10.14 as unsupported.
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
-
 // Check that the pointer __cxa_allocate_exception returns is aligned to the
 // default alignment for the target architecture.
 
diff --git a/libcxxabi/test/incomplete_type.sh.cpp b/libcxxabi/test/incomplete_type.sh.cpp
index fc26227b8bdc54..721b2770b5ecc2 100644
--- a/libcxxabi/test/incomplete_type.sh.cpp
+++ b/libcxxabi/test/incomplete_type.sh.cpp
@@ -16,9 +16,6 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: no-rtti
 
-// The fix for PR25898 landed in the system dylibs in macOS 10.13
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // RUN: %{cxx} %{flags} %{compile_flags} -Wno-unreachable-code -c %s -o %t.one.o
 // RUN: %{cxx} %{flags} %{compile_flags} -Wno-unreachable-code -c %s -o %t.two.o -DTU_ONE
 // RUN: %{cxx} %{flags} %t.one.o %t.two.o %{link_flags} -o %t.exe
diff --git a/libcxxabi/test/lit.cfg.py b/libcxxabi/test/lit.cfg.py
index ea4a9a75b4fdb8..fb36c8d4aaa32c 100644
--- a/libcxxabi/test/lit.cfg.py
+++ b/libcxxabi/test/lit.cfg.py
@@ -7,5 +7,6 @@
 lit_config.fatal(
     "You seem to be running Lit directly -- you should be running Lit through "
     "<build>/bin/llvm-lit, which will ensure that the right Lit configuration "
-    "file is used."
+    "file is used. See https://libcxx.llvm.org/TestingLibcxx.html#usage for "
+    "how to run the libc++abi tests."
 )
diff --git a/libcxxabi/test/test_aux_runtime_op_array_new.pass.cpp b/libcxxabi/test/test_aux_runtime_op_array_new.pass.cpp
index b65d8b5d339297..3b4cea02012dda 100644
--- a/libcxxabi/test/test_aux_runtime_op_array_new.pass.cpp
+++ b/libcxxabi/test/test_aux_runtime_op_array_new.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: no-exceptions
 
 // ___cxa_throw_bad_array_new_length is re-exported from libc++ only starting
-// in macosx 10.15
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// in LLVM 9.
+// XFAIL: using-built-library-before-llvm-9
 
 #include <cxxabi.h>
 #include <new>
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 1435c879d383d8..fe5598991b831b 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -6,16 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-// The demangler does not pass all these tests with the system dylibs on macOS.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-
 // This test is too big for most embedded devices.
 // XFAIL: LIBCXX-PICOLIBC-FIXME
 
 // https://llvm.org/PR51407 was not fixed in some previously-released
 // demanglers, which causes them to run into the infinite loop.
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx11.0
+// UNSUPPORTED: using-built-library-before-llvm-14
 
 // Android's long double on x86[-64] is (64/128)-bits instead of Linux's usual
 // 80-bit format, and this demangling test is failing on it.
diff --git a/libcxxabi/test/test_exception_address_alignment.pass.cpp b/libcxxabi/test/test_exception_address_alignment.pass.cpp
index 80d42854c87b7f..53b5d6bb4c8850 100644
--- a/libcxxabi/test/test_exception_address_alignment.pass.cpp
+++ b/libcxxabi/test/test_exception_address_alignment.pass.cpp
@@ -9,12 +9,6 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: c++03
 
-// The <unwind.h> header provided in the SDK of older Xcodes used to provide
-// an incorrectly aligned _Unwind_Exception type on non-ARM. That causes these
-// tests to fail when running against a system libc++abi and libunwind that was
-// compiled with an incorrect definition of _Unwind_Exception.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 // Test that the address of the exception object is properly aligned as required
 // by the relevant ABI
 
diff --git a/libcxxabi/test/uncaught_exception.pass.cpp b/libcxxabi/test/uncaught_exception.pass.cpp
index 54882862e7285f..cace88a309d0b7 100644
--- a/libcxxabi/test/uncaught_exception.pass.cpp
+++ b/libcxxabi/test/uncaught_exception.pass.cpp
@@ -15,8 +15,7 @@
 // to undefined symbols when linking against a libc++ that re-exports the symbols,
 // but running against a libc++ that doesn't. Fortunately, usage of __cxa_uncaught_exception()
 // in the wild seems to be close to non-existent.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx{{(11|12|13|14)([.][0-9]+)?}}
+// XFAIL: using-built-library-before-llvm-19
 
 #include <cxxabi.h>
 #include <cassert>
diff --git a/libcxxabi/test/uncaught_exceptions.pass.cpp b/libcxxabi/test/uncaught_exceptions.pass.cpp
index cf76dc12e5e792..d1074616386be7 100644
--- a/libcxxabi/test/uncaught_exceptions.pass.cpp
+++ b/libcxxabi/test/uncaught_exceptions.pass.cpp
@@ -8,8 +8,8 @@
 
 // UNSUPPORTED: no-exceptions
 
-// __cxa_uncaught_exceptions is not re-exported from libc++ until macOS 10.15.
-// XFAIL: stdlib=system && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+// __cxa_uncaught_exceptions is not re-exported from libc++ until LLVM 9.
+// XFAIL: using-built-library-before-llvm-9
 
 #include <cxxabi.h>
 #include <cassert>
diff --git a/libunwind/test/lit.cfg.py b/libunwind/test/lit.cfg.py
index ea4a9a75b4fdb8..51e85e489db01f 100644
--- a/libunwind/test/lit.cfg.py
+++ b/libunwind/test/lit.cfg.py
@@ -7,5 +7,6 @@
 lit_config.fatal(
     "You seem to be running Lit directly -- you should be running Lit through "
     "<build>/bin/llvm-lit, which will ensure that the right Lit configuration "
-    "file is used."
+    "file is used. See https://libcxx.llvm.org/TestingLibcxx.html#usage for "
+    "how to run the libunwind tests."
 )
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 17550170a173e2..cef6271e4c8f8a 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -413,6 +413,8 @@ void LinkerDriver::parseDirectives(InputFile *file) {
         enqueuePath(*path, false, false);
       break;
     case OPT_entry:
+      if (!arg->getValue()[0])
+        fatal("missing entry point symbol name");
       ctx.config.entry = addUndefined(mangle(arg->getValue()));
       break;
     case OPT_failifmismatch:
@@ -2249,6 +2251,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   {
     llvm::TimeTraceScope timeScope("Entry point");
     if (auto *arg = args.getLastArg(OPT_entry)) {
+      if (!arg->getValue()[0])
+        fatal("missing entry point symbol name");
       config->entry = addUndefined(mangle(arg->getValue()));
     } else if (!config->entry && !config->noEntry) {
       if (args.hasArg(OPT_dll)) {
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 47e6ea1ff77564..cf5c2380690f14 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -429,6 +429,19 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
   case R_AARCH64_PREL64:
     write64(loc, val);
     break;
+  case R_AARCH64_AUTH_ABS64:
+    // If val is wider than 32 bits, the relocation must have been moved from
+    // .relr.auth.dyn to .rela.dyn, and the addend write is not needed.
+    //
+    // If val fits in 32 bits, we have two potential scenarios:
+    // * True RELR: Write the 32-bit `val`.
+    // * RELA: Even if the value now fits in 32 bits, it might have been
+    //   converted from RELR during an iteration in
+    //   finalizeAddressDependentContent(). Writing the value is harmless
+    //   because dynamic linking ignores it.
+    if (isInt<32>(val))
+      write32(loc, val);
+    break;
   case R_AARCH64_ADD_ABS_LO12_NC:
     or32AArch64Imm(loc, val);
     break;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index e52d6444c64f2e..03ff4eadfe6703 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -887,7 +887,7 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
       // We handle that situation gracefully by discarding dangling relocation
       // sections.
       const uint32_t info = sec.sh_info;
-      InputSectionBase *s = getRelocTarget(i, sec, info);
+      InputSectionBase *s = getRelocTarget(i, info);
       if (!s)
         continue;
 
@@ -1024,9 +1024,7 @@ void readGnuProperty(const InputSection &sec, ObjFile<ELFT> &f) {
 }
 
 template <class ELFT>
-InputSectionBase *ObjFile<ELFT>::getRelocTarget(uint32_t idx,
-                                                const Elf_Shdr &sec,
-                                                uint32_t info) {
+InputSectionBase *ObjFile<ELFT>::getRelocTarget(uint32_t idx, uint32_t info) {
   if (info < this->sections.size()) {
     InputSectionBase *target = this->sections[info];
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 834b3b63dd83b4..0617f41e1e13ac 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -299,8 +299,7 @@ template <class ELFT> class ObjFile : public ELFFileBase {
   void initializeSymbols(const llvm::object::ELFFile<ELFT> &obj);
   void initializeJustSymbols();
 
-  InputSectionBase *getRelocTarget(uint32_t idx, const Elf_Shdr &sec,
-                                   uint32_t info);
+  InputSectionBase *getRelocTarget(uint32_t idx, uint32_t info);
   InputSectionBase *createInputSection(uint32_t idx, const Elf_Shdr &sec,
                                        StringRef name);
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index be12218d9948c4..4420be77f66856 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -411,7 +411,7 @@ void InputSection::copyRelocations(uint8_t *buf,
     auto *p = reinterpret_cast<typename ELFT::Rela *>(buf);
     buf += sizeof(RelTy);
 
-    if (RelTy::IsRela)
+    if (RelTy::HasAddend)
       p->r_addend = rel.addend;
 
     // Output section VA is zero for -r, so r_offset is an offset within the
@@ -452,7 +452,7 @@ void InputSection::copyRelocations(uint8_t *buf,
 
       int64_t addend = rel.addend;
       const uint8_t *bufLoc = sec->content().begin() + rel.offset;
-      if (!RelTy::IsRela)
+      if (!RelTy::HasAddend)
         addend = target.getImplicitAddend(bufLoc, type);
 
       if (config->emachine == EM_MIPS &&
@@ -471,7 +471,7 @@ void InputSection::copyRelocations(uint8_t *buf,
         addend += sec->getFile<ELFT>()->mipsGp0;
       }
 
-      if (RelTy::IsRela)
+      if (RelTy::HasAddend)
         p->r_addend = sym.getVA(addend) - section->getOutputSection()->addr;
       // For SHF_ALLOC sections relocated by REL, append a relocation to
       // sec->relocations so that relocateAlloc transitively called by
@@ -934,7 +934,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
     const uint64_t offset = rel.r_offset;
     uint8_t *bufLoc = buf + offset;
     int64_t addend = getAddend<ELFT>(rel);
-    if (!RelTy::IsRela)
+    if (!RelTy::HasAddend)
       addend += target.getImplicitAddend(bufLoc, type);
 
     Symbol &sym = f->getRelocTargetSym(rel);
@@ -1007,10 +1007,11 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       }
     }
 
-    // For a relocatable link, content relocated by RELA remains unchanged and
-    // we can stop here, while content relocated by REL referencing STT_SECTION
-    // needs updating implicit addends.
-    if (config->relocatable && (RelTy::IsRela || sym.type != STT_SECTION))
+    // For a relocatable link, content relocated by relocation types with an
+    // explicit addend, such as RELA, remain unchanged and we can stop here.
+    // While content relocated by relocation types with an implicit addend, such
+    // as REL, needs the implicit addend updated.
+    if (config->relocatable && (RelTy::HasAddend || sym.type != STT_SECTION))
       continue;
 
     // R_ABS/R_DTPREL and some other relocations can be used from non-SHF_ALLOC
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 68f5240ddc690f..1ec796a3bdd993 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -177,11 +177,10 @@ void LinkerScript::setDot(Expr e, const Twine &loc, bool inSec) {
   // report it if this is the last assignAddresses iteration. dot may be smaller
   // if there is another assignAddresses iteration.
   if (val < dot && inSec) {
-    backwardDotErr =
-        (loc + ": unable to move location counter (0x" + Twine::utohexstr(dot) +
-         ") backward to 0x" + Twine::utohexstr(val) + " for section '" +
-         state->outSec->name + "'")
-            .str();
+    recordError(loc + ": unable to move location counter (0x" +
+                Twine::utohexstr(dot) + ") backward to 0x" +
+                Twine::utohexstr(val) + " for section '" + state->outSec->name +
+                "'");
   }
 
   // Update to location counter means update to section size.
@@ -1411,7 +1410,7 @@ LinkerScript::assignAddresses() {
   state = &st;
   errorOnMissingSection = true;
   st.outSec = aether;
-  backwardDotErr.clear();
+  recordedErrors.clear();
 
   SymbolAssignmentMap oldValues = getSymbolAssignmentValues(sectionCommands);
   for (SectionCommand *cmd : sectionCommands) {
@@ -1661,6 +1660,11 @@ void LinkerScript::printMemoryUsage(raw_ostream& os) {
   }
 }
 
+void LinkerScript::recordError(const Twine &msg) {
+  auto &str = recordedErrors.emplace_back();
+  msg.toVector(str);
+}
+
 static void checkMemoryRegion(const MemoryRegion *region,
                               const OutputSection *osec, uint64_t addr) {
   uint64_t osecEnd = addr + osec->size;
@@ -1673,8 +1677,8 @@ static void checkMemoryRegion(const MemoryRegion *region,
 }
 
 void LinkerScript::checkFinalScriptConditions() const {
-  if (backwardDotErr.size())
-    errorOrWarn(backwardDotErr);
+  for (StringRef err : recordedErrors)
+    errorOrWarn(err);
   for (const OutputSection *sec : outputSections) {
     if (const MemoryRegion *memoryRegion = sec->memRegion)
       checkMemoryRegion(memoryRegion, sec, sec->addr);
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index 36feab36e26bae..db26d026a00ba8 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -350,6 +350,11 @@ class LinkerScript final {
   // Describe memory region usage.
   void printMemoryUsage(raw_ostream &os);
 
+  // Record a pending error during an assignAddresses invocation.
+  // assignAddresses is executed more than once. Therefore, lld::error should be
+  // avoided to not report duplicate errors.
+  void recordError(const Twine &msg);
+
   // Check backward location counter assignment and memory region/LMA overflows.
   void checkFinalScriptConditions() const;
 
@@ -375,7 +380,7 @@ class LinkerScript final {
   bool seenDataAlign = false;
   bool seenRelroEnd = false;
   bool errorOnMissingSection = false;
-  std::string backwardDotErr;
+  SmallVector<SmallString<0>, 0> recordedErrors;
 
   // List of section patterns specified with KEEP commands. They will
   // be kept even if they are unused and --gc-sections is specified.
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 1b08339e3996ae..da4724d8f65361 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -492,7 +492,8 @@ int64_t RelocationScanner::computeMipsAddend(const RelTy &rel, RelExpr expr,
 
   // The ABI says that the paired relocation is used only for REL.
   // See p. 4-17 at ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
-  if (RelTy::IsRela)
+  // This generalises to relocation types with implicit addends.
+  if (RelTy::HasAddend)
     return 0;
 
   RelType type = rel.getType(config->isMips64EL);
@@ -898,9 +899,9 @@ static void addRelativeReloc(InputSectionBase &isec, uint64_t offsetInSec,
     isec.addReloc({expr, type, offsetInSec, addend, &sym});
     if (shard)
       part.relrDyn->relocsVec[parallel::getThreadIndex()].push_back(
-          {&isec, offsetInSec});
+          {&isec, isec.relocs().size() - 1});
     else
-      part.relrDyn->relocs.push_back({&isec, offsetInSec});
+      part.relrDyn->relocs.push_back({&isec, isec.relocs().size() - 1});
     return;
   }
   part.relaDyn->addRelativeReloc<shard>(target->relativeRel, isec, offsetInSec,
@@ -1154,6 +1155,12 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
         // relative relocation. Use a symbolic relocation instead.
         if (sym.isPreemptible) {
           part.relaDyn->addSymbolReloc(type, *sec, offset, sym, addend, type);
+        } else if (part.relrAuthDyn && sec->addralign >= 2 && offset % 2 == 0) {
+          // When symbol values are determined in
+          // finalizeAddressDependentContent, some .relr.auth.dyn relocations
+          // may be moved to .rela.dyn.
+          sec->addReloc({expr, type, offset, addend, &sym});
+          part.relrAuthDyn->relocs.push_back({sec, sec->relocs().size() - 1});
         } else {
           part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, sec, offset,
                                   DynamicReloc::AddendOnlyWithTargetVA, sym,
@@ -1448,7 +1455,7 @@ template <class ELFT, class RelTy> void RelocationScanner::scanOne(RelTy *&i) {
     return;
 
   RelExpr expr = target->getRelExpr(type, sym, sec->content().data() + offset);
-  int64_t addend = RelTy::IsRela
+  int64_t addend = RelTy::HasAddend
                        ? getAddend<ELFT>(rel)
                        : target->getImplicitAddend(
                              sec->content().data() + rel.r_offset, type);
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index f90ce6fa740757..db46263115242a 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -158,7 +158,8 @@ static void moveAbsRight(ExprValue &a, ExprValue &b) {
   if (a.sec == nullptr || (a.forceAbsolute && !b.isAbsolute()))
     std::swap(a, b);
   if (!b.isAbsolute())
-    error(a.loc + ": at least one side of the expression must be absolute");
+    script->recordError(
+        a.loc + ": at least one side of the expression must be absolute");
 }
 
 static ExprValue add(ExprValue a, ExprValue b) {
@@ -1384,7 +1385,7 @@ StringRef ScriptParser::readParenLiteral() {
 
 static void checkIfExists(const OutputSection &osec, StringRef location) {
   if (osec.location.empty() && script->errorOnMissingSection)
-    error(location + ": undefined section " + osec.name);
+    script->recordError(location + ": undefined section " + osec.name);
 }
 
 static bool isValidSymbolName(StringRef s) {
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index cc423d152e9127..5d3f3df216b85c 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1420,6 +1420,12 @@ DynamicSection<ELFT>::computeContents() {
     addInt(config->useAndroidRelrTags ? DT_ANDROID_RELRENT : DT_RELRENT,
            sizeof(Elf_Relr));
   }
+  if (part.relrAuthDyn && part.relrAuthDyn->getParent() &&
+      !part.relrAuthDyn->relocs.empty()) {
+    addInSec(DT_AARCH64_AUTH_RELR, *part.relrAuthDyn);
+    addInt(DT_AARCH64_AUTH_RELRSZ, part.relrAuthDyn->getParent()->size);
+    addInt(DT_AARCH64_AUTH_RELRENT, sizeof(Elf_Relr));
+  }
   if (isMain && in.relaPlt->isNeeded()) {
     addInSec(DT_JMPREL, *in.relaPlt);
     entries.emplace_back(DT_PLTRELSZ, addPltRelSz());
@@ -1731,10 +1737,13 @@ template <class ELFT> void RelocationSection<ELFT>::writeTo(uint8_t *buf) {
   }
 }
 
-RelrBaseSection::RelrBaseSection(unsigned concurrency)
-    : SyntheticSection(SHF_ALLOC,
-                       config->useAndroidRelrTags ? SHT_ANDROID_RELR : SHT_RELR,
-                       config->wordsize, ".relr.dyn"),
+RelrBaseSection::RelrBaseSection(unsigned concurrency, bool isAArch64Auth)
+    : SyntheticSection(
+          SHF_ALLOC,
+          isAArch64Auth
+              ? SHT_AARCH64_AUTH_RELR
+              : (config->useAndroidRelrTags ? SHT_ANDROID_RELR : SHT_RELR),
+          config->wordsize, isAArch64Auth ? ".relr.auth.dyn" : ".relr.dyn"),
       relocsVec(concurrency) {}
 
 void RelrBaseSection::mergeRels() {
@@ -2002,8 +2011,8 @@ bool AndroidPackedRelocationSection<ELFT>::updateAllocSize() {
 }
 
 template <class ELFT>
-RelrSection<ELFT>::RelrSection(unsigned concurrency)
-    : RelrBaseSection(concurrency) {
+RelrSection<ELFT>::RelrSection(unsigned concurrency, bool isAArch64Auth)
+    : RelrBaseSection(concurrency, isAArch64Auth) {
   this->entsize = config->wordsize;
 }
 
@@ -4720,9 +4729,14 @@ template <class ELFT> void elf::createSyntheticSections() {
       add(*part.buildId);
     }
 
+    // dynSymTab is always present to simplify sym->includeInDynsym() in
+    // finalizeSections.
     part.dynStrTab = std::make_unique<StringTableSection>(".dynstr", true);
     part.dynSymTab =
         std::make_unique<SymbolTableSection<ELFT>>(*part.dynStrTab);
+
+    if (config->relocatable)
+      continue;
     part.dynamic = std::make_unique<DynamicSection<ELFT>>();
 
     if (hasMemtag()) {
@@ -4774,21 +4788,22 @@ template <class ELFT> void elf::createSyntheticSections() {
     if (config->relrPackDynRelocs) {
       part.relrDyn = std::make_unique<RelrSection<ELFT>>(threadCount);
       add(*part.relrDyn);
+      part.relrAuthDyn = std::make_unique<RelrSection<ELFT>>(
+          threadCount, /*isAArch64Auth=*/true);
+      add(*part.relrAuthDyn);
     }
 
-    if (!config->relocatable) {
-      if (config->ehFrameHdr) {
-        part.ehFrameHdr = std::make_unique<EhFrameHeader>();
-        add(*part.ehFrameHdr);
-      }
-      part.ehFrame = std::make_unique<EhFrameSection>();
-      add(*part.ehFrame);
+    if (config->ehFrameHdr) {
+      part.ehFrameHdr = std::make_unique<EhFrameHeader>();
+      add(*part.ehFrameHdr);
+    }
+    part.ehFrame = std::make_unique<EhFrameSection>();
+    add(*part.ehFrame);
 
-      if (config->emachine == EM_ARM) {
-        // This section replaces all the individual .ARM.exidx InputSections.
-        part.armExidx = std::make_unique<ARMExidxSyntheticSection>();
-        add(*part.armExidx);
-      }
+    if (config->emachine == EM_ARM) {
+      // This section replaces all the individual .ARM.exidx InputSections.
+      part.armExidx = std::make_unique<ARMExidxSyntheticSection>();
+      add(*part.armExidx);
     }
 
     if (!config->packageMetadata.empty()) {
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 34949025a45f78..eaa09ea7194fb2 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -548,7 +548,9 @@ class RelocationBaseSection : public SyntheticSection {
   static bool classof(const SectionBase *d) {
     return SyntheticSection::classof(d) &&
            (d->type == llvm::ELF::SHT_RELA || d->type == llvm::ELF::SHT_REL ||
-            d->type == llvm::ELF::SHT_RELR);
+            d->type == llvm::ELF::SHT_RELR ||
+            (d->type == llvm::ELF::SHT_AARCH64_AUTH_RELR &&
+             config->emachine == llvm::ELF::EM_AARCH64));
   }
   int32_t dynamicTag, sizeDynamicTag;
   SmallVector<DynamicReloc, 0> relocs;
@@ -596,15 +598,17 @@ class AndroidPackedRelocationSection final : public RelocationBaseSection {
 };
 
 struct RelativeReloc {
-  uint64_t getOffset() const { return inputSec->getVA(offsetInSec); }
+  uint64_t getOffset() const {
+    return inputSec->getVA(inputSec->relocs()[relocIdx].offset);
+  }
 
   const InputSectionBase *inputSec;
-  uint64_t offsetInSec;
+  size_t relocIdx;
 };
 
 class RelrBaseSection : public SyntheticSection {
 public:
-  RelrBaseSection(unsigned concurrency);
+  RelrBaseSection(unsigned concurrency, bool isAArch64Auth = false);
   void mergeRels();
   bool isNeeded() const override {
     return !relocs.empty() ||
@@ -622,7 +626,7 @@ template <class ELFT> class RelrSection final : public RelrBaseSection {
   using Elf_Relr = typename ELFT::Relr;
 
 public:
-  RelrSection(unsigned concurrency);
+  RelrSection(unsigned concurrency, bool isAArch64Auth = false);
 
   bool updateAllocSize() override;
   size_t getSize() const override { return relrRelocs.size() * this->entsize; }
@@ -1460,6 +1464,7 @@ struct Partition {
   std::unique_ptr<PackageMetadataNote> packageMetadataNote;
   std::unique_ptr<RelocationBaseSection> relaDyn;
   std::unique_ptr<RelrBaseSection> relrDyn;
+  std::unique_ptr<RelrBaseSection> relrAuthDyn;
   std::unique_ptr<VersionDefinitionSection> verDef;
   std::unique_ptr<SyntheticSection> verNeed;
   std::unique_ptr<VersionTableSection> verSym;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 640cb2a445f7da..ba794310905de0 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1458,9 +1458,33 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
       in.mipsGot->updateAllocSize();
 
     for (Partition &part : partitions) {
-      changed |= part.relaDyn->updateAllocSize();
+      // The R_AARCH64_AUTH_RELATIVE has a smaller addend field as bits [63:32]
+      // encode the signing schema. We've put relocations in .relr.auth.dyn
+      // during RelocationScanner::processAux, but the target VA for some of
+      // them might be wider than 32 bits. We can only know the final VA at this
+      // point, so move relocations with large values from .relr.auth.dyn to
+      // .rela.dyn. See also AArch64::relocate.
+      if (part.relrAuthDyn) {
+        auto it = llvm::remove_if(
+            part.relrAuthDyn->relocs, [&part](const RelativeReloc &elem) {
+              const Relocation &reloc = elem.inputSec->relocs()[elem.relocIdx];
+              if (isInt<32>(reloc.sym->getVA(reloc.addend)))
+                return false;
+              part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, elem.inputSec,
+                                      reloc.offset,
+                                      DynamicReloc::AddendOnlyWithTargetVA,
+                                      *reloc.sym, reloc.addend, R_ABS});
+              return true;
+            });
+        changed |= (it != part.relrAuthDyn->relocs.end());
+        part.relrAuthDyn->relocs.erase(it, part.relrAuthDyn->relocs.end());
+      }
+      if (part.relaDyn)
+        changed |= part.relaDyn->updateAllocSize();
       if (part.relrDyn)
         changed |= part.relrDyn->updateAllocSize();
+      if (part.relrAuthDyn)
+        changed |= part.relrAuthDyn->updateAllocSize();
       if (part.memtagGlobalDescriptors)
         changed |= part.memtagGlobalDescriptors->updateAllocSize();
     }
@@ -1624,6 +1648,14 @@ static void removeUnusedSyntheticSections() {
         auto *sec = cast<SyntheticSection>(s);
         if (sec->getParent() && sec->isNeeded())
           return false;
+        // .relr.auth.dyn relocations may be moved to .rela.dyn in
+        // finalizeAddressDependentContent, making .rela.dyn no longer empty.
+        // Conservatively keep .rela.dyn. .relr.auth.dyn can be made empty, but
+        // we would fail to remove it here.
+        if (config->emachine == EM_AARCH64 && config->relrPackDynRelocs)
+          if (auto *relSec = dyn_cast<RelocationBaseSection>(sec))
+            if (relSec == mainPart->relaDyn.get())
+              return false;
         unused.insert(sec);
         return true;
       });
@@ -1936,6 +1968,10 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
         part.relrDyn->mergeRels();
         finalizeSynthetic(part.relrDyn.get());
       }
+      if (part.relrAuthDyn) {
+        part.relrAuthDyn->mergeRels();
+        finalizeSynthetic(part.relrAuthDyn.get());
+      }
 
       finalizeSynthetic(part.dynSymTab.get());
       finalizeSynthetic(part.gnuHashTab.get());
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 96253e15f7eea0..4d3f3d05c23387 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -12,7 +12,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
@@ -92,7 +92,7 @@ class SymbolPatterns {
 public:
   // GlobPattern can also match literals,
   // but we prefer the O(1) lookup of DenseSet.
-  llvm::DenseSet<llvm::CachedHashStringRef> literals;
+  llvm::SetVector<llvm::CachedHashStringRef> literals;
   std::vector<llvm::GlobPattern> globs;
 
   bool empty() const { return literals.empty() && globs.empty(); }
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 8d66b37534f47e..0cee1a84d0b55a 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -1912,6 +1912,9 @@ DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella,
       normalSymbols.push_back(symbol);
     }
   }
+  // interface.symbols() order is non-deterministic.
+  llvm::sort(normalSymbols,
+             [](auto *l, auto *r) { return l->getName() < r->getName(); });
 
   // TODO(compnerd) filter out symbols based on the target platform
   for (const auto *symbol : normalSymbols) {
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index 9906ea4c214ba3..83cfe354e72638 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -11,6 +11,7 @@
 
 #include "InputSection.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 
 namespace lld::macho {
 
diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index e8fadfef56d4b2..a4c7f58481aa14 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/MathExtras.h"
diff --git a/lld/test/COFF/invalid-entry.s b/lld/test/COFF/invalid-entry.s
new file mode 100644
index 00000000000000..8973dfba3b9f85
--- /dev/null
+++ b/lld/test/COFF/invalid-entry.s
@@ -0,0 +1,20 @@
+# REQUIRES: x86
+# RUN: split-file %s %t.dir && cd %t.dir
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-windows test.s -o test.obj
+# RUN: llvm-mc -filetype=obj -triple=x86_64-windows drectve.s -o drectve.obj
+
+# RUN: env LLD_IN_TEST=1 not lld-link -out:out.dll test.obj -dll -entry: 2>&1 | FileCheck %s
+# RUN: env LLD_IN_TEST=1 not lld-link -out:out.dll test.obj -dll drectve.obj 2>&1 | FileCheck %s
+
+# CHECK: error: missing entry point symbol name
+
+#--- test.s
+        .text
+        .globl func
+func:
+        ret
+
+#--- drectve.s
+        .section .drectve, "yn"
+        .ascii " -entry:"
diff --git a/lld/test/ELF/aarch64-reloc-pauth.s b/lld/test/ELF/aarch64-reloc-pauth.s
index 0cfcb1665b9397..19493fcfa13657 100644
--- a/lld/test/ELF/aarch64-reloc-pauth.s
+++ b/lld/test/ELF/aarch64-reloc-pauth.s
@@ -1,12 +1,13 @@
 # REQUIRES: aarch64
 
-# RUN: rm -rf %t
-# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o %t.a.o
-# RUN: ld.lld -shared %t.a.o -soname=so -o %t.a.so
-# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o
+# RUN: rm -rf %t && split-file %s %t && cd %t
 
-# RUN: ld.lld -pie %t.o %t.a.so -o %t
-# RUN: llvm-readobj -r %t | FileCheck --check-prefix=UNPACKED %s
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o a.o
+# RUN: ld.lld -shared a.o -soname=so -o a.so
+# RUN: llvm-mc -filetype=obj -triple=aarch64 main.s -o main.o
+
+# RUN: ld.lld -pie main.o a.so -o main
+# RUN: llvm-readobj -r main | FileCheck --check-prefix=UNPACKED %s
 
 # UNPACKED:          Section ({{.+}}) .rela.dyn {
 # UNPACKED-NEXT:       0x30470 R_AARCH64_AUTH_RELATIVE - 0x1
@@ -22,8 +23,8 @@
 # UNPACKED-NEXT:       0x304B0 R_AARCH64_AUTH_ABS64 bar2 0x0
 # UNPACKED-NEXT:     }
 
-# RUN: ld.lld %t.o %t.a.so -o %t.nopie
-# RUN: llvm-readobj -r %t.nopie | FileCheck --check-prefix=NOPIE %s
+# RUN: ld.lld main.o a.so -o main.nopie
+# RUN: llvm-readobj -r main.nopie | FileCheck --check-prefix=NOPIE %s
 
 # NOPIE:      Section ({{.+}}) .rela.dyn {
 # NOPIE:        0x230460 R_AARCH64_AUTH_RELATIVE - 0x200001
@@ -39,67 +40,95 @@
 # NOPIE-NEXT:   0x2304A0 R_AARCH64_AUTH_ABS64 bar2 0x0
 # NOPIE-NEXT: }
 
-# RUN: ld.lld -pie %t.o %t.a.so -o %t.pie
-# RUN: llvm-readelf -S -d -r -x .test %t.pie | FileCheck --check-prefixes=PIE,HEX %s
-
-# PIE:      Section Headers:
-# PIE-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
-# PIE:      .rela.dyn RELA {{0*}}[[#%x,ADDR1:]]
-# PIE-SAME:                                     {{0*}}[[#ADDR1]] 000108 18 A 1 0 8
-
-# PIE:      Relocation section '.rela.dyn' at offset 0x[[#ADDR1]] contains 11 entries:
-# PIE-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# PIE-NEXT: 0000000000030470  0000000000000411 R_AARCH64_AUTH_RELATIVE 1
-# PIE-NEXT: 0000000000030478  0000000000000411 R_AARCH64_AUTH_RELATIVE 30472
-# PIE-NEXT: 0000000000030480  0000000000000411 R_AARCH64_AUTH_RELATIVE fffffffffffffffd
-# PIE-NEXT: 0000000000030488  0000000000000411 R_AARCH64_AUTH_RELATIVE 12345678
-# PIE-NEXT: 0000000000030490  0000000000000411 R_AARCH64_AUTH_RELATIVE 123456789a
-# PIE-NEXT: 0000000000030498  0000000000000411 R_AARCH64_AUTH_RELATIVE ffffffedcba98766
-# PIE-NEXT: 00000000000304a0  0000000000000411 R_AARCH64_AUTH_RELATIVE 8003046f
-# PIE-NEXT: 00000000000304b9  0000000000000411 R_AARCH64_AUTH_RELATIVE 4
-# PIE-NEXT: 00000000000304c2  0000000000000411 R_AARCH64_AUTH_RELATIVE 30475
-# PIE-NEXT: 00000000000304a8  0000000100000244 R_AARCH64_AUTH_ABS64   0000000000000000 zed2 + 1111
-# PIE-NEXT: 00000000000304b0  0000000200000244 R_AARCH64_AUTH_ABS64   0000000000000000 bar2 + 0
+# RUN: ld.lld -pie -z pack-relative-relocs main.o a.so -o main.pie
+# RUN: llvm-readelf -S -d -r -x .test main.pie | FileCheck --check-prefixes=RELR,HEX %s
+
+# RELR:      Section Headers:
+# RELR-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
+# RELR:      .rela.dyn RELA {{0*}}[[ADDR1:.+]] {{0*}}[[ADDR1]] 000090 18 A 1 0 8
+# RELR:      .relr.auth.dyn AARCH64_AUTH_RELR {{0*}}[[ADDR2:.+]] {{0*}}[[ADDR2]] 000018 08 A 0 0 8
+
+# RELR:      Dynamic section at offset {{.+}} contains 16 entries
+# RELR:      0x0000000070000012 (AARCH64_AUTH_RELR) 0x[[ADDR2]]
+# RELR-NEXT: 0x0000000070000011 (AARCH64_AUTH_RELRSZ) 24 (bytes)
+# RELR-NEXT: 0x0000000070000013 (AARCH64_AUTH_RELRENT) 8 (bytes)
+
+## Decoded SHT_RELR section is same as UNPACKED,
+## but contains only the relative relocations.
+## Any relative relocations with odd offset or value wider than 32 bits stay in SHT_RELA.
+
+# RELR:      Relocation section '.rela.dyn' at offset 0x[[ADDR1]] contains 6 entries:
+# RELR-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# RELR-NEXT: 0000000000030460  0000000000000411 R_AARCH64_AUTH_RELATIVE           123456789a
+# RELR-NEXT: 0000000000030468  0000000000000411 R_AARCH64_AUTH_RELATIVE           ffffffedcba98766
+# RELR-NEXT: 0000000000030470  0000000000000411 R_AARCH64_AUTH_RELATIVE           8003043f
+# RELR-NEXT: 0000000000030489  0000000000000411 R_AARCH64_AUTH_RELATIVE           4
+# RELR-NEXT: 0000000000030478  0000000100000244 R_AARCH64_AUTH_ABS64   0000000000000000 zed2 + 1111
+# RELR-NEXT: 0000000000030480  0000000200000244 R_AARCH64_AUTH_ABS64   0000000000000000 bar2 + 0
+# RELR-EMPTY:
+# RELR-NEXT: Relocation section '.relr.auth.dyn' at offset 0x[[ADDR2]] contains 5 entries:
+# RELR-NEXT: Index: Entry Address Symbolic Address
+# RELR-NEXT: 0000: 0000000000030440 0000000000030440 $d.0
+# RELR-NEXT: 0001: 000000000000000f 0000000000030448 $d.0 + 0x8
+# RELR-NEXT:  0000000000030450 $d.0 + 0x10
+# RELR-NEXT:  0000000000030458 $d.0 + 0x18
+# RELR-NEXT: 0002: 0000000000030492 0000000000030492 $d.0 + 0x52
 
 # HEX:      Hex dump of section '.test':
-# HEX-NEXT: 0x00030470 00000000 2a000020 00000000 2b000000
+# HEX-NEXT: 0x00030440 01000000 2a000020 42040300 2b000000
+##                     ^^^^^^^^ Implicit val = 1 = __ehdr_start + 1
 ##                              ^^^^ Discr = 42
 ##                                    ^^ Key (bits 5..6) = DA
+##                                       ^^^^^^^^ Implicit val = 0x30442 = 0x30440 + 2 = .test + 2
 ##                                                ^^^^ Discr = 43
 ##                                                      ^^ Key (bits 5..6) = IA
-# HEX-NEXT: 0x00030480 00000000 2c000080 00000000 2d000020
+# HEX-NEXT: 0x00030450 fdffffff 2c000080 78563412 2d000020
+##                     ^^^^^^^^ Implicit val = -3 = __ehdr_start - 3
 ##                              ^^^^ Discr = 44
 ##                                    ^^ Key (bits 5..6) = IA
 ##                                    ^^ Addr diversity (bit 7) = true
+##                                       ^^^^^^^^ Implicit val = 0x12345678 = __ehdr_start + 0x12345678
 ##                                                ^^^^ Discr = 45
 ##                                                      ^^ Key (bits 5..6) = DA
-# HEX-NEXT: 0x00030490 00000000 2e000020 00000000 2f000020
+# HEX-NEXT: 0x00030460 00000000 2e000020 00000000 2f000020
+##                     ^^^^^^^^ No implicit val (rela reloc due val wider than 32 bits)
 ##                              ^^^^ Discr = 46
 ##                                    ^^ Key (bits 5..6) = DA
+##                                       ^^^^^^^^ No implicit val (rela reloc due to val wider than 32 bits)
 ##                                                ^^^^ Discr = 47
 ##                                                      ^^ Key (bits 5..6) = DA
-# HEX-NEXT: 0x000304a0 00000000 30000020 00000000 31000020
+# HEX-NEXT: 0x00030470 00000000 30000020 00000000 31000020
+##                     ^^^^^^^^ No implicit val (rela reloc due val wider than 32 bits)
 ##                              ^^^^ Discr = 48
 ##                                    ^^ Key (bits 5..6) = DA
+##                                       ^^^^^^^^ No implicit val (rela reloc due to a preemptible symbol)
 ##                                                ^^^^ Discr = 49
 ##                                                      ^^ Key (bits 5..6) = DA
-# HEX-NEXT: 0x000304b0 00000000 32000000 77000000 00330000
+# HEX-NEXT: 0x00030480 00000000 32000000 77000000 00330000
+##                     ^^^^^^^^ No implicit val (rela reloc due to a preemptible symbol)
 ##                              ^^^^ Discr = 50
 ##                                    ^^ Key (bits 5..6) = IA
+##                                         ^^^^^^ ^^ No implicit val (rela reloc due to odd offset)
 ##                                                  ^^^^ Discr = 51
-# HEX-NEXT: 0x000304c0 20770000 00003400 0020{{\ }}
+# HEX-NEXT: 0x00030490 20774504 03003400 0020{{\ }}
 ##                     ^^ Key (bits 5..6) = DA
+##                         ^^^^ ^^^^ Implicit val = 0x30445 = 0x30440 + 5 = .test + 5
 ##                                  ^^^^ Discr = 52
 ##                                         ^^ Key (bits 5..6) = DA
 
+#--- main.s
+
 .section .test, "aw"
 .p2align 3
 .quad (__ehdr_start + 1)@AUTH(da,42)
 .quad (.test + 2)@AUTH(ia,43)
 .quad (__ehdr_start - 3)@AUTH(ia,44,addr)
 .quad (__ehdr_start + 0x12345678)@AUTH(da,45)
+## Addend wider than 32 bits, not enough room for storing implicitly, would go to rela
 .quad (__ehdr_start + 0x123456789A)@AUTH(da,46)
+## Negative addend wider than 32 bits, not enough room for storing implicitly, would go to rela
 .quad (__ehdr_start - 0x123456789A)@AUTH(da,47)
+## INT32_MAX plus non-zero .test is wider than 32 bits, not enough room for storing implicitly, would go to rela
 .quad (.test + 0x7FFFFFFF)@AUTH(da,48)
 .quad (zed2 + 0x1111)@AUTH(da,49)
 .quad bar2@AUTH(ia,50)
@@ -107,3 +136,75 @@
 .quad (__ehdr_start + 4)@AUTH(da,51)
 .byte 0x77
 .quad (.test + 5)@AUTH(da,52)
+
+#--- empty-relr.s
+
+## .relr.auth.dyn relocations that do not fit 32 bits are moved to .rela.dyn.
+## In this case .relr.auth.dyn will be made empty, but
+## removeUnusedSyntheticSections fails to remove the section.
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 empty-relr.s -o empty-relr.o
+# RUN: ld.lld -pie -z pack-relative-relocs empty-relr.o -o empty-relr
+# RUN: llvm-readelf -S -d -r empty-relr | FileCheck --check-prefixes=EMPTY-RELR %s
+
+# EMPTY-RELR:      Section Headers:
+# EMPTY-RELR-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
+# EMPTY-RELR:      .rela.dyn RELA {{0*}}[[ADDR1:.+]] {{0*}}[[ADDR1]] 000018 18 A 0 0 8
+# EMPTY-RELR:      .relr.auth.dyn AARCH64_AUTH_RELR {{0*}}[[ADDR2:.+]] {{0*}}[[ADDR2]] 000000 08 A 0 0 8
+
+# EMPTY-RELR:      Dynamic section at offset {{.+}} contains 12 entries
+# EMPTY-RELR-NOT:  (AARCH64_AUTH_RELR)
+# EMPTY-RELR-NOT:  (AARCH64_AUTH_RELRSZ)
+# EMPTY-RELR-NOT:  (AARCH64_AUTH_RELRENT)
+# EMPTY-RELR:      0x0000000000000007 (RELA) 0x[[ADDR1]]
+# EMPTY-RELR-NEXT: 0x0000000000000008 (RELASZ) 24 (bytes)
+# EMPTY-RELR-NEXT: 0x0000000000000009 (RELAENT) 24 (bytes)
+
+# EMPTY-RELR:      Relocation section '.rela.dyn' at offset {{.+}} contains 1 entries:
+# EMPTY-RELR-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# EMPTY-RELR-NEXT: 0000000000030320  0000000000000411 R_AARCH64_AUTH_RELATIVE           8003031f
+# EMPTY-RELR-EMPTY:
+# EMPTY-RELR-NEXT: Relocation section '.relr.auth.dyn' at offset {{.+}} contains 0 entries:
+# EMPTY-RELR-NEXT: Index: Entry Address Symbolic Address
+
+.section .test, "aw"
+.p2align 3
+.quad (.test + 0x7FFFFFFF)@AUTH(da,42)
+
+#--- empty-rela.s
+
+## .relr.auth.dyn relocations that do not fit 32 bits are moved to .rela.dyn.
+## If this scenario does not happen, .rela.dyn will remain empty,
+## but removeUnusedSyntheticSections fails to remove the section.
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 empty-rela.s -o empty-rela.o
+# RUN: ld.lld -pie -z pack-relative-relocs empty-rela.o -o empty-rela
+# RUN: llvm-readelf -S -d -r empty-rela | FileCheck --check-prefixes=EMPTY-RELA %s
+# RUN: ld.lld -r -z pack-relative-relocs empty-rela.o -o empty-rela.ro
+# RUN: llvm-readelf -S empty-rela.ro | FileCheck --check-prefixes=EMPTY-RELA-RO %s
+
+# EMPTY-RELA:      Section Headers:
+# EMPTY-RELA-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
+# EMPTY-RELA:      .rela.dyn RELA {{0*}}[[ADDR1:.+]] {{0*}}[[ADDR1]] 000000 18 A 0 0 8
+# EMPTY-RELA:      .relr.auth.dyn AARCH64_AUTH_RELR {{0*}}[[ADDR2:.+]] {{0*}}[[ADDR2]] 000008 08 A 0 0 8
+
+# EMPTY-RELA:      Dynamic section at offset {{.+}} contains 12 entries
+# EMPTY-RELA-NOT:  (RELR)
+# EMPTY-RELA-NOT:  (RELRSZ)
+# EMPTY-RELA-NOT:  (RELRENT)
+# EMPTY-RELA:      0x0000000070000012 (AARCH64_AUTH_RELR) 0x[[ADDR2]]
+# EMPTY-RELA-NEXT: 0x0000000070000011 (AARCH64_AUTH_RELRSZ) 8 (bytes)
+# EMPTY-RELA-NEXT: 0x0000000070000013 (AARCH64_AUTH_RELRENT) 8 (bytes)
+
+# EMPTY-RELA:      Relocation section '.rela.dyn' at offset {{.+}} contains 0 entries:
+# EMPTY-RELA-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name
+# EMPTY-RELA-EMPTY:
+# EMPTY-RELA-NEXT: Relocation section '.relr.auth.dyn' at offset {{.+}} contains 1 entries:
+# EMPTY-RELA-NEXT: Index: Entry Address Symbolic Address
+# EMPTY-RELA-NEXT: 0000: 0000000000030310 0000000000030310 $d.0
+
+# EMPTY-RELA-RO-NOT: .rela.dyn
+
+.section .test, "aw"
+.p2align 3
+.quad (.test + 0x12345678)@AUTH(da,42)
diff --git a/lld/test/ELF/as-needed-no-reloc.s b/lld/test/ELF/as-needed-no-reloc.s
index 6cedeec3155d6d..07b291a4940efb 100644
--- a/lld/test/ELF/as-needed-no-reloc.s
+++ b/lld/test/ELF/as-needed-no-reloc.s
@@ -8,7 +8,7 @@
 
 # There must be a NEEDED entry for each undefined
 
-# CHECK: (NEEDED) Shared library: [{{.*}}as-needed-no-reloc{{.*}}2.so]
+# CHECK: (NEEDED) Shared library: {{.*}}as-needed-no-reloc{{.*}}2.so
 # CHECK: UND bar
 
         .globl _start
diff --git a/lld/test/ELF/as-needed.s b/lld/test/ELF/as-needed.s
index 1623f324892b49..d115c672dda8c4 100644
--- a/lld/test/ELF/as-needed.s
+++ b/lld/test/ELF/as-needed.s
@@ -32,13 +32,13 @@
 // RUN: ld.lld %t.o %t.script -o %t2
 // RUN: llvm-readobj --dynamic-table %t2 | FileCheck -check-prefix=CHECK2 %s
 
-// CHECK: NEEDED Shared library: [shared1]
-// CHECK: NEEDED Shared library: [shared2]
-// CHECK: NEEDED Shared library: [shared3]
+// CHECK: NEEDED Shared library: shared1
+// CHECK: NEEDED Shared library: shared2
+// CHECK: NEEDED Shared library: shared3
 
-// CHECK2:     NEEDED Shared library: [shared1]
-// CHECK2-NOT: NEEDED Shared library: [shared2]
-// CHECK2-NOT: NEEDED Shared library: [shared3]
+// CHECK2:     NEEDED Shared library: shared1
+// CHECK2-NOT: NEEDED Shared library: shared2
+// CHECK2-NOT: NEEDED Shared library: shared3
 
 .global _start
 _start:
diff --git a/lld/test/ELF/auxiliary.s b/lld/test/ELF/auxiliary.s
index 5a74060cc851d7..dd978b0c423a5f 100644
--- a/lld/test/ELF/auxiliary.s
+++ b/lld/test/ELF/auxiliary.s
@@ -7,8 +7,8 @@
 
 # CHECK:      DynamicSection [
 # CHECK-NEXT: Tag                Type          Name/Value
-# CHECK-NEXT: 0x000000007FFFFFFD AUXILIARY     Auxiliary library: [aaa]
-# CHECK-NEXT: 0x000000007FFFFFFD AUXILIARY     Auxiliary library: [bbb]
+# CHECK-NEXT: 0x000000007FFFFFFD AUXILIARY     Auxiliary library: aaa
+# CHECK-NEXT: 0x000000007FFFFFFD AUXILIARY     Auxiliary library: bbb
 
 # RUN: not ld.lld %t.o -f aaa --auxiliary bbb -o /dev/null 2>&1 \
 # RUN:    | FileCheck -check-prefix=ERR %s
diff --git a/lld/test/ELF/dynamic-reloc.s b/lld/test/ELF/dynamic-reloc.s
index 6d98db0c2468e6..6dae02cae125a0 100644
--- a/lld/test/ELF/dynamic-reloc.s
+++ b/lld/test/ELF/dynamic-reloc.s
@@ -32,7 +32,7 @@
 
 // CHECK: DynamicSection [
 // CHECK-NEXT:  Tag                Type                 Name/Value
-// CHECK-NEXT:  0x0000000000000001 NEEDED               Shared library: [so]
+// CHECK-NEXT:  0x0000000000000001 NEEDED               Shared library: so
 // CHECK-NEXT:  0x0000000000000015 DEBUG                0x0
 // CHECK-NEXT:  0x0000000000000017 JMPREL
 // CHECK-NEXT:  0x0000000000000002 PLTRELSZ             24 (bytes)
diff --git a/lld/test/ELF/filter.s b/lld/test/ELF/filter.s
index 2b07c01013dfa6..3f34febacf659e 100644
--- a/lld/test/ELF/filter.s
+++ b/lld/test/ELF/filter.s
@@ -13,8 +13,8 @@
 
 # CHECK:      DynamicSection [
 # CHECK-NEXT: Tag                Type          Name/Value
-# CHECK-NEXT: 0x000000007FFFFFFF FILTER        Filter library: [foo.so]
-# CHECK-NEXT: 0x000000007FFFFFFF FILTER        Filter library: [boo.so]
+# CHECK-NEXT: 0x000000007FFFFFFF FILTER        Filter library: foo.so
+# CHECK-NEXT: 0x000000007FFFFFFF FILTER        Filter library: boo.so
 
 # RUN: not ld.lld %t.o -F x -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
 # ERR: -F may not be used without -shared
diff --git a/lld/test/ELF/gc-sections-shared.s b/lld/test/ELF/gc-sections-shared.s
index d32c15d4187fe3..9174bdc6f2a8f5 100644
--- a/lld/test/ELF/gc-sections-shared.s
+++ b/lld/test/ELF/gc-sections-shared.s
@@ -15,7 +15,7 @@
 # this case is checked with symbol qux and %t4.so.
 
 # CHECK-NOT: NEEDED
-# CHECK:     (NEEDED) Shared library: [{{.*}}3.so]
+# CHECK:     (NEEDED) Shared library: {{.*}}3.so
 # CHECK-NOT: NEEDED
 
 # CHECK-DAG: FUNC    WEAK   DEFAULT  UND qux
diff --git a/lld/test/ELF/linkerscript/addr.test b/lld/test/ELF/linkerscript/addr.test
index c997525e3dc3f6..53466dda267916 100644
--- a/lld/test/ELF/linkerscript/addr.test
+++ b/lld/test/ELF/linkerscript/addr.test
@@ -13,13 +13,11 @@
 # CHECK-NEXT: 0000000000001001    0 NOTYPE GLOBAL DEFAULT   1 x2
 # CHECK-NEXT: 0000000000001000    0 NOTYPE GLOBAL DEFAULT   1 x3
 
-## TODO Fix duplicate errors
 # RUN: not ld.lld a.o -T absent.lds 2>&1 | FileCheck %s --check-prefix=ABSENT --implicit-check-not=error:
-# ABSENT-COUNT-2: error: absent.lds:3: undefined section .aaa
+# ABSENT: error: absent.lds:3: undefined section .aaa
 
-## TODO Fix duplicate errors
 # RUN: not ld.lld a.o -T absolute.lds 2>&1 | FileCheck %s --check-prefix=ABSOLUTE --implicit-check-not=error:
-# ABSOLUTE-COUNT-4: error: absolute.lds:2: at least one side of the expression must be absolute
+# ABSOLUTE: error: absolute.lds:2: at least one side of the expression must be absolute
 
 #--- a.s
 .globl _start
diff --git a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test
index 791a9ae178fa94..2ce35494ca1485 100644
--- a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test
+++ b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test
@@ -8,7 +8,9 @@
 # RUN: not ld.lld -z norelro -z max-page-size=4096 -T a.t a.o -o /dev/null --no-merge-exidx-entries 2>&1 | \
 # RUN:   FileCheck %s --check-prefix=ERR --implicit-check-not=error:
 
-# ERR:      error: a.t:14: unable to move location counter (0x4104) backward to 0x4070 for section 'code.unused_space'
+# ERR:      error: a.t:9: unable to move location counter (0x1000) backward to 0xf6c for section 'dummy1'
+# ERR-NEXT: error: a.t:10: unable to move location counter (0x2000) backward to 0x1f6c for section 'dummy2'
+# ERR-NEXT: error: a.t:14: unable to move location counter (0x4104) backward to 0x4070 for section 'code.unused_space'
 # ERR-NEXT: error: section '.ARM.exidx' will not fit in region 'CODE': overflowed by 148 bytes
 # ERR-NEXT: error: section dummy1 at 0x1000 of size 0xFFFFFFFFFFFFFF6C exceeds available address space
 # ERR-NEXT: error: section dummy2 at 0x2000 of size 0xFFFFFFFFFFFFFF6C exceeds available address space
diff --git a/lld/test/ELF/no-soname.s b/lld/test/ELF/no-soname.s
index 69b17cd6cc1c81..f86b74eb15c127 100644
--- a/lld/test/ELF/no-soname.s
+++ b/lld/test/ELF/no-soname.s
@@ -6,26 +6,26 @@
 // RUN: ld.lld %t.o %t.dir/no-soname/libfoo.so -o %t
 // RUN: llvm-readobj --dynamic-table %t | FileCheck %s
 
-// CHECK:  0x0000000000000001 NEEDED               Shared library: [{{.*}}/no-soname/libfoo.so]
+// CHECK:  0x0000000000000001 NEEDED               Shared library: {{.*}}/no-soname/libfoo.so
 // CHECK-NOT: NEEDED
 
 // RUN: ld.lld %t.o %t.dir/no-soname/../no-soname/libfoo.so -o %t
 // RUN: llvm-readobj --dynamic-table %t | FileCheck %s --check-prefix=CHECK2
 
-// CHECK2:  0x0000000000000001 NEEDED               Shared library: [{{.*}}/no-soname/../no-soname/libfoo.so]
+// CHECK2:  0x0000000000000001 NEEDED               Shared library: {{.*}}/no-soname/../no-soname/libfoo.so
 // CHECK2-NOT: NEEDED
 
 // RUN: ld.lld %t.o -L%t.dir/no-soname/../no-soname -lfoo -o %t
 // RUN: llvm-readobj --dynamic-table %t | FileCheck %s --check-prefix=CHECK3
 
-// CHECK3:  0x0000000000000001 NEEDED               Shared library: [libfoo.so]
+// CHECK3:  0x0000000000000001 NEEDED               Shared library: libfoo.so
 // CHECK3-NOT: NEEDED
 
 // RUN: ld.lld %t.o -shared -soname libbar.so -o %t.dir/no-soname/libbar.so
 // RUN: ld.lld %t.o %t.dir/no-soname/libbar.so -o %t
 // RUN: llvm-readobj --dynamic-table %t | FileCheck %s --check-prefix=CHECK4
 
-// CHECK4:  0x0000000000000001 NEEDED               Shared library: [libbar.so]
+// CHECK4:  0x0000000000000001 NEEDED               Shared library: libbar.so
 // CHECK4-NOT: NEEDED
 
 .global _start
diff --git a/lld/test/ELF/partition-synthetic-sections.s b/lld/test/ELF/partition-synthetic-sections.s
index d38597856e165e..0ae11fed1395a5 100644
--- a/lld/test/ELF/partition-synthetic-sections.s
+++ b/lld/test/ELF/partition-synthetic-sections.s
@@ -74,10 +74,10 @@
 
 // CHECK: Dynamic section
 // CHECK-NEXT: Tag
-// CHECK-NEXT: 0x0000000000000001 (NEEDED)             Shared library: [verneed1.so.0]
-// PART0-NEXT: 0x000000000000000e (SONAME)             Library soname: [main.so]
-// PART1-NEXT: 0x0000000000000001 (NEEDED)             Shared library: [main.so]
-// PART1-NEXT: 0x000000000000000e (SONAME)             Library soname: [part1]
+// CHECK-NEXT: 0x0000000000000001 (NEEDED)             Shared library: verneed1.so.0
+// PART0-NEXT: 0x000000000000000e (SONAME)             Library soname: main.so
+// PART1-NEXT: 0x0000000000000001 (NEEDED)             Shared library: main.so
+// PART1-NEXT: 0x000000000000000e (SONAME)             Library soname: part1
 // CHECK-NEXT: 0x0000000000000007 (RELA)               0x[[RELA_DYN_ADDR]]
 // CHECK-NEXT: 0x0000000000000008 (RELASZ)
 // CHECK-NEXT: 0x0000000000000009 (RELAENT)            24 (bytes)
diff --git a/lld/test/ELF/push-state.s b/lld/test/ELF/push-state.s
index 6477d9d9150b65..038382d2f6a940 100644
--- a/lld/test/ELF/push-state.s
+++ b/lld/test/ELF/push-state.s
@@ -24,7 +24,7 @@
 
 // RUN: ld.lld -o %t.exe -push-state -as-needed -pop-state %t.so %t1.o
 // RUN: llvm-readobj --dynamic-table %t.exe | FileCheck -check-prefix=NO-AS-NEEDED %s
-// NO-AS-NEEDED: NEEDED Shared library: [libfoo]
+// NO-AS-NEEDED: NEEDED Shared library: libfoo
 
 
 // RUN: mkdir -p %t.dir
diff --git a/lld/test/ELF/shared-ppc64.s b/lld/test/ELF/shared-ppc64.s
index 00a27ad0d004e7..2258db2cc45929 100644
--- a/lld/test/ELF/shared-ppc64.s
+++ b/lld/test/ELF/shared-ppc64.s
@@ -28,7 +28,7 @@
 // CHECK:      DynamicSection [
 // CHECK-NEXT:   Tag                Type                 Name/Value
 // CHECK-NEXT:   0x000000000000001D RUNPATH              Library runpath: [foo:bar]
-// CHECK-NEXT:   0x0000000000000001 NEEDED               Shared library: [{{.*}}2.so]
+// CHECK-NEXT:   0x0000000000000001 NEEDED               Shared library: {{.*}}2.so
 // CHECK-NEXT:   0x0000000000000015 DEBUG                0x0
 // CHECK-NEXT:   0x0000000000000007 RELA                 [[RELADDR]]
 // CHECK-NEXT:   0x0000000000000008 RELASZ               [[RELSIZE]] (bytes)
diff --git a/lld/test/ELF/shared.s b/lld/test/ELF/shared.s
index 800570e63b9533..d8a690d3fd27d4 100644
--- a/lld/test/ELF/shared.s
+++ b/lld/test/ELF/shared.s
@@ -157,7 +157,7 @@
 // CHECK:      DynamicSection [
 // CHECK-NEXT:   Tag        Type                 Name/Value
 // CHECK-NEXT:   0x0000001D RUNPATH              Library runpath: [foo:bar]
-// CHECK-NEXT:   0x00000001 NEEDED               Shared library: [{{.*}}2.so]
+// CHECK-NEXT:   0x00000001 NEEDED               Shared library: {{.*}}2.so
 // CHECK-NEXT:   0x00000015 DEBUG                0x0
 // CHECK-NEXT:   0x00000011 REL                  [[RELADDR]]
 // CHECK-NEXT:   0x00000012 RELSZ                [[RELSIZE]] (bytes)
diff --git a/lld/test/ELF/soname.s b/lld/test/ELF/soname.s
index 25c969dab74571..904b506f4b2c3f 100644
--- a/lld/test/ELF/soname.s
+++ b/lld/test/ELF/soname.s
@@ -5,7 +5,7 @@
 // RUN: ld.lld %t.o %t.so %t2.so -o %t
 // RUN: llvm-readobj --dynamic-table %t | FileCheck %s
 
-// CHECK:  0x0000000000000001 NEEDED               Shared library: [bar]
+// CHECK:  0x0000000000000001 NEEDED               Shared library: bar
 // CHECK-NOT: NEEDED
 
 .global _start
diff --git a/lld/test/ELF/soname2.s b/lld/test/ELF/soname2.s
index 67a9c24af5cdd3..6866f9802c1982 100644
--- a/lld/test/ELF/soname2.s
+++ b/lld/test/ELF/soname2.s
@@ -3,7 +3,7 @@
 // RUN: ld.lld %t.o -shared -soname=foo.so -o %t
 // RUN: llvm-readobj --dynamic-table %t | FileCheck %s
 
-// CHECK: 0x000000000000000E SONAME  Library soname: [foo.so]
+// CHECK: 0x000000000000000E SONAME  Library soname: foo.so
 
 .global _start
 _start:
diff --git a/lld/test/ELF/wrap-drop-shared-original.s b/lld/test/ELF/wrap-drop-shared-original.s
index f3784aa9727964..59fef1493a567c 100644
--- a/lld/test/ELF/wrap-drop-shared-original.s
+++ b/lld/test/ELF/wrap-drop-shared-original.s
@@ -20,13 +20,13 @@
 # RUN: llvm-readelf --dynamic --dyn-syms %t/libref-with-original-and-wrapped.so | \
 # RUN:   FileCheck --check-prefix=ORIGINAL-AND-WRAPPED %s
 
-# ORIGINAL-NOT: (NEEDED) Shared library: [liboriginal.so]
+# ORIGINAL-NOT: (NEEDED) Shared library: liboriginal.so
 # ORIGINAL:      Symbol table '.dynsym' contains 3 entries:
 # ORIGINAL:      NOTYPE  LOCAL  DEFAULT   UND
 # ORIGINAL-NEXT: NOTYPE  GLOBAL DEFAULT   UND __wrap_foo
 # ORIGINAL-NEXT: NOTYPE  GLOBAL DEFAULT     6 ref
 
-# ORIGINAL-AND-WRAPPED: (NEEDED) Shared library: [liboriginal-and-wrapped.so]
+# ORIGINAL-AND-WRAPPED: (NEEDED) Shared library: liboriginal-and-wrapped.so
 # ORIGINAL-AND-WRAPPED:      Symbol table '.dynsym' contains 3 entries:
 # ORIGINAL-AND-WRAPPED:      NOTYPE  LOCAL  DEFAULT   UND
 # ORIGINAL-AND-WRAPPED-NEXT: NOTYPE  GLOBAL DEFAULT   UND __wrap_foo
diff --git a/lld/test/MachO/dead-strip.s b/lld/test/MachO/dead-strip.s
index d4350154695ffb..f593b69843ba67 100644
--- a/lld/test/MachO/dead-strip.s
+++ b/lld/test/MachO/dead-strip.s
@@ -266,12 +266,12 @@
 # UNWIND-NEXT:   g F __TEXT,__text _main
 # UNWIND-NEXT:   g F __TEXT,__text __mh_execute_header
 # UNWIND-NEXT:   *UND* dyld_stub_binder
+# UNWIND-NEXT:   *UND* __ZTIi
 # UNWIND-NEXT:   *UND* ___cxa_allocate_exception
+# UNWIND-NEXT:   *UND* ___cxa_begin_catch
 # UNWIND-NEXT:   *UND* ___cxa_end_catch
-# UNWIND-NEXT:   *UND* __ZTIi
 # UNWIND-NEXT:   *UND* ___cxa_throw
 # UNWIND-NEXT:   *UND* ___gxx_personality_v0
-# UNWIND-NEXT:   *UND* ___cxa_begin_catch
 # UNWIND-NOT:    GCC_except_table0
 
 ## If a dead stripped function has a strong ref to a dylib symbol but
diff --git a/lld/test/MachO/lto-explicit-exports.ll b/lld/test/MachO/lto-explicit-exports.ll
index 7ea00614815818..ce250c1e87f1f4 100644
--- a/lld/test/MachO/lto-explicit-exports.ll
+++ b/lld/test/MachO/lto-explicit-exports.ll
@@ -23,8 +23,8 @@
 ;; from the final symbol table.
 
 ; FULLLTO-SYMS:       SYMBOL TABLE:
-; FULLLTO-SYMS:       g     F __TEXT,__text _same_module_caller
 ; FULLLTO-SYMS:       g     F __TEXT,__text _refs_foo
+; FULLLTO-SYMS:       g     F __TEXT,__text _same_module_caller
 ; FULLLTO-SYMS:       *UND* dyld_stub_binder
 ; FULLLTO-SYMS-EMPTY:
 
@@ -47,8 +47,8 @@
 ; THINLTO-REFS-FOO: define dso_local void @refs_foo()
 
 ; THINLTO-SYMS: l     F __TEXT,__text .hidden _foo
-; THINLTO-SYMS: g     F __TEXT,__text _same_module_caller
 ; THINLTO-SYMS: g     F __TEXT,__text _refs_foo
+; THINLTO-SYMS: g     F __TEXT,__text _same_module_caller
 
 ;--- foo.ll
 
diff --git a/lldb/bindings/interface/SBBreakpointDocstrings.i b/lldb/bindings/interface/SBBreakpointDocstrings.i
index 74c139d5d9fb62..dca2819a9927ba 100644
--- a/lldb/bindings/interface/SBBreakpointDocstrings.i
+++ b/lldb/bindings/interface/SBBreakpointDocstrings.i
@@ -39,7 +39,7 @@ TestBreakpointIgnoreCount.py),::
         #lldbutil.print_stacktraces(process)
         from lldbutil import get_stopped_thread
         thread = get_stopped_thread(process, lldb.eStopReasonBreakpoint)
-        self.assertTrue(thread != None, 'There should be a thread stopped due to breakpoint')
+        self.assertTrue(thread is not None, 'There should be a thread stopped due to breakpoint')
         frame0 = thread.GetFrameAtIndex(0)
         frame1 = thread.GetFrameAtIndex(1)
         frame2 = thread.GetFrameAtIndex(2)
diff --git a/lldb/bindings/interface/SBDataExtensions.i b/lldb/bindings/interface/SBDataExtensions.i
index d980e79221c6d3..ddea77a088dfa9 100644
--- a/lldb/bindings/interface/SBDataExtensions.i
+++ b/lldb/bindings/interface/SBDataExtensions.i
@@ -40,19 +40,19 @@ STRING_EXTENSION_OUTSIDE(SBData)
                 lldbtarget = lldbdict['target']
             else:
                 lldbtarget = None
-            if target == None and lldbtarget != None and lldbtarget.IsValid():
+            if target is None and lldbtarget is not None and lldbtarget.IsValid():
                 target = lldbtarget
-            if ptr_size == None:
+            if ptr_size is None:
                 if target and target.IsValid():
                     ptr_size = target.addr_size
                 else:
                     ptr_size = 8
-            if endian == None:
+            if endian is None:
                 if target and target.IsValid():
                     endian = target.byte_order
                 else:
                     endian = lldbdict['eByteOrderLittle']
-            if size == None:
+            if size is None:
                 if value > 2147483647:
                     size = 8
                 elif value < -2147483648:
diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index d5ea5148727134..34f8c6f0ff8d35 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -37,10 +37,6 @@ PythonObject SWIGBridge::ToSWIGWrapper(const Status& status) {
   return ToSWIGHelper(new lldb::SBError(status), SWIGTYPE_p_lldb__SBError);
 }
 
-PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream> stream_sb) {
-  return ToSWIGHelper(stream_sb.release(), SWIGTYPE_p_lldb__SBStream);
-}
-
 PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb) {
   return ToSWIGHelper(data_sb.release(), SWIGTYPE_p_lldb__SBStructuredData);
 }
@@ -115,9 +111,16 @@ SWIGBridge::ToSWIGWrapper(CommandReturnObject &cmd_retobj) {
       SWIGTYPE_p_lldb__SBCommandReturnObject);
 }
 
-ScopedPythonObject<lldb::SBEvent> SWIGBridge::ToSWIGWrapper(Event *event) {
-  return ScopedPythonObject<lldb::SBEvent>(new lldb::SBEvent(event),
-                                           SWIGTYPE_p_lldb__SBEvent);
+PythonObject SWIGBridge::ToSWIGWrapper(const Stream *s) {
+  return ToSWIGHelper(new lldb::SBStream(), SWIGTYPE_p_lldb__SBStream);
+}
+
+PythonObject SWIGBridge::ToSWIGWrapper(std::shared_ptr<lldb::SBStream> stream_sb) {
+  return ToSWIGHelper(stream_sb.get(), SWIGTYPE_p_lldb__SBStream);
+}
+
+PythonObject SWIGBridge::ToSWIGWrapper(Event *event) {
+  return ToSWIGHelper(new lldb::SBEvent(event), SWIGTYPE_p_lldb__SBEvent);
 }
 
 PythonObject SWIGBridge::ToSWIGWrapper(
diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig
index c39594c7df0414..f8c33e15c03e66 100644
--- a/lldb/bindings/python/python-typemaps.swig
+++ b/lldb/bindings/python/python-typemaps.swig
@@ -257,7 +257,8 @@ AND call SWIG_fail at the same time, because it will result in a double free.
 }
 // For SBProcess::WriteMemory, SBTarget::GetInstructions and SBDebugger::DispatchInput.
 %typemap(in) (const void *buf, size_t size),
-             (const void *data, size_t data_len) {
+             (const void *data, size_t data_len),
+             (const void *buf, uint64_t size) {
   if (PythonString::Check($input)) {
     PythonString str(PyRefType::Borrowed, $input);
     $1 = (void *)str.GetString().data();
diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index bd3de8ce5d46cc..7915f7c4b20760 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -229,133 +229,6 @@ PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject
   return pfunc(SWIGBridge::ToSWIGWrapper(std::move(debugger_sp)), dict);
 }
 
-PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
-    const char *python_class_name, const char *session_dictionary_name,
-    const lldb_private::StructuredDataImpl &args_impl,
-    std::string &error_string, const lldb::ThreadPlanSP &thread_plan_sp) {
-  if (python_class_name == NULL || python_class_name[0] == '\0' ||
-      !session_dictionary_name)
-    return PythonObject();
-
-  PyErr_Cleaner py_err_cleaner(true);
-
-  auto dict = PythonModule::MainModule().ResolveName<PythonDictionary>(
-      session_dictionary_name);
-  auto pfunc = PythonObject::ResolveNameWithDictionary<PythonCallable>(
-      python_class_name, dict);
-
-  if (!pfunc.IsAllocated()) {
-    error_string.append("could not find script class: ");
-    error_string.append(python_class_name);
-    return PythonObject();
-  }
-
-  PythonObject tp_arg = SWIGBridge::ToSWIGWrapper(thread_plan_sp);
-
-  llvm::Expected<PythonCallable::ArgInfo> arg_info = pfunc.GetArgInfo();
-  if (!arg_info) {
-    llvm::handleAllErrors(
-        arg_info.takeError(),
-        [&](PythonException &E) { error_string.append(E.ReadBacktrace()); },
-        [&](const llvm::ErrorInfoBase &E) {
-          error_string.append(E.message());
-        });
-    return PythonObject();
-  }
-
-  PythonObject result = {};
-  auto args_sb = std::unique_ptr<lldb::SBStructuredData>(new lldb::SBStructuredData(args_impl));
-  if (arg_info.get().max_positional_args == 2) {
-    if (args_sb->IsValid()) {
-      error_string.assign(
-          "args passed, but __init__ does not take an args dictionary");
-      return PythonObject();
-    }
-    result = pfunc(tp_arg, dict);
-  } else if (arg_info.get().max_positional_args >= 3) {
-    result = pfunc(tp_arg, SWIGBridge::ToSWIGWrapper(std::move(args_sb)), dict);
-  } else {
-    error_string.assign("wrong number of arguments in __init__, should be 2 or "
-                        "3 (not including self)");
-    return PythonObject();
-  }
-
-  // FIXME: At this point we should check that the class we found supports all
-  // the methods that we need.
-
-  return result;
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementer, const char *method_name, lldb_private::Event *event,
-    bool &got_error) {
-  got_error = false;
-
-  PyErr_Cleaner py_err_cleaner(false);
-  PythonObject self(PyRefType::Borrowed, static_cast<PyObject *>(implementer));
-  auto pfunc = self.ResolveName<PythonCallable>(method_name);
-
-  if (!pfunc.IsAllocated())
-    return false;
-
-  PythonObject result;
-  if (event != nullptr) {
-    ScopedPythonObject<SBEvent> event_arg = SWIGBridge::ToSWIGWrapper(event);
-    result = pfunc(event_arg.obj());
-  } else
-    result = pfunc();
-
-  if (PyErr_Occurred()) {
-    got_error = true;
-    printf("Return value was neither false nor true for call to %s.\n",
-           method_name);
-    PyErr_Print();
-    return false;
-  }
-
-  if (result.get() == Py_True)
-    return true;
-  else if (result.get() == Py_False)
-    return false;
-
-  // Somebody returned the wrong thing...
-  got_error = true;
-  printf("Wrong return value type for call to %s.\n", method_name);
-  return false;
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementer, const char *method_name, lldb_private::Stream *stream,
-    bool &got_error) {
-  got_error = false;
-
-  PyErr_Cleaner py_err_cleaner(false);
-  PythonObject self(PyRefType::Borrowed, static_cast<PyObject *>(implementer));
-  auto pfunc = self.ResolveName<PythonCallable>(method_name);
-
-  if (!pfunc.IsAllocated()) 
-    return false;
-
-  auto *sb_stream = new lldb::SBStream();
-  PythonObject sb_stream_arg =
-      SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream>(sb_stream));
-
-  PythonObject result;
-  result = pfunc(sb_stream_arg);
-
-  if (PyErr_Occurred()) {
-    printf("Error occured for call to %s.\n",
-           method_name);
-    PyErr_Print();
-    got_error = true;
-    return false;
-  }
-  if (stream)
-    stream->PutCString(sb_stream->GetData());
-  return true;
-
-}
-
 PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedBreakpointResolver(
     const char *python_class_name, const char *session_dictionary_name,
     const StructuredDataImpl &args_impl,
@@ -500,9 +373,8 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop(
   if (!pfunc.IsAllocated())
     return true;
 
-  auto *sb_stream = new lldb::SBStream();
-  PythonObject sb_stream_arg =
-      SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream>(sb_stream));
+  std::shared_ptr<lldb::SBStream> sb_stream = std::make_shared<lldb::SBStream>();
+  PythonObject sb_stream_arg = SWIGBridge::ToSWIGWrapper(sb_stream);
   PythonObject result =
       pfunc(SWIGBridge::ToSWIGWrapper(std::move(exc_ctx_sp)), sb_stream_arg);
 
@@ -517,6 +389,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop(
   // makes an internally help StreamString which I can't interpose, so I
   // have to copy it over here.
   stream->PutCString(sb_stream->GetData());
+  sb_stream_arg.release();
 
   if (result.get() == Py_False)
     return false;
@@ -753,6 +626,30 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject * data
   return sb_ptr;
 }
 
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject * data) {
+  lldb::SBEvent *sb_ptr = nullptr;
+
+  int valid_cast =
+      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBEvent, 0);
+
+  if (valid_cast == -1)
+    return NULL;
+
+  return sb_ptr;
+}
+
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject * data) {
+  lldb::SBStream *sb_ptr = nullptr;
+
+  int valid_cast =
+      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBStream, 0);
+
+  if (valid_cast == -1)
+    return NULL;
+
+  return sb_ptr;
+}
+
 void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject * data) {
   lldb::SBValue *sb_ptr = NULL;
 
diff --git a/lldb/docs/use/python.rst b/lldb/docs/use/python.rst
index 6183d6935d80ef..d9c29d95708c1e 100644
--- a/lldb/docs/use/python.rst
+++ b/lldb/docs/use/python.rst
@@ -75,13 +75,13 @@ later explanations:
    12:     if root_word == word:
    13:         return cur_path
    14:     elif word < root_word:
-   15:         if left_child_ptr.GetValue() == None:
+   15:         if left_child_ptr.GetValue() is None:
    16:             return ""
    17:         else:
    18:             cur_path = cur_path + "L"
    19:             return DFS (left_child_ptr, word, cur_path)
    20:     else:
-   21:         if right_child_ptr.GetValue() == None:
+   21:         if right_child_ptr.GetValue() is None:
    22:             return ""
    23:         else:
    24:             cur_path = cur_path + "R"
diff --git a/lldb/examples/python/armv7_cortex_m_target_defintion.py b/lldb/examples/python/armv7_cortex_m_target_defintion.py
index 42eaa39993dae4..8225670f33e6b5 100755
--- a/lldb/examples/python/armv7_cortex_m_target_defintion.py
+++ b/lldb/examples/python/armv7_cortex_m_target_defintion.py
@@ -222,7 +222,7 @@ def get_reg_num(reg_num_dict, reg_name):
 
 def get_target_definition():
     global g_target_definition
-    if g_target_definition == None:
+    if g_target_definition is None:
         g_target_definition = {}
         offset = 0
         for reg_info in armv7_register_infos:
diff --git a/lldb/examples/python/crashlog.py b/lldb/examples/python/crashlog.py
index 1c0d717ce455cf..368437ed63e46b 100755
--- a/lldb/examples/python/crashlog.py
+++ b/lldb/examples/python/crashlog.py
@@ -166,7 +166,7 @@ def dump_symbolicated(self, crash_log, options):
             this_thread_crashed = self.app_specific_backtrace
             if not this_thread_crashed:
                 this_thread_crashed = self.did_crash()
-                if options.crashed_only and this_thread_crashed == False:
+                if options.crashed_only and not this_thread_crashed:
                     return
 
             print("%s" % self)
diff --git a/lldb/examples/python/disasm-stress-test.py b/lldb/examples/python/disasm-stress-test.py
index 2d3314ee8e8ff2..62b2b90a2860a3 100755
--- a/lldb/examples/python/disasm-stress-test.py
+++ b/lldb/examples/python/disasm-stress-test.py
@@ -95,13 +95,13 @@ def GetLLDBFrameworkPath():
 
 debugger = lldb.SBDebugger.Create()
 
-if debugger.IsValid() == False:
+if not debugger.IsValid():
     print("Couldn't create an SBDebugger")
     sys.exit(-1)
 
 target = debugger.CreateTargetWithFileAndArch(None, arg_ns.arch)
 
-if target.IsValid() == False:
+if not target.IsValid():
     print("Couldn't create an SBTarget for architecture " + arg_ns.arch)
     sys.exit(-1)
 
diff --git a/lldb/examples/summaries/cocoa/CFString.py b/lldb/examples/summaries/cocoa/CFString.py
index 13c294ca34122e..74bd927e9db218 100644
--- a/lldb/examples/summaries/cocoa/CFString.py
+++ b/lldb/examples/summaries/cocoa/CFString.py
@@ -253,9 +253,9 @@ def get_child_at_index(self, index):
             elif (
                 self.inline
                 and self.explicit
-                and self.unicode == False
-                and self.special == False
-                and self.mutable == False
+                and not self.unicode
+                and not self.special
+                and not self.mutable
             ):
                 return self.handle_inline_explicit()
             elif self.unicode:
diff --git a/lldb/examples/summaries/pysummary.py b/lldb/examples/summaries/pysummary.py
index e63a0bff56a134..2a05c1cbf8f282 100644
--- a/lldb/examples/summaries/pysummary.py
+++ b/lldb/examples/summaries/pysummary.py
@@ -2,7 +2,7 @@
 
 
 def pyobj_summary(value, unused):
-    if value is None or value.IsValid() == False or value.GetValueAsUnsigned(0) == 0:
+    if value is None or not value.IsValid() or value.GetValueAsUnsigned(0) == 0:
         return "<invalid>"
     refcnt = value.GetChildMemberWithName("ob_refcnt")
     expr = "(char*)PyString_AsString( (PyObject*)PyObject_Str( (PyObject*)0x%x) )" % (
diff --git a/lldb/examples/synthetic/bitfield/example.py b/lldb/examples/synthetic/bitfield/example.py
index 2f58123268aa1b..45416477bfef2c 100644
--- a/lldb/examples/synthetic/bitfield/example.py
+++ b/lldb/examples/synthetic/bitfield/example.py
@@ -51,7 +51,7 @@ def get_child_at_index(self, index):
             return None
         if index > self.num_children():
             return None
-        if self.valobj.IsValid() == False:
+        if not self.valobj.IsValid():
             return None
         if index == 0:
             return self.valobj.GetChildMemberWithName("value")
diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index af19b1faf3bf51..84ea9c0f772e16 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -57,6 +57,8 @@ class LLDB_API SBDebugger {
 
   static const char *GetBroadcasterClass();
 
+  static bool SupportsLanguage(lldb::LanguageType language);
+
   lldb::SBBroadcaster GetBroadcaster();
 
   /// Get progress data from a SBEvent whose type is eBroadcastBitProgress.
diff --git a/lldb/include/lldb/API/SBEvent.h b/lldb/include/lldb/API/SBEvent.h
index cc116766e85f4a..85b401ca8cc100 100644
--- a/lldb/include/lldb/API/SBEvent.h
+++ b/lldb/include/lldb/API/SBEvent.h
@@ -15,6 +15,7 @@
 #include <vector>
 
 namespace lldb_private {
+class ScriptInterpreter;
 namespace python {
 class SWIGBridge;
 }
@@ -73,11 +74,12 @@ class LLDB_API SBEvent {
   friend class SBThread;
   friend class SBWatchpoint;
 
+  friend class lldb_private::ScriptInterpreter;
   friend class lldb_private::python::SWIGBridge;
 
   SBEvent(lldb::EventSP &event_sp);
 
-  SBEvent(lldb_private::Event *event_sp);
+  SBEvent(lldb_private::Event *event);
 
   lldb::EventSP &GetSP() const;
 
diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index f1b5d1fb92ce29..a6ab7ae7599185 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -209,6 +209,16 @@ class LLDB_API SBProcess {
 
   lldb::addr_t ReadPointerFromMemory(addr_t addr, lldb::SBError &error);
 
+  lldb::SBAddressRangeList FindRangesInMemory(const void *buf, uint64_t size,
+                                              const SBAddressRangeList &ranges,
+                                              uint32_t alignment,
+                                              uint32_t max_matches,
+                                              SBError &error);
+
+  lldb::addr_t FindInMemory(const void *buf, uint64_t size,
+                            const SBAddressRange &range, uint32_t alignment,
+                            SBError &error);
+
   // Events
   static lldb::StateType GetStateFromEvent(const lldb::SBEvent &event);
 
diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h
index 71caf41fd75491..d230da6123fb36 100644
--- a/lldb/include/lldb/API/SBStream.h
+++ b/lldb/include/lldb/API/SBStream.h
@@ -13,6 +13,10 @@
 
 #include "lldb/API/SBDefines.h"
 
+namespace lldb_private {
+class ScriptInterpreter;
+} // namespace lldb_private
+
 namespace lldb {
 
 class LLDB_API SBStream {
@@ -103,6 +107,8 @@ class LLDB_API SBStream {
   friend class SBValue;
   friend class SBWatchpoint;
 
+  friend class lldb_private::ScriptInterpreter;
+
   lldb_private::Stream *operator->();
 
   lldb_private::Stream *get();
diff --git a/lldb/include/lldb/Core/AddressRangeListImpl.h b/lldb/include/lldb/Core/AddressRangeListImpl.h
index 46ebfe73d4d92d..6742e6ead87de0 100644
--- a/lldb/include/lldb/Core/AddressRangeListImpl.h
+++ b/lldb/include/lldb/Core/AddressRangeListImpl.h
@@ -13,7 +13,9 @@
 #include <cstddef>
 
 namespace lldb {
+class SBAddressRangeList;
 class SBBlock;
+class SBProcess;
 }
 
 namespace lldb_private {
@@ -39,7 +41,9 @@ class AddressRangeListImpl {
   lldb_private::AddressRange GetAddressRangeAtIndex(size_t index);
 
 private:
+  friend class lldb::SBAddressRangeList;
   friend class lldb::SBBlock;
+  friend class lldb::SBProcess;
 
   AddressRanges &ref();
 
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index 9753a916243b7b..69504dbcda5dc8 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -10,7 +10,6 @@
 #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 
 #include "lldb/Core/StructuredDataImpl.h"
-#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/UnimplementedError.h"
@@ -52,7 +51,8 @@ class ScriptedInterface {
   }
 
   template <typename T = StructuredData::ObjectSP>
-  bool CheckStructuredDataObject(llvm::StringRef caller, T obj, Status &error) {
+  static bool CheckStructuredDataObject(llvm::StringRef caller, T obj,
+                                        Status &error) {
     if (!obj)
       return ErrorWithMessage<bool>(caller, "Null Structured Data object",
                                     error);
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
new file mode 100644
index 00000000000000..ee634d15f2a9e9
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
@@ -0,0 +1,38 @@
+//===-- ScriptedThreadPlanInterface.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
+#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
+
+#include "lldb/lldb-private.h"
+
+#include "ScriptedInterface.h"
+
+namespace lldb_private {
+class ScriptedThreadPlanInterface : public ScriptedInterface {
+public:
+  virtual llvm::Expected<StructuredData::GenericSP>
+  CreatePluginObject(llvm::StringRef class_name,
+                     lldb::ThreadPlanSP thread_plan_sp,
+                     const StructuredDataImpl &args_sp) = 0;
+
+  virtual llvm::Expected<bool> ExplainsStop(Event *event) { return true; }
+
+  virtual llvm::Expected<bool> ShouldStop(Event *event) { return true; }
+
+  virtual llvm::Expected<bool> IsStale() { return true; };
+
+  virtual lldb::StateType GetRunState() { return lldb::eStateStepping; }
+
+  virtual llvm::Error GetStopDescription(lldb::StreamSP &stream) {
+    return llvm::Error::success();
+  }
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index 932eaa8b8a4a28..14a52709c1e61f 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -13,8 +13,10 @@
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/API/SBData.h"
 #include "lldb/API/SBError.h"
+#include "lldb/API/SBEvent.h"
 #include "lldb/API/SBLaunchInfo.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
+#include "lldb/API/SBStream.h"
 #include "lldb/Breakpoint/BreakpointOptions.h"
 #include "lldb/Core/PluginInterface.h"
 #include "lldb/Core/SearchFilter.h"
@@ -250,50 +252,6 @@ class ScriptInterpreter : public PluginInterface {
     return lldb::ValueObjectListSP();
   }
 
-  virtual StructuredData::ObjectSP
-  CreateScriptedThreadPlan(const char *class_name,
-                           const StructuredDataImpl &args_data,
-                           std::string &error_str,
-                           lldb::ThreadPlanSP thread_plan_sp) {
-    return StructuredData::ObjectSP();
-  }
-
-  virtual bool
-  ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp,
-                                 Event *event, bool &script_error) {
-    script_error = true;
-    return true;
-  }
-
-  virtual bool
-  ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp,
-                               Event *event, bool &script_error) {
-    script_error = true;
-    return true;
-  }
-
-  virtual bool
-  ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp,
-                            bool &script_error) {
-    script_error = true;
-    return true;
-  }
-
-  virtual lldb::StateType
-  ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp,
-                                bool &script_error) {
-    script_error = true;
-    return lldb::eStateStepping;
-  }
-
-  virtual bool
-  ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp,
-                                       lldb_private::Stream *stream,
-                                       bool &script_error) {
-    script_error = true;
-    return false;
-  }
-
   virtual StructuredData::GenericSP
   CreateScriptedBreakpointResolver(const char *class_name,
                                    const StructuredDataImpl &args_data,
@@ -592,6 +550,11 @@ class ScriptInterpreter : public PluginInterface {
     return {};
   }
 
+  virtual lldb::ScriptedThreadPlanInterfaceSP
+  CreateScriptedThreadPlanInterface() {
+    return {};
+  }
+
   virtual lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() {
     return {};
   }
@@ -610,6 +573,10 @@ class ScriptInterpreter : public PluginInterface {
 
   Status GetStatusFromSBError(const lldb::SBError &error) const;
 
+  Event *GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const;
+
+  lldb::StreamSP GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const;
+
   lldb::BreakpointSP
   GetOpaqueTypeFromSBBreakpoint(const lldb::SBBreakpoint &breakpoint) const;
 
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index b4025c173a1861..7d48f9b316138c 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -209,6 +209,7 @@ class TypeSystem : public PluginInterface,
   // TypeSystems can support more than one language
   virtual bool SupportsLanguage(lldb::LanguageType language) = 0;
 
+  static bool SupportsLanguageStatic(lldb::LanguageType language);
   // Type Completion
 
   virtual bool GetCompleteType(lldb::opaque_compiler_type_t type) = 0;
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index eec337c15f7edd..ceaf547ebddaf9 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -2685,6 +2685,15 @@ void PruneThreadPlans();
   lldb::addr_t FindInMemory(lldb::addr_t low, lldb::addr_t high,
                             const uint8_t *buf, size_t size);
 
+  AddressRanges FindRangesInMemory(const uint8_t *buf, uint64_t size,
+                                   const AddressRanges &ranges,
+                                   size_t alignment, size_t max_matches,
+                                   Status &error);
+
+  lldb::addr_t FindInMemory(const uint8_t *buf, uint64_t size,
+                            const AddressRange &range, size_t alignment,
+                            Status &error);
+
 protected:
   friend class Trace;
 
@@ -2800,6 +2809,11 @@ void PruneThreadPlans();
   virtual size_t DoReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
                               Status &error) = 0;
 
+  virtual void DoFindInMemory(lldb::addr_t start_addr, lldb::addr_t end_addr,
+                              const uint8_t *buf, size_t size,
+                              AddressRanges &matches, size_t alignment,
+                              size_t max_matches);
+
   /// DoGetMemoryRegionInfo is called by GetMemoryRegionInfo after it has
   /// removed non address bits from load_addr. Override this method in
   /// subclasses of Process.
diff --git a/lldb/include/lldb/Target/RegisterFlags.h b/lldb/include/lldb/Target/RegisterFlags.h
index 1112972cf72e19..1250fd03309580 100644
--- a/lldb/include/lldb/Target/RegisterFlags.h
+++ b/lldb/include/lldb/Target/RegisterFlags.h
@@ -32,10 +32,15 @@ class FieldEnum {
         : m_value(value), m_name(std::move(name)) {}
 
     void ToXML(Stream &strm) const;
+
+    void DumpToLog(Log *log) const;
   };
 
   typedef std::vector<Enumerator> Enumerators;
 
+  // GDB also includes a "size" that is the size of the underlying register.
+  // We will not store that here but instead use the size of the register
+  // this gets attached to when emitting XML.
   FieldEnum(std::string id, const Enumerators &enumerators);
 
   const Enumerators &GetEnumerators() const { return m_enumerators; }
@@ -44,6 +49,8 @@ class FieldEnum {
 
   void ToXML(Stream &strm, unsigned size) const;
 
+  void DumpToLog(Log *log) const;
+
 private:
   std::string m_id;
   Enumerators m_enumerators;
diff --git a/lldb/include/lldb/Target/Statistics.h b/lldb/include/lldb/Target/Statistics.h
index 122eb3ddd711fc..35bd7f8a66e055 100644
--- a/lldb/include/lldb/Target/Statistics.h
+++ b/lldb/include/lldb/Target/Statistics.h
@@ -144,9 +144,8 @@ struct StatisticsOptions {
   bool GetIncludeTargets() const {
     if (m_include_targets.has_value())
       return m_include_targets.value();
-    // `m_include_targets` has no value set, so return a value based on
-    // `m_summary_only`.
-    return !GetSummaryOnly();
+    // Default to true in both default mode and summary mode.
+    return true;
   }
 
   void SetIncludeModules(bool value) { m_include_modules = value; }
diff --git a/lldb/include/lldb/Target/ThreadPlanPython.h b/lldb/include/lldb/Target/ThreadPlanPython.h
index 64854d66b8f258..da106faf951db1 100644
--- a/lldb/include/lldb/Target/ThreadPlanPython.h
+++ b/lldb/include/lldb/Target/ThreadPlanPython.h
@@ -13,6 +13,7 @@
 #include <string>
 
 #include "lldb/Core/StructuredDataImpl.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Target/Target.h"
@@ -70,6 +71,7 @@ class ThreadPlanPython : public ThreadPlan {
   StreamString m_stop_description; // Cache the stop description here
   bool m_did_push;
   bool m_stop_others;
+  lldb::ScriptedThreadPlanInterfaceSP m_interface;
 
   ThreadPlanPython(const ThreadPlanPython &) = delete;
   const ThreadPlanPython &operator=(const ThreadPlanPython &) = delete;
diff --git a/lldb/include/lldb/Utility/StreamString.h b/lldb/include/lldb/Utility/StreamString.h
index 3d675caf8f3f43..3287f328a1be33 100644
--- a/lldb/include/lldb/Utility/StreamString.h
+++ b/lldb/include/lldb/Utility/StreamString.h
@@ -20,6 +20,8 @@
 
 namespace lldb_private {
 
+class ScriptInterpreter;
+
 class StreamString : public Stream {
 public:
   StreamString(bool colors = false);
@@ -45,6 +47,8 @@ class StreamString : public Stream {
   void FillLastLineToColumn(uint32_t column, char fill_char);
 
 protected:
+  friend class ScriptInterpreter;
+
   std::string m_packet;
   size_t WriteImpl(const void *s, size_t length) override;
 };
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 6d880b4da03c99..9c946eaa19f0b6 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -189,6 +189,7 @@ class ScriptedMetadata;
 class ScriptedPlatformInterface;
 class ScriptedProcessInterface;
 class ScriptedThreadInterface;
+class ScriptedThreadPlanInterface;
 class ScriptedSyntheticChildren;
 class SearchFilter;
 class Section;
@@ -406,6 +407,8 @@ typedef std::unique_ptr<lldb_private::ScriptedProcessInterface>
     ScriptedProcessInterfaceUP;
 typedef std::shared_ptr<lldb_private::ScriptedThreadInterface>
     ScriptedThreadInterfaceSP;
+typedef std::shared_ptr<lldb_private::ScriptedThreadPlanInterface>
+    ScriptedThreadPlanInterfaceSP;
 typedef std::shared_ptr<lldb_private::Section> SectionSP;
 typedef std::unique_ptr<lldb_private::SectionList> SectionListUP;
 typedef std::weak_ptr<lldb_private::Section> SectionWP;
diff --git a/lldb/include/lldb/lldb-private-enumerations.h b/lldb/include/lldb/lldb-private-enumerations.h
index 68e060f2393f70..9d18316dcea259 100644
--- a/lldb/include/lldb/lldb-private-enumerations.h
+++ b/lldb/include/lldb/lldb-private-enumerations.h
@@ -10,6 +10,7 @@
 #define LLDB_LLDB_PRIVATE_ENUMERATIONS_H
 
 #include "lldb/lldb-enumerations.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatProviders.h"
 #include "llvm/Support/raw_ostream.h"
@@ -197,8 +198,7 @@ enum class CompilerContextKind : uint16_t {
   TranslationUnit = 1,
   Module = 1 << 1,
   Namespace = 1 << 2,
-  Class = 1 << 3,
-  Struct = 1 << 4,
+  ClassOrStruct = 1 << 3,
   Union = 1 << 5,
   Function = 1 << 6,
   Variable = 1 << 7,
@@ -210,10 +210,12 @@ enum class CompilerContextKind : uint16_t {
   /// Match 0..n nested modules.
   AnyModule = Any | Module,
   /// Match any type.
-  AnyType = Any | Class | Struct | Union | Enum | Typedef | Builtin,
+  AnyType = Any | ClassOrStruct | Union | Enum | Typedef | Builtin,
   /// Math any declaration context.
-  AnyDeclContext = Any | Namespace | Class | Struct | Union | Enum | Function
+  AnyDeclContext = Any | Namespace | ClassOrStruct | Union | Enum | Function,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/AnyDeclContext),
 };
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
 
 // Enumerations that can be used to specify the kind of metric we're looking at
 // when collecting stats.
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 53d5fbb84cc92d..cdd9b51d9329ce 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -99,10 +99,10 @@ typedef std::optional<FileSpec> (*SymbolLocatorLocateExecutableSymbolFile)(
 typedef bool (*SymbolLocatorDownloadObjectAndSymbolFile)(
     ModuleSpec &module_spec, Status &error, bool force_lookup,
     bool copy_executable);
-typedef bool (*BreakpointHitCallback)(void *baton,
-                                      StoppointCallbackContext *context,
-                                      lldb::user_id_t break_id,
-                                      lldb::user_id_t break_loc_id);
+using BreakpointHitCallback =
+    std::function<bool(void *baton, StoppointCallbackContext *context,
+                       lldb::user_id_t break_id, lldb::user_id_t break_loc_id)>;
+
 typedef bool (*WatchpointHitCallback)(void *baton,
                                       StoppointCallbackContext *context,
                                       lldb::user_id_t watch_id);
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index 06acfb2f0756cd..f14a00a2394b0b 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -47,7 +47,7 @@
 
 def is_exe(fpath):
     """Returns true if fpath is an executable."""
-    if fpath == None:
+    if fpath is None:
         return False
     if sys.platform == "win32":
         if not fpath.endswith(".exe"):
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 1ad8ab6e6e462d..5e50b0c1458857 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -229,7 +229,7 @@ def pointer_size():
 
 def is_exe(fpath):
     """Returns true if fpath is an executable."""
-    if fpath == None:
+    if fpath is None:
         return False
     if sys.platform == "win32":
         if not fpath.endswith(".exe"):
@@ -2191,7 +2191,7 @@ def complete_from_to(self, str_input, patterns):
         if num_matches == 0:
             compare_string = str_input
         else:
-            if common_match != None and len(common_match) > 0:
+            if common_match is not None and len(common_match) > 0:
                 compare_string = str_input + common_match
             else:
                 compare_string = ""
@@ -2446,7 +2446,7 @@ def found_str(matched):
                 log_lines.append(pattern_line)
 
                 # Convert to bool because match objects
-                # are True-ish but != True itself
+                # are True-ish but is not True itself
                 matched = bool(matched)
                 if matched != matching:
                     break
@@ -2552,7 +2552,7 @@ def assertFailure(self, obj, error_str=None, msg=None):
         if obj.Success():
             self.fail(self._formatMessage(msg, "Error not in a fail state"))
 
-        if error_str == None:
+        if error_str is None:
             return
 
         error = obj.GetCString()
diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py
index 02ec65170f20c8..629565b38ca1e6 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbutil.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py
@@ -1594,11 +1594,11 @@ def set_actions_for_signal(
 ):
     return_obj = lldb.SBCommandReturnObject()
     command = "process handle {0}".format(signal_name)
-    if pass_action != None:
+    if pass_action is not None:
         command += " -p {0}".format(pass_action)
-    if stop_action != None:
+    if stop_action is not None:
         command += " -s {0}".format(stop_action)
-    if notify_action != None:
+    if notify_action is not None:
         command += " -n {0}".format(notify_action)
 
     testcase.dbg.GetCommandInterpreter().HandleCommand(command, return_obj)
diff --git a/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py
index fd1eb64219585a..f1b7d7c33bf072 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py
@@ -134,7 +134,7 @@ def traceStartProcess(
             self.assertSBError(trace.Start(configuration), error=error)
         else:
             command = "process trace start"
-            if processBufferSizeLimit != None:
+            if processBufferSizeLimit is not None:
                 command += " -l " + str(processBufferSizeLimit)
             if enableTsc:
                 command += " --tsc"
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index a9eeec18409902..a324af57b61df3 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -574,6 +574,8 @@ def request_attach(
         coreFile=None,
         postRunCommands=None,
         sourceMap=None,
+        gdbRemotePort=None,
+        gdbRemoteHostname=None,
     ):
         args_dict = {}
         if pid is not None:
@@ -603,6 +605,10 @@ def request_attach(
             args_dict["postRunCommands"] = postRunCommands
         if sourceMap:
             args_dict["sourceMap"] = sourceMap
+        if gdbRemotePort is not None:
+            args_dict["gdb-remote-port"] = gdbRemotePort
+        if gdbRemoteHostname is not None:
+            args_dict["gdb-remote-hostname"] = gdbRemoteHostname
         command_dict = {"command": "attach", "type": "request", "arguments": args_dict}
         return self.send_recv(command_dict)
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 600dc2b5fe9bb4..a312a88ebd7e58 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -3,6 +3,8 @@
 
 import dap_server
 from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbplatformutil
+import lldbgdbserverutils
 
 
 class DAPTestCaseBase(TestBase):
@@ -299,6 +301,8 @@ def attach(
         sourceMap=None,
         sourceInitFile=False,
         expectFailure=False,
+        gdbRemotePort=None,
+        gdbRemoteHostname=None,
     ):
         """Build the default Makefile target, create the DAP debug adaptor,
         and attach to the process.
@@ -329,6 +333,8 @@ def cleanup():
             coreFile=coreFile,
             postRunCommands=postRunCommands,
             sourceMap=sourceMap,
+            gdbRemotePort=gdbRemotePort,
+            gdbRemoteHostname=gdbRemoteHostname,
         )
         if expectFailure:
             return response
@@ -485,3 +491,18 @@ def build_and_launch(
             launchCommands=launchCommands,
             expectFailure=expectFailure,
         )
+
+    def getBuiltinDebugServerTool(self):
+        # Tries to find simulation/lldb-server/gdbserver tool path.
+        server_tool = None
+        if lldbplatformutil.getPlatform() == "linux":
+            server_tool = lldbgdbserverutils.get_lldb_server_exe()
+            if server_tool is None:
+                self.dap_server.request_disconnect(terminateDebuggee=True)
+                self.assertIsNotNone(server_tool, "lldb-server not found.")
+        elif lldbplatformutil.getPlatform() == "macosx":
+            server_tool = lldbgdbserverutils.get_debugserver_exe()
+            if server_tool is None:
+                self.dap_server.request_disconnect(terminateDebuggee=True)
+                self.assertIsNotNone(server_tool, "debugserver not found.")
+        return server_tool
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py
index d1a4119bac7815..5253839c7405dd 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py
@@ -13,6 +13,7 @@
 from lldbsuite.test import configuration
 from textwrap import dedent
 import shutil
+import select
 
 
 def _get_support_exe(basename):
@@ -969,3 +970,148 @@ def __str__(self):
             self._output_queue,
             self._accumulated_output,
         )
+
+
+# A class representing a pipe for communicating with debug server.
+# This class includes menthods to open the pipe and read the port number from it.
+if lldbplatformutil.getHostPlatform() == "windows":
+    import ctypes
+    import ctypes.wintypes
+    from ctypes.wintypes import BOOL, DWORD, HANDLE, LPCWSTR, LPDWORD, LPVOID
+
+    kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
+
+    PIPE_ACCESS_INBOUND = 1
+    FILE_FLAG_FIRST_PIPE_INSTANCE = 0x00080000
+    FILE_FLAG_OVERLAPPED = 0x40000000
+    PIPE_TYPE_BYTE = 0
+    PIPE_REJECT_REMOTE_CLIENTS = 8
+    INVALID_HANDLE_VALUE = -1
+    ERROR_ACCESS_DENIED = 5
+    ERROR_IO_PENDING = 997
+
+    class OVERLAPPED(ctypes.Structure):
+        _fields_ = [
+            ("Internal", LPVOID),
+            ("InternalHigh", LPVOID),
+            ("Offset", DWORD),
+            ("OffsetHigh", DWORD),
+            ("hEvent", HANDLE),
+        ]
+
+        def __init__(self):
+            super(OVERLAPPED, self).__init__(
+                Internal=0, InternalHigh=0, Offset=0, OffsetHigh=0, hEvent=None
+            )
+
+    LPOVERLAPPED = ctypes.POINTER(OVERLAPPED)
+
+    CreateNamedPipe = kernel32.CreateNamedPipeW
+    CreateNamedPipe.restype = HANDLE
+    CreateNamedPipe.argtypes = (
+        LPCWSTR,
+        DWORD,
+        DWORD,
+        DWORD,
+        DWORD,
+        DWORD,
+        DWORD,
+        LPVOID,
+    )
+
+    ConnectNamedPipe = kernel32.ConnectNamedPipe
+    ConnectNamedPipe.restype = BOOL
+    ConnectNamedPipe.argtypes = (HANDLE, LPOVERLAPPED)
+
+    CreateEvent = kernel32.CreateEventW
+    CreateEvent.restype = HANDLE
+    CreateEvent.argtypes = (LPVOID, BOOL, BOOL, LPCWSTR)
+
+    GetOverlappedResultEx = kernel32.GetOverlappedResultEx
+    GetOverlappedResultEx.restype = BOOL
+    GetOverlappedResultEx.argtypes = (HANDLE, LPOVERLAPPED, LPDWORD, DWORD, BOOL)
+
+    ReadFile = kernel32.ReadFile
+    ReadFile.restype = BOOL
+    ReadFile.argtypes = (HANDLE, LPVOID, DWORD, LPDWORD, LPOVERLAPPED)
+
+    CloseHandle = kernel32.CloseHandle
+    CloseHandle.restype = BOOL
+    CloseHandle.argtypes = (HANDLE,)
+
+    class Pipe(object):
+        def __init__(self, prefix):
+            while True:
+                self.name = "lldb-" + str(random.randrange(1e10))
+                full_name = "\\\\.\\pipe\\" + self.name
+                self._handle = CreateNamedPipe(
+                    full_name,
+                    PIPE_ACCESS_INBOUND
+                    | FILE_FLAG_FIRST_PIPE_INSTANCE
+                    | FILE_FLAG_OVERLAPPED,
+                    PIPE_TYPE_BYTE | PIPE_REJECT_REMOTE_CLIENTS,
+                    1,
+                    4096,
+                    4096,
+                    0,
+                    None,
+                )
+                if self._handle != INVALID_HANDLE_VALUE:
+                    break
+                if ctypes.get_last_error() != ERROR_ACCESS_DENIED:
+                    raise ctypes.WinError(ctypes.get_last_error())
+
+            self._overlapped = OVERLAPPED()
+            self._overlapped.hEvent = CreateEvent(None, True, False, None)
+            result = ConnectNamedPipe(self._handle, self._overlapped)
+            assert result == 0
+            if ctypes.get_last_error() != ERROR_IO_PENDING:
+                raise ctypes.WinError(ctypes.get_last_error())
+
+        def finish_connection(self, timeout):
+            if not GetOverlappedResultEx(
+                self._handle,
+                self._overlapped,
+                ctypes.byref(DWORD(0)),
+                timeout * 1000,
+                True,
+            ):
+                raise ctypes.WinError(ctypes.get_last_error())
+
+        def read(self, size, timeout):
+            buf = ctypes.create_string_buffer(size)
+            if not ReadFile(
+                self._handle, ctypes.byref(buf), size, None, self._overlapped
+            ):
+                if ctypes.get_last_error() != ERROR_IO_PENDING:
+                    raise ctypes.WinError(ctypes.get_last_error())
+            read = DWORD(0)
+            if not GetOverlappedResultEx(
+                self._handle, self._overlapped, ctypes.byref(read), timeout * 1000, True
+            ):
+                raise ctypes.WinError(ctypes.get_last_error())
+            return buf.raw[0 : read.value]
+
+        def close(self):
+            CloseHandle(self._overlapped.hEvent)
+            CloseHandle(self._handle)
+
+else:
+
+    class Pipe(object):
+        def __init__(self, prefix):
+            self.name = os.path.join(prefix, "stub_port_number")
+            os.mkfifo(self.name)
+            self._fd = os.open(self.name, os.O_RDONLY | os.O_NONBLOCK)
+
+        def finish_connection(self, timeout):
+            pass
+
+        def read(self, size, timeout):
+            (readers, _, _) = select.select([self._fd], [], [], timeout)
+            if self._fd not in readers:
+                raise TimeoutError
+            return os.read(self._fd, size)
+
+        def close(self):
+            os.close(self._fd)
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 7ef0d6efd4aaa5..29da7d33dd80b8 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -1742,3 +1742,7 @@ bool SBDebugger::InterruptRequested()   {
     return m_opaque_sp->InterruptRequested();
   return false;
 }
+
+bool SBDebugger::SupportsLanguage(lldb::LanguageType language) {
+  return TypeSystem::SupportsLanguageStatic(language);
+}
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index c37c111c5a58e0..efb3c8def2796b 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -14,6 +14,7 @@
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-types.h"
 
+#include "lldb/Core/AddressRangeListImpl.h"
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/PluginManager.h"
@@ -26,6 +27,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/Args.h"
+#include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/ProcessInfo.h"
 #include "lldb/Utility/State.h"
 #include "lldb/Utility/Stream.h"
@@ -320,8 +322,8 @@ void SBProcess::ReportEventState(const SBEvent &event, FileSP out) const {
   if (process_sp) {
     StreamFile stream(out);
     const StateType event_state = SBProcess::GetStateFromEvent(event);
-    stream.Printf("Process %" PRIu64 " %s\n",
-        process_sp->GetID(), SBDebugger::StateAsCString(event_state));
+    stream.Printf("Process %" PRIu64 " %s\n", process_sp->GetID(),
+                  SBDebugger::StateAsCString(event_state));
   }
 }
 
@@ -378,7 +380,6 @@ bool SBProcess::SetSelectedThreadByIndexID(uint32_t index_id) {
     ret_val = process_sp->GetThreadList().SetSelectedThreadByIndexID(index_id);
   }
 
-
   return ret_val;
 }
 
@@ -546,7 +547,6 @@ ByteOrder SBProcess::GetByteOrder() const {
   if (process_sp)
     byteOrder = process_sp->GetTarget().GetArchitecture().GetByteOrder();
 
-
   return byteOrder;
 }
 
@@ -558,7 +558,6 @@ uint32_t SBProcess::GetAddressByteSize() const {
   if (process_sp)
     size = process_sp->GetTarget().GetArchitecture().GetAddressByteSize();
 
-
   return size;
 }
 
@@ -810,6 +809,55 @@ const char *SBProcess::GetBroadcasterClass() {
   return ConstString(Process::GetStaticBroadcasterClass()).AsCString();
 }
 
+lldb::SBAddressRangeList SBProcess::FindRangesInMemory(
+    const void *buf, uint64_t size, const SBAddressRangeList &ranges,
+    uint32_t alignment, uint32_t max_matches, SBError &error) {
+  LLDB_INSTRUMENT_VA(this, buf, size, ranges, alignment, max_matches, error);
+
+  lldb::SBAddressRangeList matches;
+
+  ProcessSP process_sp(GetSP());
+  if (!process_sp) {
+    error.SetErrorString("SBProcess is invalid");
+    return matches;
+  }
+  Process::StopLocker stop_locker;
+  if (!stop_locker.TryLock(&process_sp->GetRunLock())) {
+    error.SetErrorString("process is running");
+    return matches;
+  }
+  std::lock_guard<std::recursive_mutex> guard(
+      process_sp->GetTarget().GetAPIMutex());
+  matches.m_opaque_up->ref() = process_sp->FindRangesInMemory(
+      reinterpret_cast<const uint8_t *>(buf), size, ranges.ref().ref(),
+      alignment, max_matches, error.ref());
+  return matches;
+}
+
+lldb::addr_t SBProcess::FindInMemory(const void *buf, uint64_t size,
+                                     const SBAddressRange &range,
+                                     uint32_t alignment, SBError &error) {
+  LLDB_INSTRUMENT_VA(this, buf, size, range, alignment, error);
+
+  ProcessSP process_sp(GetSP());
+
+  if (!process_sp) {
+    error.SetErrorString("SBProcess is invalid");
+    return LLDB_INVALID_ADDRESS;
+  }
+
+  Process::StopLocker stop_locker;
+  if (!stop_locker.TryLock(&process_sp->GetRunLock())) {
+    error.SetErrorString("process is running");
+    return LLDB_INVALID_ADDRESS;
+  }
+
+  std::lock_guard<std::recursive_mutex> guard(
+      process_sp->GetTarget().GetAPIMutex());
+  return process_sp->FindInMemory(reinterpret_cast<const uint8_t *>(buf), size,
+                                  range.ref(), alignment, error.ref());
+}
+
 size_t SBProcess::ReadMemory(addr_t addr, void *dst, size_t dst_len,
                              SBError &sb_error) {
   LLDB_INSTRUMENT_VA(this, addr, dst, dst_len, sb_error);
diff --git a/lldb/source/Breakpoint/BreakpointOptions.cpp b/lldb/source/Breakpoint/BreakpointOptions.cpp
index 6c6037dd9edd3a..1db84016981142 100644
--- a/lldb/source/Breakpoint/BreakpointOptions.cpp
+++ b/lldb/source/Breakpoint/BreakpointOptions.cpp
@@ -102,19 +102,11 @@ const char *BreakpointOptions::g_option_names[(
     "ConditionText", "IgnoreCount", 
     "EnabledState", "OneShotState", "AutoContinue"};
 
-bool BreakpointOptions::NullCallback(void *baton,
-                                     StoppointCallbackContext *context,
-                                     lldb::user_id_t break_id,
-                                     lldb::user_id_t break_loc_id) {
-  return true;
-}
-
 // BreakpointOptions constructor
 BreakpointOptions::BreakpointOptions(bool all_flags_set)
-    : m_callback(BreakpointOptions::NullCallback),
-      m_baton_is_command_baton(false), m_callback_is_synchronous(false),
-      m_enabled(true), m_one_shot(false), m_ignore_count(0),
-      m_condition_text_hash(0), m_inject_condition(false),
+    : m_callback(nullptr), m_baton_is_command_baton(false),
+      m_callback_is_synchronous(false), m_enabled(true), m_one_shot(false),
+      m_ignore_count(0), m_condition_text_hash(0), m_inject_condition(false),
       m_auto_continue(false), m_set_flags(0) {
   if (all_flags_set)
     m_set_flags.Set(~((Flags::ValueType)0));
@@ -420,7 +412,7 @@ void BreakpointOptions::SetCallback(
 }
 
 void BreakpointOptions::ClearCallback() {
-  m_callback = BreakpointOptions::NullCallback;
+  m_callback = nullptr;
   m_callback_is_synchronous = false;
   m_callback_baton_sp.reset();
   m_baton_is_command_baton = false;
@@ -449,7 +441,7 @@ bool BreakpointOptions::InvokeCallback(StoppointCallbackContext *context,
 }
 
 bool BreakpointOptions::HasCallback() const {
-  return m_callback != BreakpointOptions::NullCallback;
+  return static_cast<bool>(m_callback);
 }
 
 bool BreakpointOptions::GetCommandLineCallbacks(StringList &command_list) {
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index ba256e5ab917aa..fa8af7cb3d7624 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -1429,9 +1429,8 @@ let Command = "statistics dump" in {
     Arg<"Boolean">,
     Desc<"Dump statistics for the targets, including breakpoints, expression "
     "evaluations, frame variables, etc. "
-    "Defaults to true, unless the '--summary' mode is enabled, in which case "
-    "this is turned off unless specified. "
-    "If both the '--targets' and the '--modules' options are 'true', a list "
+    "Defaults to true in both default mode and summary mode. "
+    "In default mode, if both '--targets' and '--modules' are 'true', a list "
     "of module identifiers will be added to the 'targets' section.">;
   def statistics_dump_modules: Option<"modules", "m">, Group<1>,
     Arg<"Boolean">,
@@ -1439,7 +1438,7 @@ let Command = "statistics dump" in {
     "aspects of the module and debug information, type system, path, etc. "
     "Defaults to true, unless the '--summary' mode is enabled, in which case "
     "this is turned off unless specified. "
-    "If both the '--targets' and the '--modules' options are 'true', a list "
+    "In default mode, if both '--targets' and '--modules' are 'true', a list "
     "of module identifiers will be added to the 'targets' section.">;
   def statistics_dump_transcript: Option<"transcript", "t">, Group<1>,
     Arg<"Boolean">,
diff --git a/lldb/source/Core/DumpRegisterInfo.cpp b/lldb/source/Core/DumpRegisterInfo.cpp
index 83347954169024..eccc6784cd4974 100644
--- a/lldb/source/Core/DumpRegisterInfo.cpp
+++ b/lldb/source/Core/DumpRegisterInfo.cpp
@@ -111,6 +111,11 @@ void lldb_private::DoDumpRegisterInfo(
   };
   DumpList(strm, "    In sets: ", in_sets, emit_set);
 
-  if (flags_type)
+  if (flags_type) {
     strm.Printf("\n\n%s", flags_type->AsTable(terminal_width).c_str());
+
+    std::string enumerators = flags_type->DumpEnums(terminal_width);
+    if (enumerators.size())
+      strm << "\n\n" << enumerators;
+  }
 }
diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp
index 8dd499ce819a78..fa23964a52ffeb 100644
--- a/lldb/source/Interpreter/ScriptInterpreter.cpp
+++ b/lldb/source/Interpreter/ScriptInterpreter.cpp
@@ -101,6 +101,22 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const {
   return Status();
 }
 
+Event *
+ScriptInterpreter::GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const {
+  return event.m_opaque_ptr;
+}
+
+lldb::StreamSP ScriptInterpreter::GetOpaqueTypeFromSBStream(
+    const lldb::SBStream &stream) const {
+  if (stream.m_opaque_up) {
+    lldb::StreamSP s = std::make_shared<lldb_private::StreamString>();
+    *s << reinterpret_cast<StreamString *>(stream.m_opaque_up.get())->m_packet;
+    return s;
+  }
+
+  return nullptr;
+}
+
 std::optional<MemoryRegionInfo>
 ScriptInterpreter::GetOpaqueTypeFromSBMemoryRegionInfo(
     const lldb::SBMemoryRegionInfo &mem_region) const {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index b0e6fb7d6f5af4..0f9f93b727ce86 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -808,6 +808,9 @@ ExtractLibcxxStringInfo(ValueObject &valobj) {
       size = (layout == StringLayout::DSC) ? size_mode_value
                                            : ((size_mode_value >> 1) % 256);
 
+    if (!location_sp)
+      return {};
+
     // When the small-string optimization takes place, the data must fit in the
     // inline string buffer (23 bytes on x86_64/Darwin). If it doesn't, it's
     // likely that the string isn't initialized and we're reading garbage.
@@ -815,7 +818,7 @@ ExtractLibcxxStringInfo(ValueObject &valobj) {
     const std::optional<uint64_t> max_bytes =
         location_sp->GetCompilerType().GetByteSize(
             exe_ctx.GetBestExecutionContextScope());
-    if (!max_bytes || size > *max_bytes || !location_sp)
+    if (!max_bytes || size > *max_bytes)
       return {};
 
     return std::make_pair(size, location_sp);
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 7231433619ffb2..de212c6b20da7e 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -20,25 +20,100 @@
 #include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Target/ThreadList.h"
+#include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/RangeMap.h"
 #include "lldb/Utility/RegisterValue.h"
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Minidump.h"
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/TargetParser/Triple.h"
 
 #include "Plugins/Process/minidump/MinidumpTypes.h"
+#include "lldb/lldb-enumerations.h"
+#include "lldb/lldb-forward.h"
+#include "lldb/lldb-types.h"
 
+#include <algorithm>
 #include <cinttypes>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <set>
+#include <utility>
+#include <vector>
 
 using namespace lldb;
 using namespace lldb_private;
 using namespace llvm::minidump;
 
-void MinidumpFileBuilder::AddDirectory(StreamType type, size_t stream_size) {
+Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() {
+  // First set the offset on the file, and on the bytes saved
+  m_saved_data_size = HEADER_SIZE;
+  // We know we will have at least Misc, SystemInfo, Modules, and ThreadList
+  // (corresponding memory list for stacks) And an additional memory list for
+  // non-stacks.
+  lldb_private::Target &target = m_process_sp->GetTarget();
+  m_expected_directories = 6;
+  // Check if OS is linux and reserve directory space for all linux specific
+  // breakpad extension directories.
+  if (target.GetArchitecture().GetTriple().getOS() ==
+      llvm::Triple::OSType::Linux)
+    m_expected_directories += 9;
+
+  // Go through all of the threads and check for exceptions.
+  lldb_private::ThreadList thread_list = m_process_sp->GetThreadList();
+  const uint32_t num_threads = thread_list.GetSize();
+  for (uint32_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
+    ThreadSP thread_sp(thread_list.GetThreadAtIndex(thread_idx));
+    StopInfoSP stop_info_sp = thread_sp->GetStopInfo();
+    if (stop_info_sp) {
+      const StopReason &stop_reason = stop_info_sp->GetStopReason();
+      if (stop_reason == StopReason::eStopReasonException ||
+          stop_reason == StopReason::eStopReasonSignal)
+        m_expected_directories++;
+    }
+  }
+
+  m_saved_data_size +=
+      m_expected_directories * sizeof(llvm::minidump::Directory);
+  Status error;
+  offset_t new_offset = m_core_file->SeekFromStart(m_saved_data_size);
+  if (new_offset != m_saved_data_size)
+    error.SetErrorStringWithFormat("Failed to fill in header and directory "
+                                   "sections. Written / Expected (%" PRIx64
+                                   " / %" PRIx64 ")",
+                                   new_offset, m_saved_data_size);
+
+  return error;
+}
+
+Status MinidumpFileBuilder::AddDirectory(StreamType type,
+                                         uint64_t stream_size) {
+  // We explicitly cast type, an 32b enum, to uint32_t to avoid warnings.
+  Status error;
+  if (GetCurrentDataEndOffset() > UINT32_MAX) {
+    error.SetErrorStringWithFormat("Unable to add directory for stream type "
+                                   "%x, offset is greater then 32 bit limit.",
+                                   (uint32_t)type);
+    return error;
+  }
+
+  if (m_directories.size() + 1 > m_expected_directories) {
+    error.SetErrorStringWithFormat(
+        "Unable to add directory for stream type %x, exceeded expected number "
+        "of directories %zu.",
+        (uint32_t)type, m_expected_directories);
+    return error;
+  }
+
   LocationDescriptor loc;
   loc.DataSize = static_cast<llvm::support::ulittle32_t>(stream_size);
   // Stream will begin at the current end of data section
@@ -49,11 +124,17 @@ void MinidumpFileBuilder::AddDirectory(StreamType type, size_t stream_size) {
   dir.Location = loc;
 
   m_directories.push_back(dir);
+  return error;
 }
 
-Status MinidumpFileBuilder::AddSystemInfo(const llvm::Triple &target_triple) {
+Status MinidumpFileBuilder::AddSystemInfo() {
   Status error;
-  AddDirectory(StreamType::SystemInfo, sizeof(llvm::minidump::SystemInfo));
+  const llvm::Triple &target_triple =
+      m_process_sp->GetTarget().GetArchitecture().GetTriple();
+  error =
+      AddDirectory(StreamType::SystemInfo, sizeof(llvm::minidump::SystemInfo));
+  if (error.Fail())
+    return error;
 
   llvm::minidump::ProcessorArchitecture arch;
   switch (target_triple.getArch()) {
@@ -165,7 +246,6 @@ llvm::Expected<uint64_t> getModuleFileSize(Target &target,
     }
     return SizeOfImage;
   }
-
   SectionSP sect_sp = mod->GetObjectFile()->GetBaseAddress().GetSection();
 
   if (!sect_sp) {
@@ -203,10 +283,11 @@ llvm::Expected<uint64_t> getModuleFileSize(Target &target,
 // single module. Additional data of variable length, such as module's names,
 // are stored just after the ModuleList stream. The llvm::minidump::Module
 // structures point to this helper data by global offset.
-Status MinidumpFileBuilder::AddModuleList(Target &target) {
+Status MinidumpFileBuilder::AddModuleList() {
   constexpr size_t minidump_module_size = sizeof(llvm::minidump::Module);
   Status error;
 
+  lldb_private::Target &target = m_process_sp->GetTarget();
   const ModuleList &modules = target.GetImages();
   llvm::support::ulittle32_t modules_count =
       static_cast<llvm::support::ulittle32_t>(modules.GetSize());
@@ -223,7 +304,9 @@ Status MinidumpFileBuilder::AddModuleList(Target &target) {
       sizeof(llvm::support::ulittle32_t) + modules_count * minidump_module_size;
 
   // Adding directory describing this stream.
-  AddDirectory(StreamType::ModuleList, module_stream_size);
+  error = AddDirectory(StreamType::ModuleList, module_stream_size);
+  if (error.Fail())
+    return error;
 
   m_data.AppendData(&modules_count, sizeof(llvm::support::ulittle32_t));
 
@@ -480,42 +563,32 @@ class ArchThreadContexts {
   }
 };
 
-// Function returns start and size of the memory region that contains
-// memory location pointed to by the current stack pointer.
-llvm::Expected<std::pair<addr_t, addr_t>>
-findStackHelper(const lldb::ProcessSP &process_sp, uint64_t rsp) {
-  MemoryRegionInfo range_info;
-  Status error = process_sp->GetMemoryRegionInfo(rsp, range_info);
-  // Skip failed memory region requests or any regions with no permissions.
-  if (error.Fail() || range_info.GetLLDBPermissions() == 0)
-    return llvm::createStringError(
-        std::errc::not_supported,
-        "unable to load stack segment of the process");
-
-  // This is a duplicate of the logic in
-  // Process::SaveOffRegionsWithStackPointers but ultimately, we need to only
-  // save up from the start of the stack down to the stack pointer
-  const addr_t range_end = range_info.GetRange().GetRangeEnd();
-  const addr_t red_zone = process_sp->GetABI()->GetRedZoneSize();
-  const addr_t stack_head = rsp - red_zone;
-  if (stack_head > range_info.GetRange().GetRangeEnd()) {
-    range_info.GetRange().SetRangeBase(stack_head);
-    range_info.GetRange().SetByteSize(range_end - stack_head);
-  }
-
-  const addr_t addr = range_info.GetRange().GetRangeBase();
-  const addr_t size = range_info.GetRange().GetByteSize();
-
-  if (size == 0)
-    return llvm::createStringError(std::errc::not_supported,
-                                   "stack segment of the process is empty");
-
-  return std::make_pair(addr, size);
+Status MinidumpFileBuilder::FixThreadStacks() {
+  Status error;
+  // If we have anything in the heap flush it.
+  FlushBufferToDisk();
+  m_core_file->SeekFromStart(m_thread_list_start);
+  for (auto &pair : m_thread_by_range_end) {
+    // The thread objects will get a new memory descriptor added
+    // When we are emitting the memory list and then we write it here
+    const llvm::minidump::Thread &thread = pair.second;
+    size_t bytes_to_write = sizeof(llvm::minidump::Thread);
+    size_t bytes_written = bytes_to_write;
+    error = m_core_file->Write(&thread, bytes_written);
+    if (error.Fail() || bytes_to_write != bytes_written) {
+      error.SetErrorStringWithFormat(
+          "Wrote incorrect number of bytes to minidump file. (written %zd/%zd)",
+          bytes_written, bytes_to_write);
+      return error;
+    }
+  }
+
+  return error;
 }
 
-Status MinidumpFileBuilder::AddThreadList(const lldb::ProcessSP &process_sp) {
+Status MinidumpFileBuilder::AddThreadList() {
   constexpr size_t minidump_thread_size = sizeof(llvm::minidump::Thread);
-  lldb_private::ThreadList thread_list = process_sp->GetThreadList();
+  lldb_private::ThreadList thread_list = m_process_sp->GetThreadList();
 
   // size of the entire thread stream consists of:
   // number of threads and threads array
@@ -523,28 +596,31 @@ Status MinidumpFileBuilder::AddThreadList(const lldb::ProcessSP &process_sp) {
                               thread_list.GetSize() * minidump_thread_size;
   // save for the ability to set up RVA
   size_t size_before = GetCurrentDataEndOffset();
-
-  AddDirectory(StreamType::ThreadList, thread_stream_size);
+  Status error;
+  error = AddDirectory(StreamType::ThreadList, thread_stream_size);
+  if (error.Fail())
+    return error;
 
   llvm::support::ulittle32_t thread_count =
       static_cast<llvm::support::ulittle32_t>(thread_list.GetSize());
   m_data.AppendData(&thread_count, sizeof(llvm::support::ulittle32_t));
 
+  // Take the offset after the thread count.
+  m_thread_list_start = GetCurrentDataEndOffset();
   DataBufferHeap helper_data;
 
   const uint32_t num_threads = thread_list.GetSize();
-
+  Log *log = GetLog(LLDBLog::Object);
   for (uint32_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     ThreadSP thread_sp(thread_list.GetThreadAtIndex(thread_idx));
     RegisterContextSP reg_ctx_sp(thread_sp->GetRegisterContext());
-    Status error;
 
     if (!reg_ctx_sp) {
       error.SetErrorString("Unable to get the register context.");
       return error;
     }
     RegisterContext *reg_ctx = reg_ctx_sp.get();
-    Target &target = process_sp->GetTarget();
+    Target &target = m_process_sp->GetTarget();
     const ArchSpec &arch = target.GetArchitecture();
     ArchThreadContexts thread_context(arch.GetMachine());
     if (!thread_context.prepareRegisterContext(reg_ctx)) {
@@ -553,38 +629,18 @@ Status MinidumpFileBuilder::AddThreadList(const lldb::ProcessSP &process_sp) {
           arch.GetTriple().getArchName().str().c_str());
       return error;
     }
-    uint64_t sp = reg_ctx->GetSP();
-    auto expected_address_range = findStackHelper(process_sp, sp);
 
-    if (!expected_address_range) {
-      consumeError(expected_address_range.takeError());
-      error.SetErrorString("Unable to get the stack address.");
-      return error;
-    }
-
-    std::pair<uint64_t, uint64_t> range = std::move(*expected_address_range);
-    uint64_t addr = range.first;
-    uint64_t size = range.second;
-
-    auto data_up = std::make_unique<DataBufferHeap>(size, 0);
-    const size_t stack_bytes_read =
-        process_sp->ReadMemory(addr, data_up->GetBytes(), size, error);
-
-    if (error.Fail())
-      return error;
-
-    LocationDescriptor stack_memory;
-    stack_memory.DataSize =
-        static_cast<llvm::support::ulittle32_t>(stack_bytes_read);
-    stack_memory.RVA = static_cast<llvm::support::ulittle32_t>(
-        size_before + thread_stream_size + helper_data.GetByteSize());
+    uint64_t sp = reg_ctx->GetSP();
+    MemoryRegionInfo sp_region;
+    m_process_sp->GetMemoryRegionInfo(sp, sp_region);
 
+    // Emit a blank descriptor
     MemoryDescriptor stack;
-    stack.StartOfMemoryRange = static_cast<llvm::support::ulittle64_t>(addr);
-    stack.Memory = stack_memory;
-
-    helper_data.AppendData(data_up->GetBytes(), stack_bytes_read);
-
+    LocationDescriptor empty_label;
+    empty_label.DataSize = 0;
+    empty_label.RVA = 0;
+    stack.Memory = empty_label;
+    stack.StartOfMemoryRange = 0;
     LocationDescriptor thread_context_memory_locator;
     thread_context_memory_locator.DataSize =
         static_cast<llvm::support::ulittle32_t>(thread_context.size());
@@ -593,6 +649,8 @@ Status MinidumpFileBuilder::AddThreadList(const lldb::ProcessSP &process_sp) {
     // Cache thie thread context memory so we can reuse for exceptions.
     m_tid_to_reg_ctx[thread_sp->GetID()] = thread_context_memory_locator;
 
+    LLDB_LOGF(log, "AddThreadList for thread %d: thread_context %zu bytes",
+              thread_idx, thread_context.size());
     helper_data.AppendData(thread_context.data(), thread_context.size());
 
     llvm::minidump::Thread t;
@@ -604,16 +662,20 @@ Status MinidumpFileBuilder::AddThreadList(const lldb::ProcessSP &process_sp) {
     t.EnvironmentBlock = static_cast<llvm::support::ulittle64_t>(0);
     t.Stack = stack, t.Context = thread_context_memory_locator;
 
+    // We save off the stack object so we can circle back and clean it up.
+    m_thread_by_range_end[sp_region.GetRange().GetRangeEnd()] = t;
     m_data.AppendData(&t, sizeof(llvm::minidump::Thread));
   }
 
+  LLDB_LOGF(log, "AddThreadList(): total helper_data %" PRIx64 " bytes",
+            helper_data.GetByteSize());
   m_data.AppendData(helper_data.GetBytes(), helper_data.GetByteSize());
   return Status();
 }
 
-void MinidumpFileBuilder::AddExceptions(const lldb::ProcessSP &process_sp) {
-  lldb_private::ThreadList thread_list = process_sp->GetThreadList();
-
+Status MinidumpFileBuilder::AddExceptions() {
+  lldb_private::ThreadList thread_list = m_process_sp->GetThreadList();
+  Status error;
   const uint32_t num_threads = thread_list.GetSize();
   for (uint32_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     ThreadSP thread_sp(thread_list.GetThreadAtIndex(thread_idx));
@@ -632,7 +694,10 @@ void MinidumpFileBuilder::AddExceptions(const lldb::ProcessSP &process_sp) {
     if (add_exception) {
       constexpr size_t minidump_exception_size =
           sizeof(llvm::minidump::ExceptionStream);
-      AddDirectory(StreamType::Exception, minidump_exception_size);
+      error = AddDirectory(StreamType::Exception, minidump_exception_size);
+      if (error.Fail())
+        return error;
+
       StopInfoSP stop_info_sp = thread_sp->GetStopInfo();
       RegisterContextSP reg_ctx_sp(thread_sp->GetRegisterContext());
       Exception exp_record = {};
@@ -660,69 +725,16 @@ void MinidumpFileBuilder::AddExceptions(const lldb::ProcessSP &process_sp) {
       m_data.AppendData(&exp_stream, minidump_exception_size);
     }
   }
-}
-
-lldb_private::Status
-MinidumpFileBuilder::AddMemoryList(const lldb::ProcessSP &process_sp,
-                                   lldb::SaveCoreStyle core_style) {
-  Status error;
-  Process::CoreFileMemoryRanges core_ranges;
-  error = process_sp->CalculateCoreFileSaveRanges(core_style, core_ranges);
-  if (error.Fail()) {
-    error.SetErrorString("Process doesn't support getting memory region info.");
-    return error;
-  }
-
-  DataBufferHeap helper_data;
-  std::vector<MemoryDescriptor> mem_descriptors;
-  for (const auto &core_range : core_ranges) {
-    // Skip empty memory regions.
-    if (core_range.range.empty())
-      continue;
-    const addr_t addr = core_range.range.start();
-    const addr_t size = core_range.range.size();
-    auto data_up = std::make_unique<DataBufferHeap>(size, 0);
-    const size_t bytes_read =
-        process_sp->ReadMemory(addr, data_up->GetBytes(), size, error);
-    if (error.Fail()) {
-      Log *log = GetLog(LLDBLog::Object);
-      LLDB_LOGF(log, "Failed to read memory region. Bytes read: %zu, error: %s",
-                bytes_read, error.AsCString());
-      error.Clear();
-    }
-    if (bytes_read == 0)
-      continue;
-    // We have a good memory region with valid bytes to store.
-    LocationDescriptor memory_dump;
-    memory_dump.DataSize = static_cast<llvm::support::ulittle32_t>(bytes_read);
-    memory_dump.RVA =
-        static_cast<llvm::support::ulittle32_t>(GetCurrentDataEndOffset());
-    MemoryDescriptor memory_desc;
-    memory_desc.StartOfMemoryRange =
-        static_cast<llvm::support::ulittle64_t>(addr);
-    memory_desc.Memory = memory_dump;
-    mem_descriptors.push_back(memory_desc);
-    m_data.AppendData(data_up->GetBytes(), bytes_read);
-  }
-
-  AddDirectory(StreamType::MemoryList,
-               sizeof(llvm::support::ulittle32_t) +
-                   mem_descriptors.size() *
-                       sizeof(llvm::minidump::MemoryDescriptor));
-  llvm::support::ulittle32_t memory_ranges_num(mem_descriptors.size());
-
-  m_data.AppendData(&memory_ranges_num, sizeof(llvm::support::ulittle32_t));
-  for (auto memory_descriptor : mem_descriptors) {
-    m_data.AppendData(&memory_descriptor,
-                      sizeof(llvm::minidump::MemoryDescriptor));
-  }
 
   return error;
 }
 
-void MinidumpFileBuilder::AddMiscInfo(const lldb::ProcessSP &process_sp) {
-  AddDirectory(StreamType::MiscInfo,
-               sizeof(lldb_private::minidump::MinidumpMiscInfo));
+lldb_private::Status MinidumpFileBuilder::AddMiscInfo() {
+  Status error;
+  error = AddDirectory(StreamType::MiscInfo,
+                       sizeof(lldb_private::minidump::MinidumpMiscInfo));
+  if (error.Fail())
+    return error;
 
   lldb_private::minidump::MinidumpMiscInfo misc_info;
   misc_info.size = static_cast<llvm::support::ulittle32_t>(
@@ -732,7 +744,7 @@ void MinidumpFileBuilder::AddMiscInfo(const lldb::ProcessSP &process_sp) {
   misc_info.flags1 = static_cast<llvm::support::ulittle32_t>(0);
 
   lldb_private::ProcessInstanceInfo process_info;
-  process_sp->GetProcessInfo(process_info);
+  m_process_sp->GetProcessInfo(process_info);
   if (process_info.ProcessIDIsValid()) {
     // Set flags1 to reflect that PID is filled in
     misc_info.flags1 =
@@ -744,6 +756,7 @@ void MinidumpFileBuilder::AddMiscInfo(const lldb::ProcessSP &process_sp) {
 
   m_data.AppendData(&misc_info,
                     sizeof(lldb_private::minidump::MinidumpMiscInfo));
+  return error;
 }
 
 std::unique_ptr<llvm::MemoryBuffer>
@@ -754,15 +767,20 @@ getFileStreamHelper(const std::string &path) {
   return std::move(maybe_stream.get());
 }
 
-void MinidumpFileBuilder::AddLinuxFileStreams(
-    const lldb::ProcessSP &process_sp) {
+Status MinidumpFileBuilder::AddLinuxFileStreams() {
+  Status error;
+  // No-op if we are not on linux.
+  if (m_process_sp->GetTarget().GetArchitecture().GetTriple().getOS() !=
+      llvm::Triple::Linux)
+    return error;
+
   std::vector<std::pair<StreamType, std::string>> files_with_stream_types = {
       {StreamType::LinuxCPUInfo, "/proc/cpuinfo"},
       {StreamType::LinuxLSBRelease, "/etc/lsb-release"},
   };
 
   lldb_private::ProcessInstanceInfo process_info;
-  process_sp->GetProcessInfo(process_info);
+  m_process_sp->GetProcessInfo(process_info);
   if (process_info.ProcessIDIsValid()) {
     lldb::pid_t pid = process_info.GetProcessID();
     std::string pid_str = std::to_string(pid);
@@ -791,16 +809,94 @@ void MinidumpFileBuilder::AddLinuxFileStreams(
       size_t size = memory_buffer->getBufferSize();
       if (size == 0)
         continue;
-      AddDirectory(stream, size);
+      error = AddDirectory(stream, size);
+      if (error.Fail())
+        return error;
       m_data.AppendData(memory_buffer->getBufferStart(), size);
     }
   }
+
+  return error;
 }
 
-Status MinidumpFileBuilder::Dump(lldb::FileUP &core_file) const {
-  constexpr size_t header_size = sizeof(llvm::minidump::Header);
-  constexpr size_t directory_size = sizeof(llvm::minidump::Directory);
+Status MinidumpFileBuilder::AddMemoryList(SaveCoreStyle core_style) {
+  Status error;
+
+  // We first save the thread stacks to ensure they fit in the first UINT32_MAX
+  // bytes of the core file. Thread structures in minidump files can only use
+  // 32 bit memory descriptiors, so we emit them first to ensure the memory is
+  // in accessible with a 32 bit offset.
+  Process::CoreFileMemoryRanges ranges_32;
+  Process::CoreFileMemoryRanges ranges_64;
+  error = m_process_sp->CalculateCoreFileSaveRanges(
+      SaveCoreStyle::eSaveCoreStackOnly, ranges_32);
+  if (error.Fail())
+    return error;
+
+  // Calculate totalsize including the current offset.
+  uint64_t total_size = GetCurrentDataEndOffset();
+  total_size += ranges_32.size() * sizeof(llvm::minidump::MemoryDescriptor);
+  std::unordered_set<addr_t> stack_start_addresses;
+  for (const auto &core_range : ranges_32) {
+    stack_start_addresses.insert(core_range.range.start());
+    total_size += core_range.range.size();
+  }
+
+  if (total_size >= UINT32_MAX) {
+    error.SetErrorStringWithFormat("Unable to write minidump. Stack memory "
+                                   "exceeds 32b limit. (Num Stacks %zu)",
+                                   ranges_32.size());
+    return error;
+  }
+
+  Process::CoreFileMemoryRanges all_core_memory_ranges;
+  if (core_style != SaveCoreStyle::eSaveCoreStackOnly) {
+    error = m_process_sp->CalculateCoreFileSaveRanges(core_style,
+                                                      all_core_memory_ranges);
+    if (error.Fail())
+      return error;
+  }
+
+  // After saving the stacks, we start packing as much as we can into 32b.
+  // We apply a generous padding here so that the Directory, MemoryList and
+  // Memory64List sections all begin in 32b addressable space.
+  // Then anything overflow extends into 64b addressable space.
+  // All core memeroy ranges will either container nothing on stacks only
+  // or all the memory ranges including stacks
+  if (!all_core_memory_ranges.empty())
+    total_size +=
+        256 + (all_core_memory_ranges.size() - stack_start_addresses.size()) *
+                  sizeof(llvm::minidump::MemoryDescriptor_64);
+
+  for (const auto &core_range : all_core_memory_ranges) {
+    const addr_t range_size = core_range.range.size();
+    if (stack_start_addresses.count(core_range.range.start()) > 0)
+      // Don't double save stacks.
+      continue;
+
+    if (total_size + range_size < UINT32_MAX) {
+      ranges_32.push_back(core_range);
+      total_size += range_size;
+    } else {
+      ranges_64.push_back(core_range);
+    }
+  }
 
+  error = AddMemoryList_32(ranges_32);
+  if (error.Fail())
+    return error;
+
+  // Add the remaining memory as a 64b range.
+  if (!ranges_64.empty()) {
+    error = AddMemoryList_64(ranges_64);
+    if (error.Fail())
+      return error;
+  }
+
+  return FixThreadStacks();
+}
+
+Status MinidumpFileBuilder::DumpHeader() const {
   // write header
   llvm::minidump::Header header;
   header.Signature = static_cast<llvm::support::ulittle32_t>(
@@ -808,9 +904,10 @@ Status MinidumpFileBuilder::Dump(lldb::FileUP &core_file) const {
   header.Version = static_cast<llvm::support::ulittle32_t>(
       llvm::minidump::Header::MagicVersion);
   header.NumberOfStreams =
-      static_cast<llvm::support::ulittle32_t>(GetDirectoriesNum());
+      static_cast<llvm::support::ulittle32_t>(m_directories.size());
+  // We write the directories right after the header.
   header.StreamDirectoryRVA =
-      static_cast<llvm::support::ulittle32_t>(GetCurrentDataEndOffset());
+      static_cast<llvm::support::ulittle32_t>(HEADER_SIZE);
   header.Checksum = static_cast<llvm::support::ulittle32_t>(
       0u), // not used in most of the writers
       header.TimeDateStamp =
@@ -821,36 +918,35 @@ Status MinidumpFileBuilder::Dump(lldb::FileUP &core_file) const {
   Status error;
   size_t bytes_written;
 
-  bytes_written = header_size;
-  error = core_file->Write(&header, bytes_written);
-  if (error.Fail() || bytes_written != header_size) {
-    if (bytes_written != header_size)
+  m_core_file->SeekFromStart(0);
+  bytes_written = HEADER_SIZE;
+  error = m_core_file->Write(&header, bytes_written);
+  if (error.Fail() || bytes_written != HEADER_SIZE) {
+    if (bytes_written != HEADER_SIZE)
       error.SetErrorStringWithFormat(
-          "unable to write the header (written %zd/%zd)", bytes_written,
-          header_size);
+          "Unable to write the minidump header (written %zd/%zd)",
+          bytes_written, HEADER_SIZE);
     return error;
   }
+  return error;
+}
 
-  // write data
-  bytes_written = m_data.GetByteSize();
-  error = core_file->Write(m_data.GetBytes(), bytes_written);
-  if (error.Fail() || bytes_written != m_data.GetByteSize()) {
-    if (bytes_written != m_data.GetByteSize())
-      error.SetErrorStringWithFormat(
-          "unable to write the data (written %zd/%" PRIu64 ")", bytes_written,
-          m_data.GetByteSize());
-    return error;
-  }
+offset_t MinidumpFileBuilder::GetCurrentDataEndOffset() const {
+  return m_data.GetByteSize() + m_saved_data_size;
+}
 
-  // write directories
+Status MinidumpFileBuilder::DumpDirectories() const {
+  Status error;
+  size_t bytes_written;
+  m_core_file->SeekFromStart(HEADER_SIZE);
   for (const Directory &dir : m_directories) {
-    bytes_written = directory_size;
-    error = core_file->Write(&dir, bytes_written);
-    if (error.Fail() || bytes_written != directory_size) {
-      if (bytes_written != directory_size)
+    bytes_written = DIRECTORY_SIZE;
+    error = m_core_file->Write(&dir, bytes_written);
+    if (error.Fail() || bytes_written != DIRECTORY_SIZE) {
+      if (bytes_written != DIRECTORY_SIZE)
         error.SetErrorStringWithFormat(
             "unable to write the directory (written %zd/%zd)", bytes_written,
-            directory_size);
+            DIRECTORY_SIZE);
       return error;
     }
   }
@@ -858,10 +954,254 @@ Status MinidumpFileBuilder::Dump(lldb::FileUP &core_file) const {
   return error;
 }
 
-size_t MinidumpFileBuilder::GetDirectoriesNum() const {
-  return m_directories.size();
+static uint64_t
+GetLargestRangeSize(const Process::CoreFileMemoryRanges &ranges) {
+  uint64_t max_size = 0;
+  for (const auto &core_range : ranges)
+    max_size = std::max(max_size, core_range.range.size());
+  return max_size;
+}
+
+Status
+MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) {
+  std::vector<MemoryDescriptor> descriptors;
+  Status error;
+  if (ranges.size() == 0)
+    return error;
+
+  Log *log = GetLog(LLDBLog::Object);
+  size_t region_index = 0;
+  auto data_up =
+      std::make_unique<DataBufferHeap>(GetLargestRangeSize(ranges), 0);
+  for (const auto &core_range : ranges) {
+    // Take the offset before we write.
+    const offset_t offset_for_data = GetCurrentDataEndOffset();
+    const addr_t addr = core_range.range.start();
+    const addr_t size = core_range.range.size();
+    const addr_t end = core_range.range.end();
+
+    LLDB_LOGF(log,
+              "AddMemoryList %zu/%zu reading memory for region "
+              "(%" PRIx64 " bytes) [%" PRIx64 ", %" PRIx64 ")",
+              region_index, ranges.size(), size, addr, addr + size);
+    ++region_index;
+
+    const size_t bytes_read =
+        m_process_sp->ReadMemory(addr, data_up->GetBytes(), size, error);
+    if (error.Fail() || bytes_read == 0) {
+      LLDB_LOGF(log, "Failed to read memory region. Bytes read: %zu, error: %s",
+                bytes_read, error.AsCString());
+      // Just skip sections with errors or zero bytes in 32b mode
+      continue;
+    } else if (bytes_read != size) {
+      LLDB_LOGF(
+          log, "Memory region at: %" PRIx64 " failed to read %" PRIx64 " bytes",
+          addr, size);
+    }
+
+    MemoryDescriptor descriptor;
+    descriptor.StartOfMemoryRange =
+        static_cast<llvm::support::ulittle64_t>(addr);
+    descriptor.Memory.DataSize =
+        static_cast<llvm::support::ulittle32_t>(bytes_read);
+    descriptor.Memory.RVA =
+        static_cast<llvm::support::ulittle32_t>(offset_for_data);
+    descriptors.push_back(descriptor);
+    if (m_thread_by_range_end.count(end) > 0)
+      m_thread_by_range_end[end].Stack = descriptor;
+
+    // Add the data to the buffer, flush as needed.
+    error = AddData(data_up->GetBytes(), bytes_read);
+    if (error.Fail())
+      return error;
+  }
+
+  // Add a directory that references this list
+  // With a size of the number of ranges as a 32 bit num
+  // And then the size of all the ranges
+  error = AddDirectory(StreamType::MemoryList,
+                       sizeof(llvm::support::ulittle32_t) +
+                           descriptors.size() *
+                               sizeof(llvm::minidump::MemoryDescriptor));
+  if (error.Fail())
+    return error;
+
+  llvm::support::ulittle32_t memory_ranges_num =
+      static_cast<llvm::support::ulittle32_t>(descriptors.size());
+  m_data.AppendData(&memory_ranges_num, sizeof(llvm::support::ulittle32_t));
+  // For 32b we can get away with writing off the descriptors after the data.
+  // This means no cleanup loop needed.
+  m_data.AppendData(descriptors.data(),
+                    descriptors.size() * sizeof(MemoryDescriptor));
+
+  return error;
 }
 
-size_t MinidumpFileBuilder::GetCurrentDataEndOffset() const {
-  return sizeof(llvm::minidump::Header) + m_data.GetByteSize();
+Status
+MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) {
+  Status error;
+  if (ranges.empty())
+    return error;
+
+  error = AddDirectory(StreamType::Memory64List,
+                       (sizeof(llvm::support::ulittle64_t) * 2) +
+                           ranges.size() *
+                               sizeof(llvm::minidump::MemoryDescriptor_64));
+  if (error.Fail())
+    return error;
+
+  llvm::support::ulittle64_t memory_ranges_num =
+      static_cast<llvm::support::ulittle64_t>(ranges.size());
+  m_data.AppendData(&memory_ranges_num, sizeof(llvm::support::ulittle64_t));
+  // Capture the starting offset for all the descriptors so we can clean them up
+  // if needed.
+  offset_t starting_offset =
+      GetCurrentDataEndOffset() + sizeof(llvm::support::ulittle64_t);
+  // The base_rva needs to start after the directories, which is right after
+  // this 8 byte variable.
+  offset_t base_rva =
+      starting_offset +
+      (ranges.size() * sizeof(llvm::minidump::MemoryDescriptor_64));
+  llvm::support::ulittle64_t memory_ranges_base_rva =
+      static_cast<llvm::support::ulittle64_t>(base_rva);
+  m_data.AppendData(&memory_ranges_base_rva,
+                    sizeof(llvm::support::ulittle64_t));
+
+  bool cleanup_required = false;
+  std::vector<MemoryDescriptor_64> descriptors;
+  // Enumerate the ranges and create the memory descriptors so we can append
+  // them first
+  for (const auto core_range : ranges) {
+    // Add the space required to store the memory descriptor
+    MemoryDescriptor_64 memory_desc;
+    memory_desc.StartOfMemoryRange =
+        static_cast<llvm::support::ulittle64_t>(core_range.range.start());
+    memory_desc.DataSize =
+        static_cast<llvm::support::ulittle64_t>(core_range.range.size());
+    descriptors.push_back(memory_desc);
+    // Now write this memory descriptor to the buffer.
+    m_data.AppendData(&memory_desc, sizeof(MemoryDescriptor_64));
+  }
+
+  Log *log = GetLog(LLDBLog::Object);
+  size_t region_index = 0;
+  auto data_up =
+      std::make_unique<DataBufferHeap>(GetLargestRangeSize(ranges), 0);
+  for (const auto &core_range : ranges) {
+    const addr_t addr = core_range.range.start();
+    const addr_t size = core_range.range.size();
+
+    LLDB_LOGF(log,
+              "AddMemoryList_64 %zu/%zu reading memory for region "
+              "(%" PRIx64 "bytes) "
+              "[%" PRIx64 ", %" PRIx64 ")",
+              region_index, ranges.size(), size, addr, addr + size);
+    ++region_index;
+
+    const size_t bytes_read =
+        m_process_sp->ReadMemory(addr, data_up->GetBytes(), size, error);
+    if (error.Fail()) {
+      LLDB_LOGF(log, "Failed to read memory region. Bytes read: %zu, error: %s",
+                bytes_read, error.AsCString());
+      error.Clear();
+      cleanup_required = true;
+      descriptors[region_index].DataSize = 0;
+    }
+    if (bytes_read != size) {
+      LLDB_LOGF(
+          log, "Memory region at: %" PRIx64 " failed to read %" PRIx64 " bytes",
+          addr, size);
+      cleanup_required = true;
+      descriptors[region_index].DataSize = bytes_read;
+    }
+
+    // Add the data to the buffer, flush as needed.
+    error = AddData(data_up->GetBytes(), bytes_read);
+    if (error.Fail())
+      return error;
+  }
+
+  // Early return if there is no cleanup needed.
+  if (!cleanup_required) {
+    return error;
+  } else {
+    // Flush to disk we can make the fixes in place.
+    FlushBufferToDisk();
+    // Fixup the descriptors that were not read correctly.
+    m_core_file->SeekFromStart(starting_offset);
+    size_t bytes_written = sizeof(MemoryDescriptor_64) * descriptors.size();
+    error = m_core_file->Write(descriptors.data(), bytes_written);
+    if (error.Fail() ||
+        bytes_written != sizeof(MemoryDescriptor_64) * descriptors.size()) {
+      error.SetErrorStringWithFormat(
+          "unable to write the memory descriptors (written %zd/%zd)",
+          bytes_written, sizeof(MemoryDescriptor_64) * descriptors.size());
+    }
+
+    return error;
+  }
+}
+
+Status MinidumpFileBuilder::AddData(const void *data, uint64_t size) {
+  // This should also get chunked, because worst case we copy over a big
+  // object / memory range, say 5gb. In that case, we'd have to allocate 10gb
+  // 5 gb for the buffer we're copying from, and then 5gb for the buffer we're
+  // copying to. Which will be short lived and immedaitely go to disk, the goal
+  // here is to limit the number of bytes we need to host in memory at any given
+  // time.
+  m_data.AppendData(data, size);
+  if (m_data.GetByteSize() > MAX_WRITE_CHUNK_SIZE)
+    return FlushBufferToDisk();
+
+  return Status();
+}
+
+Status MinidumpFileBuilder::FlushBufferToDisk() {
+  Status error;
+  // Set the stream to it's end.
+  m_core_file->SeekFromStart(m_saved_data_size);
+  addr_t starting_size = m_data.GetByteSize();
+  addr_t remaining_bytes = starting_size;
+  offset_t offset = 0;
+
+  while (remaining_bytes > 0) {
+    size_t bytes_written = remaining_bytes;
+    // We don't care how many bytes we wrote unless we got an error
+    // so just decrement the remaining bytes.
+    error = m_core_file->Write(m_data.GetBytes() + offset, bytes_written);
+    if (error.Fail()) {
+      error.SetErrorStringWithFormat(
+          "Wrote incorrect number of bytes to minidump file. (written %" PRIx64
+          "/%" PRIx64 ")",
+          starting_size - remaining_bytes, starting_size);
+      return error;
+    }
+
+    offset += bytes_written;
+    remaining_bytes -= bytes_written;
+  }
+
+  m_saved_data_size += starting_size;
+  m_data.Clear();
+  return error;
+}
+
+Status MinidumpFileBuilder::DumpFile() {
+  Status error;
+  // If anything is left unsaved, dump it.
+  error = FlushBufferToDisk();
+  if (error.Fail())
+    return error;
+
+  // Overwrite the header which we filled in earlier.
+  error = DumpHeader();
+  if (error.Fail())
+    return error;
+
+  // Overwrite the space saved for directories
+  error = DumpDirectories();
+  if (error.Fail())
+    return error;
+
+  return error;
 }
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index b2e984191983ff..20564e0661f2a2 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -17,12 +17,20 @@
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_MINIDUMPFILEBUILDER_H
 
 #include <cstddef>
+#include <cstdint>
 #include <map>
+#include <unordered_map>
+#include <utility>
+#include <variant>
 
+#include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Status.h"
+#include "lldb/lldb-forward.h"
+#include "lldb/lldb-types.h"
 
+#include "llvm/BinaryFormat/Minidump.h"
 #include "llvm/Object/Minidump.h"
 
 // Write std::string to minidump in the UTF16 format(with null termination char)
@@ -36,11 +44,39 @@ lldb_private::Status WriteString(const std::string &to_write,
 /// Minidump writer for Linux
 ///
 /// This class provides a Minidump writer that is able to
-/// snapshot the current process state. For the whole time, it stores all
-/// the data on heap.
+/// snapshot the current process state.
+///
+/// Minidumps are a Microsoft format for dumping process state.
+/// This class constructs the minidump on disk starting with
+/// Headers and Directories are written at the top of the file,
+/// with the amount of bytes being precalculates before any writing takes place
+/// Then the smaller data sections are written
+/// SystemInfo, ModuleList, Misc Info.
+/// Then Threads are emitted, threads are the first section that needs to be
+/// 'fixed up' this happens when later we emit the memory stream, we identify if
+/// that stream is the expected stack, and if so we update the stack with the
+/// current RVA. Lastly the Memory lists are added. For Memory List, this will
+/// contain everything that can fit within 4.2gb. MemoryList has it's
+/// descriptors written at the end so it cannot be allowed to overflow.
+///
+/// Memory64List is a special case where it has to be begin before 4.2gb but can
+/// expand forever The difference in Memory64List is there are no RVA's and all
+/// the addresses are figured out by starting at the base RVA, and adding the
+/// antecedent memory sections.
+///
+/// Because Memory64List can be arbitrarily large, this class has to write
+/// chunks to disk this means we have to precalculate the descriptors and write
+/// them first, and if we encounter any error, or are unable to read the same
+/// number of bytes we have to go back and update them on disk.
+///
+/// And as the last step, after all the directories have been added, we go back
+/// to the top of the file to fill in the header and the redirectory sections
+/// that we preallocated.
 class MinidumpFileBuilder {
 public:
-  MinidumpFileBuilder() = default;
+  MinidumpFileBuilder(lldb::FileUP &&core_file,
+                      const lldb::ProcessSP &process_sp)
+      : m_process_sp(process_sp), m_core_file(std::move(core_file)){};
 
   MinidumpFileBuilder(const MinidumpFileBuilder &) = delete;
   MinidumpFileBuilder &operator=(const MinidumpFileBuilder &) = delete;
@@ -50,48 +86,84 @@ class MinidumpFileBuilder {
 
   ~MinidumpFileBuilder() = default;
 
+  // This method only calculates the amount of bytes the header and directories
+  // will take up. It does not write the directories or headers. This function
+  // must be called with a followup to fill in the data.
+  lldb_private::Status AddHeaderAndCalculateDirectories();
   // Add SystemInfo stream, used for storing the most basic information
   // about the system, platform etc...
-  lldb_private::Status AddSystemInfo(const llvm::Triple &target_triple);
+  lldb_private::Status AddSystemInfo();
   // Add ModuleList stream, containing information about all loaded modules
   // at the time of saving minidump.
-  lldb_private::Status AddModuleList(lldb_private::Target &target);
+  lldb_private::Status AddModuleList();
   // Add ThreadList stream, containing information about all threads running
   // at the moment of core saving. Contains information about thread
   // contexts.
-  lldb_private::Status AddThreadList(const lldb::ProcessSP &process_sp);
+  lldb_private::Status AddThreadList();
   // Add Exception streams for any threads that stopped with exceptions.
-  void AddExceptions(const lldb::ProcessSP &process_sp);
+  lldb_private::Status AddExceptions();
   // Add MemoryList stream, containing dumps of important memory segments
-  lldb_private::Status AddMemoryList(const lldb::ProcessSP &process_sp,
-                                     lldb::SaveCoreStyle core_style);
+  lldb_private::Status AddMemoryList(lldb::SaveCoreStyle core_style);
   // Add MiscInfo stream, mainly providing ProcessId
-  void AddMiscInfo(const lldb::ProcessSP &process_sp);
+  lldb_private::Status AddMiscInfo();
   // Add informative files about a Linux process
-  void AddLinuxFileStreams(const lldb::ProcessSP &process_sp);
-  // Dump the prepared data into file. In case of the failure data are
-  // intact.
-  lldb_private::Status Dump(lldb::FileUP &core_file) const;
-  // Returns the current number of directories(streams) that have been so far
-  // created. This number of directories will be dumped when calling Dump()
-  size_t GetDirectoriesNum() const;
+  lldb_private::Status AddLinuxFileStreams();
+
+  // Run cleanup and write all remaining bytes to file
+  lldb_private::Status DumpFile();
 
 private:
+  // Add data to the end of the buffer, if the buffer exceeds the flush level,
+  // trigger a flush.
+  lldb_private::Status AddData(const void *data, uint64_t size);
+  // Add MemoryList stream, containing dumps of important memory segments
+  lldb_private::Status
+  AddMemoryList_64(lldb_private::Process::CoreFileMemoryRanges &ranges);
+  lldb_private::Status
+  AddMemoryList_32(lldb_private::Process::CoreFileMemoryRanges &ranges);
+  // Update the thread list on disk with the newly emitted stack RVAs.
+  lldb_private::Status FixThreadStacks();
+  lldb_private::Status FlushBufferToDisk();
+
+  lldb_private::Status DumpHeader() const;
+  lldb_private::Status DumpDirectories() const;
   // Add directory of StreamType pointing to the current end of the prepared
   // file with the specified size.
-  void AddDirectory(llvm::minidump::StreamType type, size_t stream_size);
-  size_t GetCurrentDataEndOffset() const;
-
-  // Stores directories to later put them at the end of minidump file
+  lldb_private::Status AddDirectory(llvm::minidump::StreamType type,
+                                    uint64_t stream_size);
+  lldb::offset_t GetCurrentDataEndOffset() const;
+  // Stores directories to fill in later
   std::vector<llvm::minidump::Directory> m_directories;
+  // When we write off the threads for the first time, we need to clean them up
+  // and give them the correct RVA once we write the stack memory list.
+  // We save by the end because we only take from the stack pointer up
+  // So the saved off range base can differ from the memory region the stack
+  // pointer is in.
+  std::unordered_map<lldb::addr_t, llvm::minidump::Thread>
+      m_thread_by_range_end;
   // Main data buffer consisting of data without the minidump header and
   // directories
   lldb_private::DataBufferHeap m_data;
+  lldb::ProcessSP m_process_sp;
+
+  size_t m_expected_directories = 0;
+  uint64_t m_saved_data_size = 0;
+  lldb::offset_t m_thread_list_start = 0;
+  // We set the max write amount to 128 mb, this is arbitrary
+  // but we want to try to keep the size of m_data small
+  // and we will only exceed a 128 mb buffer if we get a memory region
+  // that is larger than 128 mb.
+  static constexpr size_t MAX_WRITE_CHUNK_SIZE = (1024 * 1024 * 128);
+
+  static constexpr size_t HEADER_SIZE = sizeof(llvm::minidump::Header);
+  static constexpr size_t DIRECTORY_SIZE = sizeof(llvm::minidump::Directory);
 
   // More that one place can mention the register thread context locations,
   // so when we emit the thread contents, remember where it is so we don't have
   // to duplicate it in the exception data.
-  std::map<lldb::tid_t, llvm::minidump::LocationDescriptor> m_tid_to_reg_ctx;
+  std::unordered_map<lldb::tid_t, llvm::minidump::LocationDescriptor>
+      m_tid_to_reg_ctx;
+  lldb::FileUP m_core_file;
 };
 
 #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_MINIDUMPFILEBUILDER_H
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
index 1af5d99f0b1604..7c875aa3223ede 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
@@ -15,6 +15,7 @@
 #include "lldb/Core/Section.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
 
 #include "llvm/Support/FileSystem.h"
 
@@ -65,56 +66,70 @@ bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
   if (!process_sp)
     return false;
 
-  MinidumpFileBuilder builder;
-
-  Target &target = process_sp->GetTarget();
+  llvm::Expected<lldb::FileUP> maybe_core_file = FileSystem::Instance().Open(
+      outfile, File::eOpenOptionWriteOnly | File::eOpenOptionCanCreate);
+  if (!maybe_core_file) {
+    error = maybe_core_file.takeError();
+    return false;
+  }
+  MinidumpFileBuilder builder(std::move(maybe_core_file.get()), process_sp);
 
   Log *log = GetLog(LLDBLog::Object);
-  error = builder.AddSystemInfo(target.GetArchitecture().GetTriple());
+  error = builder.AddHeaderAndCalculateDirectories();
   if (error.Fail()) {
-    LLDB_LOG(log, "AddSystemInfo failed: %s", error.AsCString());
+    LLDB_LOGF(log, "AddHeaderAndCalculateDirectories failed: %s",
+              error.AsCString());
+    return false;
+  };
+  error = builder.AddSystemInfo();
+  if (error.Fail()) {
+    LLDB_LOGF(log, "AddSystemInfo failed: %s", error.AsCString());
     return false;
   }
 
-  error = builder.AddModuleList(target);
+  error = builder.AddModuleList();
   if (error.Fail()) {
-    LLDB_LOG(log, "AddModuleList failed: %s", error.AsCString());
+    LLDB_LOGF(log, "AddModuleList failed: %s", error.AsCString());
     return false;
   }
-
-  builder.AddMiscInfo(process_sp);
-
-  error = builder.AddThreadList(process_sp);
+  error = builder.AddMiscInfo();
   if (error.Fail()) {
-    LLDB_LOG(log, "AddThreadList failed: %s", error.AsCString());
+    LLDB_LOGF(log, "AddMiscInfo failed: %s", error.AsCString());
     return false;
   }
 
-  // Add any exceptions but only if there are any in any threads.
-  builder.AddExceptions(process_sp);
+  error = builder.AddThreadList();
+  if (error.Fail()) {
+    LLDB_LOGF(log, "AddThreadList failed: %s", error.AsCString());
+    return false;
+  }
 
-  error = builder.AddMemoryList(process_sp, core_style);
+  error = builder.AddLinuxFileStreams();
   if (error.Fail()) {
-    LLDB_LOG(log, "AddMemoryList failed: %s", error.AsCString());
+    LLDB_LOGF(log, "AddLinuxFileStreams failed: %s", error.AsCString());
     return false;
   }
 
-  if (target.GetArchitecture().GetTriple().getOS() ==
-      llvm::Triple::OSType::Linux) {
-    builder.AddLinuxFileStreams(process_sp);
+  // Add any exceptions but only if there are any in any threads.
+  error = builder.AddExceptions();
+  if (error.Fail()) {
+    LLDB_LOGF(log, "AddExceptions failed: %s", error.AsCString());
+    return false;
   }
 
-  llvm::Expected<lldb::FileUP> maybe_core_file = FileSystem::Instance().Open(
-      outfile, File::eOpenOptionWriteOnly | File::eOpenOptionCanCreate);
-  if (!maybe_core_file) {
-    error = maybe_core_file.takeError();
+  // Note: add memory HAS to be the last thing we do. It can overflow into 64b
+  // land and many RVA's only support 32b
+  error = builder.AddMemoryList(core_style);
+  if (error.Fail()) {
+    LLDB_LOGF(log, "AddMemoryList failed: %s", error.AsCString());
     return false;
   }
-  lldb::FileUP core_file = std::move(maybe_core_file.get());
 
-  error = builder.Dump(core_file);
-  if (error.Fail())
+  error = builder.DumpFile();
+  if (error.Fail()) {
+    LLDB_LOGF(log, "DumpFile failed: %s", error.AsCString());
     return false;
+  }
 
   return true;
 }
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.cpp
index 51553817921f35..8ed75d700f225e 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.cpp
@@ -20,6 +20,7 @@
 #define HWCAP2_BTI (1ULL << 17)
 #define HWCAP2_MTE (1ULL << 18)
 #define HWCAP2_AFP (1ULL << 20)
+#define HWCAP2_SME (1ULL << 23)
 #define HWCAP2_EBF16 (1ULL << 32)
 
 using namespace lldb_private;
@@ -27,7 +28,10 @@ using namespace lldb_private;
 LinuxArm64RegisterFlags::Fields
 LinuxArm64RegisterFlags::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
   (void)hwcap;
-  (void)hwcap2;
+
+  if (!(hwcap2 & HWCAP2_SME))
+    return {};
+
   // Represents the pseudo register that lldb-server builds, which itself
   // matches the architectural register SCVR. The fields match SVCR in the Arm
   // manual.
@@ -40,7 +44,10 @@ LinuxArm64RegisterFlags::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
 LinuxArm64RegisterFlags::Fields
 LinuxArm64RegisterFlags::DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2) {
   (void)hwcap;
-  (void)hwcap2;
+
+  if (!(hwcap2 & HWCAP2_MTE))
+    return {};
+
   // Represents the contents of NT_ARM_TAGGED_ADDR_CTRL and the value passed
   // to prctl(PR_TAGGED_ADDR_CTRL...). Fields are derived from the defines
   // used to build the value.
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.h
index 660bef08700f4c..49b1d90db64f69 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsLinux_arm64.h
@@ -38,8 +38,8 @@ class LinuxArm64RegisterFlags {
   /// For the registers listed in this class, detect which fields are
   /// present. Must be called before UpdateRegisterInfos.
   /// If called more than once, fields will be redetected each time from
-  /// scratch. If you do not have access to hwcap, just pass 0 for each one, you
-  /// will only get unconditional fields.
+  /// scratch. If the target would not have this register at all, the list of
+  /// fields will be left empty.
   void DetectFields(uint64_t hwcap, uint64_t hwcap2);
 
   /// Add the field information of any registers named in this class,
@@ -63,7 +63,7 @@ class LinuxArm64RegisterFlags {
 
   struct RegisterEntry {
     RegisterEntry(llvm::StringRef name, unsigned size, DetectorFn detector)
-        : m_name(name), m_flags(std::string(name) + "_flags", size, {{"", 0}}),
+        : m_name(name), m_flags(std::string(name) + "_flags", size, {}),
           m_detector(detector) {}
 
     llvm::StringRef m_name;
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 43c61fc9df6e4f..604c92369e9a27 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -4179,21 +4179,134 @@ struct GdbServerTargetInfo {
   RegisterSetMap reg_set_map;
 };
 
-static std::vector<RegisterFlags::Field> ParseFlagsFields(XMLNode flags_node,
-                                                          unsigned size) {
+static FieldEnum::Enumerators ParseEnumEvalues(const XMLNode &enum_node) {
+  Log *log(GetLog(GDBRLog::Process));
+  // We will use the last instance of each value. Also we preserve the order
+  // of declaration in the XML, as it may not be numerical.
+  // For example, hardware may intially release with two states that softwware
+  // can read from a register field:
+  // 0 = startup, 1 = running
+  // If in a future hardware release, the designers added a pre-startup state:
+  // 0 = startup, 1 = running, 2 = pre-startup
+  // Now it makes more sense to list them in this logical order as opposed to
+  // numerical order:
+  // 2 = pre-startup, 1 = startup, 0 = startup
+  // This only matters for "register info" but let's trust what the server
+  // chose regardless.
+  std::map<uint64_t, FieldEnum::Enumerator> enumerators;
+
+  enum_node.ForEachChildElementWithName(
+      "evalue", [&enumerators, &log](const XMLNode &enumerator_node) {
+        std::optional<llvm::StringRef> name;
+        std::optional<uint64_t> value;
+
+        enumerator_node.ForEachAttribute(
+            [&name, &value, &log](const llvm::StringRef &attr_name,
+                                  const llvm::StringRef &attr_value) {
+              if (attr_name == "name") {
+                if (attr_value.size())
+                  name = attr_value;
+                else
+                  LLDB_LOG(log, "ProcessGDBRemote::ParseEnumEvalues "
+                                "Ignoring empty name in evalue");
+              } else if (attr_name == "value") {
+                uint64_t parsed_value = 0;
+                if (llvm::to_integer(attr_value, parsed_value))
+                  value = parsed_value;
+                else
+                  LLDB_LOG(log,
+                           "ProcessGDBRemote::ParseEnumEvalues "
+                           "Invalid value \"{0}\" in "
+                           "evalue",
+                           attr_value.data());
+              } else
+                LLDB_LOG(log,
+                         "ProcessGDBRemote::ParseEnumEvalues Ignoring "
+                         "unknown attribute "
+                         "\"{0}\" in evalue",
+                         attr_name.data());
+
+              // Keep walking attributes.
+              return true;
+            });
+
+        if (value && name)
+          enumerators.insert_or_assign(
+              *value, FieldEnum::Enumerator(*value, name->str()));
+
+        // Find all evalue elements.
+        return true;
+      });
+
+  FieldEnum::Enumerators final_enumerators;
+  for (auto [_, enumerator] : enumerators)
+    final_enumerators.push_back(enumerator);
+
+  return final_enumerators;
+}
+
+static void
+ParseEnums(XMLNode feature_node,
+           llvm::StringMap<std::unique_ptr<FieldEnum>> &registers_enum_types) {
+  Log *log(GetLog(GDBRLog::Process));
+
+  // The top level element is "<enum...".
+  feature_node.ForEachChildElementWithName(
+      "enum", [log, &registers_enum_types](const XMLNode &enum_node) {
+        std::string id;
+
+        enum_node.ForEachAttribute([&id](const llvm::StringRef &attr_name,
+                                         const llvm::StringRef &attr_value) {
+          if (attr_name == "id")
+            id = attr_value;
+
+          // There is also a "size" attribute that is supposed to be the size in
+          // bytes of the register this applies to. However:
+          // * LLDB doesn't need this information.
+          // * It  is difficult to verify because you have to wait until the
+          //   enum is applied to a field.
+          //
+          // So we will emit this attribute in XML for GDB's sake, but will not
+          // bother ingesting it.
+
+          // Walk all attributes.
+          return true;
+        });
+
+        if (!id.empty()) {
+          FieldEnum::Enumerators enumerators = ParseEnumEvalues(enum_node);
+          if (!enumerators.empty()) {
+            LLDB_LOG(log,
+                     "ProcessGDBRemote::ParseEnums Found enum type \"{0}\"",
+                     id);
+            registers_enum_types.insert_or_assign(
+                id, std::make_unique<FieldEnum>(id, enumerators));
+          }
+        }
+
+        // Find all <enum> elements.
+        return true;
+      });
+}
+
+static std::vector<RegisterFlags::Field> ParseFlagsFields(
+    XMLNode flags_node, unsigned size,
+    const llvm::StringMap<std::unique_ptr<FieldEnum>> &registers_enum_types) {
   Log *log(GetLog(GDBRLog::Process));
   const unsigned max_start_bit = size * 8 - 1;
 
   // Process the fields of this set of flags.
   std::vector<RegisterFlags::Field> fields;
-  flags_node.ForEachChildElementWithName("field", [&fields, max_start_bit,
-                                                   &log](const XMLNode
-                                                             &field_node) {
+  flags_node.ForEachChildElementWithName("field", [&fields, max_start_bit, &log,
+                                                   &registers_enum_types](
+                                                      const XMLNode
+                                                          &field_node) {
     std::optional<llvm::StringRef> name;
     std::optional<unsigned> start;
     std::optional<unsigned> end;
+    std::optional<llvm::StringRef> type;
 
-    field_node.ForEachAttribute([&name, &start, &end, max_start_bit,
+    field_node.ForEachAttribute([&name, &start, &end, &type, max_start_bit,
                                  &log](const llvm::StringRef &attr_name,
                                        const llvm::StringRef &attr_value) {
       // Note that XML in general requires that each of these attributes only
@@ -4240,8 +4353,7 @@ static std::vector<RegisterFlags::Field> ParseFlagsFields(XMLNode flags_node,
                    attr_value.data());
         }
       } else if (attr_name == "type") {
-        // Type is a known attribute but we do not currently use it and it is
-        // not required.
+        type = attr_value;
       } else {
         LLDB_LOG(
             log,
@@ -4254,14 +4366,55 @@ static std::vector<RegisterFlags::Field> ParseFlagsFields(XMLNode flags_node,
     });
 
     if (name && start && end) {
-      if (*start > *end) {
+      if (*start > *end)
         LLDB_LOG(
             log,
             "ProcessGDBRemote::ParseFlagsFields Start {0} > end {1} in field "
             "\"{2}\", ignoring",
             *start, *end, name->data());
-      } else {
-        fields.push_back(RegisterFlags::Field(name->str(), *start, *end));
+      else {
+        if (RegisterFlags::Field::GetSizeInBits(*start, *end) > 64)
+          LLDB_LOG(log,
+                   "ProcessGDBRemote::ParseFlagsFields Ignoring field \"{2}\" "
+                   "that has "
+                   "size > 64 bits, this is not supported",
+                   name->data());
+        else {
+          // A field's type may be set to the name of an enum type.
+          const FieldEnum *enum_type = nullptr;
+          if (type && !type->empty()) {
+            auto found = registers_enum_types.find(*type);
+            if (found != registers_enum_types.end()) {
+              enum_type = found->second.get();
+
+              // No enumerator can exceed the range of the field itself.
+              uint64_t max_value =
+                  RegisterFlags::Field::GetMaxValue(*start, *end);
+              for (const auto &enumerator : enum_type->GetEnumerators()) {
+                if (enumerator.m_value > max_value) {
+                  enum_type = nullptr;
+                  LLDB_LOG(
+                      log,
+                      "ProcessGDBRemote::ParseFlagsFields In enum \"{0}\" "
+                      "evalue \"{1}\" with value {2} exceeds the maximum value "
+                      "of field \"{3}\" ({4}), ignoring enum",
+                      type->data(), enumerator.m_name, enumerator.m_value,
+                      name->data(), max_value);
+                  break;
+                }
+              }
+            } else {
+              LLDB_LOG(log,
+                       "ProcessGDBRemote::ParseFlagsFields Could not find type "
+                       "\"{0}\" "
+                       "for field \"{1}\", ignoring",
+                       type->data(), name->data());
+            }
+          }
+
+          fields.push_back(
+              RegisterFlags::Field(name->str(), *start, *end, enum_type));
+        }
       }
     }
 
@@ -4272,12 +4425,14 @@ static std::vector<RegisterFlags::Field> ParseFlagsFields(XMLNode flags_node,
 
 void ParseFlags(
     XMLNode feature_node,
-    llvm::StringMap<std::unique_ptr<RegisterFlags>> &registers_flags_types) {
+    llvm::StringMap<std::unique_ptr<RegisterFlags>> &registers_flags_types,
+    const llvm::StringMap<std::unique_ptr<FieldEnum>> &registers_enum_types) {
   Log *log(GetLog(GDBRLog::Process));
 
   feature_node.ForEachChildElementWithName(
       "flags",
-      [&log, &registers_flags_types](const XMLNode &flags_node) -> bool {
+      [&log, &registers_flags_types,
+       &registers_enum_types](const XMLNode &flags_node) -> bool {
         LLDB_LOG(log, "ProcessGDBRemote::ParseFlags Found flags node \"{0}\"",
                  flags_node.GetAttributeValue("id").c_str());
 
@@ -4310,7 +4465,7 @@ void ParseFlags(
         if (id && size) {
           // Process the fields of this set of flags.
           std::vector<RegisterFlags::Field> fields =
-              ParseFlagsFields(flags_node, *size);
+              ParseFlagsFields(flags_node, *size, registers_enum_types);
           if (fields.size()) {
             // Sort so that the fields with the MSBs are first.
             std::sort(fields.rbegin(), fields.rend());
@@ -4375,13 +4530,19 @@ void ParseFlags(
 bool ParseRegisters(
     XMLNode feature_node, GdbServerTargetInfo &target_info,
     std::vector<DynamicRegisterInfo::Register> &registers,
-    llvm::StringMap<std::unique_ptr<RegisterFlags>> &registers_flags_types) {
+    llvm::StringMap<std::unique_ptr<RegisterFlags>> &registers_flags_types,
+    llvm::StringMap<std::unique_ptr<FieldEnum>> &registers_enum_types) {
   if (!feature_node)
     return false;
 
   Log *log(GetLog(GDBRLog::Process));
 
-  ParseFlags(feature_node, registers_flags_types);
+  // Enums first because they are referenced by fields in the flags.
+  ParseEnums(feature_node, registers_enum_types);
+  for (const auto &enum_type : registers_enum_types)
+    enum_type.second->DumpToLog(log);
+
+  ParseFlags(feature_node, registers_flags_types, registers_enum_types);
   for (const auto &flags : registers_flags_types)
     flags.second->DumpToLog(log);
 
@@ -4643,7 +4804,7 @@ bool ProcessGDBRemote::GetGDBServerRegisterInfoXMLAndProcess(
     if (arch_to_use.IsValid()) {
       for (auto &feature_node : feature_nodes) {
         ParseRegisters(feature_node, target_info, registers,
-                       m_registers_flags_types);
+                       m_registers_flags_types, m_registers_enum_types);
       }
 
       for (const auto &include : target_info.includes) {
@@ -4708,16 +4869,19 @@ bool ProcessGDBRemote::GetGDBServerRegisterInfo(ArchSpec &arch_to_use) {
   if (!m_gdb_comm.GetQXferFeaturesReadSupported())
     return false;
 
-  // This holds register flags information for the whole of target.xml.
+  // These hold register type information for the whole of target.xml.
   // target.xml may include further documents that
   // GetGDBServerRegisterInfoXMLAndProcess will recurse to fetch and process.
   // That's why we clear the cache here, and not in
   // GetGDBServerRegisterInfoXMLAndProcess. To prevent it being cleared on every
   // include read.
   m_registers_flags_types.clear();
+  m_registers_enum_types.clear();
   std::vector<DynamicRegisterInfo::Register> registers;
   if (GetGDBServerRegisterInfoXMLAndProcess(arch_to_use, "target.xml",
-                                            registers))
+                                            registers) &&
+      // Target XML is not required to include register information.
+      !registers.empty())
     AddRemoteRegisters(registers, arch_to_use);
 
   return m_register_info_sp->GetNumRegisters() > 0;
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index 610a1ee0b34d2f..b44ffefcd0d366 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -484,6 +484,11 @@ class ProcessGDBRemote : public Process,
   // entries are added. Which would invalidate any pointers set in the register
   // info up to that point.
   llvm::StringMap<std::unique_ptr<RegisterFlags>> m_registers_flags_types;
+
+  // Enum types are referenced by register fields. This does not store the data
+  // directly because the map may reallocate. Pointers to these are contained
+  // within instances of RegisterFlags.
+  llvm::StringMap<std::unique_ptr<FieldEnum>> m_registers_enum_types;
 };
 
 } // namespace process_gdb_remote
diff --git a/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp b/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp
index 067768537c0684..a088a8742718f3 100644
--- a/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp
+++ b/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp
@@ -43,8 +43,7 @@ CompilerType RegisterTypeBuilderClang::GetRegisterType(
       ScratchTypeSystemClang::GetForTarget(m_target);
   assert(type_system);
 
-  std::string register_type_name = "__lldb_register_fields_";
-  register_type_name += name;
+  std::string register_type_name = "__lldb_register_fields_" + name;
   // See if we have made this type before and can reuse it.
   CompilerType fields_type =
       type_system->GetTypeForIdentifier<clang::CXXRecordDecl>(
@@ -67,8 +66,43 @@ CompilerType RegisterTypeBuilderClang::GetRegisterType(
     // We assume that RegisterFlags has padded and sorted the fields
     // already.
     for (const RegisterFlags::Field &field : flags.GetFields()) {
+      CompilerType field_type = field_uint_type;
+
+      if (const FieldEnum *enum_type = field.GetEnum()) {
+        const FieldEnum::Enumerators &enumerators = enum_type->GetEnumerators();
+        if (!enumerators.empty()) {
+          std::string enum_type_name =
+              "__lldb_register_fields_enum_" + enum_type->GetID();
+
+          // Enums can be used by mutiple fields and multiple registers, so we
+          // may have built this one already.
+          CompilerType field_enum_type =
+              type_system->GetTypeForIdentifier<clang::EnumDecl>(
+                  enum_type_name);
+
+          if (field_enum_type)
+            field_type = field_enum_type;
+          else {
+            field_type = type_system->CreateEnumerationType(
+                enum_type_name, type_system->GetTranslationUnitDecl(),
+                OptionalClangModuleID(), Declaration(), field_uint_type, false);
+
+            type_system->StartTagDeclarationDefinition(field_type);
+
+            Declaration decl;
+            for (auto enumerator : enumerators) {
+              type_system->AddEnumerationValueToEnumerationType(
+                  field_type, decl, enumerator.m_name.c_str(),
+                  enumerator.m_value, byte_size * 8);
+            }
+
+            type_system->CompleteTagDeclarationDefinition(field_type);
+          }
+        }
+      }
+
       type_system->AddFieldToRecordType(fields_type, field.GetName(),
-                                        field_uint_type, lldb::eAccessPublic,
+                                        field_type, lldb::eAccessPublic,
                                         field.GetSizeInBits());
     }
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index b22abc49c92a9a..c60e4bb503a371 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -24,6 +24,7 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
   ScriptedPythonInterface.cpp
   ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
+  ScriptedThreadPlanPythonInterface.cpp
   ScriptedPlatformPythonInterface.cpp
 
   LINK_LIBS
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
index 9ba4731032bd35..6e93bec80056ee 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
@@ -20,6 +20,8 @@
 #include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
 
+#include "lldb/Target/ExecutionContext.h"
+
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
index e86b34d6b930e4..313c597ce48f3c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
@@ -49,7 +49,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetCapabilities() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_capabilities", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -90,7 +91,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetThreadsInfo() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_threads_info", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -106,7 +108,8 @@ bool ScriptedProcessPythonInterface::CreateBreakpoint(lldb::addr_t addr,
   if (py_error.Fail())
     error = py_error;
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetBooleanValue();
@@ -131,7 +134,8 @@ lldb::offset_t ScriptedProcessPythonInterface::WriteMemoryAtAddress(
   StructuredData::ObjectSP obj =
       Dispatch("write_memory_at_address", py_error, addr, data_sp, error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return LLDB_INVALID_OFFSET;
 
   // If there was an error on the python call, surface it to the user.
@@ -146,7 +150,8 @@ StructuredData::ArraySP ScriptedProcessPythonInterface::GetLoadedImages() {
   StructuredData::ArraySP array =
       Dispatch<StructuredData::ArraySP>("get_loaded_images", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array,
+                                                    error))
     return {};
 
   return array;
@@ -156,7 +161,8 @@ lldb::pid_t ScriptedProcessPythonInterface::GetProcessID() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_process_id", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return LLDB_INVALID_PROCESS_ID;
 
   return obj->GetUnsignedIntegerValue(LLDB_INVALID_PROCESS_ID);
@@ -166,7 +172,8 @@ bool ScriptedProcessPythonInterface::IsAlive() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("is_alive", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetBooleanValue();
@@ -177,7 +184,8 @@ ScriptedProcessPythonInterface::GetScriptedThreadPluginName() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_scripted_thread_plugin", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetStringValue().str();
@@ -193,7 +201,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetMetadata() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_process_metadata", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 6f22503b279ca6..699412e437a1ad 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -48,12 +48,34 @@ Status ScriptedPythonInterface::ExtractValueFromPythonObject<Status>(
   if (lldb::SBError *sb_error = reinterpret_cast<lldb::SBError *>(
           python::LLDBSWIGPython_CastPyObjectToSBError(p.get())))
     return m_interpreter.GetStatusFromSBError(*sb_error);
-  else
-    error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status.");
+  error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status.");
 
   return {};
 }
 
+template <>
+Event *ScriptedPythonInterface::ExtractValueFromPythonObject<Event *>(
+    python::PythonObject &p, Status &error) {
+  if (lldb::SBEvent *sb_event = reinterpret_cast<lldb::SBEvent *>(
+          python::LLDBSWIGPython_CastPyObjectToSBEvent(p.get())))
+    return m_interpreter.GetOpaqueTypeFromSBEvent(*sb_event);
+  error.SetErrorString("Couldn't cast lldb::SBEvent to lldb_private::Event.");
+
+  return nullptr;
+}
+
+template <>
+lldb::StreamSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StreamSP>(
+    python::PythonObject &p, Status &error) {
+  if (lldb::SBStream *sb_stream = reinterpret_cast<lldb::SBStream *>(
+          python::LLDBSWIGPython_CastPyObjectToSBStream(p.get())))
+    return m_interpreter.GetOpaqueTypeFromSBStream(*sb_stream);
+  error.SetErrorString("Couldn't cast lldb::SBStream to lldb_private::Stream.");
+
+  return nullptr;
+}
+
 template <>
 lldb::DataExtractorSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DataExtractorSP>(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index 163659234466d3..e1a3156d10afdb 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -115,7 +115,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
           PythonObject::ResolveNameWithDictionary<python::PythonCallable>(
               class_name, dict);
       if (!init.IsAllocated())
-        return create_error(llvm::formatv("Could not find script class: %s",
+        return create_error(llvm::formatv("Could not find script class: {0}",
                                           class_name.data()));
 
       std::tuple<Args...> original_args = std::forward_as_tuple(args...);
@@ -248,8 +248,11 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
                              (PyObject *)m_object_instance_sp->GetValue());
 
     if (!implementor.IsAllocated())
-      return ErrorWithMessage<T>(caller_signature,
-                                 "Python implementor not allocated.", error);
+      return llvm::is_contained(GetAbstractMethods(), method_name)
+                 ? ErrorWithMessage<T>(caller_signature,
+                                       "Python implementor not allocated.",
+                                       error)
+                 : T{};
 
     std::tuple<Args...> original_args = std::forward_as_tuple(args...);
     auto transformed_args = TransformArgs(original_args);
@@ -322,6 +325,10 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(lldb::ThreadPlanSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   python::PythonObject Transform(lldb::ProcessAttachInfoSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -330,6 +337,14 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(Event *arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
+  python::PythonObject Transform(lldb::StreamSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg.get());
+  }
+
   python::PythonObject Transform(lldb::DataExtractorSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -427,6 +442,15 @@ template <>
 Status ScriptedPythonInterface::ExtractValueFromPythonObject<Status>(
     python::PythonObject &p, Status &error);
 
+template <>
+Event *ScriptedPythonInterface::ExtractValueFromPythonObject<Event *>(
+    python::PythonObject &p, Status &error);
+
+template <>
+lldb::StreamSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StreamSP>(
+    python::PythonObject &p, Status &error);
+
 template <>
 lldb::BreakpointSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::BreakpointSP>(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
new file mode 100644
index 00000000000000..f23858c01277cf
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
@@ -0,0 +1,105 @@
+//===-- ScriptedThreadPlanPythonInterface.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Config.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/lldb-enumerations.h"
+
+#if LLDB_ENABLE_PYTHON
+
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
+#include "ScriptedThreadPlanPythonInterface.h"
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::python;
+
+ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface(
+    ScriptInterpreterPythonImpl &interpreter)
+    : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {}
+
+llvm::Expected<StructuredData::GenericSP>
+ScriptedThreadPlanPythonInterface::CreatePluginObject(
+    const llvm::StringRef class_name, lldb::ThreadPlanSP thread_plan_sp,
+    const StructuredDataImpl &args_sp) {
+  return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr,
+                                                     thread_plan_sp, args_sp);
+}
+
+llvm::Expected<bool>
+ScriptedThreadPlanPythonInterface::ExplainsStop(Event *event) {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("explains_stop", error, event);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error)) {
+    if (!obj)
+      return false;
+    return error.ToError();
+  }
+
+  return obj->GetBooleanValue();
+}
+
+llvm::Expected<bool>
+ScriptedThreadPlanPythonInterface::ShouldStop(Event *event) {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("should_stop", error, event);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error)) {
+    if (!obj)
+      return false;
+    return error.ToError();
+  }
+
+  return obj->GetBooleanValue();
+}
+
+llvm::Expected<bool> ScriptedThreadPlanPythonInterface::IsStale() {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("is_stale", error);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error)) {
+    if (!obj)
+      return false;
+    return error.ToError();
+  }
+
+  return obj->GetBooleanValue();
+}
+
+lldb::StateType ScriptedThreadPlanPythonInterface::GetRunState() {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("should_step", error);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
+    return lldb::eStateStepping;
+
+  return static_cast<lldb::StateType>(obj->GetUnsignedIntegerValue(
+      static_cast<uint32_t>(lldb::eStateStepping)));
+}
+
+llvm::Error
+ScriptedThreadPlanPythonInterface::GetStopDescription(lldb::StreamSP &stream) {
+  Status error;
+  Dispatch("stop_description", error, stream);
+
+  if (error.Fail())
+    return error.ToError();
+
+  return llvm::Error::success();
+}
+
+#endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
new file mode 100644
index 00000000000000..6ec89b9f59253a
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
@@ -0,0 +1,48 @@
+//===-- ScriptedThreadPlanPythonInterface.h ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
+#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
+
+#include "lldb/Host/Config.h"
+
+#if LLDB_ENABLE_PYTHON
+
+#include "ScriptedPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
+#include <optional>
+
+namespace lldb_private {
+class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
+                                          public ScriptedPythonInterface {
+public:
+  ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter);
+
+  llvm::Expected<StructuredData::GenericSP>
+  CreatePluginObject(const llvm::StringRef class_name,
+                     lldb::ThreadPlanSP thread_plan_sp,
+                     const StructuredDataImpl &args_sp) override;
+
+  llvm::SmallVector<llvm::StringLiteral> GetAbstractMethods() const override {
+    return {};
+  }
+
+  llvm::Expected<bool> ExplainsStop(Event *event) override;
+
+  llvm::Expected<bool> ShouldStop(Event *event) override;
+
+  llvm::Expected<bool> IsStale() override;
+
+  lldb::StateType GetRunState() override;
+
+  llvm::Error GetStopDescription(lldb::StreamSP &stream) override;
+};
+} // namespace lldb_private
+
+#endif // LLDB_ENABLE_PYTHON
+#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
index 18e268527eb2fb..8af89d761764bc 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/Config.h"
+#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
@@ -44,7 +45,8 @@ lldb::tid_t ScriptedThreadPythonInterface::GetThreadID() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_thread_id", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return LLDB_INVALID_THREAD_ID;
 
   return obj->GetUnsignedIntegerValue(LLDB_INVALID_THREAD_ID);
@@ -54,7 +56,8 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetName() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_name", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetStringValue().str();
@@ -64,7 +67,8 @@ lldb::StateType ScriptedThreadPythonInterface::GetState() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_state", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return eStateInvalid;
 
   return static_cast<StateType>(obj->GetUnsignedIntegerValue(eStateInvalid));
@@ -74,7 +78,8 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetQueue() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_queue", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetStringValue().str();
@@ -85,7 +90,8 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetStopReason() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_stop_reason", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -96,7 +102,8 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetStackFrames() {
   StructuredData::ArraySP arr =
       Dispatch<StructuredData::ArraySP>("get_stackframes", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr,
+                                                    error))
     return {};
 
   return arr;
@@ -107,7 +114,8 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetRegisterInfo() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_register_info", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -117,7 +125,8 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetRegisterContext() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_register_context", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetAsString()->GetValue().str();
@@ -128,7 +137,8 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetExtendedInfo() {
   StructuredData::ArraySP arr =
       Dispatch<StructuredData::ArraySP>("get_extended_info", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr,
+                                                    error))
     return {};
 
   return arr;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index c1a11b9134d62b..95eb5a782097bb 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -97,12 +97,14 @@ class SWIGBridge {
   static PythonObject ToSWIGWrapper(lldb::ExecutionContextRefSP ctx_sp);
   static PythonObject ToSWIGWrapper(const TypeSummaryOptions &summary_options);
   static PythonObject ToSWIGWrapper(const SymbolContext &sym_ctx);
+  static PythonObject ToSWIGWrapper(const Stream *stream);
+  static PythonObject ToSWIGWrapper(std::shared_ptr<lldb::SBStream> stream_sb);
+  static PythonObject ToSWIGWrapper(Event *event);
 
   static PythonObject ToSWIGWrapper(lldb::ProcessAttachInfoSP attach_info_sp);
   static PythonObject ToSWIGWrapper(lldb::ProcessLaunchInfoSP launch_info_sp);
   static PythonObject ToSWIGWrapper(lldb::DataExtractorSP data_extractor_sp);
 
-  static PythonObject ToSWIGWrapper(std::unique_ptr<lldb::SBStream> stream_sb);
   static PythonObject
   ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb);
   static PythonObject
@@ -112,7 +114,6 @@ class SWIGBridge {
 
   static python::ScopedPythonObject<lldb::SBCommandReturnObject>
   ToSWIGWrapper(CommandReturnObject &cmd_retobj);
-  static python::ScopedPythonObject<lldb::SBEvent> ToSWIGWrapper(Event *event);
   // These prototypes are the Pythonic implementations of the required
   // callbacks. Although these are scripting-language specific, their definition
   // depends on the public API.
@@ -147,21 +148,6 @@ class SWIGBridge {
                                     const char *session_dictionary_name,
                                     lldb::DebuggerSP debugger_sp);
 
-  static python::PythonObject LLDBSwigPythonCreateScriptedThreadPlan(
-      const char *python_class_name, const char *session_dictionary_name,
-      const StructuredDataImpl &args_data, std::string &error_string,
-      const lldb::ThreadPlanSP &thread_plan_sp);
-
-  static bool LLDBSWIGPythonCallThreadPlan(void *implementor,
-                                           const char *method_name,
-                                           lldb_private::Event *event_sp,
-                                           bool &got_error);
-
-  static bool LLDBSWIGPythonCallThreadPlan(void *implementor,
-                                           const char *method_name,
-                                           lldb_private::Stream *stream,
-                                           bool &got_error);
-
   static python::PythonObject LLDBSwigPythonCreateScriptedBreakpointResolver(
       const char *python_class_name, const char *session_dictionary_name,
       const StructuredDataImpl &args, const lldb::BreakpointSP &bkpt_sp);
@@ -269,6 +255,8 @@ void *LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBAttachInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data);
 } // namespace python
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 6e676de146b3d3..70c9f94754418b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -17,6 +17,7 @@
 #include "Interfaces/OperatingSystemPythonInterface.h"
 #include "Interfaces/ScriptedPlatformPythonInterface.h"
 #include "Interfaces/ScriptedProcessPythonInterface.h"
+#include "Interfaces/ScriptedThreadPlanPythonInterface.h"
 #include "Interfaces/ScriptedThreadPythonInterface.h"
 #include "PythonDataObjects.h"
 #include "PythonReadline.h"
@@ -1537,6 +1538,11 @@ ScriptInterpreterPythonImpl::CreateScriptedThreadInterface() {
   return std::make_shared<ScriptedThreadPythonInterface>(*this);
 }
 
+ScriptedThreadPlanInterfaceSP
+ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() {
+  return std::make_shared<ScriptedThreadPlanPythonInterface>(*this);
+}
+
 OperatingSystemInterfaceSP
 ScriptInterpreterPythonImpl::CreateOperatingSystemInterface() {
   return std::make_shared<OperatingSystemPythonInterface>(*this);
@@ -1553,122 +1559,6 @@ ScriptInterpreterPythonImpl::CreateStructuredDataFromScriptObject(
   return py_obj.CreateStructuredObject();
 }
 
-StructuredData::ObjectSP ScriptInterpreterPythonImpl::CreateScriptedThreadPlan(
-    const char *class_name, const StructuredDataImpl &args_data,
-    std::string &error_str, lldb::ThreadPlanSP thread_plan_sp) {
-  if (class_name == nullptr || class_name[0] == '\0')
-    return StructuredData::ObjectSP();
-
-  if (!thread_plan_sp.get())
-    return {};
-
-  Debugger &debugger = thread_plan_sp->GetTarget().GetDebugger();
-  ScriptInterpreterPythonImpl *python_interpreter =
-      GetPythonInterpreter(debugger);
-
-  if (!python_interpreter)
-    return {};
-
-  Locker py_lock(this,
-                 Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-  PythonObject ret_val = SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
-      class_name, python_interpreter->m_dictionary_name.c_str(), args_data,
-      error_str, thread_plan_sp);
-  if (!ret_val)
-    return {};
-
-  return StructuredData::ObjectSP(
-      new StructuredPythonObject(std::move(ret_val)));
-}
-
-bool ScriptInterpreterPythonImpl::ScriptedThreadPlanExplainsStop(
-    StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) {
-  bool explains_stop = true;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    explains_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "explains_stop", event, script_error);
-    if (script_error)
-      return true;
-  }
-  return explains_stop;
-}
-
-bool ScriptInterpreterPythonImpl::ScriptedThreadPlanShouldStop(
-    StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) {
-  bool should_stop = true;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    should_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "should_stop", event, script_error);
-    if (script_error)
-      return true;
-  }
-  return should_stop;
-}
-
-bool ScriptInterpreterPythonImpl::ScriptedThreadPlanIsStale(
-    StructuredData::ObjectSP implementor_sp, bool &script_error) {
-  bool is_stale = true;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    is_stale = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "is_stale", (Event *)nullptr, script_error);
-    if (script_error)
-      return true;
-  }
-  return is_stale;
-}
-
-lldb::StateType ScriptInterpreterPythonImpl::ScriptedThreadPlanGetRunState(
-    StructuredData::ObjectSP implementor_sp, bool &script_error) {
-  bool should_step = false;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    should_step = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "should_step", (Event *)nullptr, script_error);
-    if (script_error)
-      should_step = true;
-  }
-  if (should_step)
-    return lldb::eStateStepping;
-  return lldb::eStateRunning;
-}
-
-bool
-ScriptInterpreterPythonImpl::ScriptedThreadPlanGetStopDescription(
-    StructuredData::ObjectSP implementor_sp, lldb_private::Stream *stream,
-    bool &script_error) {
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (!generic) {
-    script_error = true;
-    return false;
-  }
-  Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-  return SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-      generic->GetValue(), "stop_description", stream, script_error);
-}
-
-
 StructuredData::GenericSP
 ScriptInterpreterPythonImpl::CreateScriptedBreakpointResolver(
     const char *class_name, const StructuredDataImpl &args_data,
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index fcd21dff612b1e..fa23540534738b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -77,34 +77,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
   StructuredData::GenericSP
   CreateScriptCommandObject(const char *class_name) override;
 
-  StructuredData::ObjectSP
-  CreateScriptedThreadPlan(const char *class_name,
-                           const StructuredDataImpl &args_data,
-                           std::string &error_str,
-                           lldb::ThreadPlanSP thread_plan) override;
-
   StructuredData::ObjectSP
   CreateStructuredDataFromScriptObject(ScriptObject obj) override;
 
-  bool ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp,
-                                      Event *event,
-                                      bool &script_error) override;
-
-  bool ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp,
-                                    Event *event, bool &script_error) override;
-
-  bool ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp,
-                                 bool &script_error) override;
-
-  lldb::StateType
-  ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp,
-                                bool &script_error) override;
-
-  bool
-  ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp,
-                                lldb_private::Stream *s,
-                                bool &script_error) override;
-
   StructuredData::GenericSP
   CreateScriptedBreakpointResolver(const char *class_name,
                                    const StructuredDataImpl &args_data,
@@ -136,6 +111,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
 
   lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override;
 
+  lldb::ScriptedThreadPlanInterfaceSP
+  CreateScriptedThreadPlanInterface() override;
+
   lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() override;
 
   StructuredData::ObjectSP
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index ae3eaf88ff4a8d..8e297141f4e132 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -39,10 +39,12 @@
 #include "lldb/Utility/StreamString.h"
 
 #include "clang/AST/CXXInheritance.h"
+#include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Type.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Demangle/Demangle.h"
 
 #include <map>
@@ -223,7 +225,6 @@ TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc,
       nullptr, LLDB_INVALID_UID, Type::eEncodingInvalid,
       &pcm_type_sp->GetDeclaration(), type, Type::ResolveState::Forward,
       TypePayloadClang(GetOwningClangModule(die)));
-  dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
   clang::TagDecl *tag_decl = TypeSystemClang::GetAsTagDecl(type);
   if (tag_decl) {
     LinkDeclContextToDIE(tag_decl, die);
@@ -236,20 +237,10 @@ TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc,
   return type_sp;
 }
 
-static void ForcefullyCompleteType(CompilerType type) {
-  bool started = TypeSystemClang::StartTagDeclarationDefinition(type);
-  lldbassert(started && "Unable to start a class type definition.");
-  TypeSystemClang::CompleteTagDeclarationDefinition(type);
-  const clang::TagDecl *td = ClangUtil::GetAsTagDecl(type);
-  auto ts_sp = type.GetTypeSystem();
-  auto ts = ts_sp.dyn_cast_or_null<TypeSystemClang>();
-  if (ts)
-    ts->SetDeclIsForcefullyCompleted(td);
-}
-
-/// This function serves a similar purpose as RequireCompleteType above, but it
-/// avoids completing the type if it is not immediately necessary. It only
-/// ensures we _can_ complete the type later.
+/// This function ensures we are able to add members (nested types, functions,
+/// etc.) to this type. It does so by starting its definition even if one cannot
+/// be found in the debug info. This means the type may need to be "forcibly
+/// completed" later -- see CompleteTypeFromDWARF).
 static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
                                            ClangASTImporter &ast_importer,
                                            clang::DeclContext *decl_ctx,
@@ -259,14 +250,12 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
   if (!tag_decl_ctx)
     return; // Non-tag context are always ready.
 
-  // We have already completed the type, or we have found its definition and are
-  // ready to complete it later (cf. ParseStructureLikeDIE).
+  // We have already completed the type or it is already prepared.
   if (tag_decl_ctx->isCompleteDefinition() || tag_decl_ctx->isBeingDefined())
     return;
 
-  // We reach this point of the tag was present in the debug info as a
-  // declaration only. If it was imported from another AST context (in the
-  // gmodules case), we can complete the type by doing a full import.
+  // If this tag was imported from another AST context (in the gmodules case),
+  // we can complete the type by doing a full import.
 
   // If this type was not imported from an external AST, there's nothing to do.
   CompilerType type = ast.GetTypeForDecl(tag_decl_ctx);
@@ -280,9 +269,9 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
         type_name_cstr ? type_name_cstr : "", die.GetOffset());
   }
 
-  // We don't have a type definition and/or the import failed. We must
-  // forcefully complete the type to avoid crashes.
-  ForcefullyCompleteType(type);
+  // We don't have a type definition and/or the import failed, but we need to
+  // add members to it. Start the definition to make that possible.
+  tag_decl_ctx->startDefinition();
 }
 
 ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) {
@@ -458,90 +447,78 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
         DW_TAG_value_to_name(die.Tag()), die.Tag(), die.GetName());
   }
 
-  Type *type_ptr = dwarf->GetDIEToType().lookup(die.GetDIE());
-  if (type_ptr == DIE_IS_BEING_PARSED)
-    return nullptr;
-  if (type_ptr)
-    return type_ptr->shared_from_this();
   // Set a bit that lets us know that we are currently parsing this
-  dwarf->GetDIEToType()[die.GetDIE()] = DIE_IS_BEING_PARSED;
+  if (auto [it, inserted] =
+          dwarf->GetDIEToType().try_emplace(die.GetDIE(), DIE_IS_BEING_PARSED);
+      !inserted) {
+    if (it->getSecond() == nullptr || it->getSecond() == DIE_IS_BEING_PARSED)
+      return nullptr;
+    return it->getSecond()->shared_from_this();
+  }
 
   ParsedDWARFTypeAttributes attrs(die);
 
+  TypeSP type_sp;
   if (DWARFDIE signature_die = attrs.signature.Reference()) {
-    if (TypeSP type_sp =
-            ParseTypeFromDWARF(sc, signature_die, type_is_new_ptr)) {
-      dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
+    type_sp = ParseTypeFromDWARF(sc, signature_die, type_is_new_ptr);
+    if (type_sp) {
       if (clang::DeclContext *decl_ctx =
               GetCachedClangDeclContextForDIE(signature_die))
         LinkDeclContextToDIE(decl_ctx, die);
-      return type_sp;
     }
-    return nullptr;
-  }
-
-  if (type_is_new_ptr)
-    *type_is_new_ptr = true;
-
-  const dw_tag_t tag = die.Tag();
-
-  TypeSP type_sp;
-
-  switch (tag) {
-  case DW_TAG_typedef:
-  case DW_TAG_base_type:
-  case DW_TAG_pointer_type:
-  case DW_TAG_reference_type:
-  case DW_TAG_rvalue_reference_type:
-  case DW_TAG_const_type:
-  case DW_TAG_restrict_type:
-  case DW_TAG_volatile_type:
-  case DW_TAG_LLVM_ptrauth_type:
-  case DW_TAG_atomic_type:
-  case DW_TAG_unspecified_type: {
-    type_sp = ParseTypeModifier(sc, die, attrs);
-    break;
-  }
-
-  case DW_TAG_structure_type:
-  case DW_TAG_union_type:
-  case DW_TAG_class_type: {
-    type_sp = ParseStructureLikeDIE(sc, die, attrs);
-    break;
-  }
+  } else {
+    if (type_is_new_ptr)
+      *type_is_new_ptr = true;
 
-  case DW_TAG_enumeration_type: {
-    type_sp = ParseEnum(sc, die, attrs);
-    break;
-  }
+    const dw_tag_t tag = die.Tag();
 
-  case DW_TAG_inlined_subroutine:
-  case DW_TAG_subprogram:
-  case DW_TAG_subroutine_type: {
-    type_sp = ParseSubroutine(die, attrs);
-    break;
-  }
-  case DW_TAG_array_type: {
-    type_sp = ParseArrayType(die, attrs);
-    break;
-  }
-  case DW_TAG_ptr_to_member_type: {
-    type_sp = ParsePointerToMemberType(die, attrs);
-    break;
+    switch (tag) {
+    case DW_TAG_typedef:
+    case DW_TAG_base_type:
+    case DW_TAG_pointer_type:
+    case DW_TAG_reference_type:
+    case DW_TAG_rvalue_reference_type:
+    case DW_TAG_const_type:
+    case DW_TAG_restrict_type:
+    case DW_TAG_volatile_type:
+    case DW_TAG_LLVM_ptrauth_type:
+    case DW_TAG_atomic_type:
+    case DW_TAG_unspecified_type:
+      type_sp = ParseTypeModifier(sc, die, attrs);
+      break;
+    case DW_TAG_structure_type:
+    case DW_TAG_union_type:
+    case DW_TAG_class_type:
+      type_sp = ParseStructureLikeDIE(sc, die, attrs);
+      break;
+    case DW_TAG_enumeration_type:
+      type_sp = ParseEnum(sc, die, attrs);
+      break;
+    case DW_TAG_inlined_subroutine:
+    case DW_TAG_subprogram:
+    case DW_TAG_subroutine_type:
+      type_sp = ParseSubroutine(die, attrs);
+      break;
+    case DW_TAG_array_type:
+      type_sp = ParseArrayType(die, attrs);
+      break;
+    case DW_TAG_ptr_to_member_type:
+      type_sp = ParsePointerToMemberType(die, attrs);
+      break;
+    default:
+      dwarf->GetObjectFile()->GetModule()->ReportError(
+          "[{0:x16}]: unhandled type tag {1:x4} ({2}), "
+          "please file a bug and "
+          "attach the file at the start of this error message",
+          die.GetOffset(), tag, DW_TAG_value_to_name(tag));
+      break;
+    }
+    UpdateSymbolContextScopeForType(sc, die, type_sp);
   }
-  default:
-    dwarf->GetObjectFile()->GetModule()->ReportError(
-        "[{0:x16}]: unhandled type tag {1:x4} ({2}), "
-        "please file a bug and "
-        "attach the file at the start of this error message",
-        die.GetOffset(), tag, DW_TAG_value_to_name(tag));
-    break;
+  if (type_sp) {
+    dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
   }
-
-  // TODO: We should consider making the switch above exhaustive to simplify
-  // control flow in ParseTypeFromDWARF. Then, we could simply replace this
-  // return statement with a call to llvm_unreachable.
-  return UpdateSymbolContextScopeForType(sc, die, type_sp);
+  return type_sp;
 }
 
 static std::optional<uint32_t>
@@ -830,12 +807,9 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
     }
   }
 
-  type_sp = dwarf->MakeType(die.GetID(), attrs.name, attrs.byte_size, nullptr,
-                            attrs.type.Reference().GetID(), encoding_data_type,
-                            &attrs.decl, clang_type, resolve_state, payload);
-
-  dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-  return type_sp;
+  return dwarf->MakeType(die.GetID(), attrs.name, attrs.byte_size, nullptr,
+                         attrs.type.Reference().GetID(), encoding_data_type,
+                         &attrs.decl, clang_type, resolve_state, payload);
 }
 
 std::string
@@ -851,57 +825,57 @@ DWARFASTParserClang::GetDIEClassTemplateParams(const DWARFDIE &die) {
 }
 
 TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc,
-                                      const DWARFDIE &die,
+                                      const DWARFDIE &decl_die,
                                       ParsedDWARFTypeAttributes &attrs) {
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
-  SymbolFileDWARF *dwarf = die.GetDWARF();
-  const dw_tag_t tag = die.Tag();
-  TypeSP type_sp;
+  SymbolFileDWARF *dwarf = decl_die.GetDWARF();
+  const dw_tag_t tag = decl_die.Tag();
 
+  DWARFDIE def_die;
   if (attrs.is_forward_declaration) {
-    type_sp = ParseTypeFromClangModule(sc, die, log);
-    if (type_sp)
+    if (TypeSP type_sp = ParseTypeFromClangModule(sc, decl_die, log))
       return type_sp;
 
-    type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
+    def_die = dwarf->FindDefinitionDIE(decl_die);
 
-    if (!type_sp) {
+    if (!def_die) {
       SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
       if (debug_map_symfile) {
         // We weren't able to find a full declaration in this DWARF,
         // see if we have a declaration anywhere else...
-        type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
+        def_die = debug_map_symfile->FindDefinitionDIE(decl_die);
       }
     }
 
-    if (type_sp) {
-      if (log) {
-        dwarf->GetObjectFile()->GetModule()->LogMessage(
-            log,
-            "SymbolFileDWARF({0:p}) - {1:x16}}: {2} ({3}) type \"{4}\" is a "
-            "forward declaration, complete type is {5:x8}",
-            static_cast<void *>(this), die.GetOffset(),
-            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
-            type_sp->GetID());
-      }
-
-      // We found a real definition for this type elsewhere so lets use
-      // it and cache the fact that we found a complete type for this
-      // die
-      dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-      clang::DeclContext *defn_decl_ctx =
-          GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID()));
-      if (defn_decl_ctx)
-        LinkDeclContextToDIE(defn_decl_ctx, die);
-      return type_sp;
-    }
+    if (log) {
+      dwarf->GetObjectFile()->GetModule()->LogMessage(
+          log,
+          "SymbolFileDWARF({0:p}) - {1:x16}}: {2} ({3}) type \"{4}\" is a "
+          "forward declaration, complete DIE is {5}",
+          static_cast<void *>(this), decl_die.GetID(), DW_TAG_value_to_name(tag),
+          tag, attrs.name.GetCString(),
+          def_die ? llvm::utohexstr(def_die.GetID()) : "not found");
+    }
+  }
+  if (def_die) {
+    if (auto [it, inserted] = dwarf->GetDIEToType().try_emplace(
+            def_die.GetDIE(), DIE_IS_BEING_PARSED);
+        !inserted) {
+      if (it->getSecond() == nullptr || it->getSecond() == DIE_IS_BEING_PARSED)
+        return nullptr;
+      return it->getSecond()->shared_from_this();
+    }
+    attrs = ParsedDWARFTypeAttributes(def_die);
+  } else {
+    // No definition found. Proceed with the declaration die. We can use it to
+    // create a forward-declared type.
+    def_die = decl_die;
   }
-  DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(),
-               DW_TAG_value_to_name(tag), type_name_cstr);
 
   CompilerType enumerator_clang_type;
   if (attrs.type.IsValid()) {
-    Type *enumerator_type = dwarf->ResolveTypeUID(attrs.type.Reference(), true);
+    Type *enumerator_type =
+        dwarf->ResolveTypeUID(attrs.type.Reference(), true);
     if (enumerator_type)
       enumerator_clang_type = enumerator_type->GetFullCompilerType();
   }
@@ -916,24 +890,31 @@ TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc,
   }
 
   CompilerType clang_type = m_ast.CreateEnumerationType(
-      attrs.name.GetStringRef(), GetClangDeclContextContainingDIE(die, nullptr),
-      GetOwningClangModule(die), attrs.decl, enumerator_clang_type,
+      attrs.name.GetStringRef(), GetClangDeclContextContainingDIE(def_die, nullptr),
+      GetOwningClangModule(def_die), attrs.decl, enumerator_clang_type,
       attrs.is_scoped_enum);
-
-  LinkDeclContextToDIE(TypeSystemClang::GetDeclContextForType(clang_type), die);
-
-  type_sp =
-      dwarf->MakeType(die.GetID(), attrs.name, attrs.byte_size, nullptr,
+  TypeSP type_sp =
+      dwarf->MakeType(def_die.GetID(), attrs.name, attrs.byte_size, nullptr,
                       attrs.type.Reference().GetID(), Type::eEncodingIsUID,
                       &attrs.decl, clang_type, Type::ResolveState::Forward,
-                      TypePayloadClang(GetOwningClangModule(die)));
+                      TypePayloadClang(GetOwningClangModule(def_die)));
+
+  clang::DeclContext *type_decl_ctx =
+      TypeSystemClang::GetDeclContextForType(clang_type);
+  LinkDeclContextToDIE(type_decl_ctx, decl_die);
+  if (decl_die != def_die) {
+    LinkDeclContextToDIE(type_decl_ctx, def_die);
+    dwarf->GetDIEToType()[def_die.GetDIE()] = type_sp.get();
+    // Declaration DIE is inserted into the type map in ParseTypeFromDWARF
+  }
+
 
   if (TypeSystemClang::StartTagDeclarationDefinition(clang_type)) {
-    if (die.HasChildren()) {
+    if (def_die.HasChildren()) {
       bool is_signed = false;
       enumerator_clang_type.IsIntegerType(is_signed);
       ParseChildEnumerators(clang_type, is_signed,
-                            type_sp->GetByteSize(nullptr).value_or(0), die);
+                            type_sp->GetByteSize(nullptr).value_or(0), def_die);
     }
     TypeSystemClang::CompleteTagDeclarationDefinition(clang_type);
   } else {
@@ -941,7 +922,7 @@ TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc,
         "DWARF DIE at {0:x16} named \"{1}\" was not able to start its "
         "definition.\nPlease file a bug and attach the file at the "
         "start of this error message",
-        die.GetOffset(), attrs.name.GetCString());
+        def_die.GetOffset(), attrs.name.GetCString());
   }
   return type_sp;
 }
@@ -1052,7 +1033,7 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
       // Unfortunately classes don't like having stuff added
       // to them after their definitions are complete...
 
-      Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
+      Type *type_ptr = dwarf->GetDIEToType().lookup(die.GetDIE());
       if (type_ptr && type_ptr != DIE_IS_BEING_PARSED)
         return {true, type_ptr->shared_from_this()};
     }
@@ -1105,85 +1086,53 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
   if (!TypeSystemClang::IsCXXClassType(class_opaque_type))
     return {};
 
-  if (class_opaque_type.IsBeingDefined()) {
-    // We have a C++ member function with no children (this
-    // pointer!) and clang will get mad if we try and make
-    // a function that isn't well formed in the DWARF, so
-    // we will just skip it...
-    if (!is_static && !die.HasChildren())
-      return {true, nullptr};
-
-    const bool is_attr_used = false;
-    // Neither GCC 4.2 nor clang++ currently set a valid
-    // accessibility in the DWARF for C++ methods...
-    // Default to public for now...
-    const auto accessibility = attrs.accessibility == eAccessNone
-                                   ? eAccessPublic
-                                   : attrs.accessibility;
-
-    clang::CXXMethodDecl *cxx_method_decl = m_ast.AddMethodToCXXRecordType(
-        class_opaque_type.GetOpaqueQualType(), attrs.name.GetCString(),
-        attrs.mangled_name, clang_type, accessibility, attrs.is_virtual,
-        is_static, attrs.is_inline, attrs.is_explicit, is_attr_used,
-        attrs.is_artificial);
-
-    if (cxx_method_decl) {
-      LinkDeclContextToDIE(cxx_method_decl, die);
-
-      ClangASTMetadata metadata;
-      metadata.SetUserID(die.GetID());
-
-      char const *object_pointer_name =
-          attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
-      if (object_pointer_name) {
-        metadata.SetObjectPtrName(object_pointer_name);
-        LLDB_LOGF(log,
-                  "Setting object pointer name: %s on method "
-                  "object %p.\n",
-                  object_pointer_name, static_cast<void *>(cxx_method_decl));
-      }
-      m_ast.SetMetadata(cxx_method_decl, metadata);
-    } else {
-      ignore_containing_context = true;
-    }
+  PrepareContextToReceiveMembers(
+      m_ast, GetClangASTImporter(),
+      TypeSystemClang::GetDeclContextForType(class_opaque_type), die,
+      attrs.name.GetCString());
 
-    // Artificial methods are always handled even when we
-    // don't create a new declaration for them.
-    const bool type_handled = cxx_method_decl != nullptr || attrs.is_artificial;
+  // We have a C++ member function with no children (this pointer!) and clang
+  // will get mad if we try and make a function that isn't well formed in the
+  // DWARF, so we will just skip it...
+  if (!is_static && !die.HasChildren())
+    return {true, nullptr};
 
-    return {type_handled, nullptr};
-  }
+  const bool is_attr_used = false;
+  // Neither GCC 4.2 nor clang++ currently set a valid
+  // accessibility in the DWARF for C++ methods...
+  // Default to public for now...
+  const auto accessibility =
+      attrs.accessibility == eAccessNone ? eAccessPublic : attrs.accessibility;
 
-  // We were asked to parse the type for a method in a
-  // class, yet the class hasn't been asked to complete
-  // itself through the clang::ExternalASTSource protocol,
-  // so we need to just have the class complete itself and
-  // do things the right way, then our
-  // DIE should then have an entry in the
-  // dwarf->GetDIEToType() map. First
-  // we need to modify the dwarf->GetDIEToType() so it
-  // doesn't think we are trying to parse this DIE
-  // anymore...
-  dwarf->GetDIEToType()[die.GetDIE()] = NULL;
+  clang::CXXMethodDecl *cxx_method_decl = m_ast.AddMethodToCXXRecordType(
+      class_opaque_type.GetOpaqueQualType(), attrs.name.GetCString(),
+      attrs.mangled_name, clang_type, accessibility, attrs.is_virtual,
+      is_static, attrs.is_inline, attrs.is_explicit, is_attr_used,
+      attrs.is_artificial);
 
-  // Now we get the full type to force our class type to
-  // complete itself using the clang::ExternalASTSource
-  // protocol which will parse all base classes and all
-  // methods (including the method for this DIE).
-  class_type->GetFullCompilerType();
+  if (cxx_method_decl) {
+    LinkDeclContextToDIE(cxx_method_decl, die);
+
+    ClangASTMetadata metadata;
+    metadata.SetUserID(die.GetID());
 
-  // The type for this DIE should have been filled in the
-  // function call above.
-  Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
-  if (type_ptr && type_ptr != DIE_IS_BEING_PARSED)
-    return {true, type_ptr->shared_from_this()};
+    char const *object_pointer_name =
+        attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
+    if (object_pointer_name) {
+      metadata.SetObjectPtrName(object_pointer_name);
+      LLDB_LOGF(log, "Setting object pointer name: %s on method object %p.\n",
+                object_pointer_name, static_cast<void *>(cxx_method_decl));
+    }
+    m_ast.SetMetadata(cxx_method_decl, metadata);
+  } else {
+    ignore_containing_context = true;
+  }
+
+  // Artificial methods are always handled even when we
+  // don't create a new declaration for them.
+  const bool type_handled = cxx_method_decl != nullptr || attrs.is_artificial;
 
-  // The previous comment isn't actually true if the class wasn't
-  // resolved using the current method's parent DIE as source
-  // data. We need to ensure that we look up the method correctly
-  // in the class and then link the method's DIE to the unique
-  // CXXMethodDecl appropriately.
-  return {true, nullptr};
+  return {type_handled, nullptr};
 }
 
 TypeSP
@@ -1576,7 +1525,6 @@ TypeSP DWARFASTParserClang::UpdateSymbolContextScopeForType(
   if (!type_sp)
     return type_sp;
 
-  SymbolFileDWARF *dwarf = die.GetDWARF();
   DWARFDIE sc_parent_die = SymbolFileDWARF::GetParentSymbolContextDIE(die);
   dw_tag_t sc_parent_tag = sc_parent_die.Tag();
 
@@ -1595,8 +1543,6 @@ TypeSP DWARFASTParserClang::UpdateSymbolContextScopeForType(
 
   if (symbol_context_scope != nullptr)
     type_sp->SetSymbolContextScope(symbol_context_scope);
-
-  dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
   return type_sp;
 }
 
@@ -1657,13 +1603,12 @@ DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) {
 
 TypeSP
 DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
-                                           const DWARFDIE &die,
+                                           const DWARFDIE &decl_die,
                                            ParsedDWARFTypeAttributes &attrs) {
-  TypeSP type_sp;
   CompilerType clang_type;
-  const dw_tag_t tag = die.Tag();
-  SymbolFileDWARF *dwarf = die.GetDWARF();
-  LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU());
+  const dw_tag_t tag = decl_die.Tag();
+  SymbolFileDWARF *dwarf = decl_die.GetDWARF();
+  LanguageType cu_language = SymbolFileDWARF::GetLanguage(*decl_die.GetCU());
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
 
   // UniqueDWARFASTType is large, so don't create a local variables on the
@@ -1680,20 +1625,19 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       // For C++, we rely solely upon the one definition rule that says
       // only one thing can exist at a given decl context. We ignore the
       // file and line that things are declared on.
-      std::string qualified_name = GetCPlusPlusQualifiedName(die);
+      std::string qualified_name = GetCPlusPlusQualifiedName(decl_die);
       if (!qualified_name.empty())
         unique_typename = ConstString(qualified_name);
       unique_decl.Clear();
     }
 
     if (dwarf->GetUniqueDWARFASTTypeMap().Find(
-            unique_typename, die, unique_decl, attrs.byte_size.value_or(-1),
-            *unique_ast_entry_up)) {
-      type_sp = unique_ast_entry_up->m_type_sp;
-      if (type_sp) {
-        dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
+            unique_typename, decl_die, unique_decl,
+            attrs.byte_size.value_or(-1), *unique_ast_entry_up)) {
+      if (TypeSP type_sp = unique_ast_entry_up->m_type_sp) {
         LinkDeclContextToDIE(
-            GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die), die);
+            GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die),
+            decl_die);
         return type_sp;
       }
     }
@@ -1716,7 +1660,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   }
 
   if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name &&
-      !die.HasChildren() && cu_language == eLanguageTypeObjC) {
+      !decl_die.HasChildren() && cu_language == eLanguageTypeObjC) {
     // Work around an issue with clang at the moment where forward
     // declarations for objective C classes are emitted as:
     //  DW_TAG_structure_type [2]
@@ -1733,14 +1677,14 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   if (attrs.class_language == eLanguageTypeObjC ||
       attrs.class_language == eLanguageTypeObjC_plus_plus) {
     if (!attrs.is_complete_objc_class &&
-        die.Supports_DW_AT_APPLE_objc_complete_type()) {
+        decl_die.Supports_DW_AT_APPLE_objc_complete_type()) {
       // We have a valid eSymbolTypeObjCClass class symbol whose name
       // matches the current objective C class that we are trying to find
       // and this DIE isn't the complete definition (we checked
       // is_complete_objc_class above and know it is false), so the real
       // definition is in here somewhere
-      type_sp =
-          dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true);
+      TypeSP type_sp =
+          dwarf->FindCompleteObjCDefinitionTypeForDIE(decl_die, attrs.name, true);
 
       if (!type_sp) {
         SymbolFileDWARFDebugMap *debug_map_symfile =
@@ -1749,7 +1693,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
           // We weren't able to find a full declaration in this DWARF,
           // see if we have a declaration anywhere else...
           type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE(
-              die, attrs.name, true);
+              decl_die, attrs.name, true);
         }
       }
 
@@ -1759,20 +1703,16 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
               log,
               "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an "
               "incomplete objc type, complete type is {5:x8}",
-              static_cast<void *>(this), die.GetOffset(),
+              static_cast<void *>(this), decl_die.GetOffset(),
               DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
               type_sp->GetID());
         }
-
-        // We found a real definition for this type elsewhere so lets use
-        // it and cache the fact that we found a complete type for this
-        // die
-        dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
         return type_sp;
       }
     }
   }
 
+  DWARFDIE def_die;
   if (attrs.is_forward_declaration) {
     Progress progress(llvm::formatv(
         "Parsing type in {0}: '{1}'",
@@ -1789,83 +1729,87 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
           log,
           "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a "
           "forward declaration, trying to find complete type",
-          static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
-          tag, attrs.name.GetCString());
+          static_cast<void *>(this), decl_die.GetID(),
+          DW_TAG_value_to_name(tag), tag, attrs.name.GetCString());
     }
 
     // See if the type comes from a Clang module and if so, track down
     // that type.
-    type_sp = ParseTypeFromClangModule(sc, die, log);
-    if (type_sp)
+    if (TypeSP type_sp = ParseTypeFromClangModule(sc, decl_die, log))
       return type_sp;
 
-    // type_sp = FindDefinitionTypeForDIE (dwarf_cu, die,
-    // type_name_const_str);
-    type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
+    def_die = dwarf->FindDefinitionDIE(decl_die);
 
-    if (!type_sp) {
+    if (!def_die) {
       SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
       if (debug_map_symfile) {
         // We weren't able to find a full declaration in this DWARF, see
         // if we have a declaration anywhere else...
-        type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
+        def_die = debug_map_symfile->FindDefinitionDIE(decl_die);
       }
     }
 
-    if (type_sp) {
-      if (log) {
-        dwarf->GetObjectFile()->GetModule()->LogMessage(
-            log,
-            "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a "
-            "forward declaration, complete type is {5:x8}",
-            static_cast<void *>(this), die.GetOffset(),
-            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
-            type_sp->GetID());
-      }
+    if (log) {
+      dwarf->GetObjectFile()->GetModule()->LogMessage(
+          log,
+          "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a "
+          "forward declaration, complete type is {5}",
+          static_cast<void *>(this), def_die.GetID(), DW_TAG_value_to_name(tag),
+          tag, attrs.name.GetCString(),
+          def_die ? llvm::utohexstr(def_die.GetID()) : "not found");
+    }
+  }
 
-      // We found a real definition for this type elsewhere so lets use
-      // it and cache the fact that we found a complete type for this die
-      dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-      clang::DeclContext *defn_decl_ctx =
-          GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID()));
-      if (defn_decl_ctx)
-        LinkDeclContextToDIE(defn_decl_ctx, die);
-      return type_sp;
+  if (def_die) {
+    if (auto [it, inserted] = dwarf->GetDIEToType().try_emplace(
+            def_die.GetDIE(), DIE_IS_BEING_PARSED);
+        !inserted) {
+      if (it->getSecond() == nullptr || it->getSecond() == DIE_IS_BEING_PARSED)
+        return nullptr;
+      return it->getSecond()->shared_from_this();
     }
+    attrs = ParsedDWARFTypeAttributes(def_die);
+  } else {
+    // No definition found. Proceed with the declaration die. We can use it to
+    // create a forward-declared type.
+    def_die = decl_die;
   }
   assert(tag_decl_kind != -1);
   UNUSED_IF_ASSERT_DISABLED(tag_decl_kind);
   bool clang_type_was_created = false;
-  clang::DeclContext *decl_ctx = GetClangDeclContextContainingDIE(die, nullptr);
+  clang::DeclContext *containing_decl_ctx = GetClangDeclContextContainingDIE(def_die, nullptr);
 
-  PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(), decl_ctx, die,
+  PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(),
+                                 containing_decl_ctx, def_die,
                                  attrs.name.GetCString());
 
-  if (attrs.accessibility == eAccessNone && decl_ctx) {
+  if (attrs.accessibility == eAccessNone && containing_decl_ctx) {
     // Check the decl context that contains this class/struct/union. If
     // it is a class we must give it an accessibility.
-    const clang::Decl::Kind containing_decl_kind = decl_ctx->getDeclKind();
+    const clang::Decl::Kind containing_decl_kind =
+        containing_decl_ctx->getDeclKind();
     if (DeclKindIsCXXClass(containing_decl_kind))
       attrs.accessibility = default_accessibility;
   }
 
   ClangASTMetadata metadata;
-  metadata.SetUserID(die.GetID());
-  metadata.SetIsDynamicCXXType(dwarf->ClassOrStructIsVirtual(die));
+  metadata.SetUserID(def_die.GetID());
+  metadata.SetIsDynamicCXXType(dwarf->ClassOrStructIsVirtual(def_die));
 
   TypeSystemClang::TemplateParameterInfos template_param_infos;
-  if (ParseTemplateParameterInfos(die, template_param_infos)) {
+  if (ParseTemplateParameterInfos(def_die, template_param_infos)) {
     clang::ClassTemplateDecl *class_template_decl =
         m_ast.ParseClassTemplateDecl(
-            decl_ctx, GetOwningClangModule(die), attrs.accessibility,
-            attrs.name.GetCString(), tag_decl_kind, template_param_infos);
+            containing_decl_ctx, GetOwningClangModule(def_die),
+            attrs.accessibility, attrs.name.GetCString(), tag_decl_kind,
+            template_param_infos);
     if (!class_template_decl) {
       if (log) {
         dwarf->GetObjectFile()->GetModule()->LogMessage(
             log,
             "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" "
             "clang::ClassTemplateDecl failed to return a decl.",
-            static_cast<void *>(this), die.GetOffset(),
+            static_cast<void *>(this), def_die.GetID(),
             DW_TAG_value_to_name(tag), tag, attrs.name.GetCString());
       }
       return TypeSP();
@@ -1873,8 +1817,8 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
 
     clang::ClassTemplateSpecializationDecl *class_specialization_decl =
         m_ast.CreateClassTemplateSpecializationDecl(
-            decl_ctx, GetOwningClangModule(die), class_template_decl,
-            tag_decl_kind, template_param_infos);
+            containing_decl_ctx, GetOwningClangModule(def_die),
+            class_template_decl, tag_decl_kind, template_param_infos);
     clang_type =
         m_ast.CreateClassTemplateSpecializationType(class_specialization_decl);
     clang_type_was_created = true;
@@ -1886,98 +1830,54 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   if (!clang_type_was_created) {
     clang_type_was_created = true;
     clang_type = m_ast.CreateRecordType(
-        decl_ctx, GetOwningClangModule(die), attrs.accessibility,
+        containing_decl_ctx, GetOwningClangModule(def_die), attrs.accessibility,
         attrs.name.GetCString(), tag_decl_kind, attrs.class_language, &metadata,
         attrs.exports_symbols);
   }
 
-  // Store a forward declaration to this class type in case any
-  // parameters in any class methods need it for the clang types for
-  // function prototypes.
-  LinkDeclContextToDIE(m_ast.GetDeclContextForType(clang_type), die);
-  type_sp = dwarf->MakeType(
-      die.GetID(), attrs.name, attrs.byte_size, nullptr, LLDB_INVALID_UID,
+  TypeSP type_sp = dwarf->MakeType(
+      def_die.GetID(), attrs.name, attrs.byte_size, nullptr, LLDB_INVALID_UID,
       Type::eEncodingIsUID, &attrs.decl, clang_type,
       Type::ResolveState::Forward,
       TypePayloadClang(OptionalClangModuleID(), attrs.is_complete_objc_class));
 
+  // Store a forward declaration to this class type in case any
+  // parameters in any class methods need it for the clang types for
+  // function prototypes.
+  clang::DeclContext *type_decl_ctx =
+      TypeSystemClang::GetDeclContextForType(clang_type);
+  LinkDeclContextToDIE(type_decl_ctx, decl_die);
+  if (decl_die != def_die) {
+    LinkDeclContextToDIE(type_decl_ctx, def_die);
+    dwarf->GetDIEToType()[def_die.GetDIE()] = type_sp.get();
+    // Declaration DIE is inserted into the type map in ParseTypeFromDWARF
+  }
+
   // Add our type to the unique type map so we don't end up creating many
   // copies of the same type over and over in the ASTContext for our
   // module
   unique_ast_entry_up->m_type_sp = type_sp;
-  unique_ast_entry_up->m_die = die;
+  unique_ast_entry_up->m_die = def_die;
   unique_ast_entry_up->m_declaration = unique_decl;
   unique_ast_entry_up->m_byte_size = attrs.byte_size.value_or(0);
   dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename,
                                            *unique_ast_entry_up);
 
-  if (!attrs.is_forward_declaration) {
-    // Always start the definition for a class type so that if the class
-    // has child classes or types that require the class to be created
-    // for use as their decl contexts the class will be ready to accept
-    // these child definitions.
-    if (!die.HasChildren()) {
-      // No children for this struct/union/class, lets finish it
-      if (TypeSystemClang::StartTagDeclarationDefinition(clang_type)) {
-        TypeSystemClang::CompleteTagDeclarationDefinition(clang_type);
-      } else {
-        dwarf->GetObjectFile()->GetModule()->ReportError(
-
-            "DWARF DIE at {0:x16} named \"{1}\" was not able to start "
-            "its "
-            "definition.\nPlease file a bug and attach the file at the "
-            "start of this error message",
-            die.GetOffset(), attrs.name.GetCString());
-      }
-
-      // Setting authority byte size and alignment for empty structures.
-      //
-      // If the byte size or alignmenet of the record is specified then
-      // overwrite the ones that would be computed by Clang.
-      // This is only needed as LLDB's TypeSystemClang is always in C++ mode,
-      // but some compilers such as GCC and Clang give empty structs a size of 0
-      // in C mode (in contrast to the size of 1 for empty structs that would be
-      // computed in C++ mode).
-      if (attrs.byte_size || attrs.alignment) {
-        clang::RecordDecl *record_decl =
-            TypeSystemClang::GetAsRecordDecl(clang_type);
-        if (record_decl) {
-          ClangASTImporter::LayoutInfo layout;
-          layout.bit_size = attrs.byte_size.value_or(0) * 8;
-          layout.alignment = attrs.alignment.value_or(0) * 8;
-          GetClangASTImporter().SetRecordLayout(record_decl, layout);
-        }
-      }
-    } else if (clang_type_was_created) {
-      // Start the definition if the class is not objective C since the
-      // underlying decls respond to isCompleteDefinition(). Objective
-      // C decls don't respond to isCompleteDefinition() so we can't
-      // start the declaration definition right away. For C++
-      // class/union/structs we want to start the definition in case the
-      // class is needed as the declaration context for a contained class
-      // or type without the need to complete that type..
-
-      if (attrs.class_language != eLanguageTypeObjC &&
-          attrs.class_language != eLanguageTypeObjC_plus_plus)
-        TypeSystemClang::StartTagDeclarationDefinition(clang_type);
-
-      // Leave this as a forward declaration until we need to know the
-      // details of the type. lldb_private::Type will automatically call
-      // the SymbolFile virtual function
-      // "SymbolFileDWARF::CompleteType(Type *)" When the definition
-      // needs to be defined.
-      assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count(
-                 ClangUtil::RemoveFastQualifiers(clang_type)
-                     .GetOpaqueQualType()) &&
-             "Type already in the forward declaration map!");
-      // Can't assume m_ast.GetSymbolFile() is actually a
-      // SymbolFileDWARF, it can be a SymbolFileDWARFDebugMap for Apple
-      // binaries.
-      dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
-          ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
-          *die.GetDIERef());
-      m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
-    }
+  if (clang_type_was_created) {
+    // Leave this as a forward declaration until we need to know the
+    // details of the type. lldb_private::Type will automatically call
+    // the SymbolFile virtual function
+    // "SymbolFileDWARF::CompleteType(Type *)" When the definition
+    // needs to be defined.
+    bool inserted =
+        dwarf->GetForwardDeclCompilerTypeToDIE()
+            .try_emplace(
+                ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
+                *def_die.GetDIERef())
+            .second;
+    assert(inserted && "Type already in the forward declaration map!");
+    (void)inserted;
+    m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
   }
 
   // If we made a clang type, set the trivial abi if applicable: We only
@@ -2211,87 +2111,82 @@ bool DWARFASTParserClang::CompleteRecordType(const DWARFDIE &die,
   ClangASTImporter::LayoutInfo layout_info;
   std::vector<DWARFDIE> contained_type_dies;
 
-  if (die.HasChildren()) {
-    const bool type_is_objc_object_or_interface =
-        TypeSystemClang::IsObjCObjectOrInterfaceType(clang_type);
-    if (type_is_objc_object_or_interface) {
-      // For objective C we don't start the definition when the class is
-      // created.
-      TypeSystemClang::StartTagDeclarationDefinition(clang_type);
-    }
-
-    AccessType default_accessibility = eAccessNone;
-    if (tag == DW_TAG_structure_type) {
-      default_accessibility = eAccessPublic;
-    } else if (tag == DW_TAG_union_type) {
-      default_accessibility = eAccessPublic;
-    } else if (tag == DW_TAG_class_type) {
-      default_accessibility = eAccessPrivate;
-    }
-
-    std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> bases;
-    // Parse members and base classes first
-    std::vector<DWARFDIE> member_function_dies;
-
-    DelayedPropertyList delayed_properties;
-    ParseChildMembers(die, clang_type, bases, member_function_dies,
-                      contained_type_dies, delayed_properties,
-                      default_accessibility, layout_info);
-
-    // Now parse any methods if there were any...
-    for (const DWARFDIE &die : member_function_dies)
-      dwarf->ResolveType(die);
-
-    if (type_is_objc_object_or_interface) {
-      ConstString class_name(clang_type.GetTypeName());
-      if (class_name) {
-        dwarf->GetObjCMethods(class_name, [&](DWARFDIE method_die) {
-          method_die.ResolveType();
-          return true;
-        });
+  if (die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0))
+    return false; // No definition, cannot complete.
 
-        for (DelayedAddObjCClassProperty &property : delayed_properties)
-          property.Finalize();
-      }
-    }
+  // Start the definition if the type is not being defined already. This can
+  // happen (e.g.) when adding nested types to a class type -- see
+  // PrepareContextToReceiveMembers.
+  if (!clang_type.IsBeingDefined())
+    TypeSystemClang::StartTagDeclarationDefinition(clang_type);
 
-    if (!bases.empty()) {
-      // Make sure all base classes refer to complete types and not forward
-      // declarations. If we don't do this, clang will crash with an
-      // assertion in the call to clang_type.TransferBaseClasses()
-      for (const auto &base_class : bases) {
-        clang::TypeSourceInfo *type_source_info =
-            base_class->getTypeSourceInfo();
-        if (type_source_info)
-          TypeSystemClang::RequireCompleteType(
-              m_ast.GetType(type_source_info->getType()));
-      }
+  AccessType default_accessibility = eAccessNone;
+  if (tag == DW_TAG_structure_type) {
+    default_accessibility = eAccessPublic;
+  } else if (tag == DW_TAG_union_type) {
+    default_accessibility = eAccessPublic;
+  } else if (tag == DW_TAG_class_type) {
+    default_accessibility = eAccessPrivate;
+  }
+
+  std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> bases;
+  // Parse members and base classes first
+  std::vector<DWARFDIE> member_function_dies;
+
+  DelayedPropertyList delayed_properties;
+  ParseChildMembers(die, clang_type, bases, member_function_dies,
+                    contained_type_dies, delayed_properties,
+                    default_accessibility, layout_info);
+
+  // Now parse any methods if there were any...
+  for (const DWARFDIE &die : member_function_dies)
+    dwarf->ResolveType(die);
+
+  if (TypeSystemClang::IsObjCObjectOrInterfaceType(clang_type)) {
+    ConstString class_name(clang_type.GetTypeName());
+    if (class_name) {
+      dwarf->GetObjCMethods(class_name, [&](DWARFDIE method_die) {
+        method_die.ResolveType();
+        return true;
+      });
 
-      m_ast.TransferBaseClasses(clang_type.GetOpaqueQualType(),
-                                std::move(bases));
+      for (DelayedAddObjCClassProperty &property : delayed_properties)
+        property.Finalize();
     }
   }
 
+  if (!bases.empty()) {
+    // Make sure all base classes refer to complete types and not forward
+    // declarations. If we don't do this, clang will crash with an
+    // assertion in the call to clang_type.TransferBaseClasses()
+    for (const auto &base_class : bases) {
+      clang::TypeSourceInfo *type_source_info = base_class->getTypeSourceInfo();
+      if (type_source_info)
+        TypeSystemClang::RequireCompleteType(
+            m_ast.GetType(type_source_info->getType()));
+    }
+
+    m_ast.TransferBaseClasses(clang_type.GetOpaqueQualType(), std::move(bases));
+  }
+
   m_ast.AddMethodOverridesForCXXRecordType(clang_type.GetOpaqueQualType());
   TypeSystemClang::BuildIndirectFields(clang_type);
   TypeSystemClang::CompleteTagDeclarationDefinition(clang_type);
 
-  if (!layout_info.field_offsets.empty() || !layout_info.base_offsets.empty() ||
-      !layout_info.vbase_offsets.empty()) {
-    if (type)
-      layout_info.bit_size = type->GetByteSize(nullptr).value_or(0) * 8;
-    if (layout_info.bit_size == 0)
-      layout_info.bit_size =
-          die.GetAttributeValueAsUnsigned(DW_AT_byte_size, 0) * 8;
-    if (layout_info.alignment == 0)
-      layout_info.alignment =
-          die.GetAttributeValueAsUnsigned(llvm::dwarf::DW_AT_alignment, 0) * 8;
+  if (type)
+    layout_info.bit_size = type->GetByteSize(nullptr).value_or(0) * 8;
+  if (layout_info.bit_size == 0)
+    layout_info.bit_size =
+        die.GetAttributeValueAsUnsigned(DW_AT_byte_size, 0) * 8;
+  if (layout_info.alignment == 0)
+    layout_info.alignment =
+        die.GetAttributeValueAsUnsigned(llvm::dwarf::DW_AT_alignment, 0) * 8;
+
+  clang::CXXRecordDecl *record_decl =
+      m_ast.GetAsCXXRecordDecl(clang_type.GetOpaqueQualType());
+  if (record_decl)
+    GetClangASTImporter().SetRecordLayout(record_decl, layout_info);
 
-    clang::CXXRecordDecl *record_decl =
-        m_ast.GetAsCXXRecordDecl(clang_type.GetOpaqueQualType());
-    if (record_decl)
-      GetClangASTImporter().SetRecordLayout(record_decl, layout_info);
-  }
   // Now parse all contained types inside of the class. We make forward
   // declarations to all classes, but we need the CXXRecordDecl to have decls
   // for all contained types because we don't get asked for them via the
@@ -2339,15 +2234,25 @@ bool DWARFASTParserClang::CompleteTypeFromDWARF(const DWARFDIE &die,
   case DW_TAG_structure_type:
   case DW_TAG_union_type:
   case DW_TAG_class_type:
-    return CompleteRecordType(die, type, clang_type);
+    CompleteRecordType(die, type, clang_type);
+    break;
   case DW_TAG_enumeration_type:
-    return CompleteEnumType(die, type, clang_type);
+    CompleteEnumType(die, type, clang_type);
+    break;
   default:
     assert(false && "not a forward clang type decl!");
     break;
   }
 
-  return false;
+  // If the type is still not fully defined at this point, it means we weren't
+  // able to find its definition. We must forcefully complete it to preserve
+  // clang AST invariants.
+  if (clang_type.IsBeingDefined()) {
+    TypeSystemClang::CompleteTagDeclarationDefinition(clang_type);
+    m_ast.SetDeclIsForcefullyCompleted(ClangUtil::GetAsTagDecl(clang_type));
+  }
+
+  return true;
 }
 
 void DWARFASTParserClang::EnsureAllDIEsInDeclContextHaveBeenParsed(
@@ -3809,7 +3714,7 @@ bool DWARFASTParserClang::CopyUniqueClassMethodTypes(
     if (dst_decl_ctx)
       src_dwarf_ast_parser->LinkDeclContextToDIE(dst_decl_ctx, src);
 
-    if (Type *src_child_type = die_to_type[src.GetDIE()])
+    if (Type *src_child_type = die_to_type.lookup(src.GetDIE()))
       die_to_type[dst.GetDIE()] = src_child_type;
   };
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index ada3da85112fee..fb32e2adeb3fea 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -394,15 +394,13 @@ static void GetDeclContextImpl(DWARFDIE die,
     case DW_TAG_namespace:
       push_ctx(CompilerContextKind::Namespace, die.GetName());
       break;
+    case DW_TAG_class_type:
     case DW_TAG_structure_type:
-      push_ctx(CompilerContextKind::Struct, die.GetName());
+      push_ctx(CompilerContextKind::ClassOrStruct, die.GetName());
       break;
     case DW_TAG_union_type:
       push_ctx(CompilerContextKind::Union, die.GetName());
       break;
-    case DW_TAG_class_type:
-      push_ctx(CompilerContextKind::Class, die.GetName());
-      break;
     case DW_TAG_enumeration_type:
       push_ctx(CompilerContextKind::Enum, die.GetName());
       break;
@@ -456,15 +454,13 @@ static void GetTypeLookupContextImpl(DWARFDIE die,
     case DW_TAG_namespace:
       push_ctx(CompilerContextKind::Namespace, die.GetName());
       break;
+    case DW_TAG_class_type:
     case DW_TAG_structure_type:
-      push_ctx(CompilerContextKind::Struct, die.GetName());
+      push_ctx(CompilerContextKind::ClassOrStruct, die.GetName());
       break;
     case DW_TAG_union_type:
       push_ctx(CompilerContextKind::Union, die.GetName());
       break;
-    case DW_TAG_class_type:
-      push_ctx(CompilerContextKind::Class, die.GetName());
-      break;
     case DW_TAG_enumeration_type:
       push_ctx(CompilerContextKind::Enum, die.GetName());
       break;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
index c37cc91e08ed12..f7df38d2401914 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
@@ -232,6 +232,10 @@ DWARFDebugInfo::GetUnitContainingDIEOffset(DIERef::Section section,
   return result;
 }
 
+const std::shared_ptr<SymbolFileDWARFDwo> &DWARFDebugInfo::GetDwpSymbolFile() {
+  return m_dwarf.GetDwpSymbolFile();
+}
+
 DWARFTypeUnit *DWARFDebugInfo::GetTypeUnitForHash(uint64_t hash) {
   auto pos = llvm::lower_bound(m_type_hash_to_unit_index,
                                std::make_pair(hash, 0u), llvm::less_first());
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
index 4706b55d38ea98..598739bf3cb958 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
@@ -52,6 +52,8 @@ class DWARFDebugInfo {
 
   const DWARFDebugAranges &GetCompileUnitAranges();
 
+  const std::shared_ptr<SymbolFileDWARFDwo> &GetDwpSymbolFile();
+
 protected:
   typedef std::vector<DWARFUnitSP> UnitColl;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index 1d17f20670eed4..7e66b3dccf97fa 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -10,6 +10,7 @@
 #include "Plugins/SymbolFile/DWARF/DWARFDebugInfo.h"
 #include "Plugins/SymbolFile/DWARF/DWARFDeclContext.h"
 #include "Plugins/SymbolFile/DWARF/LogChannelDWARF.h"
+#include "Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Utility/RegularExpression.h"
 #include "lldb/Utility/Stream.h"
@@ -34,6 +35,17 @@ DebugNamesDWARFIndex::Create(Module &module, DWARFDataExtractor debug_names,
       module, std::move(index_up), debug_names, debug_str, dwarf));
 }
 
+llvm::DenseSet<uint64_t>
+DebugNamesDWARFIndex::GetTypeUnitSignatures(const DebugNames &debug_names) {
+  llvm::DenseSet<uint64_t> result;
+  for (const DebugNames::NameIndex &ni : debug_names) {
+    const uint32_t num_tus = ni.getForeignTUCount();
+    for (uint32_t tu = 0; tu < num_tus; ++tu)
+      result.insert(ni.getForeignTUSignature(tu));
+  }
+  return result;
+}
+
 llvm::DenseSet<dw_offset_t>
 DebugNamesDWARFIndex::GetUnits(const DebugNames &debug_names) {
   llvm::DenseSet<dw_offset_t> result;
@@ -48,20 +60,80 @@ DebugNamesDWARFIndex::GetUnits(const DebugNames &debug_names) {
   return result;
 }
 
+std::optional<DWARFTypeUnit *>
+DebugNamesDWARFIndex::GetForeignTypeUnit(const DebugNames::Entry &entry) const {
+  std::optional<uint64_t> type_sig = entry.getForeignTUTypeSignature();
+  if (!type_sig.has_value())
+    return std::nullopt;
+
+  // Ask the entry for the skeleton compile unit offset and fetch the .dwo
+  // file from it and get the type unit by signature from there. If we find
+  // the type unit in the .dwo file, we don't need to check that the
+  // DW_AT_dwo_name matches because each .dwo file can have its own type unit.
+  std::optional<uint64_t> cu_offset = entry.getRelatedCUOffset();
+  if (!cu_offset)
+    return nullptr; // Return NULL, this is a type unit, but couldn't find it.
+
+  DWARFUnit *cu =
+      m_debug_info.GetUnitAtOffset(DIERef::Section::DebugInfo, *cu_offset);
+  if (!cu)
+    return nullptr; // Return NULL, this is a type unit, but couldn't find it.
+
+  auto dwp_sp = m_debug_info.GetDwpSymbolFile();
+  if (!dwp_sp) {
+    // No .dwp file, we need to load the .dwo file.
+    DWARFUnit &dwo_cu = cu->GetNonSkeletonUnit();
+    // We don't need the check if the type unit matches the .dwo file if we have
+    // a .dwo file (not a .dwp), so we can just return the value here.
+    if (!dwo_cu.IsDWOUnit())
+      return nullptr; // We weren't able to load the .dwo file.
+    return dwo_cu.GetSymbolFileDWARF().DebugInfo().GetTypeUnitForHash(
+        *type_sig);
+  }
+  // We have a .dwp file, just get the type unit from there. We need to verify
+  // that the type unit that ended up in the final .dwp file is the right type
+  // unit. Type units have signatures which are the same across multiple .dwo
+  // files, but only one of those type units will end up in the .dwp file. The
+  // contents of type units for the same type can be different in different .dwo
+  // files, which means the DIE offsets might not be the same between two
+  // different type units. So we need to determine if this accelerator table
+  // matches the type unit that ended up in the .dwp file. If it doesn't match,
+  // then we need to ignore this accelerator table entry as the type unit that
+  // is in the .dwp file will have its own index. In order to determine if the
+  // type unit that ended up in a .dwp file matches this DebugNames::Entry, we
+  // need to find the skeleton compile unit for this entry.
+  DWARFTypeUnit *foreign_tu = dwp_sp->DebugInfo().GetTypeUnitForHash(*type_sig);
+  if (!foreign_tu)
+    return nullptr; // Return NULL, this is a type unit, but couldn't find it.
+
+  DWARFBaseDIE cu_die = cu->GetUnitDIEOnly();
+  DWARFBaseDIE tu_die = foreign_tu->GetUnitDIEOnly();
+  llvm::StringRef cu_dwo_name =
+      cu_die.GetAttributeValueAsString(DW_AT_dwo_name, nullptr);
+  llvm::StringRef tu_dwo_name =
+      tu_die.GetAttributeValueAsString(DW_AT_dwo_name, nullptr);
+  if (cu_dwo_name == tu_dwo_name)
+    return foreign_tu; // We found a match!
+  return nullptr; // Return NULL, this is a type unit, but couldn't find it.
+}
+
 DWARFUnit *
 DebugNamesDWARFIndex::GetNonSkeletonUnit(const DebugNames::Entry &entry) const {
+
+  if (std::optional<DWARFTypeUnit *> foreign_tu = GetForeignTypeUnit(entry))
+    return foreign_tu.value();
+
   // Look for a DWARF unit offset (CU offset or local TU offset) as they are
   // both offsets into the .debug_info section.
   std::optional<uint64_t> unit_offset = entry.getCUOffset();
-  if (!unit_offset) {
+  if (!unit_offset)
     unit_offset = entry.getLocalTUOffset();
-    if (!unit_offset)
-      return nullptr;
+  if (unit_offset) {
+    if (DWARFUnit *cu = m_debug_info.GetUnitAtOffset(DIERef::Section::DebugInfo,
+                                                     *unit_offset))
+      return &cu->GetNonSkeletonUnit();
   }
-
-  DWARFUnit *cu =
-      m_debug_info.GetUnitAtOffset(DIERef::Section::DebugInfo, *unit_offset);
-  return cu ? &cu->GetNonSkeletonUnit() : nullptr;
+  return nullptr;
 }
 
 DWARFDIE DebugNamesDWARFIndex::GetDIE(const DebugNames::Entry &entry) const {
@@ -274,6 +346,13 @@ void DebugNamesDWARFIndex::GetFullyQualifiedType(
     if (!isType(entry.tag()))
       continue;
 
+    // If we get a NULL foreign_tu back, the entry doesn't match the type unit
+    // in the .dwp file, or we were not able to load the .dwo file or the DWO ID
+    // didn't match.
+    std::optional<DWARFTypeUnit *> foreign_tu = GetForeignTypeUnit(entry);
+    if (foreign_tu && foreign_tu.value() == nullptr)
+      continue;
+
     // Grab at most one extra parent, subsequent parents are not necessary to
     // test equality.
     std::optional<llvm::SmallVector<Entry, 4>> parent_chain =
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
index a27a414ecdd193..cb15c1d4f994b3 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
@@ -70,7 +70,8 @@ class DebugNamesDWARFIndex : public DWARFIndex {
       : DWARFIndex(module), m_debug_info(dwarf.DebugInfo()),
         m_debug_names_data(debug_names_data), m_debug_str_data(debug_str_data),
         m_debug_names_up(std::move(debug_names_up)),
-        m_fallback(module, dwarf, GetUnits(*m_debug_names_up)) {}
+        m_fallback(module, dwarf, GetUnits(*m_debug_names_up),
+                   GetTypeUnitSignatures(*m_debug_names_up)) {}
 
   DWARFDebugInfo &m_debug_info;
 
@@ -85,6 +86,31 @@ class DebugNamesDWARFIndex : public DWARFIndex {
 
   DWARFUnit *GetNonSkeletonUnit(const DebugNames::Entry &entry) const;
   DWARFDIE GetDIE(const DebugNames::Entry &entry) const;
+
+  /// Checks if an entry is a foreign TU and fetch the type unit.
+  ///
+  /// This function checks if the DebugNames::Entry refers to a foreign TU and
+  /// returns an optional with a value of the \a entry is a foreign type unit
+  /// entry. A valid pointer will be returned if this entry is from a .dwo file
+  /// or if it is from a .dwp file and it matches the type unit's originating
+  /// .dwo file by verifying that the DW_TAG_type_unit DIE has a DW_AT_dwo_name
+  /// that matches the DWO name from the originating skeleton compile unit.
+  ///
+  /// \param[in] entry
+  ///   The accelerator table entry to check.
+  ///
+  /// \returns
+  ///   A std::optional that has a value if this entry represents a foreign type
+  ///   unit. If the pointer is valid, then we were able to find and match the
+  ///   entry to the type unit in the .dwo or .dwp file. The returned value can
+  ///   have a valid, yet contain NULL in the following cases:
+  ///   - we were not able to load the .dwo file (missing or DWO ID mismatch)
+  ///   - we were able to load the .dwp file, but the type units DWO name
+  ///     doesn't match the originating skeleton compile unit's entry
+  ///   Returns std::nullopt if this entry is not a foreign type unit entry.
+  std::optional<DWARFTypeUnit *>
+  GetForeignTypeUnit(const DebugNames::Entry &entry) const;
+
   bool ProcessEntry(const DebugNames::Entry &entry,
                     llvm::function_ref<bool(DWARFDIE die)> callback);
 
@@ -97,6 +123,8 @@ class DebugNamesDWARFIndex : public DWARFIndex {
                                   llvm::StringRef name);
 
   static llvm::DenseSet<dw_offset_t> GetUnits(const DebugNames &debug_names);
+  static llvm::DenseSet<uint64_t>
+  GetTypeUnitSignatures(const DebugNames &debug_names);
 };
 
 } // namespace dwarf
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
index 92275600f99caa..d581d3773ab23e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -60,8 +60,11 @@ void ManualDWARFIndex::Index() {
   }
   if (dwp_info && dwp_info->ContainsTypeUnits()) {
     for (size_t U = 0; U < dwp_info->GetNumUnits(); ++U) {
-      if (auto *tu = llvm::dyn_cast<DWARFTypeUnit>(dwp_info->GetUnitAtIndex(U)))
-        units_to_index.push_back(tu);
+      if (auto *tu =
+              llvm::dyn_cast<DWARFTypeUnit>(dwp_info->GetUnitAtIndex(U))) {
+        if (!m_type_sigs_to_avoid.contains(tu->GetTypeHash()))
+          units_to_index.push_back(tu);
+      }
     }
   }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
index 0126e587e52d85..d8c4a22ab21f7b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
@@ -21,9 +21,11 @@ class SymbolFileDWARFDwo;
 class ManualDWARFIndex : public DWARFIndex {
 public:
   ManualDWARFIndex(Module &module, SymbolFileDWARF &dwarf,
-                   llvm::DenseSet<dw_offset_t> units_to_avoid = {})
+                   llvm::DenseSet<dw_offset_t> units_to_avoid = {},
+                   llvm::DenseSet<uint64_t> type_sigs_to_avoid = {})
       : DWARFIndex(module), m_dwarf(&dwarf),
-        m_units_to_avoid(std::move(units_to_avoid)) {}
+        m_units_to_avoid(std::move(units_to_avoid)),
+        m_type_sigs_to_avoid(std::move(type_sigs_to_avoid)) {}
 
   void Preload() override { Index(); }
 
@@ -170,6 +172,7 @@ class ManualDWARFIndex : public DWARFIndex {
   SymbolFileDWARF *m_dwarf;
   /// Which dwarf units should we skip while building the index.
   llvm::DenseSet<dw_offset_t> m_units_to_avoid;
+  llvm::DenseSet<uint64_t> m_type_sigs_to_avoid;
 
   IndexSet m_set;
   bool m_indexed = false;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 0f3eab0186c4e8..f2ff3a8b259fa4 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -1727,14 +1727,7 @@ lldb::ModuleSP SymbolFileDWARF::GetExternalModule(ConstString name) {
   return pos->second;
 }
 
-DWARFDIE
-SymbolFileDWARF::GetDIE(const DIERef &die_ref) {
-  // This method can be called without going through the symbol vendor so we
-  // need to lock the module.
-  std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
-
-  SymbolFileDWARF *symbol_file = nullptr;
-
+SymbolFileDWARF *SymbolFileDWARF::GetDIERefSymbolFile(const DIERef &die_ref) {
   // Anytime we get a "lldb::user_id_t" from an lldb_private::SymbolFile API we
   // must make sure we use the correct DWARF file when resolving things. On
   // MacOSX, when using SymbolFileDWARFDebugMap, we will use multiple
@@ -1742,30 +1735,44 @@ SymbolFileDWARF::GetDIE(const DIERef &die_ref) {
   // references to other DWARF objects and we must be ready to receive a
   // "lldb::user_id_t" that specifies a DIE from another SymbolFileDWARF
   // instance.
+
   std::optional<uint32_t> file_index = die_ref.file_index();
-  if (file_index) {
-    if (SymbolFileDWARFDebugMap *debug_map = GetDebugMapSymfile()) {
-      symbol_file = debug_map->GetSymbolFileByOSOIndex(*file_index); // OSO case
-      if (symbol_file)
-        return symbol_file->DebugInfo().GetDIE(die_ref.section(),
-                                               die_ref.die_offset());
-      return DWARFDIE();
-    }
 
+  // If the file index matches, then we have the right SymbolFileDWARF already.
+  // This will work for both .dwo file and DWARF in .o files for mac. Also if
+  // both the file indexes are invalid, then we have a match.
+  if (GetFileIndex() == file_index)
+    return this;
+
+  if (file_index) {
+      // We have a SymbolFileDWARFDebugMap, so let it find the right file
+    if (SymbolFileDWARFDebugMap *debug_map = GetDebugMapSymfile())
+      return debug_map->GetSymbolFileByOSOIndex(*file_index);
+    
+    // Handle the .dwp file case correctly
     if (*file_index == DIERef::k_file_index_mask)
-      symbol_file = GetDwpSymbolFile().get(); // DWP case
-    else
-      symbol_file = this->DebugInfo()
-                        .GetUnitAtIndex(*die_ref.file_index())
-                        ->GetDwoSymbolFile(); // DWO case
-  } else if (die_ref.die_offset() == DW_INVALID_OFFSET) {
-    return DWARFDIE();
+      return GetDwpSymbolFile().get(); // DWP case
+
+    // Handle the .dwo file case correctly
+    return DebugInfo().GetUnitAtIndex(*die_ref.file_index())
+        ->GetDwoSymbolFile(); // DWO case
   }
+  return this;
+}
 
-  if (symbol_file)
-    return symbol_file->GetDIE(die_ref);
+DWARFDIE
+SymbolFileDWARF::GetDIE(const DIERef &die_ref) {
+  if (die_ref.die_offset() == DW_INVALID_OFFSET)
+    return DWARFDIE();
 
-  return DebugInfo().GetDIE(die_ref.section(), die_ref.die_offset());
+  // This method can be called without going through the symbol vendor so we
+  // need to lock the module.
+  std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
+  SymbolFileDWARF *symbol_file = GetDIERefSymbolFile(die_ref);
+  if (symbol_file)
+    return symbol_file->DebugInfo().GetDIE(die_ref.section(),
+                                           die_ref.die_offset());
+  return DWARFDIE();
 }
 
 /// Return the DW_AT_(GNU_)dwo_id.
@@ -3038,118 +3045,113 @@ TypeSP SymbolFileDWARF::FindCompleteObjCDefinitionTypeForDIE(
   return type_sp;
 }
 
-TypeSP
-SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die) {
-  TypeSP type_sp;
+DWARFDIE
+SymbolFileDWARF::FindDefinitionDIE(const DWARFDIE &die) {
+  if (!die.GetName())
+    return {};
 
-  if (die.GetName()) {
-    const dw_tag_t tag = die.Tag();
+  const dw_tag_t tag = die.Tag();
 
-    Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
-    if (log) {
-      GetObjectFile()->GetModule()->LogMessage(
-          log,
-          "SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(tag={0} "
-          "({1}), name='{2}')",
-          DW_TAG_value_to_name(tag), tag, die.GetName());
-    }
+  Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
+  if (log) {
+    GetObjectFile()->GetModule()->LogMessage(
+        log,
+        "SymbolFileDWARF::FindDefinitionDIE(tag={0} "
+        "({1}), name='{2}')",
+        DW_TAG_value_to_name(tag), tag, die.GetName());
+  }
 
-    // Get the type system that we are looking to find a type for. We will
-    // use this to ensure any matches we find are in a language that this
-    // type system supports
-    const LanguageType language = GetLanguage(*die.GetCU());
-    TypeSystemSP type_system = nullptr;
-    if (language != eLanguageTypeUnknown) {
-      auto type_system_or_err = GetTypeSystemForLanguage(language);
-      if (auto err = type_system_or_err.takeError()) {
-        LLDB_LOG_ERROR(GetLog(LLDBLog::Symbols), std::move(err),
-                       "Cannot get TypeSystem for language {1}: {0}",
-                       Language::GetNameForLanguageType(language));
-      } else {
-        type_system = *type_system_or_err;
-      }
+  // Get the type system that we are looking to find a type for. We will
+  // use this to ensure any matches we find are in a language that this
+  // type system supports
+  const LanguageType language = GetLanguage(*die.GetCU());
+  TypeSystemSP type_system = nullptr;
+  if (language != eLanguageTypeUnknown) {
+    auto type_system_or_err = GetTypeSystemForLanguage(language);
+    if (auto err = type_system_or_err.takeError()) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Symbols), std::move(err),
+                     "Cannot get TypeSystem for language {1}: {0}",
+                     Language::GetNameForLanguageType(language));
+    } else {
+      type_system = *type_system_or_err;
     }
+  }
 
-    // See comments below about -gsimple-template-names for why we attempt to
-    // compute missing template parameter names.
-    std::vector<std::string> template_params;
-    DWARFDeclContext die_dwarf_decl_ctx;
-    DWARFASTParser *dwarf_ast = type_system ? type_system->GetDWARFParser() : nullptr;
-    for (DWARFDIE ctx_die = die; ctx_die && !isUnitType(ctx_die.Tag());
-         ctx_die = ctx_die.GetParentDeclContextDIE()) {
-      die_dwarf_decl_ctx.AppendDeclContext(ctx_die.Tag(), ctx_die.GetName());
-      template_params.push_back(
-          (ctx_die.IsStructUnionOrClass() && dwarf_ast)
-              ? dwarf_ast->GetDIEClassTemplateParams(ctx_die)
-              : "");
-    }
-    const bool any_template_params = llvm::any_of(
-        template_params, [](llvm::StringRef p) { return !p.empty(); });
-
-    auto die_matches = [&](DWARFDIE type_die) {
-      // Resolve the type if both have the same tag or {class, struct} tags.
-      const bool tag_matches =
-          type_die.Tag() == tag ||
-          (IsStructOrClassTag(type_die.Tag()) && IsStructOrClassTag(tag));
-      if (!tag_matches)
-        return false;
-      if (any_template_params) {
-        size_t pos = 0;
-        for (DWARFDIE ctx_die = type_die;
-             ctx_die && !isUnitType(ctx_die.Tag()) &&
-             pos < template_params.size();
-             ctx_die = ctx_die.GetParentDeclContextDIE(), ++pos) {
-          if (template_params[pos].empty())
-            continue;
-          if (template_params[pos] != dwarf_ast->GetDIEClassTemplateParams(ctx_die))
-            return false;
-        }
-        if (pos != template_params.size())
+  // See comments below about -gsimple-template-names for why we attempt to
+  // compute missing template parameter names.
+  std::vector<std::string> template_params;
+  DWARFDeclContext die_dwarf_decl_ctx;
+  DWARFASTParser *dwarf_ast =
+      type_system ? type_system->GetDWARFParser() : nullptr;
+  for (DWARFDIE ctx_die = die; ctx_die && !isUnitType(ctx_die.Tag());
+       ctx_die = ctx_die.GetParentDeclContextDIE()) {
+    die_dwarf_decl_ctx.AppendDeclContext(ctx_die.Tag(), ctx_die.GetName());
+    template_params.push_back(
+        (ctx_die.IsStructUnionOrClass() && dwarf_ast)
+            ? dwarf_ast->GetDIEClassTemplateParams(ctx_die)
+            : "");
+  }
+  const bool any_template_params = llvm::any_of(
+      template_params, [](llvm::StringRef p) { return !p.empty(); });
+
+  auto die_matches = [&](DWARFDIE type_die) {
+    // Resolve the type if both have the same tag or {class, struct} tags.
+    const bool tag_matches =
+        type_die.Tag() == tag ||
+        (IsStructOrClassTag(type_die.Tag()) && IsStructOrClassTag(tag));
+    if (!tag_matches)
+      return false;
+    if (any_template_params) {
+      size_t pos = 0;
+      for (DWARFDIE ctx_die = type_die; ctx_die && !isUnitType(ctx_die.Tag()) &&
+                                        pos < template_params.size();
+           ctx_die = ctx_die.GetParentDeclContextDIE(), ++pos) {
+        if (template_params[pos].empty())
+          continue;
+        if (template_params[pos] !=
+            dwarf_ast->GetDIEClassTemplateParams(ctx_die))
           return false;
       }
+      if (pos != template_params.size())
+        return false;
+    }
+    return true;
+  };
+  DWARFDIE result;
+  m_index->GetFullyQualifiedType(die_dwarf_decl_ctx, [&](DWARFDIE type_die) {
+    // Make sure type_die's language matches the type system we are
+    // looking for. We don't want to find a "Foo" type from Java if we
+    // are looking for a "Foo" type for C, C++, ObjC, or ObjC++.
+    if (type_system &&
+        !type_system->SupportsLanguage(GetLanguage(*type_die.GetCU())))
       return true;
-    };
-    m_index->GetFullyQualifiedType(die_dwarf_decl_ctx, [&](DWARFDIE type_die) {
-      // Make sure type_die's language matches the type system we are
-      // looking for. We don't want to find a "Foo" type from Java if we
-      // are looking for a "Foo" type for C, C++, ObjC, or ObjC++.
-      if (type_system &&
-          !type_system->SupportsLanguage(GetLanguage(*type_die.GetCU())))
-        return true;
-
-      if (!die_matches(type_die)) {
-        if (log) {
-          GetObjectFile()->GetModule()->LogMessage(
-              log,
-              "SymbolFileDWARF::"
-              "FindDefinitionTypeForDWARFDeclContext(tag={0} ({1}), "
-              "name='{2}') ignoring die={3:x16} ({4})",
-              DW_TAG_value_to_name(tag), tag, die.GetName(),
-              type_die.GetOffset(), type_die.GetName());
-        }
-        return true;
-      }
 
+    if (!die_matches(type_die)) {
       if (log) {
-        DWARFDeclContext type_dwarf_decl_ctx = type_die.GetDWARFDeclContext();
         GetObjectFile()->GetModule()->LogMessage(
             log,
-            "SymbolFileDWARF::"
-            "FindDefinitionTypeForDWARFDeclContext(tag={0} ({1}), name='{2}') "
-            "trying die={3:x16} ({4})",
+            "SymbolFileDWARF::FindDefinitionDIE(tag={0} ({1}), "
+            "name='{2}') ignoring die={3:x16} ({4})",
             DW_TAG_value_to_name(tag), tag, die.GetName(), type_die.GetOffset(),
-            type_dwarf_decl_ctx.GetQualifiedName());
+            type_die.GetName());
       }
+      return true;
+    }
 
-      Type *resolved_type = ResolveType(type_die, false);
-      if (!resolved_type || resolved_type == DIE_IS_BEING_PARSED)
-        return true;
+    if (log) {
+      DWARFDeclContext type_dwarf_decl_ctx = type_die.GetDWARFDeclContext();
+      GetObjectFile()->GetModule()->LogMessage(
+          log,
+          "SymbolFileDWARF::FindDefinitionTypeDIE(tag={0} ({1}), name='{2}') "
+          "trying die={3:x16} ({4})",
+          DW_TAG_value_to_name(tag), tag, die.GetName(), type_die.GetOffset(),
+          type_dwarf_decl_ctx.GetQualifiedName());
+    }
 
-      type_sp = resolved_type->shared_from_this();
-      return false;
-    });
-  }
-  return type_sp;
+    result = type_die;
+    return false;
+  });
+  return result;
 }
 
 TypeSP SymbolFileDWARF::ParseType(const SymbolContext &sc, const DWARFDIE &die,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index bb2d949677d985..8469248872a44f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -241,6 +241,15 @@ class SymbolFileDWARF : public SymbolFileCommon {
     return m_external_type_modules;
   }
 
+  /// Given a DIERef, find the correct SymbolFileDWARF.
+  ///
+  /// A DIERef contains a file index that can uniquely identify a N_OSO file for
+  /// DWARF in .o files on mac, or a .dwo or .dwp file index for split DWARF.
+  /// Calling this function will find the correct symbol file to use so that
+  /// further lookups can be done on the correct symbol file so that the DIE
+  /// offset makes sense in the DIERef.
+  virtual SymbolFileDWARF *GetDIERefSymbolFile(const DIERef &die_ref);
+
   virtual DWARFDIE GetDIE(const DIERef &die_ref);
 
   DWARFDIE GetDIE(lldb::user_id_t uid);
@@ -351,8 +360,7 @@ class SymbolFileDWARF : public SymbolFileCommon {
 
   SymbolFileDWARFDebugMap *GetDebugMapSymfile();
 
-  virtual lldb::TypeSP
-  FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die);
+  virtual DWARFDIE FindDefinitionDIE(const DWARFDIE &die);
 
   virtual lldb::TypeSP FindCompleteObjCDefinitionTypeForDIE(
       const DWARFDIE &die, ConstString type_name, bool must_be_implementation);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
index f066f13d51c5d1..64cde16433efa8 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
@@ -1147,14 +1147,13 @@ SymbolFileDWARFDebugMap::ParseCallEdgesInFunction(
   return {};
 }
 
-TypeSP SymbolFileDWARFDebugMap::FindDefinitionTypeForDWARFDeclContext(
-    const DWARFDIE &die) {
-  TypeSP type_sp;
+DWARFDIE SymbolFileDWARFDebugMap::FindDefinitionDIE(const DWARFDIE &die) {
+  DWARFDIE result;
   ForEachSymbolFile([&](SymbolFileDWARF *oso_dwarf) {
-    type_sp = oso_dwarf->FindDefinitionTypeForDWARFDeclContext(die);
-    return type_sp ? IterationAction::Stop : IterationAction::Continue;
+    result = oso_dwarf->FindDefinitionDIE(die);
+    return result ? IterationAction::Stop : IterationAction::Continue;
   });
-  return type_sp;
+  return result;
 }
 
 bool SymbolFileDWARFDebugMap::Supports_DW_AT_APPLE_objc_complete_type(
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
index de22dd676eef0a..7d5516b92737b9 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
@@ -277,7 +277,7 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon {
 
   CompileUnitInfo *GetCompileUnitInfo(SymbolFileDWARF *oso_dwarf);
 
-  lldb::TypeSP FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die);
+  DWARFDIE FindDefinitionDIE(const DWARFDIE &die);
 
   bool Supports_DW_AT_APPLE_objc_complete_type(SymbolFileDWARF *skip_dwarf_oso);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
index 71c9997e4c82c1..4a8c532a0d2a5e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
@@ -125,9 +125,8 @@ UniqueDWARFASTTypeMap &SymbolFileDWARFDwo::GetUniqueDWARFASTTypeMap() {
   return GetBaseSymbolFile().GetUniqueDWARFASTTypeMap();
 }
 
-lldb::TypeSP
-SymbolFileDWARFDwo::FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die) {
-  return GetBaseSymbolFile().FindDefinitionTypeForDWARFDeclContext(die);
+DWARFDIE SymbolFileDWARFDwo::FindDefinitionDIE(const DWARFDIE &die) {
+  return GetBaseSymbolFile().FindDefinitionDIE(die);
 }
 
 lldb::TypeSP SymbolFileDWARFDwo::FindCompleteObjCDefinitionTypeForDIE(
@@ -174,3 +173,8 @@ bool SymbolFileDWARFDwo::GetDebugInfoHadFrameVariableErrors() const {
 void SymbolFileDWARFDwo::SetDebugInfoHadFrameVariableErrors() {
   return GetBaseSymbolFile().SetDebugInfoHadFrameVariableErrors();
 }
+
+SymbolFileDWARF *
+SymbolFileDWARFDwo::GetDIERefSymbolFile(const DIERef &die_ref) {
+  return GetBaseSymbolFile().GetDIERefSymbolFile(die_ref);
+}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
index 1500540424b524..3bd0a2d25a5a6a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
@@ -67,6 +67,8 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
   bool GetDebugInfoHadFrameVariableErrors() const override;
   void SetDebugInfoHadFrameVariableErrors() override;
 
+  SymbolFileDWARF *GetDIERefSymbolFile(const DIERef &die_ref) override;
+
 protected:
   DIEToTypePtr &GetDIEToType() override;
 
@@ -76,8 +78,7 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
 
   UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() override;
 
-  lldb::TypeSP
-  FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die) override;
+  DWARFDIE FindDefinitionDIE(const DWARFDIE &die) override;
 
   lldb::TypeSP
   FindCompleteObjCDefinitionTypeForDIE(const DWARFDIE &die,
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index eb7f6232ac9818..cd1c500d9aa292 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -9173,10 +9173,8 @@ static CompilerContextKind GetCompilerKind(clang::Decl::Kind clang_kind,
     if (decl_ctx) {
       if (decl_ctx->isFunctionOrMethod())
         return CompilerContextKind::Function;
-      else if (decl_ctx->isRecord())
-        return (CompilerContextKind)((uint16_t)CompilerContextKind::Class |
-                                     (uint16_t)CompilerContextKind::Struct |
-                                     (uint16_t)CompilerContextKind::Union);
+      if (decl_ctx->isRecord())
+        return CompilerContextKind::ClassOrStruct | CompilerContextKind::Union;
     }
     break;
   }
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index 585808ace15ce8..e76574795733f7 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -75,20 +75,18 @@ bool lldb_private::contextMatches(llvm::ArrayRef<CompilerContext> context_chain,
 static CompilerContextKind ConvertTypeClass(lldb::TypeClass type_class) {
   if (type_class == eTypeClassAny)
     return CompilerContextKind::AnyType;
-  uint16_t result = 0;
-  if (type_class & lldb::eTypeClassClass)
-    result |= (uint16_t)CompilerContextKind::Class;
-  if (type_class & lldb::eTypeClassStruct)
-    result |= (uint16_t)CompilerContextKind::Struct;
+  CompilerContextKind result = {};
+  if (type_class & (lldb::eTypeClassClass | lldb::eTypeClassStruct))
+    result |= CompilerContextKind::ClassOrStruct;
   if (type_class & lldb::eTypeClassUnion)
-    result |= (uint16_t)CompilerContextKind::Union;
+    result |= CompilerContextKind::Union;
   if (type_class & lldb::eTypeClassEnumeration)
-    result |= (uint16_t)CompilerContextKind::Enum;
+    result |= CompilerContextKind::Enum;
   if (type_class & lldb::eTypeClassFunction)
-    result |= (uint16_t)CompilerContextKind::Function;
+    result |= CompilerContextKind::Function;
   if (type_class & lldb::eTypeClassTypedef)
-    result |= (uint16_t)CompilerContextKind::Typedef;
-  return (CompilerContextKind)result;
+    result |= CompilerContextKind::Typedef;
+  return result;
 }
 
 TypeQuery::TypeQuery(llvm::StringRef name, TypeQueryOptions options)
@@ -207,11 +205,8 @@ void CompilerContext::Dump(Stream &s) const {
   case CompilerContextKind::Namespace:
     s << "Namespace";
     break;
-  case CompilerContextKind::Class:
-    s << "Class";
-    break;
-  case CompilerContextKind::Struct:
-    s << "Structure";
+  case CompilerContextKind::ClassOrStruct:
+    s << "ClassOrStruct";
     break;
   case CompilerContextKind::Union:
     s << "Union";
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 4956f10a0b0a73..931ce1b0203a93 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -335,3 +335,14 @@ TypeSystemMap::GetTypeSystemForLanguage(lldb::LanguageType language,
   }
   return GetTypeSystemForLanguage(language);
 }
+
+bool TypeSystem::SupportsLanguageStatic(lldb::LanguageType language) {
+  if (language == eLanguageTypeUnknown || language >= eNumLanguageTypes)
+    return false;
+
+  LanguageSet languages =
+      PluginManager::GetAllTypeSystemSupportedLanguagesForTypes();
+  if (languages.Empty())
+    return false;
+  return languages[language];
+}
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 9b905663a2c328..6fac0df1d7a66d 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -2007,6 +2007,129 @@ size_t Process::ReadMemory(addr_t addr, void *buf, size_t size, Status &error) {
   }
 }
 
+void Process::DoFindInMemory(lldb::addr_t start_addr, lldb::addr_t end_addr,
+                             const uint8_t *buf, size_t size,
+                             AddressRanges &matches, size_t alignment,
+                             size_t max_matches) {
+  // Inputs are already validated in FindInMemory() functions.
+  assert(buf != nullptr);
+  assert(size > 0);
+  assert(alignment > 0);
+  assert(max_matches > 0);
+  assert(start_addr != LLDB_INVALID_ADDRESS);
+  assert(end_addr != LLDB_INVALID_ADDRESS);
+  assert(start_addr < end_addr);
+
+  lldb::addr_t start = llvm::alignTo(start_addr, alignment);
+  while (matches.size() < max_matches && (start + size) < end_addr) {
+    const lldb::addr_t found_addr = FindInMemory(start, end_addr, buf, size);
+    if (found_addr == LLDB_INVALID_ADDRESS)
+      break;
+
+    if (found_addr % alignment) {
+      // We need to check the alignment because the FindInMemory uses a special
+      // algorithm to efficiently search mememory but doesn't support alignment.
+      start = llvm::alignTo(start + 1, alignment);
+      continue;
+    }
+
+    matches.emplace_back(found_addr, size);
+    start = found_addr + alignment;
+  }
+}
+
+AddressRanges Process::FindRangesInMemory(const uint8_t *buf, uint64_t size,
+                                          const AddressRanges &ranges,
+                                          size_t alignment, size_t max_matches,
+                                          Status &error) {
+  AddressRanges matches;
+  if (buf == nullptr) {
+    error.SetErrorString("buffer is null");
+    return matches;
+  }
+  if (size == 0) {
+    error.SetErrorString("buffer size is zero");
+    return matches;
+  }
+  if (ranges.empty()) {
+    error.SetErrorString("empty ranges");
+    return matches;
+  }
+  if (alignment == 0) {
+    error.SetErrorString("alignment must be greater than zero");
+    return matches;
+  }
+  if (max_matches == 0) {
+    error.SetErrorString("max_matches must be greater than zero");
+    return matches;
+  }
+
+  int resolved_ranges = 0;
+  Target &target = GetTarget();
+  for (size_t i = 0; i < ranges.size(); ++i) {
+    if (matches.size() >= max_matches)
+      break;
+    const AddressRange &range = ranges[i];
+    if (range.IsValid() == false)
+      continue;
+
+    const lldb::addr_t start_addr =
+        range.GetBaseAddress().GetLoadAddress(&target);
+    if (start_addr == LLDB_INVALID_ADDRESS)
+      continue;
+
+    ++resolved_ranges;
+    const lldb::addr_t end_addr = start_addr + range.GetByteSize();
+    DoFindInMemory(start_addr, end_addr, buf, size, matches, alignment,
+                   max_matches);
+  }
+
+  if (resolved_ranges > 0)
+    error.Clear();
+  else
+    error.SetErrorString("unable to resolve any ranges");
+
+  return matches;
+}
+
+lldb::addr_t Process::FindInMemory(const uint8_t *buf, uint64_t size,
+                                   const AddressRange &range, size_t alignment,
+                                   Status &error) {
+  if (buf == nullptr) {
+    error.SetErrorString("buffer is null");
+    return LLDB_INVALID_ADDRESS;
+  }
+  if (size == 0) {
+    error.SetErrorString("buffer size is zero");
+    return LLDB_INVALID_ADDRESS;
+  }
+  if (!range.IsValid()) {
+    error.SetErrorString("range is invalid");
+    return LLDB_INVALID_ADDRESS;
+  }
+  if (alignment == 0) {
+    error.SetErrorString("alignment must be greater than zero");
+    return LLDB_INVALID_ADDRESS;
+  }
+
+  Target &target = GetTarget();
+  const lldb::addr_t start_addr =
+      range.GetBaseAddress().GetLoadAddress(&target);
+  if (start_addr == LLDB_INVALID_ADDRESS) {
+    error.SetErrorString("range load address is invalid");
+    return LLDB_INVALID_ADDRESS;
+  }
+  const lldb::addr_t end_addr = start_addr + range.GetByteSize();
+
+  AddressRanges matches;
+  DoFindInMemory(start_addr, end_addr, buf, size, matches, alignment, 1);
+  if (matches.empty())
+    return LLDB_INVALID_ADDRESS;
+
+  error.Clear();
+  return matches[0].GetBaseAddress().GetLoadAddress(&target);
+}
+
 size_t Process::ReadCStringFromMemory(addr_t addr, std::string &out_str,
                                       Status &error) {
   char buf[256];
diff --git a/lldb/source/Target/RegisterFlags.cpp b/lldb/source/Target/RegisterFlags.cpp
index d2fc5392f1a76e..976e03870ad9e4 100644
--- a/lldb/source/Target/RegisterFlags.cpp
+++ b/lldb/source/Target/RegisterFlags.cpp
@@ -108,10 +108,6 @@ uint64_t RegisterFlags::Field::GetMask() const {
 }
 
 void RegisterFlags::SetFields(const std::vector<Field> &fields) {
-  // We expect that the XML processor will discard anything describing flags but
-  // with no fields.
-  assert(fields.size() && "Some fields must be provided.");
-
   // We expect that these are unsorted but do not overlap.
   // They could fill the register but may have gaps.
   std::vector<Field> provided_fields = fields;
@@ -370,6 +366,16 @@ void FieldEnum::Enumerator::ToXML(Stream &strm) const {
               escaped_name.c_str(), m_value);
 }
 
+void FieldEnum::Enumerator::DumpToLog(Log *log) const {
+  LLDB_LOG(log, "  Name: \"{0}\" Value: {1}", m_name.c_str(), m_value);
+}
+
+void FieldEnum::DumpToLog(Log *log) const {
+  LLDB_LOG(log, "ID: \"{0}\"", m_id.c_str());
+  for (const auto &enumerator : GetEnumerators())
+    enumerator.DumpToLog(log);
+}
+
 void RegisterFlags::ToXML(Stream &strm) const {
   // Example XML:
   // <flags id="cpsr_flags" size="4">
diff --git a/lldb/source/Target/ThreadPlanPython.cpp b/lldb/source/Target/ThreadPlanPython.cpp
index d6de6b3c3cf049..dfcb67eb190222 100644
--- a/lldb/source/Target/ThreadPlanPython.cpp
+++ b/lldb/source/Target/ThreadPlanPython.cpp
@@ -10,6 +10,7 @@
 
 #include "lldb/Core/Debugger.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 #include "lldb/Interpreter/ScriptInterpreter.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/RegisterContext.h"
@@ -32,6 +33,26 @@ ThreadPlanPython::ThreadPlanPython(Thread &thread, const char *class_name,
                  eVoteNoOpinion, eVoteNoOpinion),
       m_class_name(class_name), m_args_data(args_data), m_did_push(false),
       m_stop_others(false) {
+  ScriptInterpreter *interpreter = GetScriptInterpreter();
+  if (!interpreter) {
+    SetPlanComplete(false);
+    // FIXME: error handling
+    // error.SetErrorStringWithFormat(
+    //     "ThreadPlanPython::%s () - ERROR: %s", __FUNCTION__,
+    //     "Couldn't get script interpreter");
+    return;
+  }
+
+  m_interface = interpreter->CreateScriptedThreadPlanInterface();
+  if (!m_interface) {
+    SetPlanComplete(false);
+    // FIXME: error handling
+    // error.SetErrorStringWithFormat(
+    //     "ThreadPlanPython::%s () - ERROR: %s", __FUNCTION__,
+    //     "Script interpreter couldn't create Scripted Thread Plan Interface");
+    return;
+  }
+
   SetIsControllingPlan(true);
   SetOkayToDiscard(true);
   SetPrivate(false);
@@ -60,13 +81,14 @@ void ThreadPlanPython::DidPush() {
   // We set up the script side in DidPush, so that it can push other plans in
   // the constructor, and doesn't have to care about the details of DidPush.
   m_did_push = true;
-  if (!m_class_name.empty()) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      m_implementation_sp = script_interp->CreateScriptedThreadPlan(
-          m_class_name.c_str(), m_args_data, m_error_str, 
-          this->shared_from_this());
-    }
+  if (m_interface) {
+    auto obj_or_err = m_interface->CreatePluginObject(
+        m_class_name, this->shared_from_this(), m_args_data);
+    if (!obj_or_err) {
+      m_error_str = llvm::toString(obj_or_err.takeError());
+      SetPlanComplete(false);
+    } else
+      m_implementation_sp = *obj_or_err;
   }
 }
 
@@ -77,14 +99,13 @@ bool ThreadPlanPython::ShouldStop(Event *event_ptr) {
 
   bool should_stop = true;
   if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      should_stop = script_interp->ScriptedThreadPlanShouldStop(
-          m_implementation_sp, event_ptr, script_error);
-      if (script_error)
-        SetPlanComplete(false);
-    }
+    auto should_stop_or_err = m_interface->ShouldStop(event_ptr);
+    if (!should_stop_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), should_stop_or_err.takeError(),
+                     "Can't call ScriptedThreadPlan::ShouldStop.");
+      SetPlanComplete(false);
+    } else
+      should_stop = *should_stop_or_err;
   }
   return should_stop;
 }
@@ -96,14 +117,13 @@ bool ThreadPlanPython::IsPlanStale() {
 
   bool is_stale = true;
   if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      is_stale = script_interp->ScriptedThreadPlanIsStale(m_implementation_sp,
-                                                          script_error);
-      if (script_error)
-        SetPlanComplete(false);
-    }
+    auto is_stale_or_err = m_interface->IsStale();
+    if (!is_stale_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), is_stale_or_err.takeError(),
+                     "Can't call ScriptedThreadPlan::IsStale.");
+      SetPlanComplete(false);
+    } else
+      is_stale = *is_stale_or_err;
   }
   return is_stale;
 }
@@ -115,14 +135,14 @@ bool ThreadPlanPython::DoPlanExplainsStop(Event *event_ptr) {
 
   bool explains_stop = true;
   if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      explains_stop = script_interp->ScriptedThreadPlanExplainsStop(
-          m_implementation_sp, event_ptr, script_error);
-      if (script_error)
-        SetPlanComplete(false);
-    }
+    auto explains_stop_or_error = m_interface->ExplainsStop(event_ptr);
+    if (!explains_stop_or_error) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread),
+                     explains_stop_or_error.takeError(),
+                     "Can't call ScriptedThreadPlan::ExplainsStop.");
+      SetPlanComplete(false);
+    } else
+      explains_stop = *explains_stop_or_error;
   }
   return explains_stop;
 }
@@ -150,14 +170,8 @@ lldb::StateType ThreadPlanPython::GetPlanRunState() {
   LLDB_LOGF(log, "%s called on Python Thread Plan: %s )", LLVM_PRETTY_FUNCTION,
             m_class_name.c_str());
   lldb::StateType run_state = eStateRunning;
-  if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      run_state = script_interp->ScriptedThreadPlanGetRunState(
-          m_implementation_sp, script_error);
-    }
-  }
+  if (m_implementation_sp)
+    run_state = m_interface->GetRunState();
   return run_state;
 }
 
@@ -168,12 +182,16 @@ void ThreadPlanPython::GetDescription(Stream *s, lldb::DescriptionLevel level) {
   if (m_implementation_sp) {
     ScriptInterpreter *script_interp = GetScriptInterpreter();
     if (script_interp) {
-      bool script_error;
-      bool added_desc = script_interp->ScriptedThreadPlanGetStopDescription(
-          m_implementation_sp, s, script_error);
-      if (script_error || !added_desc)
+      lldb::StreamSP stream = std::make_shared<lldb_private::StreamString>();
+      llvm::Error err = m_interface->GetStopDescription(stream);
+      if (err) {
+        LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), std::move(err),
+                       "Can't call ScriptedThreadPlan::GetStopDescription.");
         s->Printf("Python thread plan implemented by class %s.",
             m_class_name.c_str());
+      } else
+        s->PutCString(
+            reinterpret_cast<StreamString *>(stream.get())->GetData());
     }
     return;
   }
diff --git a/lldb/source/Utility/Broadcaster.cpp b/lldb/source/Utility/Broadcaster.cpp
index bd65ffd86a1d0f..b6d8ae39325d37 100644
--- a/lldb/source/Utility/Broadcaster.cpp
+++ b/lldb/source/Utility/Broadcaster.cpp
@@ -202,7 +202,7 @@ bool Broadcaster::BroadcasterImpl::RemoveListener(
     if (!curr_listener_sp) {
       // The weak pointer for this listener didn't resolve, lets' prune it
       // as we go.
-      m_listeners.erase(it);
+      it = m_listeners.erase(it);
       continue;
     }
 
@@ -213,8 +213,8 @@ bool Broadcaster::BroadcasterImpl::RemoveListener(
       if (!it->second)
         m_listeners.erase(it);
       return true;
-    } else
-      it++;
+    }
+    it++;
   }
   return false;
 }
diff --git a/lldb/test/API/commands/command/script/welcome.py b/lldb/test/API/commands/command/script/welcome.py
index c1ae0063a52a91..7b7578d6df46d4 100644
--- a/lldb/test/API/commands/command/script/welcome.py
+++ b/lldb/test/API/commands/command/script/welcome.py
@@ -45,7 +45,7 @@ def print_wait_impl(debugger, args, result, dict):
 def check_for_synchro(debugger, args, result, dict):
     if debugger.GetAsync():
         print("I am running async", file=result)
-    if debugger.GetAsync() == False:
+    if not debugger.GetAsync():
         print("I am running sync", file=result)
 
 
diff --git a/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py b/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py
index ca08591aedb392..a108ffe485efc4 100644
--- a/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py
+++ b/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py
@@ -1,4 +1,4 @@
-﻿"""
+"""
 Test calling a function that hits a signal set to auto-restart, make sure the call completes.
 """
 
diff --git a/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py b/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py
index b8cc87c93ba61d..0090513864cd73 100644
--- a/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py
+++ b/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py
@@ -61,7 +61,7 @@ def call_function(self):
 
         value = frame.EvaluateExpression("[my_class callMeIThrow]", options)
 
-        self.assertTrue(value.IsValid() and value.GetError().Success() == False)
+        self.assertTrue(value.IsValid() and not value.GetError().Success())
         self.check_after_call()
 
         # Now set the ObjC language breakpoint and make sure that doesn't
@@ -76,7 +76,7 @@ def call_function(self):
 
         value = frame.EvaluateExpression("[my_class callMeIThrow]", options)
 
-        self.assertTrue(value.IsValid() and value.GetError().Success() == False)
+        self.assertTrue(value.IsValid() and not value.GetError().Success())
         self.check_after_call()
 
         # Now turn off exception trapping, and call a function that catches the exceptions,
@@ -95,5 +95,5 @@ def call_function(self):
         options.SetUnwindOnError(False)
         value = frame.EvaluateExpression("[my_class callMeIThrow]", options)
 
-        self.assertTrue(value.IsValid() and value.GetError().Success() == False)
+        self.assertTrue(value.IsValid() and not value.GetError().Success())
         self.check_after_call()
diff --git a/lldb/test/API/commands/log/invalid-args/TestInvalidArgsLog.py b/lldb/test/API/commands/log/invalid-args/TestInvalidArgsLog.py
index 583c68d7bfacc6..dbcd2d60d021ae 100644
--- a/lldb/test/API/commands/log/invalid-args/TestInvalidArgsLog.py
+++ b/lldb/test/API/commands/log/invalid-args/TestInvalidArgsLog.py
@@ -25,7 +25,7 @@ def test_disable_empty(self):
         )
 
     @no_debug_info_test
-    def test_enable_empty(self):
+    def test_enable_invalid_path(self):
         invalid_path = os.path.join("this", "is", "not", "a", "valid", "path")
         self.expect(
             "log enable lldb all -f " + invalid_path,
diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py
index 136c44e17c645e..a8ac60090a760d 100644
--- a/lldb/test/API/commands/statistics/basic/TestStats.py
+++ b/lldb/test/API/commands/statistics/basic/TestStats.py
@@ -702,6 +702,8 @@ def get_test_cases_for_sections_existence(self):
                     "targets.moduleIdentifiers": True,
                     "targets.breakpoints": True,
                     "targets.expressionEvaluation": True,
+                    "targets.frameVariable": True,
+                    "targets.totalSharedLibraryEventHitCount": True,
                     "modules": True,
                     "transcript": True,
                 },
@@ -713,10 +715,12 @@ def get_test_cases_for_sections_existence(self):
                 },
                 "expect": {
                     "commands": False,
-                    "targets": False,
+                    "targets": True,
                     "targets.moduleIdentifiers": False,
                     "targets.breakpoints": False,
                     "targets.expressionEvaluation": False,
+                    "targets.frameVariable": False,
+                    "targets.totalSharedLibraryEventHitCount": True,
                     "modules": False,
                     "transcript": False,
                 },
@@ -733,11 +737,25 @@ def get_test_cases_for_sections_existence(self):
                     "targets.moduleIdentifiers": False,
                     "targets.breakpoints": False,
                     "targets.expressionEvaluation": False,
+                    "targets.frameVariable": False,
                     "targets.totalSharedLibraryEventHitCount": True,
                     "modules": False,
                     "transcript": False,
                 },
             },
+            {  # Summary mode without targets
+                "command_options": " --summary --targets=false",
+                "api_options": {
+                    "SetSummaryOnly": True,
+                    "SetIncludeTargets": False,
+                },
+                "expect": {
+                    "commands": False,
+                    "targets": False,
+                    "modules": False,
+                    "transcript": False,
+                },
+            },
             {  # Summary mode with modules
                 "command_options": " --summary --modules=true",
                 "api_options": {
@@ -746,15 +764,17 @@ def get_test_cases_for_sections_existence(self):
                 },
                 "expect": {
                     "commands": False,
-                    "targets": False,
+                    "targets": True,
                     "targets.moduleIdentifiers": False,
                     "targets.breakpoints": False,
                     "targets.expressionEvaluation": False,
+                    "targets.frameVariable": False,
+                    "targets.totalSharedLibraryEventHitCount": True,
                     "modules": True,
                     "transcript": False,
                 },
             },
-            {  # Everything mode but without modules and transcript
+            {  # Default mode without modules and transcript
                 "command_options": " --modules=false --transcript=false",
                 "api_options": {
                     "SetIncludeModules": False,
@@ -766,11 +786,13 @@ def get_test_cases_for_sections_existence(self):
                     "targets.moduleIdentifiers": False,
                     "targets.breakpoints": True,
                     "targets.expressionEvaluation": True,
+                    "targets.frameVariable": True,
+                    "targets.totalSharedLibraryEventHitCount": True,
                     "modules": False,
                     "transcript": False,
                 },
             },
-            {  # Everything mode but without modules
+            {  # Default mode without modules
                 "command_options": " --modules=false",
                 "api_options": {
                     "SetIncludeModules": False,
@@ -781,6 +803,8 @@ def get_test_cases_for_sections_existence(self):
                     "targets.moduleIdentifiers": False,
                     "targets.breakpoints": True,
                     "targets.expressionEvaluation": True,
+                    "targets.frameVariable": True,
+                    "targets.totalSharedLibraryEventHitCount": True,
                     "modules": False,
                     "transcript": True,
                 },
diff --git a/lldb/test/API/commands/thread/backtrace/TestThreadBacktraceRepeat.py b/lldb/test/API/commands/thread/backtrace/TestThreadBacktraceRepeat.py
index 9678bd42999b31..571024417560fd 100644
--- a/lldb/test/API/commands/thread/backtrace/TestThreadBacktraceRepeat.py
+++ b/lldb/test/API/commands/thread/backtrace/TestThreadBacktraceRepeat.py
@@ -16,8 +16,6 @@ def test_thread_backtrace_one_thread(self):
         """Run a simplified version of the test that just hits one breakpoint and
         doesn't care about synchronizing the two threads - hopefully this will
         run on more systems."""
-
-    def test_thread_backtrace_one_thread(self):
         self.build()
         (
             self.inferior_target,
diff --git a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py
index c41e85fd670baa..12f99f07c78a82 100644
--- a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py
+++ b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py
@@ -31,43 +31,6 @@ def testStartMultipleLiveThreads(self):
 
         self.traceStopProcess()
 
-    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
-    @testSBAPIAndCommands
-    def testStartMultipleLiveThreadsWithStops(self):
-        self.build()
-        exe = self.getBuildArtifact("a.out")
-
-        self.dbg.CreateTarget(exe)
-
-        self.expect("b main")
-        self.expect("b 6")
-        self.expect("b 11")
-
-        self.expect("r")
-        self.traceStartProcess()
-
-        # We'll see here the first thread
-        self.expect("continue")
-
-        # We are in thread 2
-        self.expect("thread trace dump instructions", substrs=["main.cpp:9"])
-        self.expect("thread trace dump instructions 2", substrs=["main.cpp:9"])
-
-        # We stop tracing it
-        self.expect("thread trace stop 2")
-
-        # The trace is still in memory
-        self.expect("thread trace dump instructions 2", substrs=["main.cpp:9"])
-
-        # We'll stop at the next breakpoint, thread 2 will be still alive, but not traced. Thread 3 will be traced
-        self.expect("continue")
-        self.expect("thread trace dump instructions", substrs=["main.cpp:4"])
-        self.expect("thread trace dump instructions 3", substrs=["main.cpp:4"])
-
-        self.expect("thread trace dump instructions 2", substrs=["not traced"])
-
-        self.traceStopProcess()
-
     @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
     @testSBAPIAndCommands
     def testStartMultipleLiveThreadsWithStops(self):
diff --git a/lldb/test/API/commands/watchpoints/watchpoint_commands/condition/TestWatchpointConditionCmd.py b/lldb/test/API/commands/watchpoints/watchpoint_commands/condition/TestWatchpointConditionCmd.py
index 9e8db7025219da..3a60859b9ce7db 100644
--- a/lldb/test/API/commands/watchpoints/watchpoint_commands/condition/TestWatchpointConditionCmd.py
+++ b/lldb/test/API/commands/watchpoints/watchpoint_commands/condition/TestWatchpointConditionCmd.py
@@ -1,4 +1,4 @@
-﻿"""
+"""
 Test watchpoint modify command to set condition on a watchpoint.
 """
 
diff --git a/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py b/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py
index d120692e4d6e63..3674a8c233c8d3 100644
--- a/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py
+++ b/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py
@@ -32,7 +32,7 @@ def address_breakpoints(self):
         for region_idx in range(regions.GetSize()):
             region = lldb.SBMemoryRegionInfo()
             regions.GetMemoryRegionAtIndex(region_idx, region)
-            if illegal_address == None or region.GetRegionEnd() > illegal_address:
+            if illegal_address is None or region.GetRegionEnd() > illegal_address:
                 illegal_address = region.GetRegionEnd()
 
         if illegal_address is not None:
diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py
index d304e12a80c2da..95873405eab84e 100644
--- a/lldb/test/API/functionalities/completion/TestCompletion.py
+++ b/lldb/test/API/functionalities/completion/TestCompletion.py
@@ -196,12 +196,6 @@ def test_disassemble_dash_f(self):
     def test_plugin_load(self):
         self.complete_from_to("plugin load ", [])
 
-    def test_log_enable(self):
-        self.complete_from_to("log enable ll", ["lldb"])
-        self.complete_from_to("log enable dw", ["dwarf"])
-        self.complete_from_to("log enable lldb al", ["all"])
-        self.complete_from_to("log enable lldb sym", ["symbol"])
-
     def test_log_enable(self):
         self.complete_from_to("log disable ll", ["lldb"])
         self.complete_from_to("log disable dw", ["dwarf"])
diff --git a/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py b/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py
index da9ce1b87d3339..8a3bcf28a2115e 100644
--- a/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py
+++ b/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py
@@ -57,7 +57,7 @@ def disassemble_check_for_hi_and_foo(self, target, func, binaryname):
                 found_hi_string = True
             if "foo" in i.GetComment(target):
                 found_foo = True
-        if found_hi_string == False or found_foo == False:
+        if not found_hi_string or not found_foo:
             print(
                 'Did not find "HI" string or "foo" in disassembly symbolication in %s'
                 % binaryname
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXMLRegisters.py
similarity index 72%
rename from lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py
rename to lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXMLRegisters.py
index 5f6ed7d537af18..118f46f9974a86 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXMLRegisters.py
@@ -1,3 +1,12 @@
+"""
+Check that lldb falls back to default register layouts when the remote provides
+no target XML or does not include registers in the target XML.
+
+GPRS are passed to the responder to create register data to send back to lldb.
+Registers in SUPPL are virtual registers based on those general ones. The tests
+pass __file__ to FileCheck so any prefixes in this Python file will be checked.
+"""
+
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
@@ -5,20 +14,50 @@
 from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
 
 import binascii
+from textwrap import dedent
 
 
-class TestGDBServerTargetXML(GDBRemoteTestBase):
+class MyResponder(MockGDBServerResponder):
     @staticmethod
     def filecheck_to_blob(fc):
         for l in fc.strip().splitlines():
             val = l.split("0x")[1]
             yield binascii.b2a_hex(bytes(reversed(binascii.a2b_hex(val)))).decode()
 
-    @skipIfRemote
-    @skipIfLLVMTargetMissing("X86")
-    def test_x86_64_regs(self):
-        """Test grabbing various x86_64 registers from gdbserver."""
+    def __init__(self, architecture, has_target_xml, reg_data, halt_reason):
+        super().__init__()
+        self.architecture = architecture
+        self.has_target_xml = has_target_xml
+        self.reg_data = "".join(self.filecheck_to_blob(reg_data))
+        self.halt_reason = halt_reason
+
+    def readRegister(self, regnum):
+        return ""
+
+    def readRegisters(self):
+        return self.reg_data
+
+    def haltReason(self):
+        return self.halt_reason
+
+    def qXferRead(self, obj, annex, offset, length):
+        if self.has_target_xml and annex == "target.xml":
+            return (
+                dedent(
+                    f"""\
+                    <?xml version="1.0"?>
+                    <target version="1.0">
+                      <architecture>{self.architecture}</architecture>
+                    </target>"""
+                ),
+                False,
+            )
+
+        return None, False
 
+
+class TestGDBServerTargetXML(GDBRemoteTestBase):
+    def check_x86_64_regs(self, has_target_xml):
         GPRS = """
 CHECK-AMD64-DAG: rax = 0x0807060504030201
 CHECK-AMD64-DAG: rbx = 0x1817161514131211
@@ -100,19 +139,12 @@ def test_x86_64_regs(self):
 CHECK-AMD64-DAG: r15l = 0xf1
 """
 
-        class MyResponder(MockGDBServerResponder):
-            reg_data = "".join(self.filecheck_to_blob(GPRS))
-
-            def readRegister(self, regnum):
-                return ""
-
-            def readRegisters(self):
-                return self.reg_data
-
-            def haltReason(self):
-                return "T02thread:1ff0d;threads:1ff0d;thread-pcs:000000010001bc00;07:0102030405060708;10:1112131415161718;"
-
-        self.server.responder = MyResponder()
+        self.server.responder = MyResponder(
+            "x86-64",
+            has_target_xml,
+            GPRS,
+            "T02thread:1ff0d;threads:1ff0d;thread-pcs:000000010001bc00;07:0102030405060708;10:1112131415161718;",
+        )
 
         target = self.createTarget("basic_eh_frame.yaml")
         process = self.connect(target)
@@ -140,10 +172,21 @@ def haltReason(self):
         self.match("register read flags", ["eflags = 0x1c1b1a19"])
 
     @skipIfRemote
-    @skipIfLLVMTargetMissing("AArch64")
-    def test_aarch64_regs(self):
-        """Test grabbing various aarch64 registers from gdbserver."""
+    @skipIfLLVMTargetMissing("X86")
+    def test_x86_64_regs_no_target_xml(self):
+        """Test grabbing various x86_64 registers from gdbserver when there
+        is no target XML."""
+        self.check_x86_64_regs(False)
 
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    @skipIfLLVMTargetMissing("X86")
+    def test_x86_64_regs_no_register_info(self):
+        """Test grabbing various x86_64 registers from gdbserver when there
+        is target XML but it does not include register info."""
+        self.check_x86_64_regs(True)
+
+    def check_aarch64_regs(self, has_target_xml):
         GPRS = """
 CHECK-AARCH64-DAG: x0 = 0x0001020304050607
 CHECK-AARCH64-DAG: x1 = 0x0102030405060708
@@ -216,19 +259,12 @@ def test_aarch64_regs(self):
 CHECK-AARCH64-DAG: w31 = 0x23242526
 """
 
-        class MyResponder(MockGDBServerResponder):
-            reg_data = "".join(self.filecheck_to_blob(GPRS))
-
-            def readRegister(self, regnum):
-                return ""
-
-            def readRegisters(self):
-                return self.reg_data
-
-            def haltReason(self):
-                return "T02thread:1ff0d;threads:1ff0d;thread-pcs:000000010001bc00;07:0102030405060708;10:1112131415161718;"
-
-        self.server.responder = MyResponder()
+        self.server.responder = MyResponder(
+            "aarch64",
+            has_target_xml,
+            GPRS,
+            "T02thread:1ff0d;threads:1ff0d;thread-pcs:000000010001bc00;07:0102030405060708;10:1112131415161718;",
+        )
 
         target = self.createTarget("basic_eh_frame-aarch64.yaml")
         process = self.connect(target)
@@ -252,10 +288,21 @@ def haltReason(self):
         self.match("register read flags", ["cpsr = 0x21222324"])
 
     @skipIfRemote
-    @skipIfLLVMTargetMissing("X86")
-    def test_i386_regs(self):
-        """Test grabbing various i386 registers from gdbserver."""
+    @skipIfLLVMTargetMissing("AArch64")
+    def test_aarch64_regs_no_target_xml(self):
+        """Test grabbing various aarch64 registers from gdbserver when there
+        is no target XML."""
+        self.check_aarch64_regs(False)
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    @skipIfLLVMTargetMissing("AArch64")
+    def test_aarch64_regs_no_register_info(self):
+        """Test grabbing various aarch64 registers from gdbserver when there
+        is target XML but it does not include register info."""
+        self.check_aarch64_regs(True)
 
+    def check_i386_regs(self, has_target_xml):
         GPRS = """
 CHECK-I386-DAG: eax = 0x04030201
 CHECK-I386-DAG: ecx = 0x14131211
@@ -300,19 +347,12 @@ def test_i386_regs(self):
 CHECK-I386-DAG: dil = 0x71
 """
 
-        class MyResponder(MockGDBServerResponder):
-            reg_data = "".join(self.filecheck_to_blob(GPRS))
-
-            def readRegister(self, regnum):
-                return ""
-
-            def readRegisters(self):
-                return self.reg_data
-
-            def haltReason(self):
-                return "T02thread:1ff0d;threads:1ff0d;thread-pcs:000000010001bc00;07:0102030405060708;10:1112131415161718;"
-
-        self.server.responder = MyResponder()
+        self.server.responder = MyResponder(
+            "i386",
+            has_target_xml,
+            GPRS,
+            "T02thread:1ff0d;threads:1ff0d;thread-pcs:000000010001bc00;07:0102030405060708;10:1112131415161718;",
+        )
 
         target = self.createTarget("basic_eh_frame-i386.yaml")
         process = self.connect(target)
@@ -332,3 +372,18 @@ def haltReason(self):
         self.match("register read sp", ["esp = 0x44434241"])
         self.match("register read pc", ["eip = 0x84838281"])
         self.match("register read flags", ["eflags = 0x94939291"])
+
+    @skipIfRemote
+    @skipIfLLVMTargetMissing("X86")
+    def test_i386_regs_no_target_xml(self):
+        """Test grabbing various i386 registers from gdbserver when there is
+        no target XML."""
+        self.check_i386_regs(False)
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    @skipIfLLVMTargetMissing("X86")
+    def test_i386_regs_no_register_info(self):
+        """Test grabbing various i386 registers from gdbserver when there is
+        target XML but it does not include register info."""
+        self.check_i386_regs(True)
\ No newline at end of file
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestIOSSimulator.py b/lldb/test/API/functionalities/gdb_remote_client/TestIOSSimulator.py
index f45f014016a90c..7ea305621010d6 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestIOSSimulator.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestIOSSimulator.py
@@ -79,7 +79,7 @@ def test_tvos_sim(self):
         )
 
     @skipIfRemote
-    def test_tvos_sim(self):
+    def test_watchos_sim(self):
         self.platform_test(
             host="macosx",
             process="watchossimulator",
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py b/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py
index a51b228e917cc7..7d6b1afd94a18e 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py
@@ -61,7 +61,7 @@ def qXferRead(self, obj, annex, offset, length):
         wp_opts = lldb.SBWatchpointOptions()
         wp_opts.SetWatchpointTypeWrite(lldb.eWatchpointWriteTypeOnModify)
         wp = target.WatchpointCreateByAddress(0x100, 8, wp_opts, err)
-        if self.TraceOn() and (err.Fail() or wp.IsValid == False):
+        if self.TraceOn() and (err.Fail() or not wp.IsValid):
             strm = lldb.SBStream()
             err.GetDescription(strm)
             print("watchpoint failed: %s" % strm.GetData())
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestXMLRegisterFlags.py b/lldb/test/API/functionalities/gdb_remote_client/TestXMLRegisterFlags.py
index e2c75970c2d2ed..2dbb2b5f5e3a9c 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestXMLRegisterFlags.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestXMLRegisterFlags.py
@@ -654,3 +654,405 @@ def test_flags_name_xml_reserved_characters(self):
             "register info cpsr",
             substrs=["| A< | B> | C' | D\" | E& |"],
         )
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_no_enum(self):
+        """Check that lldb does not try to print an enum when there isn't one."""
+
+        self.setup_flags_test('<field name="E" start="0" end="0">' "</field>")
+
+        self.expect("register info cpsr", patterns=["E:.*$"], matching=False)
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_enum_type_not_found(self):
+        """Check that lldb uses the default format if we don't find the enum type."""
+        self.setup_register_test(
+            """\
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="0" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect("register read cpsr", patterns=["\(E = 1\)$"])
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_enum_duplicated_evalue(self):
+        """Check that lldb only uses the last instance of a evalue for each
+        value."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="abc" value="1"/>
+            <evalue name="def" value="1"/>
+            <evalue name="geh" value="2"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="1" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect("register info cpsr", patterns=["E: 1 = def, 2 = geh$"])
+        self.expect("register read cpsr", patterns=["\(E = def \| geh\)$"])
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_enum_duplicated(self):
+        """Check that lldb only uses the last instance of enums with the same
+        id."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="abc" value="1"/>
+          </enum>
+          <enum id="some_enum" size="4">
+            <evalue name="def" value="1"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="0" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect("register info cpsr", patterns=["E: 1 = def$"])
+        self.expect("register read cpsr", patterns=["\(E = def\)$"])
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_enum_use_first_valid(self):
+        """Check that lldb uses the first enum that parses correctly and ignores
+        the rest."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4"/>
+          <enum size="4">
+            <evalue name="invalid" value="1"/>
+          </enum>
+          <enum id="some_enum" size="4">
+            <evalue name="valid" value="1"/>
+          </enum>
+          <enum id="another_enum" size="4">
+            <evalue name="invalid" value="1"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="0" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect("register info cpsr", patterns=["E: 1 = valid$"])
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_evalue_empty_name(self):
+        """Check that lldb ignores evalues with an empty name."""
+
+        # The only potential use case for empty names is to shadow an evalue
+        # declared later so that it's name is hidden should the debugger only
+        # pick one of them. This behaviour would be debugger specific so the protocol
+        # would probably not care or leave it up to us, and I think it's not a
+        # useful thing to allow.
+
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="" value="1"/>
+            <evalue name="valid" value="2"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="1" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect("register info cpsr", patterns=["E: 2 = valid$"])
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_evalue_invalid_value(self):
+        """Check that lldb ignores evalues with an invalid value."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="negative_dec" value="-1"/>
+            <evalue name="negative_hex" value="-0x1"/>
+            <evalue name="negative_bin" value="-0b1"/>
+            <evalue name="negative_float" value="-0.5"/>
+            <evalue name="nan" value="aardvark"/>
+            <evalue name="dec" value="1"/>
+            <evalue name="hex" value="0x2"/>
+            <evalue name="octal" value="03"/>
+            <evalue name="float" value="0.5"/>
+            <evalue name="bin" value="0b100"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="2" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect(
+            "register info cpsr", patterns=["E: 1 = dec, 2 = hex, 3 = octal, 4 = bin$"]
+        )
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_evalue_out_of_range(self):
+        """Check that lldb will not use an enum type if one of its evalues
+        exceeds the size of the field it is applied to."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="A" value="0"/>
+            <evalue name="B" value="2"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="0" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        # The whole eunm is rejected even if just 1 value is out of range.
+        self.expect("register info cpsr", patterns=["E: 0 = "], matching=False)
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_enum_ignore_unknown_attributes(self):
+        """Check that lldb ignores unknown attributes on an enum or evalue."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4" foo=\"bar\">
+            <evalue name="valid" value="1" colour=\"red"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="0" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect("register info cpsr", patterns=["E: 1 = valid$"])
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_evalue_required_attributes(self):
+        """Check that lldb rejects any evalue missing a name and/or value."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="foo"/>
+            <evalue value="1"/>
+            <evalue />
+            <evalue name="valid" value="1"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="0" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect("register info cpsr", patterns=["E: 1 = valid$"])
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_evalue_name_xml_reserved_characters(self):
+        """Check that lldb converts reserved character replacements like &amp;
+        when found in evalue names."""
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="A&amp;"  value="0"/>
+            <evalue name="B&quot;" value="1"/>
+            <evalue name="C&apos;" value="2"/>
+            <evalue name="D&gt;"   value="3"/>
+            <evalue name="E&lt;"   value="4"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="E" start="0" end="2" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        self.expect(
+            "register info cpsr",
+            patterns=["E: 0 = A&, 1 = B\", 2 = C', 3 = D>, 4 = E<$"],
+        )
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_enum_value_range(self):
+        """Check that lldb ignores enums whose values would not fit into
+        their field."""
+
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="4">
+            <evalue name="A" value="0"/>
+            <evalue name="B" value="1"/>
+            <evalue name="C" value="2"/>
+            <evalue name="D" value="3"/>
+            <evalue name="E" value="4"/>
+          </enum>
+          <flags id="cpsr_flags" size="4">
+            <field name="foo" start="0" end="1" type="some_enum"/>
+            <field name="bar" start="2" end="10" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        # some_enum can apply to foo
+        self.expect(
+            "register info cpsr", patterns=["bar: 0 = A, 1 = B, 2 = C, 3 = D, 4 = E$"]
+        )
+        # but not to bar
+        self.expect("register info cpsr", patterns=["foo: "], matching=False)
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_evalue_value_limits(self):
+        """Check that lldb can handle an evalue for a field up to 64 bits
+        in size and anything greater is ignored."""
+
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="8">
+            <evalue name="min" value="0"/>
+            <evalue name="max" value="0xffffffffffffffff"/>
+            <evalue name="invalid" value="0xfffffffffffffffff"/>
+          </enum>
+          <flags id="x0_flags" size="8">
+            <field name="foo" start="0" end="63" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64" type="x0_flags"/>
+          <reg name="cpsr" regnum="33" bitsize="32"/>"""
+        )
+
+        self.expect(
+            "register info x0", patterns=["foo: 0 = min, 18446744073709551615 = max$"]
+        )
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_field_size_limit(self):
+        """Check that lldb ignores any field > 64 bits. We can't handle those
+        correctly."""
+
+        self.setup_register_test(
+            """\
+          <flags id="x0_flags" size="8">
+            <field name="invalid" start="0" end="64"/>
+            <field name="valid" start="0" end="63"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64" type="x0_flags"/>
+          <reg name="cpsr" regnum="33" bitsize="32"/>"""
+        )
+
+        self.expect(
+            "register info x0", substrs=["| 63-0  |\n" "|-------|\n" "| valid |"]
+        )
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_many_fields_same_enum(self):
+        """Check that an enum can be reused by many fields, and fields of many
+        registers."""
+
+        self.setup_register_test(
+            """\
+          <enum id="some_enum" size="8">
+            <evalue name="valid" value="1"/>
+          </enum>
+          <flags id="x0_flags" size="8">
+            <field name="f1" start="0" end="0" type="some_enum"/>
+            <field name="f2" start="1" end="1" type="some_enum"/>
+          </flags>
+          <flags id="cpsr_flags" size="4">
+            <field name="f1" start="0" end="0" type="some_enum"/>
+            <field name="f2" start="1" end="1" type="some_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64" type="x0_flags"/>
+          <reg name="cpsr" regnum="33" bitsize="32" type="cpsr_flags"/>"""
+        )
+
+        expected_info = [
+            dedent(
+                """\
+             f2: 1 = valid
+
+             f1: 1 = valid$"""
+            )
+        ]
+        self.expect("register info x0", patterns=expected_info)
+
+        self.expect("register info cpsr", patterns=expected_info)
+
+        expected_read = ["\(f2 = valid, f1 = valid\)$"]
+        self.expect("register read x0", patterns=expected_read)
+        self.expect("register read cpsr", patterns=expected_read)
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_fields_same_name_different_enum(self):
+        """Check that lldb does something sensible when there are two fields with
+        the same name, but their enum types differ."""
+
+        # It's unlikely anyone would do this intentionally but it is allowed by
+        # the protocol spec so we have to cope with it.
+        self.setup_register_test(
+            """\
+          <enum id="foo_enum" size="8">
+            <evalue name="foo_0" value="1"/>
+          </enum>
+          <enum id="foo_alt_enum" size="8">
+            <evalue name="foo_1" value="1"/>
+          </enum>
+          <flags id="x0_flags" size="8">
+            <field name="foo" start="0" end="0" type="foo_enum"/>
+            <field name="foo" start="1" end="1" type="foo_alt_enum"/>
+          </flags>
+          <reg name="pc" bitsize="64"/>
+          <reg name="x0" regnum="0" bitsize="64" type="x0_flags"/>
+          <reg name="cpsr" regnum="33" bitsize="32"/>"""
+        )
+
+        self.expect(
+            "register info x0",
+            patterns=[
+                dedent(
+                    """\
+             foo: 1 = foo_1
+
+             foo: 1 = foo_0$"""
+                )
+            ],
+        )
+
+        self.expect("register read x0", patterns=["\(foo = foo_1, foo = foo_0\)$"])
diff --git a/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py b/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py
index 2783aaff1a0ed1..0fa7bc7edf995a 100644
--- a/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py
+++ b/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py
@@ -1,4 +1,4 @@
-﻿"""
+"""
 Test that SBProcess.LoadImageUsingPaths uses RTLD_LAZY
 """
 
diff --git a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
index 2208e520f1d561..e52fb8c87377ff 100644
--- a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
+++ b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
@@ -1,4 +1,4 @@
-﻿"""
+"""
 Test that breakpoint by symbol name works correctly with dynamic libs.
 """
 
diff --git a/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py b/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py
index 6ee99f84adda85..c423236b219f4e 100644
--- a/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py
+++ b/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py
@@ -1,4 +1,4 @@
-﻿"""
+"""
 Test that SBProcess.LoadImageUsingPaths works correctly.
 """
 
diff --git a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py
index c5f4a7c6dedc19..84033daff77308 100644
--- a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py
+++ b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py
@@ -1,4 +1,4 @@
-﻿"""Test that lldb picks the correct DWARF location list entry with a return-pc out of bounds."""
+"""Test that lldb picks the correct DWARF location list entry with a return-pc out of bounds."""
 
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/TestPythonOSPlugin.py b/lldb/test/API/functionalities/plugins/python_os_plugin/TestPythonOSPlugin.py
index 859e87cf75a592..e03e8e0df77825 100644
--- a/lldb/test/API/functionalities/plugins/python_os_plugin/TestPythonOSPlugin.py
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/TestPythonOSPlugin.py
@@ -17,7 +17,7 @@ def test_python_os_plugin(self):
         self.build()
         self.run_python_os_funcionality()
 
-    def run_python_os_step(self):
+    def test_run_python_os_step(self):
         """Test that the Python operating system plugin works correctly when single stepping a virtual thread"""
         self.build()
         self.run_python_os_step()
diff --git a/lldb/test/API/functionalities/signal/TestSendSignal.py b/lldb/test/API/functionalities/signal/TestSendSignal.py
index 50435572c4d83a..5b6a50a461cdfa 100644
--- a/lldb/test/API/functionalities/signal/TestSendSignal.py
+++ b/lldb/test/API/functionalities/signal/TestSendSignal.py
@@ -1,4 +1,4 @@
-﻿"""Test that lldb command 'process signal SIGUSR1' to send a signal to the inferior works."""
+"""Test that lldb command 'process signal SIGUSR1' to send a signal to the inferior works."""
 
 
 import lldb
diff --git a/lldb/test/API/functionalities/step_scripted/Steps.py b/lldb/test/API/functionalities/step_scripted/Steps.py
index 7527607be847a5..3325dba7536571 100644
--- a/lldb/test/API/functionalities/step_scripted/Steps.py
+++ b/lldb/test/API/functionalities/step_scripted/Steps.py
@@ -47,7 +47,7 @@ def queue_child_thread_plan(self):
 
 # This plan does a step-over until a variable changes value.
 class StepUntil(StepWithChild):
-    def __init__(self, thread_plan, args_data, dict):
+    def __init__(self, thread_plan, args_data):
         self.thread_plan = thread_plan
         self.frame = thread_plan.GetThread().frames[0]
         self.target = thread_plan.GetThread().GetProcess().GetTarget()
@@ -99,7 +99,7 @@ def stop_description(self, stream):
 class StepReportsStopOthers:
     stop_mode_dict = {}
 
-    def __init__(self, thread_plan, args_data, dict):
+    def __init__(self, thread_plan, args_data):
         self.thread_plan = thread_plan
         self.key = str(args_data.GetValueForKey("token").GetUnsignedIntegerValue(1000))
 
diff --git a/lldb/test/API/functionalities/step_scripted/TestStepScripted.py b/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
index 9891a9c23b50cb..53901718019f90 100644
--- a/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
+++ b/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
@@ -126,7 +126,7 @@ def run_step(self, stop_others_value, run_mode, token):
         cmd = "thread step-scripted -C Steps.StepReportsStopOthers -k token -v %s" % (
             token
         )
-        if run_mode != None:
+        if run_mode is not None:
             cmd = cmd + " --run-mode %s" % (run_mode)
         if self.TraceOn():
             print(cmd)
diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py
index 58205553154bf3..f68eb4196c1254 100644
--- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py
+++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py
@@ -99,7 +99,7 @@ def do_test(self, bkpt_modifier=None):
             backstop_bkpt_2.GetNumLocations(), 0, "Set our third breakpoint"
         )
 
-        if bkpt_modifier == None:
+        if bkpt_modifier is None:
             process.Continue()
             self.assertState(
                 process.GetState(), lldb.eStateStopped, "We didn't stop for the load"
diff --git a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py
index 802aaf2d3ffdb1..ebb795abfa0e27 100644
--- a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py
+++ b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py
@@ -2,7 +2,7 @@
 
 
 class WrapStepOver:
-    def __init__(self, thread_plan, args_data, dict):
+    def __init__(self, thread_plan, args_data):
         self.plan = thread_plan
         thread = thread_plan.GetThread()
         target = thread.GetProcess().GetTarget()
diff --git a/lldb/test/API/lang/cpp/alignas_base_class/TestAlignAsBaseClass.py b/lldb/test/API/lang/cpp/alignas_base_class/TestAlignAsBaseClass.py
index 7d97b0c42b7e16..362fc2740bf52d 100644
--- a/lldb/test/API/lang/cpp/alignas_base_class/TestAlignAsBaseClass.py
+++ b/lldb/test/API/lang/cpp/alignas_base_class/TestAlignAsBaseClass.py
@@ -16,3 +16,4 @@ def test(self):
         # Verify specified class alignments.
         self.expect_expr("alignof(B2)", result_value="8")
         self.expect_expr("alignof(EmptyClassAlign8)", result_value="8")
+        self.expect_expr("alignof(Derived)", result_value="8")
diff --git a/lldb/test/API/lang/cpp/alignas_base_class/main.cpp b/lldb/test/API/lang/cpp/alignas_base_class/main.cpp
index 9d37554957ba3f..a37919deaebdc8 100644
--- a/lldb/test/API/lang/cpp/alignas_base_class/main.cpp
+++ b/lldb/test/API/lang/cpp/alignas_base_class/main.cpp
@@ -13,4 +13,11 @@ D d3g;
 struct alignas(8) EmptyClassAlign8 {
 } t;
 
+struct alignas(8) __attribute__((packed)) AlignedAndPackedBase {
+} foo;
+
+struct Derived : AlignedAndPackedBase {
+} bar;
+static_assert(alignof(Derived) == 8);
+
 int main() {}
diff --git a/lldb/test/API/lang/cpp/diamond/TestCppDiamond.py b/lldb/test/API/lang/cpp/diamond/TestCppDiamond.py
index 49a4740d0caf07..27062c0666a1a9 100644
--- a/lldb/test/API/lang/cpp/diamond/TestCppDiamond.py
+++ b/lldb/test/API/lang/cpp/diamond/TestCppDiamond.py
@@ -109,7 +109,7 @@ def test(self):
 
     @expectedFailureAll
     @no_debug_info_test
-    def test(self):
+    def test_invalid_member(self):
         self.build()
         lldbutil.run_to_source_breakpoint(
             self, "// breakpoint 1", lldb.SBFileSpec("main.cpp")
diff --git a/lldb/test/API/lang/objcxx/cxx-bridged-po/TestObjCXXBridgedPO.py b/lldb/test/API/lang/objcxx/cxx-bridged-po/TestObjCXXBridgedPO.py
index 87d8b3d84b8c75..0317d0a4880b1f 100644
--- a/lldb/test/API/lang/objcxx/cxx-bridged-po/TestObjCXXBridgedPO.py
+++ b/lldb/test/API/lang/objcxx/cxx-bridged-po/TestObjCXXBridgedPO.py
@@ -7,7 +7,7 @@
 class TestObjCXXBridgedPO(TestBase):
     @skipIfDarwinEmbedded
     @skipIf(macos_version=[">=", "13.0"])
-    def test_bridged_type_po(self):
+    def test_bridged_type_po_old(self):
         self.build()
         lldbutil.run_to_source_breakpoint(
             self, "break here", lldb.SBFileSpec("main.mm")
diff --git a/lldb/test/API/lang/rust/enum-structs/TestRustEnumStructs.py b/lldb/test/API/lang/rust/enum-structs/TestRustEnumStructs.py
index 46f96d89221d59..66df22cd9f99f6 100644
--- a/lldb/test/API/lang/rust/enum-structs/TestRustEnumStructs.py
+++ b/lldb/test/API/lang/rust/enum-structs/TestRustEnumStructs.py
@@ -287,7 +287,7 @@ def test_niche_layout_with_fields_3_a(self):
             (137, 1),
         )
 
-    def test_niche_layout_with_fields_3_a(self):
+    def test_niche_layout_with_fields_3_c(self):
         # static NICHE_W_FIELDS_3_C: NicheLayoutWithFields3 = NicheLayoutWithFields3::C(false);
         value = self.getFromGlobal("NICHE_W_FIELDS_3_C").getCurrentValue()
         self.assertEqual(
diff --git a/lldb/test/API/lua_api/TestLuaAPI.py b/lldb/test/API/lua_api/TestLuaAPI.py
index 4c9a5d9672c4c6..065de61ad6577b 100644
--- a/lldb/test/API/lua_api/TestLuaAPI.py
+++ b/lldb/test/API/lua_api/TestLuaAPI.py
@@ -115,7 +115,7 @@ def killProcess():
         out, err = p.communicate(input=input)
         exitCode = p.wait()
     finally:
-        if timerObject != None:
+        if timerObject is not None:
             timerObject.cancel()
 
     # Ensure the resulting output is always of string type.
diff --git a/lldb/test/API/macosx/thread_suspend/TestInternalThreadSuspension.py b/lldb/test/API/macosx/thread_suspend/TestInternalThreadSuspension.py
index 1b33494cc5206d..b41c2ae0976a27 100644
--- a/lldb/test/API/macosx/thread_suspend/TestInternalThreadSuspension.py
+++ b/lldb/test/API/macosx/thread_suspend/TestInternalThreadSuspension.py
@@ -92,7 +92,7 @@ def suspended_thread_test(self):
         thread = lldb.SBThread()
         for thread in process.threads:
             th_name = thread.GetName()
-            if th_name == None:
+            if th_name is None:
                 continue
             if "Look for me" in th_name:
                 break
diff --git a/lldb/test/API/python_api/event/TestEvents.py b/lldb/test/API/python_api/event/TestEvents.py
index a15f4357f4f5f7..d8d3dd2d2b01b8 100644
--- a/lldb/test/API/python_api/event/TestEvents.py
+++ b/lldb/test/API/python_api/event/TestEvents.py
@@ -335,7 +335,7 @@ def wait_for_next_event(self, expected_state, test_shadow=False):
         if state == lldb.eStateStopped:
             restart = lldb.SBProcess.GetRestartedFromEvent(event)
 
-        if expected_state != None:
+        if expected_state is not None:
             self.assertEqual(
                 state, expected_state, "Primary thread got the correct event"
             )
diff --git a/lldb/test/API/python_api/find_in_memory/Makefile b/lldb/test/API/python_api/find_in_memory/Makefile
new file mode 100644
index 00000000000000..99998b20bcb050
--- /dev/null
+++ b/lldb/test/API/python_api/find_in_memory/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py b/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py
new file mode 100644
index 00000000000000..9ab4619b1f8f4f
--- /dev/null
+++ b/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py
@@ -0,0 +1,154 @@
+"""
+Test Process::FindInMemory.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from address_ranges_helper import *
+
+
+class FindInMemoryTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        TestBase.setUp(self)
+
+        self.build()
+        (
+            self.target,
+            self.process,
+            self.thread,
+            self.bp,
+        ) = lldbutil.run_to_source_breakpoint(
+            self,
+            "break here",
+            lldb.SBFileSpec("main.cpp"),
+        )
+        self.assertTrue(self.bp.IsValid())
+
+    def test_check_stack_pointer(self):
+        """Make sure the 'stack_pointer' variable lives on the stack"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        frame = self.thread.GetSelectedFrame()
+        ex = frame.EvaluateExpression("&stack_pointer")
+        variable_region = lldb.SBMemoryRegionInfo()
+        self.assertTrue(
+            self.process.GetMemoryRegionInfo(
+                ex.GetValueAsUnsigned(), variable_region
+            ).Success(),
+        )
+
+        stack_region = lldb.SBMemoryRegionInfo()
+        self.assertTrue(
+            self.process.GetMemoryRegionInfo(frame.GetSP(), stack_region).Success(),
+        )
+
+        self.assertEqual(variable_region, stack_region)
+
+    def test_find_in_memory_ok(self):
+        """Make sure a match exists in the heap memory and the right address ranges are provided"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+        error = lldb.SBError()
+        addr = self.process.FindInMemory(
+            SINGLE_INSTANCE_PATTERN_STACK,
+            GetStackRange(self),
+            1,
+            error,
+        )
+
+        self.assertSuccess(error)
+        self.assertNotEqual(addr, lldb.LLDB_INVALID_ADDRESS)
+
+    def test_find_in_memory_double_instance_ok(self):
+        """Make sure a match exists in the heap memory and the right address ranges are provided"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+        error = lldb.SBError()
+        addr = self.process.FindInMemory(
+            DOUBLE_INSTANCE_PATTERN_HEAP,
+            GetHeapRanges(self)[0],
+            1,
+            error,
+        )
+
+        self.assertSuccess(error)
+        self.assertNotEqual(addr, lldb.LLDB_INVALID_ADDRESS)
+
+    def test_find_in_memory_invalid_alignment(self):
+        """Make sure the alignment 0 is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        error = lldb.SBError()
+        addr = self.process.FindInMemory(
+            SINGLE_INSTANCE_PATTERN_STACK,
+            GetStackRange(self),
+            0,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertEqual(addr, lldb.LLDB_INVALID_ADDRESS)
+
+    def test_find_in_memory_invalid_address_range(self):
+        """Make sure invalid address range is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        error = lldb.SBError()
+        addr = self.process.FindInMemory(
+            SINGLE_INSTANCE_PATTERN_STACK,
+            lldb.SBAddressRange(),
+            1,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertEqual(addr, lldb.LLDB_INVALID_ADDRESS)
+
+    def test_find_in_memory_invalid_buffer(self):
+        """Make sure the empty buffer is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        error = lldb.SBError()
+        addr = self.process.FindInMemory(
+            "",
+            GetStackRange(self),
+            1,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertEqual(addr, lldb.LLDB_INVALID_ADDRESS)
+
+    def test_find_in_memory_unaligned(self):
+        """Make sure the unaligned match exists in the heap memory and is not found with alignment 8"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+        error = lldb.SBError()
+        range = GetAlignedRange(self)
+
+        # First we make sure the pattern is found with alignment 1
+        addr = self.process.FindInMemory(
+            UNALIGNED_INSTANCE_PATTERN_HEAP,
+            range,
+            1,
+            error,
+        )
+        self.assertSuccess(error)
+        self.assertNotEqual(addr, lldb.LLDB_INVALID_ADDRESS)
+
+        # With alignment 8 the pattern should not be found
+        addr = self.process.FindInMemory(
+            UNALIGNED_INSTANCE_PATTERN_HEAP,
+            range,
+            8,
+            error,
+        )
+        self.assertSuccess(error)
+        self.assertEqual(addr, lldb.LLDB_INVALID_ADDRESS)
diff --git a/lldb/test/API/python_api/find_in_memory/TestFindRangesInMemory.py b/lldb/test/API/python_api/find_in_memory/TestFindRangesInMemory.py
new file mode 100644
index 00000000000000..31bc0e99f4914f
--- /dev/null
+++ b/lldb/test/API/python_api/find_in_memory/TestFindRangesInMemory.py
@@ -0,0 +1,221 @@
+"""
+Test Process::FindRangesInMemory.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from address_ranges_helper import *
+
+
+class FindRangesInMemoryTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        TestBase.setUp(self)
+
+        self.build()
+        (
+            self.target,
+            self.process,
+            self.thread,
+            self.bp,
+        ) = lldbutil.run_to_source_breakpoint(
+            self, "break here", lldb.SBFileSpec("main.cpp")
+        )
+        self.assertTrue(self.bp.IsValid())
+
+    def test_find_ranges_in_memory_two_matches(self):
+        """Make sure two matches exist in the heap memory and the right address ranges are provided"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = GetHeapRanges(self)
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            DOUBLE_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            1,
+            10,
+            error,
+        )
+
+        self.assertSuccess(error)
+        self.assertEqual(matches.GetSize(), 2)
+
+    def test_find_ranges_in_memory_one_match(self):
+        """Make sure exactly one match exists in the heap memory and the right address ranges are provided"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = GetStackRanges(self)
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            SINGLE_INSTANCE_PATTERN_STACK,
+            addr_ranges,
+            1,
+            10,
+            error,
+        )
+
+        self.assertSuccess(error)
+        self.assertEqual(matches.GetSize(), 1)
+
+    def test_find_ranges_in_memory_one_match_multiple_ranges(self):
+        """Make sure exactly one match exists in the heap memory and multiple address ranges are provided"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = GetRanges(self)
+        addr_ranges.Append(lldb.SBAddressRange())
+        self.assertGreater(addr_ranges.GetSize(), 2)
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            SINGLE_INSTANCE_PATTERN_STACK,
+            addr_ranges,
+            1,
+            10,
+            error,
+        )
+
+        self.assertSuccess(error)
+        self.assertEqual(matches.GetSize(), 1)
+
+    def test_find_ranges_in_memory_one_match_max(self):
+        """Make sure at least one matche exists in the heap memory and the right address ranges are provided"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = GetHeapRanges(self)
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            DOUBLE_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            1,
+            1,
+            error,
+        )
+
+        self.assertSuccess(error)
+        self.assertEqual(matches.GetSize(), 1)
+
+    def test_find_ranges_in_memory_invalid_alignment(self):
+        """Make sure the alignment 0 is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = GetHeapRanges(self)
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            DOUBLE_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            0,
+            10,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertEqual(matches.GetSize(), 0)
+
+    def test_find_ranges_in_memory_invalid_range(self):
+        """Make sure the alignment 0 is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = lldb.SBAddressRangeList()
+        addr_ranges.Append(lldb.SBAddressRange())
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            DOUBLE_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            1,
+            10,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertIn("unable to resolve any ranges", str(error))
+        self.assertEqual(matches.GetSize(), 0)
+
+    def test_find_ranges_in_memory_empty_ranges(self):
+        """Make sure the empty ranges is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = lldb.SBAddressRangeList()
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            DOUBLE_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            1,
+            10,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertEqual(matches.GetSize(), 0)
+
+    def test_find_ranges_in_memory_invalid_buffer(self):
+        """Make sure the empty buffer is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = GetHeapRanges(self)
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            "",
+            addr_ranges,
+            1,
+            10,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertEqual(matches.GetSize(), 0)
+
+    def test_find_ranges_in_memory_invalid_max_matches(self):
+        """Make sure the empty buffer is failing"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = GetHeapRanges(self)
+        error = lldb.SBError()
+        matches = self.process.FindRangesInMemory(
+            DOUBLE_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            1,
+            0,
+            error,
+        )
+
+        self.assertFailure(error)
+        self.assertEqual(matches.GetSize(), 0)
+
+    def test_find_in_memory_unaligned(self):
+        """Make sure the unaligned match exists in the heap memory and is not found with alignment 8"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        addr_ranges = lldb.SBAddressRangeList()
+        addr_ranges.Append(GetAlignedRange(self))
+        error = lldb.SBError()
+
+        matches = self.process.FindRangesInMemory(
+            UNALIGNED_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            1,
+            10,
+            error,
+        )
+        self.assertSuccess(error)
+        self.assertEqual(matches.GetSize(), 1)
+
+        matches = self.process.FindRangesInMemory(
+            UNALIGNED_INSTANCE_PATTERN_HEAP,
+            addr_ranges,
+            8,
+            10,
+            error,
+        )
+        self.assertSuccess(error)
+        self.assertEqual(matches.GetSize(), 0)
diff --git a/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py b/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py
new file mode 100644
index 00000000000000..810fb9fee38615
--- /dev/null
+++ b/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py
@@ -0,0 +1,73 @@
+import lldb
+
+SINGLE_INSTANCE_PATTERN_STACK = "stack_there_is_only_one_of_me"
+DOUBLE_INSTANCE_PATTERN_HEAP = "heap_there_is_exactly_two_of_me"
+ALIGNED_INSTANCE_PATTERN_HEAP = "i_am_unaligned_string_on_the_heap"
+UNALIGNED_INSTANCE_PATTERN_HEAP = ALIGNED_INSTANCE_PATTERN_HEAP[1:]
+
+
+def GetAlignedRange(test_base):
+    frame = test_base.thread.GetSelectedFrame()
+    ex = frame.EvaluateExpression("aligned_string_ptr")
+    test_base.assertTrue(ex.IsValid())
+    return GetRangeFromAddrValue(test_base, ex)
+
+
+def GetStackRange(test_base):
+    frame = test_base.thread.GetSelectedFrame()
+    ex = frame.EvaluateExpression("&stack_pointer")
+    test_base.assertTrue(ex.IsValid())
+    return GetRangeFromAddrValue(test_base, ex)
+
+
+def GetStackRanges(test_base):
+    addr_ranges = lldb.SBAddressRangeList()
+    addr_ranges.Append(GetStackRange(test_base))
+    return addr_ranges
+
+
+def GetRangeFromAddrValue(test_base, addr):
+    region = lldb.SBMemoryRegionInfo()
+    test_base.assertTrue(
+        test_base.process.GetMemoryRegionInfo(
+            addr.GetValueAsUnsigned(), region
+        ).Success(),
+    )
+
+    test_base.assertTrue(region.IsReadable())
+    test_base.assertFalse(region.IsExecutable())
+
+    address_start = lldb.SBAddress(region.GetRegionBase(), test_base.target)
+    stack_size = region.GetRegionEnd() - region.GetRegionBase()
+    return lldb.SBAddressRange(address_start, stack_size)
+
+
+def IsWithinRange(addr, range, target):
+    start_addr = range.GetBaseAddress().GetLoadAddress(target)
+    end_addr = start_addr + range.GetByteSize()
+    addr = addr.GetValueAsUnsigned()
+    return addr >= start_addr and addr < end_addr
+
+
+def GetHeapRanges(test_base):
+    frame = test_base.thread.GetSelectedFrame()
+
+    ex = frame.EvaluateExpression("heap_pointer1")
+    test_base.assertTrue(ex.IsValid())
+    range = GetRangeFromAddrValue(test_base, ex)
+    addr_ranges = lldb.SBAddressRangeList()
+    addr_ranges.Append(range)
+
+    ex = frame.EvaluateExpression("heap_pointer2")
+    test_base.assertTrue(ex.IsValid())
+    if not IsWithinRange(ex, addr_ranges[0], test_base.target):
+        addr_ranges.Append(GetRangeFromAddrValue(test_base, ex))
+
+    return addr_ranges
+
+
+def GetRanges(test_base):
+    ranges = GetHeapRanges(test_base)
+    ranges.Append(GetStackRanges(test_base))
+
+    return ranges
diff --git a/lldb/test/API/python_api/find_in_memory/main.cpp b/lldb/test/API/python_api/find_in_memory/main.cpp
new file mode 100644
index 00000000000000..13402e3e2f2f80
--- /dev/null
+++ b/lldb/test/API/python_api/find_in_memory/main.cpp
@@ -0,0 +1,40 @@
+#include <cstring>
+#include <memory>
+#include <string>
+
+int main() {
+  // Stack
+  const char stack_pointer[] = "stack_there_is_only_one_of_me";
+
+  // Heap
+  // This test relies on std::string objects with size over 22 characters being
+  // allocated on the heap.
+  const std::string heap_string1("heap_there_is_exactly_two_of_me");
+  const std::string heap_string2("heap_there_is_exactly_two_of_me");
+  const char *heap_pointer1 = heap_string1.data();
+  const char *heap_pointer2 = heap_string2.data();
+
+  // Aligned Heap
+  constexpr char aligned_string[] = "i_am_unaligned_string_on_the_heap";
+  constexpr size_t buffer_size = 100;
+  constexpr size_t len = sizeof(aligned_string) + 1;
+  // Allocate memory aligned to 8-byte boundary
+  void *aligned_string_ptr = new size_t[buffer_size];
+  if (aligned_string_ptr == nullptr) {
+    return -1;
+  }
+  // Zero out the memory
+  memset(aligned_string_ptr, 0, buffer_size);
+
+  // Align the pointer to a multiple of 8 bytes
+  size_t size = buffer_size;
+  aligned_string_ptr = std::align(8, len, aligned_string_ptr, size);
+
+  // Copy the string to aligned memory
+  memcpy(aligned_string_ptr, aligned_string, len);
+
+  (void)stack_pointer;
+  (void)heap_pointer1;
+  (void)heap_pointer2; // break here
+  return 0;
+}
diff --git a/lldb/test/API/python_api/process/read-mem-cstring/TestReadMemCString.py b/lldb/test/API/python_api/process/read-mem-cstring/TestReadMemCString.py
index f664a18ebfa3d6..108487b08bb005 100644
--- a/lldb/test/API/python_api/process/read-mem-cstring/TestReadMemCString.py
+++ b/lldb/test/API/python_api/process/read-mem-cstring/TestReadMemCString.py
@@ -61,4 +61,4 @@ def test_read_memory_c_string(self):
             invalid_memory_str_addr, 2048, err
         )
         self.assertTrue(err.Fail())
-        self.assertTrue(invalid_memory_string == "" or invalid_memory_string == None)
+        self.assertTrue(invalid_memory_string == "" or invalid_memory_string is None)
diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py
index b028929eea4423..bc4d00c17c5551 100644
--- a/lldb/test/API/python_api/type/TestTypeList.py
+++ b/lldb/test/API/python_api/type/TestTypeList.py
@@ -132,7 +132,7 @@ def test(self):
                         "my_type_is_named has a named type",
                     )
                     self.assertTrue(field.type.IsAggregateType())
-                elif field.name == None:
+                elif field.name is None:
                     self.assertTrue(
                         field.type.IsAnonymousType(), "Nameless type is not anonymous"
                     )
diff --git a/lldb/test/API/python_api/was_interrupted/interruptible.py b/lldb/test/API/python_api/was_interrupted/interruptible.py
index 4e7437d768d265..37999979e1a7f6 100644
--- a/lldb/test/API/python_api/was_interrupted/interruptible.py
+++ b/lldb/test/API/python_api/was_interrupted/interruptible.py
@@ -45,7 +45,7 @@ def __call__(self, debugger, args, exe_ctx, result):
         For the "check" case, it doesn't wait, but just returns whether there was
         an interrupt in force or not."""
 
-        if local_data == None:
+        if local_data is None:
             result.SetError("local data was not set.")
             result.SetStatus(lldb.eReturnStatusFailed)
             return
diff --git a/lldb/test/API/test_utils/TestDecorators.py b/lldb/test/API/test_utils/TestDecorators.py
index eb09db69de3497..4810e281c8e793 100644
--- a/lldb/test/API/test_utils/TestDecorators.py
+++ b/lldb/test/API/test_utils/TestDecorators.py
@@ -29,7 +29,7 @@ def test_add_test_categories(self):
         self.assertIsNone(self.getDebugInfo())
 
     @expectedFailureAll
-    def test_xfail_regexp(self):
+    def test_xfail_empty(self):
         """Test that expectedFailureAll can be empty (but please just use expectedFailure)"""
         self.fail()
 
@@ -92,7 +92,7 @@ def test_decorator_skip2(self):
         # self.assertNotEqual(self.getDebugInfo(), "dwarf")
 
     @expectedFailureAll
-    def test_xfail_regexp(self):
+    def test_xfail_empty(self):
         """Test that xfail can be empty"""
         self.fail()
 
diff --git a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attachByPortNum.py b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attachByPortNum.py
new file mode 100644
index 00000000000000..fbabc857bd0e0b
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attachByPortNum.py
@@ -0,0 +1,160 @@
+"""
+Test lldb-dap "port" configuration to "attach" request
+"""
+
+
+import dap_server
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from lldbsuite.test import lldbplatformutil
+from lldbgdbserverutils import Pipe
+import lldbdap_testcase
+import os
+import shutil
+import subprocess
+import tempfile
+import threading
+import sys
+import socket
+
+
+class TestDAP_attachByPortNum(lldbdap_testcase.DAPTestCaseBase):
+    default_timeout = 20
+
+    def set_and_hit_breakpoint(self, continueToExit=True):
+        source = "main.c"
+        main_source_path = os.path.join(os.getcwd(), source)
+        breakpoint1_line = line_number(main_source_path, "// breakpoint 1")
+        lines = [breakpoint1_line]
+        # Set breakpoint in the thread function so we can step the threads
+        breakpoint_ids = self.set_source_breakpoints(main_source_path, lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+        if continueToExit:
+            self.continue_to_exit()
+
+    def get_debug_server_command_line_args(self):
+        args = []
+        if lldbplatformutil.getPlatform() == "linux":
+            args = ["gdbserver"]
+        elif lldbplatformutil.getPlatform() == "macosx":
+            args = ["--listen"]
+        if lldb.remote_platform:
+            args += ["*:0"]
+        else:
+            args += ["localhost:0"]
+        return args
+
+    def get_debug_server_pipe(self):
+        pipe = Pipe(self.getBuildDir())
+        self.addTearDownHook(lambda: pipe.close())
+        pipe.finish_connection(self.default_timeout)
+        return pipe
+
+    @skipIfWindows
+    @skipIfNetBSD
+    def test_by_port(self):
+        """
+        Tests attaching to a process by port.
+        """
+        self.build_and_create_debug_adaptor()
+        program = self.getBuildArtifact("a.out")
+
+        debug_server_tool = self.getBuiltinDebugServerTool()
+
+        pipe = self.get_debug_server_pipe()
+        args = self.get_debug_server_command_line_args()
+        args += [program]
+        args += ["--named-pipe", pipe.name]
+
+        self.process = self.spawnSubprocess(
+            debug_server_tool, args, install_remote=False
+        )
+
+        # Read the port number from the debug server pipe.
+        port = pipe.read(10, self.default_timeout)
+        # Trim null byte, convert to int
+        port = int(port[:-1])
+        self.assertIsNotNone(
+            port, " Failed to read the port number from debug server pipe"
+        )
+
+        self.attach(program=program, gdbRemotePort=port, sourceInitFile=True)
+        self.set_and_hit_breakpoint(continueToExit=True)
+        self.process.terminate()
+
+    @skipIfWindows
+    @skipIfNetBSD
+    def test_by_port_and_pid(self):
+        """
+        Tests attaching to a process by process ID and port number.
+        """
+        self.build_and_create_debug_adaptor()
+        program = self.getBuildArtifact("a.out")
+
+        # It is not necessary to launch "lldb-server" to obtain the actual port and pid for attaching.
+        # However, when providing the port number and pid directly, "lldb-dap" throws an error message, which is expected.
+        # So, used random pid and port numbers here.
+
+        pid = 1354
+        port = 1234
+
+        response = self.attach(
+            program=program,
+            pid=pid,
+            gdbRemotePort=port,
+            sourceInitFile=True,
+            expectFailure=True,
+        )
+        if not (response and response["success"]):
+            self.assertFalse(
+                response["success"], "The user can't specify both pid and port"
+            )
+
+    @skipIfWindows
+    @skipIfNetBSD
+    def test_by_invalid_port(self):
+        """
+        Tests attaching to a process by invalid port number 0.
+        """
+        self.build_and_create_debug_adaptor()
+        program = self.getBuildArtifact("a.out")
+
+        port = 0
+        response = self.attach(
+            program=program, gdbRemotePort=port, sourceInitFile=True, expectFailure=True
+        )
+        if not (response and response["success"]):
+            self.assertFalse(
+                response["success"],
+                "The user can't attach with invalid port (%s)" % port,
+            )
+
+    @skipIfWindows
+    @skipIfNetBSD
+    def test_by_illegal_port(self):
+        """
+        Tests attaching to a process by illegal/greater port number 65536
+        """
+        self.build_and_create_debug_adaptor()
+        program = self.getBuildArtifact("a.out")
+
+        port = 65536
+        args = [program]
+        debug_server_tool = self.getBuiltinDebugServerTool()
+        self.process = self.spawnSubprocess(
+            debug_server_tool, args, install_remote=False
+        )
+
+        response = self.attach(
+            program=program, gdbRemotePort=port, sourceInitFile=True, expectFailure=True
+        )
+        if not (response and response["success"]):
+            self.assertFalse(
+                response["success"],
+                "The user can't attach with illegal port (%s)" % port,
+            )
+        self.process.terminate()
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index 9fcd210122d545..b214b512c0de34 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -105,7 +105,7 @@ def test_runInTerminalInvalidTarget(self):
         )
         self.assertFalse(response["success"])
         self.assertIn(
-            "Could not create a target for a program 'INVALIDPROGRAM': unable to find executable",
+            "Could not create a target for a program 'INVALIDPROGRAM': 'INVALIDPROGRAM' does not exist",
             response["message"],
         )
 
diff --git a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
index 754579a59c11e5..16297efe14372a 100644
--- a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
+++ b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
@@ -39,7 +39,7 @@ def check_simulator_ostype(self, sdk, platform_name, arch=platform.machine()):
             for device in devices:
                 if "availability" in device and device["availability"] != "(available)":
                     continue
-                if "isAvailable" in device and device["isAvailable"] != True:
+                if "isAvailable" in device and not device["isAvailable"]:
                     continue
                 if deviceRuntime and runtime < deviceRuntime:
                     continue
diff --git a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
index 32b36bc04c1a3a..93485cd32f5196 100644
--- a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
+++ b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
@@ -953,7 +953,7 @@ def breakpoint_set_and_remove_work(self, want_hardware):
         z_packet_type = 0
 
         # If hardware breakpoint is requested set packet type to Z1
-        if want_hardware == True:
+        if want_hardware:
             z_packet_type = 1
 
         self.reset_test_sequence()
diff --git a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
index 853b7ad5ef2901..ed600d396fad44 100644
--- a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
+++ b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
@@ -1,153 +1,11 @@
 import gdbremote_testcase
 import random
-import select
 import socket
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbgdbserverutils import Server
 import lldbsuite.test.lldbplatformutil
-
-if lldbplatformutil.getHostPlatform() == "windows":
-    import ctypes
-    import ctypes.wintypes
-    from ctypes.wintypes import BOOL, DWORD, HANDLE, LPCWSTR, LPDWORD, LPVOID
-
-    kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
-
-    PIPE_ACCESS_INBOUND = 1
-    FILE_FLAG_FIRST_PIPE_INSTANCE = 0x00080000
-    FILE_FLAG_OVERLAPPED = 0x40000000
-    PIPE_TYPE_BYTE = 0
-    PIPE_REJECT_REMOTE_CLIENTS = 8
-    INVALID_HANDLE_VALUE = -1
-    ERROR_ACCESS_DENIED = 5
-    ERROR_IO_PENDING = 997
-
-    class OVERLAPPED(ctypes.Structure):
-        _fields_ = [
-            ("Internal", LPVOID),
-            ("InternalHigh", LPVOID),
-            ("Offset", DWORD),
-            ("OffsetHigh", DWORD),
-            ("hEvent", HANDLE),
-        ]
-
-        def __init__(self):
-            super(OVERLAPPED, self).__init__(
-                Internal=0, InternalHigh=0, Offset=0, OffsetHigh=0, hEvent=None
-            )
-
-    LPOVERLAPPED = ctypes.POINTER(OVERLAPPED)
-
-    CreateNamedPipe = kernel32.CreateNamedPipeW
-    CreateNamedPipe.restype = HANDLE
-    CreateNamedPipe.argtypes = (
-        LPCWSTR,
-        DWORD,
-        DWORD,
-        DWORD,
-        DWORD,
-        DWORD,
-        DWORD,
-        LPVOID,
-    )
-
-    ConnectNamedPipe = kernel32.ConnectNamedPipe
-    ConnectNamedPipe.restype = BOOL
-    ConnectNamedPipe.argtypes = (HANDLE, LPOVERLAPPED)
-
-    CreateEvent = kernel32.CreateEventW
-    CreateEvent.restype = HANDLE
-    CreateEvent.argtypes = (LPVOID, BOOL, BOOL, LPCWSTR)
-
-    GetOverlappedResultEx = kernel32.GetOverlappedResultEx
-    GetOverlappedResultEx.restype = BOOL
-    GetOverlappedResultEx.argtypes = (HANDLE, LPOVERLAPPED, LPDWORD, DWORD, BOOL)
-
-    ReadFile = kernel32.ReadFile
-    ReadFile.restype = BOOL
-    ReadFile.argtypes = (HANDLE, LPVOID, DWORD, LPDWORD, LPOVERLAPPED)
-
-    CloseHandle = kernel32.CloseHandle
-    CloseHandle.restype = BOOL
-    CloseHandle.argtypes = (HANDLE,)
-
-    class Pipe(object):
-        def __init__(self, prefix):
-            while True:
-                self.name = "lldb-" + str(random.randrange(1e10))
-                full_name = "\\\\.\\pipe\\" + self.name
-                self._handle = CreateNamedPipe(
-                    full_name,
-                    PIPE_ACCESS_INBOUND
-                    | FILE_FLAG_FIRST_PIPE_INSTANCE
-                    | FILE_FLAG_OVERLAPPED,
-                    PIPE_TYPE_BYTE | PIPE_REJECT_REMOTE_CLIENTS,
-                    1,
-                    4096,
-                    4096,
-                    0,
-                    None,
-                )
-                if self._handle != INVALID_HANDLE_VALUE:
-                    break
-                if ctypes.get_last_error() != ERROR_ACCESS_DENIED:
-                    raise ctypes.WinError(ctypes.get_last_error())
-
-            self._overlapped = OVERLAPPED()
-            self._overlapped.hEvent = CreateEvent(None, True, False, None)
-            result = ConnectNamedPipe(self._handle, self._overlapped)
-            assert result == 0
-            if ctypes.get_last_error() != ERROR_IO_PENDING:
-                raise ctypes.WinError(ctypes.get_last_error())
-
-        def finish_connection(self, timeout):
-            if not GetOverlappedResultEx(
-                self._handle,
-                self._overlapped,
-                ctypes.byref(DWORD(0)),
-                timeout * 1000,
-                True,
-            ):
-                raise ctypes.WinError(ctypes.get_last_error())
-
-        def read(self, size, timeout):
-            buf = ctypes.create_string_buffer(size)
-            if not ReadFile(
-                self._handle, ctypes.byref(buf), size, None, self._overlapped
-            ):
-                if ctypes.get_last_error() != ERROR_IO_PENDING:
-                    raise ctypes.WinError(ctypes.get_last_error())
-            read = DWORD(0)
-            if not GetOverlappedResultEx(
-                self._handle, self._overlapped, ctypes.byref(read), timeout * 1000, True
-            ):
-                raise ctypes.WinError(ctypes.get_last_error())
-            return buf.raw[0 : read.value]
-
-        def close(self):
-            CloseHandle(self._overlapped.hEvent)
-            CloseHandle(self._handle)
-
-else:
-
-    class Pipe(object):
-        def __init__(self, prefix):
-            self.name = os.path.join(prefix, "stub_port_number")
-            os.mkfifo(self.name)
-            self._fd = os.open(self.name, os.O_RDONLY | os.O_NONBLOCK)
-
-        def finish_connection(self, timeout):
-            pass
-
-        def read(self, size, timeout):
-            (readers, _, _) = select.select([self._fd], [], [], timeout)
-            if self._fd not in readers:
-                raise TimeoutError
-            return os.read(self._fd, size)
-
-        def close(self):
-            os.close(self._fd)
+from lldbgdbserverutils import Pipe
 
 
 class TestGdbRemoteConnection(gdbremote_testcase.GdbRemoteTestCaseBase):
diff --git a/lldb/test/Shell/SymbolFile/DWARF/clang-gmodules-type-lookup.c b/lldb/test/Shell/SymbolFile/DWARF/clang-gmodules-type-lookup.c
index f42a5d3907dfd4..a8da692e57b1f9 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/clang-gmodules-type-lookup.c
+++ b/lldb/test/Shell/SymbolFile/DWARF/clang-gmodules-type-lookup.c
@@ -7,7 +7,7 @@
 // RUN: %clangxx_host -g -gmodules -fmodules -std=c99 -x c -include-pch %t.pch %s -c -o %t.o
 // RUN: %clangxx_host %t.o -o %t.exe
 // RUN: lldb-test symbols -dump-clang-ast -find type --language=C99 \
-// RUN:   -compiler-context 'AnyModule:*,Struct:TypeFromPCH' %t.exe | FileCheck %s
+// RUN:   -compiler-context 'AnyModule:*,ClassOrStruct:TypeFromPCH' %t.exe | FileCheck %s
 
 anchor_t anchor;
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp
new file mode 100644
index 00000000000000..1488199a3ad2d3
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp
@@ -0,0 +1,24 @@
+// XFAIL: *
+
+// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o "expr alignof(OverlappingFields)" \
+// RUN:   -o "expr sizeof(OverlappingFields)" \
+// RUN:   -o exit | FileCheck %s
+
+// CHECK:      (lldb) expr alignof(OverlappingFields)
+// CHECK-NEXT: ${{.*}} = 4
+// CHECK:      (lldb) expr sizeof(OverlappingFields)
+// CHECK-NEXT: ${{.*}} = 8
+
+struct Empty {};
+
+struct OverlappingFields {
+  char y;
+  [[no_unique_address]] Empty e;
+  int z;
+} g_overlapping_struct;
+static_assert(alignof(OverlappingFields) == 4);
+static_assert(sizeof(OverlappingFields) == 8);
+
+int main() {}
diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp
new file mode 100644
index 00000000000000..15d8de0e3ee988
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp
@@ -0,0 +1,30 @@
+// XFAIL: *
+
+// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o "expr alignof(OverlappingDerived)" \
+// RUN:   -o "expr sizeof(OverlappingDerived)" \
+// RUN:   -o exit | FileCheck %s
+
+struct Empty {};
+
+struct OverlappingBase {
+  [[no_unique_address]] Empty e;
+};
+static_assert(sizeof(OverlappingBase) == 1);
+static_assert(alignof(OverlappingBase) == 1);
+
+struct Base {
+  int mem;
+};
+
+struct OverlappingDerived : Base, OverlappingBase {};
+static_assert(alignof(OverlappingDerived) == 4);
+static_assert(sizeof(OverlappingDerived) == 4);
+
+// CHECK:      (lldb) expr alignof(OverlappingDerived)
+// CHECK-NEXT: ${{.*}} = 4
+// CHECK:      (lldb) expr sizeof(OverlappingDerived)
+// CHECK-NEXT: ${{.*}} = 4
+
+int main() { OverlappingDerived d; }
diff --git a/lldb/test/Shell/SymbolFile/DWARF/packed-alignof.cpp b/lldb/test/Shell/SymbolFile/DWARF/packed-alignof.cpp
new file mode 100644
index 00000000000000..dcc02736a9be74
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/packed-alignof.cpp
@@ -0,0 +1,43 @@
+// XFAIL: *
+//
+// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o "expr alignof(base)" \
+// RUN:   -o "expr alignof(packed_base)" \
+// RUN:   -o "expr alignof(derived)" \
+// RUN:   -o "expr sizeof(derived)" \
+// RUN:   -o exit | FileCheck %s
+
+struct __attribute__((packed)) packed {
+  int x;
+  char y;
+  int z;
+} g_packed_struct;
+
+// LLDB incorrectly calculates alignof(base)
+struct foo {};
+struct base : foo { int x; };
+static_assert(alignof(base) == 4);
+
+// CHECK:      (lldb) expr alignof(base)
+// CHECK-NEXT: ${{.*}} = 4
+
+// LLDB incorrectly calculates alignof(packed_base)
+struct __attribute__((packed)) packed_base { int x; };
+static_assert(alignof(packed_base) == 1);
+
+// CHECK:      (lldb) expr alignof(packed_base)
+// CHECK-NEXT: ${{.*}} = 1
+
+struct derived : packed, packed_base {
+  short s;
+} g_derived;
+static_assert(alignof(derived) == 2);
+static_assert(sizeof(derived) == 16);
+
+// CHECK:      (lldb) expr alignof(derived)
+// CHECK-NEXT: ${{.*}} = 2
+// CHECK:      (lldb) expr sizeof(derived)
+// CHECK-NEXT: ${{.*}} = 16
+
+int main() {}
diff --git a/lldb/test/Shell/SymbolFile/DWARF/packed.cpp b/lldb/test/Shell/SymbolFile/DWARF/packed.cpp
new file mode 100644
index 00000000000000..56a4308ff7c5ef
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/packed.cpp
@@ -0,0 +1,50 @@
+// XFAIL: system-windows
+// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o "expr alignof(packed)" \
+// RUN:   -o "expr sizeof(packed)" \
+// RUN:   -o "expr alignof(packed_and_aligned)" \
+// RUN:   -o "expr sizeof(packed_and_aligned)" \
+// RUN:   -o "expr alignof(derived)" \
+// RUN:   -o "expr sizeof(derived)" \
+// RUN:   -o exit | FileCheck %s
+
+struct __attribute__((packed)) packed {
+  int x;
+  char y;
+  int z;
+} g_packed_struct;
+static_assert(alignof(packed) == 1);
+static_assert(sizeof(packed) == 9);
+
+// CHECK:      (lldb) expr alignof(packed)
+// CHECK-NEXT: ${{.*}} = 1
+// CHECK:      (lldb) expr sizeof(packed)
+// CHECK-NEXT: ${{.*}} = 9
+
+struct __attribute__((packed, aligned(16))) packed_and_aligned {
+  int x;
+  char y;
+  int z;
+} g_packed_and_aligned_struct;
+static_assert(alignof(packed_and_aligned) == 16);
+static_assert(sizeof(packed_and_aligned) == 16);
+
+// CHECK:      (lldb) expr alignof(packed_and_aligned)
+// CHECK-NEXT: ${{.*}} = 16
+// CHECK:      (lldb) expr sizeof(packed_and_aligned)
+// CHECK-NEXT: ${{.*}} = 16
+
+struct __attribute__((packed)) packed_base { int x; };
+static_assert(alignof(packed_base) == 1);
+
+struct derived : packed, packed_base {} g_derived;
+static_assert(alignof(derived) == 1);
+static_assert(sizeof(derived) == 13);
+
+// CHECK:      (lldb) expr alignof(derived)
+// CHECK-NEXT: ${{.*}} = 1
+// CHECK:      (lldb) expr sizeof(derived)
+// CHECK-NEXT: ${{.*}} = 13
+
+int main() {}
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/compilercontext.ll b/lldb/test/Shell/SymbolFile/DWARF/x86/compilercontext.ll
index 149f7efbbfbe48..8fa45e955abf11 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/compilercontext.ll
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/compilercontext.ll
@@ -1,19 +1,19 @@
 ; Test finding types by CompilerContext.
 ; RUN: llc %s -filetype=obj -o %t.o
 ; RUN: lldb-test symbols %t.o -find=type --language=C99 \
-; RUN:   -compiler-context="Module:CModule,Module:SubModule,Struct:FromSubmoduleX" \
+; RUN:   -compiler-context="Module:CModule,Module:SubModule,ClassOrStruct:FromSubmoduleX" \
 ; RUN:   | FileCheck %s --check-prefix=NORESULTS
 ; RUN: lldb-test symbols %t.o -find=type --language=C++ \
-; RUN:   -compiler-context="Module:CModule,Module:SubModule,Struct:FromSubmodule" \
+; RUN:   -compiler-context="Module:CModule,Module:SubModule,ClassOrStruct:FromSubmodule" \
 ; RUN:   | FileCheck %s --check-prefix=NORESULTS
 ; RUN: lldb-test symbols %t.o -find=type --language=C99 \
-; RUN:   -compiler-context="Module:CModule,Module:SubModule,Struct:FromSubmodule" \
+; RUN:   -compiler-context="Module:CModule,Module:SubModule,ClassOrStruct:FromSubmodule" \
 ; RUN:   | FileCheck %s
 ; RUN: lldb-test symbols %t.o -find=type --language=C99 \
-; RUN:   -compiler-context="Module:CModule,AnyModule:*,Struct:FromSubmodule" \
+; RUN:   -compiler-context="Module:CModule,AnyModule:*,ClassOrStruct:FromSubmodule" \
 ; RUN:   | FileCheck %s
 ; RUN: lldb-test symbols %t.o -find=type --language=C99 \
-; RUN:   -compiler-context="AnyModule:*,Struct:FromSubmodule" \
+; RUN:   -compiler-context="AnyModule:*,ClassOrStruct:FromSubmodule" \
 ; RUN:   | FileCheck %s
 ; RUN: lldb-test symbols %t.o -find=type --language=C99 \
 ; RUN:   -compiler-context="Module:CModule,Module:SubModule,AnyType:FromSubmodule" \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
new file mode 100644
index 00000000000000..4df1b33dd7d91f
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
@@ -0,0 +1,108 @@
+// REQUIRES: lld
+
+// This test will make a type that will be compiled differently into two
+// different .dwo files in a type unit with the same type hash, but with
+// differing contents. Clang's type unit signature is based only on the mangled
+// name of the type, regardless of the contents of the type, so that will help
+// us test foreign type units in the .debug_names section of the main
+// executable. When a DWP file is made, only one type unit will be kept and the
+// type unit that is kept has the .dwo file name that it came from. When LLDB
+// loads the foreign type units, it needs to verify that any entries from
+// foreign type units come from the right .dwo file. We test this since the
+// contents of type units are not always the same even though they have the
+// same type hash. We don't want invalid accelerator table entries to come from
+// one .dwo file and be used on a type unit from another since this could cause
+// invalid lookups to happen. LLDB knows how to track down which .dwo file a
+// type unit comes from by looking at the DW_AT_dwo_name attribute in the
+// DW_TAG_type_unit.
+
+// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \
+// RUN:   -fdebug-types-section -gpubnames -c %s -o %t.main.o
+// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \
+// RUN:   -fdebug-types-section -gpubnames -c %s -o %t.foo.o
+// RUN: ld.lld %t.main.o %t.foo.o -o %t
+
+// Check when have no .dwp file that we can find the types in both .dwo files.
+// RUN: rm -f %t.dwp
+// RUN: %lldb \
+// RUN:   -o "type lookup IntegerType" \
+// RUN:   -o "type lookup FloatType" \
+// RUN:   -o "type lookup CustomType" \
+// RUN:   -b %t | FileCheck %s --check-prefix=NODWP
+// NODWP: (lldb) type lookup IntegerType
+// NODWP-DAG: int
+// NODWP-DAG: unsigned int
+// NODWP: (lldb) type lookup FloatType
+// NODWP-DAG: double
+// NODWP-DAG: float
+// NODWP: (lldb) type lookup CustomType
+// NODWP: struct CustomType {
+// NODWP: struct CustomType {
+
+// Check when we make the .dwp file with %t.main.dwo first so it will
+// pick the type unit from %t.main.dwo. Verify we find only the types from
+// %t.main.dwo's type unit.
+// RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.dwp
+// RUN: %lldb \
+// RUN:   -o "type lookup IntegerType" \
+// RUN:   -o "type lookup FloatType" \
+// RUN:   -o "type lookup CustomType" \
+// RUN:   -b %t | FileCheck %s --check-prefix=DWPMAIN
+// DWPMAIN: (lldb) type lookup IntegerType
+// DWPMAIN-NEXT: int
+// DWPMAIN: (lldb) type lookup FloatType
+// DWPMAIN-NEXT: double
+// DWPMAIN: (lldb) type lookup CustomType
+// DWPMAIN-NEXT: struct CustomType {
+// DWPMAIN-NEXT:     typedef int IntegerType;
+// DWPMAIN-NEXT:     typedef double FloatType;
+// DWPMAIN-NEXT:     CustomType::IntegerType x;
+// DWPMAIN-NEXT:     CustomType::FloatType y;
+// DWPMAIN-NEXT: }
+
+// Next we check when we make the .dwp file with %t.foo.dwo first so it will
+// pick the type unit from %t.main.dwo. Verify we find only the types from
+// %t.main.dwo's type unit.
+// RUN: llvm-dwp %t.foo.dwo %t.main.dwo -o %t.dwp
+// RUN: %lldb \
+// RUN:   -o "type lookup IntegerType" \
+// RUN:   -o "type lookup FloatType" \
+// RUN:   -o "type lookup CustomType" \
+// RUN:   -b %t | FileCheck %s --check-prefix=DWPFOO
+
+// DWPFOO: (lldb) type lookup IntegerType
+// DWPFOO-NEXT: unsigned int
+// DWPFOO: (lldb) type lookup FloatType
+// DWPFOO-NEXT: float
+// DWPFOO: (lldb) type lookup CustomType
+// DWPFOO-NEXT: struct CustomType {
+// DWPFOO-NEXT:     typedef unsigned int IntegerType;
+// DWPFOO-NEXT:     typedef float FloatType;
+// DWPFOO-NEXT:     CustomType::IntegerType x;
+// DWPFOO-NEXT:     CustomType::FloatType y;
+// DWPFOO-NEXT: }
+
+struct CustomType {
+  // We switch the order of "FloatType" and "IntegerType" so that if we do
+  // end up reading the wrong accelerator table entry, that we would end up
+  // getting an invalid offset and not find anything, or the offset would have
+  // matched and we would find the wrong thing.
+#ifdef VARIANT
+  typedef float FloatType;
+  typedef unsigned IntegerType;
+#else
+  typedef int IntegerType;
+  typedef double FloatType;
+#endif
+  IntegerType x;
+  FloatType y;
+};
+
+#ifdef VARIANT
+int foo() {
+#else
+int main() {
+#endif
+  CustomType c = {1, 2.0};
+  return 0;
+}
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/enum-declaration-uniqueness.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/enum-declaration-uniqueness.cpp
new file mode 100644
index 00000000000000..20322f1ccc37bd
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/enum-declaration-uniqueness.cpp
@@ -0,0 +1,32 @@
+// REQUIRES: lld
+//
+// RUN: %clangxx --target=x86_64-pc-linux -g -c -o %t_a.o %s -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -g -c -o %t_b.o %s -DFILE_B
+// RUN: ld.lld -o %t %t_a.o %t_b.o
+// RUN: %lldb %t \
+// RUN:   -o "target variable my_enum my_enum_ref" -o "image dump ast" \
+// RUN:   -o exit | FileCheck %s
+
+
+// CHECK: (lldb) target variable
+// CHECK: (MyEnum) my_enum = MyEnum_A
+// CHECK: (MyEnum &) my_enum_ref =
+// CHECK-SAME: &::my_enum_ref = MyEnum_A
+
+// CHECK: (lldb) image dump ast
+// CHECK: EnumDecl {{.*}} MyEnum
+// CHECK-NEXT: EnumConstantDecl {{.*}} MyEnum_A 'MyEnum'
+// CHECK-NOT: MyEnum
+
+enum MyEnum : int;
+
+extern MyEnum my_enum;
+
+#ifdef FILE_A
+enum MyEnum : int { MyEnum_A };
+
+MyEnum my_enum = MyEnum_A;
+#endif
+#ifdef FILE_B
+MyEnum &my_enum_ref = my_enum;
+#endif
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
index b9a63525d0711d..c42f9fe0b8b524 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
@@ -34,7 +34,7 @@
 // RUN:   FileCheck --check-prefix=FULL-MANGLED-METHOD %s
 // RUN: lldb-test symbols --name=foo --context=context --find=function --function-flags=base %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
-// RUN: lldb-test symbols --compiler-context=Struct:sbar,Function:foo -language=c++ -find=function -function-flags=method %t | \
+// RUN: lldb-test symbols --compiler-context=ClassOrStruct:sbar,Function:foo -language=c++ -find=function -function-flags=method %t | \
 // RUN:   FileCheck --check-prefix=COMPILER-CONTEXT %s
 // RUN: lldb-test symbols --name=not_there --find=function %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
index 4f39e2e5a9e170..2dec109a781ca6 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
@@ -17,7 +17,7 @@
 
 TopLevelStruct s1;
 // RUN: lldb-test symbols -dump-clang-ast -find type --language=ObjC++ \
-// RUN:   -compiler-context 'Module:A,Struct:TopLevelStruct' %t.o \
+// RUN:   -compiler-context 'Module:A,ClassOrStruct:TopLevelStruct' %t.o \
 // RUN:   | FileCheck %s --check-prefix=CHECK-TOPLEVELSTRUCT
 // CHECK-TOPLEVELSTRUCT: CXXRecordDecl {{.*}} imported in A struct TopLevelStruct
 // CHECK-TOPLEVELSTRUCT: -FieldDecl {{.*}} in A a 'int'
@@ -45,7 +45,7 @@ @implementation SomeClass {
 
 SomeClass *obj1;
 // RUN: lldb-test symbols -dump-clang-ast -find type --language=ObjC++ \
-// RUN:   -compiler-context 'Module:A,Struct:SomeClass' %t.o \
+// RUN:   -compiler-context 'Module:A,ClassOrStruct:SomeClass' %t.o \
 // RUN:   | FileCheck %s --check-prefix=CHECK-OBJC
 // CHECK-OBJC: ObjCInterfaceDecl {{.*}} imported in A <undeserialized declarations> SomeClass
 // CHECK-OBJC-NEXT: |-ObjCIvarDecl
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index e24f3fbb4d9318..d764cfa20ea837 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -145,7 +145,7 @@ def calculate_arch_features(arch_string):
 if config.lldb_enable_lzma:
     config.available_features.add("lzma")
 
-if shutil.which("xz") != None:
+if shutil.which("xz") is not None:
     config.available_features.add("xz")
 
 if config.lldb_system_debugserver:
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index d419f821999e6c..0196aed819f2b4 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -32,14 +32,7 @@ namespace lldb_dap {
 DAP g_dap;
 
 DAP::DAP()
-    : broadcaster("lldb-dap"),
-      exception_breakpoints(
-          {{"cpp_catch", "C++ Catch", lldb::eLanguageTypeC_plus_plus},
-           {"cpp_throw", "C++ Throw", lldb::eLanguageTypeC_plus_plus},
-           {"objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC},
-           {"objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC},
-           {"swift_catch", "Swift Catch", lldb::eLanguageTypeSwift},
-           {"swift_throw", "Swift Throw", lldb::eLanguageTypeSwift}}),
+    : broadcaster("lldb-dap"), exception_breakpoints(),
       focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false),
       enable_auto_variable_summaries(false),
       enable_synthetic_child_debugging(false),
@@ -65,8 +58,51 @@ DAP::DAP()
 
 DAP::~DAP() = default;
 
+void DAP::PopulateExceptionBreakpoints() {
+  llvm::call_once(init_exception_breakpoints_flag, [this]() {
+    exception_breakpoints = std::vector<ExceptionBreakpoint> {};
+    
+    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
+      exception_breakpoints->emplace_back("cpp_catch", "C++ Catch",
+                                          lldb::eLanguageTypeC_plus_plus);
+      exception_breakpoints->emplace_back("cpp_throw", "C++ Throw",
+                                          lldb::eLanguageTypeC_plus_plus);
+    }
+    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
+      exception_breakpoints->emplace_back("objc_catch", "Objective-C Catch",
+                                          lldb::eLanguageTypeObjC);
+      exception_breakpoints->emplace_back("objc_throw", "Objective-C Throw",
+                                          lldb::eLanguageTypeObjC);
+    }
+    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
+      exception_breakpoints->emplace_back("swift_catch", "Swift Catch",
+                                          lldb::eLanguageTypeSwift);
+      exception_breakpoints->emplace_back("swift_throw", "Swift Throw",
+                                          lldb::eLanguageTypeSwift);
+    }
+    assert(!exception_breakpoints->empty() && "should not be empty");
+  });
+}
+
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) {
-  for (auto &bp : exception_breakpoints) {
+  // PopulateExceptionBreakpoints() is called after g_dap.debugger is created
+  // in a request-initialize.
+  //
+  // But this GetExceptionBreakpoint() method may be called before attaching, in
+  // which case, we may not have populated the filter yet.
+  //
+  // We also cannot call PopulateExceptionBreakpoints() in DAP::DAP() because
+  // we need SBDebugger::Initialize() to have been called before this.
+  //
+  // So just calling PopulateExceptionBreakoints(),which does lazy-populating
+  // seems easiest. Two other options include:
+  //  + call g_dap.PopulateExceptionBreakpoints() in lldb-dap.cpp::main()
+  //    right after the call to SBDebugger::Initialize()
+  //  + Just call PopulateExceptionBreakpoints() to get a fresh list  everytime
+  //    we query (a bit overkill since it's not likely to change?)
+  PopulateExceptionBreakpoints();
+
+  for (auto &bp : *exception_breakpoints) {
     if (bp.filter == filter)
       return &bp;
   }
@@ -74,7 +110,10 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) {
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) {
-  for (auto &bp : exception_breakpoints) {
+  // See comment in the other GetExceptionBreakpoint().
+  PopulateExceptionBreakpoints();
+
+  for (auto &bp : *exception_breakpoints) {
     if (bp.bp.GetID() == bp_id)
       return &bp;
   }
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index a88ee3e1dec6bc..37e57d58968d90 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -156,7 +156,8 @@ struct DAP {
   std::unique_ptr<std::ofstream> log;
   llvm::StringMap<SourceBreakpointMap> source_breakpoints;
   FunctionBreakpointMap function_breakpoints;
-  std::vector<ExceptionBreakpoint> exception_breakpoints;
+  std::optional<std::vector<ExceptionBreakpoint>> exception_breakpoints;
+  llvm::once_flag init_exception_breakpoints_flag;
   std::vector<std::string> init_commands;
   std::vector<std::string> pre_run_commands;
   std::vector<std::string> post_run_commands;
@@ -228,6 +229,8 @@ struct DAP {
 
   llvm::json::Value CreateTopLevelScopes();
 
+  void PopulateExceptionBreakpoints();
+
   /// \return
   ///   Attempt to determine if an expression is a variable expression or
   ///   lldb command using a hueristic based on the first term of the
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 7746afb6cbbf38..b74474b9d383c0 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -16,6 +16,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <optional>
 #include <sys/stat.h>
 #include <sys/types.h>
 #if defined(_WIN32)
@@ -672,9 +673,14 @@ void request_attach(const llvm::json::Object &request) {
   lldb::SBError error;
   FillResponse(request, response);
   lldb::SBAttachInfo attach_info;
+  const int invalid_port = 0;
   auto arguments = request.getObject("arguments");
   const lldb::pid_t pid =
       GetUnsigned(arguments, "pid", LLDB_INVALID_PROCESS_ID);
+  const auto gdb_remote_port =
+      GetUnsigned(arguments, "gdb-remote-port", invalid_port);
+  const auto gdb_remote_hostname =
+      GetString(arguments, "gdb-remote-hostname", "localhost");
   if (pid != LLDB_INVALID_PROCESS_ID)
     attach_info.SetProcessID(pid);
   const auto wait_for = GetBoolean(arguments, "waitFor", false);
@@ -736,7 +742,8 @@ void request_attach(const llvm::json::Object &request) {
     return;
   }
 
-  if (pid == LLDB_INVALID_PROCESS_ID && wait_for) {
+  if ((pid == LLDB_INVALID_PROCESS_ID || gdb_remote_port == invalid_port) &&
+      wait_for) {
     char attach_msg[256];
     auto attach_msg_len = snprintf(attach_msg, sizeof(attach_msg),
                                    "Waiting to attach to \"%s\"...",
@@ -749,9 +756,27 @@ void request_attach(const llvm::json::Object &request) {
     // Disable async events so the attach will be successful when we return from
     // the launch call and the launch will happen synchronously
     g_dap.debugger.SetAsync(false);
-    if (core_file.empty())
-      g_dap.target.Attach(attach_info, error);
-    else
+    if (core_file.empty()) {
+      if ((pid != LLDB_INVALID_PROCESS_ID) &&
+          (gdb_remote_port != invalid_port)) {
+        // If both pid and port numbers are specified.
+        error.SetErrorString("The user can't specify both pid and port");
+      } else if (gdb_remote_port != invalid_port) {
+        // If port is specified and pid is not.
+        lldb::SBListener listener = g_dap.debugger.GetListener();
+
+        // If the user hasn't provided the hostname property, default localhost
+        // being used.
+        std::string connect_url =
+            llvm::formatv("connect://{0}:", gdb_remote_hostname);
+        connect_url += std::to_string(gdb_remote_port);
+        g_dap.target.ConnectRemote(listener, connect_url.c_str(), "gdb-remote",
+                                   error);
+      } else {
+        // Attach by process name or id.
+        g_dap.target.Attach(attach_info, error);
+      }
+    } else
       g_dap.target.LoadCore(core_file.data(), error);
     // Reenable async events
     g_dap.debugger.SetAsync(true);
@@ -1586,6 +1611,7 @@ void request_initialize(const llvm::json::Object &request) {
   bool source_init_file = GetBoolean(arguments, "sourceInitFile", true);
 
   g_dap.debugger = lldb::SBDebugger::Create(source_init_file, log_cb, nullptr);
+  g_dap.PopulateExceptionBreakpoints();
   auto cmd = g_dap.debugger.GetCommandInterpreter().AddMultiwordCommand(
       "lldb-dap", "Commands for managing lldb-dap.");
   if (GetBoolean(arguments, "supportsStartDebuggingRequest", false)) {
@@ -1621,7 +1647,7 @@ void request_initialize(const llvm::json::Object &request) {
   body.try_emplace("supportsEvaluateForHovers", true);
   // Available filters or options for the setExceptionBreakpoints request.
   llvm::json::Array filters;
-  for (const auto &exc_bp : g_dap.exception_breakpoints) {
+  for (const auto &exc_bp : *g_dap.exception_breakpoints) {
     filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp));
   }
   body.try_emplace("exceptionBreakpointFilters", std::move(filters));
@@ -2476,7 +2502,7 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) {
   // Keep a list of any exception breakpoint filter names that weren't set
   // so we can clear any exception breakpoints if needed.
   std::set<std::string> unset_filters;
-  for (const auto &bp : g_dap.exception_breakpoints)
+  for (const auto &bp : *g_dap.exception_breakpoints)
     unset_filters.insert(bp.filter);
 
   for (const auto &value : *filters) {
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index adffc3f0a2c58d..62cf69b3597286 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -348,6 +348,17 @@
                 "type": "string",
                 "description": "The time in seconds to wait for a program to stop when attaching using \"attachCommands\". Defaults to 30 seconds."
               },
+              "gdb-remote-port": {
+                "type": [
+                  "number",
+                  "string"
+                ],
+                "description": "TCP/IP port to attach to. Specifying both pid and port is an error."
+              },
+              "gdb-remote-hostname": {
+                "type": "string",
+                "description": "The hostname to connect to a remote system. The default hostname being used localhost."
+              },
               "enableAutoVariableSummaries": {
                 "type": "boolean",
                 "description": "Enable auto generated summaries for variables when no summaries exist for a given type. This feature can cause performance delays in large projects when viewing variables.",
diff --git a/lldb/tools/lldb-test/lldb-test.cpp b/lldb/tools/lldb-test/lldb-test.cpp
index 33281cfb150747..30df2ddf864cdc 100644
--- a/lldb/tools/lldb-test/lldb-test.cpp
+++ b/lldb/tools/lldb-test/lldb-test.cpp
@@ -306,8 +306,7 @@ llvm::SmallVector<CompilerContext, 4> parseCompilerContext() {
             .Case("TranslationUnit", CompilerContextKind::TranslationUnit)
             .Case("Module", CompilerContextKind::Module)
             .Case("Namespace", CompilerContextKind::Namespace)
-            .Case("Class", CompilerContextKind::Class)
-            .Case("Struct", CompilerContextKind::Struct)
+            .Case("ClassOrStruct", CompilerContextKind::ClassOrStruct)
             .Case("Union", CompilerContextKind::Union)
             .Case("Function", CompilerContextKind::Function)
             .Case("Variable", CompilerContextKind::Variable)
diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt
index a2585a94b61558..cc9d45ebf981d0 100644
--- a/lldb/unittests/CMakeLists.txt
+++ b/lldb/unittests/CMakeLists.txt
@@ -52,6 +52,7 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
   add_subdirectory(API)
 endif()
 add_subdirectory(Breakpoint)
+add_subdirectory(Callback)
 add_subdirectory(Core)
 add_subdirectory(DataFormatter)
 add_subdirectory(Disassembler)
diff --git a/lldb/unittests/Callback/CMakeLists.txt b/lldb/unittests/Callback/CMakeLists.txt
new file mode 100644
index 00000000000000..b9e0ef5a396e3a
--- /dev/null
+++ b/lldb/unittests/Callback/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_lldb_unittest(LLDBCallbackTests
+  TestBreakpointSetCallback.cpp
+
+  LINK_LIBS
+    lldbBreakpoint
+    lldbCore
+    LLVMTestingSupport
+    lldbUtilityHelpers
+    lldbPluginPlatformMacOSX
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/unittests/Callback/TestBreakpointSetCallback.cpp b/lldb/unittests/Callback/TestBreakpointSetCallback.cpp
new file mode 100644
index 00000000000000..2a7070f9349c02
--- /dev/null
+++ b/lldb/unittests/Callback/TestBreakpointSetCallback.cpp
@@ -0,0 +1,85 @@
+//===-- TestBreakpointSetCallback.cpp
+//--------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Plugins/Platform/MacOSX/PlatformMacOSX.h"
+#include "Plugins/Platform/MacOSX/PlatformRemoteMacOSX.h"
+#include "TestingSupport/SubsystemRAII.h"
+#include "TestingSupport/TestUtilities.h"
+#include "lldb/Breakpoint/StoppointCallbackContext.h"
+#include "lldb/Core/Debugger.h"
+#include "lldb/Core/Progress.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/HostInfo.h"
+#include "lldb/Target/ExecutionContext.h"
+#include "lldb/lldb-private-enumerations.h"
+#include "lldb/lldb-types.h"
+#include "gtest/gtest.h"
+#include <iostream>
+#include <memory>
+#include <mutex>
+
+using namespace lldb_private;
+using namespace lldb;
+
+static constexpr lldb::user_id_t expected_breakpoint_id = 1;
+static constexpr lldb::user_id_t expected_breakpoint_location_id = 0;
+
+class BreakpointSetCallbackTest : public ::testing::Test {
+public:
+  static void CheckCallbackArgs(void *baton, StoppointCallbackContext *context,
+                                lldb::user_id_t break_id,
+                                lldb::user_id_t break_loc_id,
+                                TargetSP expected_target_sp) {
+    EXPECT_EQ(context->exe_ctx_ref.GetTargetSP(), expected_target_sp);
+    EXPECT_EQ(baton, "hello");
+    EXPECT_EQ(break_id, expected_breakpoint_id);
+    EXPECT_EQ(break_loc_id, expected_breakpoint_location_id);
+  }
+
+protected:
+  void SetUp() override {
+    std::call_once(TestUtilities::g_debugger_initialize_flag,
+                   []() { Debugger::Initialize(nullptr); });
+  };
+
+  DebuggerSP m_debugger_sp;
+  PlatformSP m_platform_sp;
+  SubsystemRAII<FileSystem, HostInfo, PlatformMacOSX, ProgressManager>
+      subsystems;
+};
+
+TEST_F(BreakpointSetCallbackTest, TestBreakpointSetCallback) {
+  void *baton = (void *)"hello";
+  // Set up the debugger, make sure that was done properly.
+  TargetSP target_sp;
+  ArchSpec arch("x86_64-apple-macosx-");
+  Platform::SetHostPlatform(PlatformRemoteMacOSX::CreateInstance(true, &arch));
+
+  m_debugger_sp = Debugger::CreateInstance();
+
+  // Create target
+  m_debugger_sp->GetTargetList().CreateTarget(*m_debugger_sp, "", arch,
+                                              lldb_private::eLoadDependentsNo,
+                                              m_platform_sp, target_sp);
+
+  // Create breakpoint
+  BreakpointSP breakpoint_sp =
+      target_sp->CreateBreakpoint(0xDEADBEEF, false, false);
+
+  breakpoint_sp->SetCallback(
+      [target_sp](void *baton, StoppointCallbackContext *context,
+                  lldb::user_id_t break_id, lldb::user_id_t break_loc_id) {
+        CheckCallbackArgs(baton, context, break_id, break_loc_id, target_sp);
+        return true;
+      },
+      baton, true);
+  ExecutionContext exe_ctx(target_sp, false);
+  StoppointCallbackContext context(nullptr, exe_ctx, true);
+  breakpoint_sp->InvokeCallback(&context, 0);
+}
diff --git a/lldb/unittests/Core/DumpRegisterInfoTest.cpp b/lldb/unittests/Core/DumpRegisterInfoTest.cpp
index 0d3ff8f542595f..593170c2822abf 100644
--- a/lldb/unittests/Core/DumpRegisterInfoTest.cpp
+++ b/lldb/unittests/Core/DumpRegisterInfoTest.cpp
@@ -102,3 +102,29 @@ TEST(DoDumpRegisterInfoTest, FieldsTable) {
                               "|-------|-------|------|-----|\n"
                               "|   A   |   B   |  C   |  D  |");
 }
+
+TEST(DoDumpRegisterInfoTest, Enumerators) {
+  StreamString strm;
+
+  FieldEnum enum_one("enum_one", {{0, "an_enumerator"}});
+  FieldEnum enum_two("enum_two",
+                     {{1, "another_enumerator"}, {2, "another_enumerator_2"}});
+
+  RegisterFlags flags("", 4,
+                      {RegisterFlags::Field("A", 24, 31, &enum_one),
+                       RegisterFlags::Field("B", 16, 23),
+                       RegisterFlags::Field("C", 8, 15, &enum_two)});
+
+  DoDumpRegisterInfo(strm, "abc", nullptr, 4, {}, {}, {}, &flags, 100);
+  ASSERT_EQ(strm.GetString(),
+            "       Name: abc\n"
+            "       Size: 4 bytes (32 bits)\n"
+            "\n"
+            "| 31-24 | 23-16 | 15-8 | 7-0 |\n"
+            "|-------|-------|------|-----|\n"
+            "|   A   |   B   |  C   |     |\n"
+            "\n"
+            "A: 0 = an_enumerator\n"
+            "\n"
+            "C: 1 = another_enumerator, 2 = another_enumerator_2");
+}
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 23162436d42c94..017953b372e3e2 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -80,26 +80,6 @@ lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject(
   return python::PythonObject();
 }
 
-python::PythonObject
-lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
-    const char *python_class_name, const char *session_dictionary_name,
-    const StructuredDataImpl &args_data, std::string &error_string,
-    const lldb::ThreadPlanSP &thread_plan_sp) {
-  return python::PythonObject();
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementor, const char *method_name, Event *event_sp,
-    bool &got_error) {
-  return false;
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementor, const char *method_name, Stream *event_sp,
-    bool &got_error) {
-  return false;
-}
-
 python::PythonObject lldb_private::python::SWIGBridge::
     LLDBSwigPythonCreateScriptedBreakpointResolver(
         const char *python_class_name, const char *session_dictionary_name,
@@ -154,6 +134,16 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject *data) {
   return nullptr;
 }
 
+void *
+lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data) {
+  return nullptr;
+}
+
+void *
+lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) {
+  return nullptr;
+}
+
 void *
 lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data) {
   return nullptr;
@@ -311,6 +301,11 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ExecutionContextRefSP) {
   return python::PythonObject();
 }
 
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP) {
+  return python::PythonObject();
+}
+
 python::PythonObject
 lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) {
   return python::PythonObject();
@@ -320,3 +315,18 @@ python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(
     const lldb_private::StructuredDataImpl &) {
   return python::PythonObject();
 }
+
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(Event *event) {
+  return python::PythonObject();
+}
+
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(const Stream *stream) {
+  return python::PythonObject();
+}
+
+python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(
+    std::shared_ptr<lldb::SBStream> stream_sb) {
+  return python::PythonObject();
+}
diff --git a/lldb/unittests/Symbol/TestType.cpp b/lldb/unittests/Symbol/TestType.cpp
index da849d804e4d54..79201d6ba2e59c 100644
--- a/lldb/unittests/Symbol/TestType.cpp
+++ b/lldb/unittests/Symbol/TestType.cpp
@@ -48,32 +48,26 @@ TEST(Type, GetTypeScopeAndBasename) {
 }
 
 TEST(Type, CompilerContextPattern) {
-  std::vector<CompilerContext> mms = {
-      {CompilerContextKind::Module, ConstString("A")},
-      {CompilerContextKind::Module, ConstString("B")},
-      {CompilerContextKind::Struct, ConstString("S")}};
-  EXPECT_TRUE(contextMatches(mms, mms));
   std::vector<CompilerContext> mmc = {
       {CompilerContextKind::Module, ConstString("A")},
       {CompilerContextKind::Module, ConstString("B")},
-      {CompilerContextKind::Class, ConstString("S")}};
-  EXPECT_FALSE(contextMatches(mms, mmc));
-  std::vector<CompilerContext> ms = {
+      {CompilerContextKind::ClassOrStruct, ConstString("S")}};
+  std::vector<CompilerContext> mc = {
       {CompilerContextKind::Module, ConstString("A")},
-      {CompilerContextKind::Struct, ConstString("S")}};
-  std::vector<CompilerContext> mas = {
+      {CompilerContextKind::ClassOrStruct, ConstString("S")}};
+  std::vector<CompilerContext> mac = {
       {CompilerContextKind::Module, ConstString("A")},
       {CompilerContextKind::AnyModule, ConstString("*")},
-      {CompilerContextKind::Struct, ConstString("S")}};
-  EXPECT_TRUE(contextMatches(mms, mas));
-  EXPECT_TRUE(contextMatches(ms, mas));
-  EXPECT_FALSE(contextMatches(mas, ms));
-  std::vector<CompilerContext> mmms = {
+      {CompilerContextKind::ClassOrStruct, ConstString("S")}};
+  EXPECT_TRUE(contextMatches(mmc, mac));
+  EXPECT_TRUE(contextMatches(mc, mac));
+  EXPECT_FALSE(contextMatches(mac, mc));
+  std::vector<CompilerContext> mmmc = {
       {CompilerContextKind::Module, ConstString("A")},
       {CompilerContextKind::Module, ConstString("B")},
       {CompilerContextKind::Module, ConstString("C")},
-      {CompilerContextKind::Struct, ConstString("S")}};
-  EXPECT_TRUE(contextMatches(mmms, mas));
+      {CompilerContextKind::ClassOrStruct, ConstString("S")}};
+  EXPECT_TRUE(contextMatches(mmmc, mac));
   std::vector<CompilerContext> mme = {
       {CompilerContextKind::Module, ConstString("A")},
       {CompilerContextKind::Module, ConstString("B")},
@@ -83,7 +77,7 @@ TEST(Type, CompilerContextPattern) {
       {CompilerContextKind::Module, ConstString("B")},
       {CompilerContextKind::AnyType, ConstString("S")}};
   EXPECT_TRUE(contextMatches(mme, mma));
-  EXPECT_TRUE(contextMatches(mms, mma));
+  EXPECT_TRUE(contextMatches(mmc, mma));
   std::vector<CompilerContext> mme2 = {
       {CompilerContextKind::Module, ConstString("A")},
       {CompilerContextKind::Module, ConstString("B")},
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 2a7d38f18fc77a..122b7de7516b6d 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -249,7 +249,8 @@ TEST(DWARFDIETest, GetContext) {
     return CompilerContext(CompilerContextKind::Namespace, ConstString(name));
   };
   auto make_struct = [](llvm::StringRef name) {
-    return CompilerContext(CompilerContextKind::Struct, ConstString(name));
+    return CompilerContext(CompilerContextKind::ClassOrStruct,
+                           ConstString(name));
   };
   DWARFDIE struct_die = unit->DIE().GetFirstChild().GetFirstChild();
   ASSERT_TRUE(struct_die);
@@ -356,7 +357,8 @@ TEST(DWARFDIETest, GetContextInFunction) {
     return CompilerContext(CompilerContextKind::Namespace, ConstString(name));
   };
   auto make_struct = [](llvm::StringRef name) {
-    return CompilerContext(CompilerContextKind::Struct, ConstString(name));
+    return CompilerContext(CompilerContextKind::ClassOrStruct,
+                           ConstString(name));
   };
   // Grab the "a::struct_t" type from the "a" namespace
   DWARFDIE a_struct_die = unit->DIE().GetFirstChild().GetFirstChild();
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 33d5190a053892..91a2b6181ce0a9 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -572,11 +572,11 @@ option(LLVM_ENABLE_Z3_SOLVER
 )
 
 if (LLVM_ENABLE_Z3_SOLVER)
-  find_package(Z3 4.7.1)
+  find_package(Z3 4.8.9)
 
   if (LLVM_Z3_INSTALL_DIR)
     if (NOT Z3_FOUND)
-      message(FATAL_ERROR "Z3 >= 4.7.1 has not been found in LLVM_Z3_INSTALL_DIR: ${LLVM_Z3_INSTALL_DIR}.")
+      message(FATAL_ERROR "Z3 >= 4.8.9 has not been found in LLVM_Z3_INSTALL_DIR: ${LLVM_Z3_INSTALL_DIR}.")
     endif()
   endif()
 
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5a16457412d24e..80dd619019e7b0 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1188,6 +1188,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specifies state.
 
+  llvm.amdgcn.readfirstlane                        Provides direct access to v_readfirstlane_b32. Returns the value in
+                                                   the lowest active lane of the input operand. Currently implemented
+                                                   for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
+                                                   i64, double, pointers, multiples of the 32-bit vectors.
+
+  llvm.amdgcn.readlane                             Provides direct access to v_readlane_b32. Returns the value in the
+                                                   specified lane of the first input operand. The second operand specifies
+                                                   the lane to read from. Currently implemented for i16, i32, float, half,
+                                                   bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+                                                   multiples of the 32-bit vectors.
+
+  llvm.amdgcn.writelane                            Provides direct access to v_writelane_b32. Writes value in the first input
+                                                   operand to the specified lane of divergent output. The second operand
+                                                   specifies the lane to write. Currently implemented for i16, i32, float,
+                                                   half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+                                                   multiples of the 32-bit vectors.
+
   llvm.amdgcn.wave.reduce.umin                     Performs an arithmetic unsigned min reduction on the unsigned values
                                                    provided by each lane in the wavefront.
                                                    Intrinsic takes a hint for reduction strategy using second operand
@@ -1208,6 +1225,26 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    reduction will be performed using default iterative strategy.
                                                    Intrinsic is currently only implemented for i32.
 
+  llvm.amdgcn.permlane16                           Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
+                                                   operation within a row (16 contiguous lanes) of the second input operand.
+                                                   The third and fourth inputs must be scalar values. these are combined into
+                                                   a single 64-bit value representing lane selects used to swizzle within each
+                                                   row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, 
+                                                   <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
+
+  llvm.amdgcn.permlanex16                          Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
+                                                   operation across two rows of the second input operand (each row is 16 contiguous
+                                                   lanes). The third and fourth inputs must be scalar values. these are combined
+                                                   into a single 64-bit value representing lane selects used to swizzle within each
+                                                   row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, 
+                                                   <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
+
+  llvm.amdgcn.permlane64                           Provides direct access to v_permlane64_b32. Performs a specific permutation across
+                                                   lanes of the input operand where the high half and low half of a wave64 are swapped.
+                                                   Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half, 
+                                                   bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 
+                                                   32-bit vectors.
+
   llvm.amdgcn.udot2                                Provides direct access to v_dot2_u32_u16 across targets which
                                                    support such instructions. This performs unsigned dot product
                                                    with two v2i16 operands, summed with the third i32 operand. The
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 42f56348885b49..b05394aeee003e 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -60,6 +60,17 @@ The address of a global value.
 
   %0(p0) = G_GLOBAL_VALUE @var_local
 
+G_PTRAUTH_GLOBAL_VALUE
+^^^^^^^^^^^^^^^^^^^^^^
+
+The signed address of a global value. Operands: address to be signed (pointer),
+key (32-bit imm), address for address discrimination (zero if not needed) and
+an extra discriminator (64-bit imm).
+
+.. code-block:: none
+
+  %0:_(p0) = G_PTRAUTH_GLOBAL_VALUE %1:_(p0), s32, %2:_(p0), s64
+
 G_BLOCK_ADDR
 ^^^^^^^^^^^^
 
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index f4a16c43d44992..e00165caae0c9c 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -2066,8 +2066,10 @@ insertion/deleting/queries with low constant factors) and is very stingy with
 malloc traffic.
 
 Note that, unlike :ref:`std::set <dss_set>`, the iterators of ``SmallPtrSet``
-are invalidated whenever an insertion occurs.  Also, the values visited by the
-iterators are not visited in sorted order.
+are invalidated whenever an insertion or erasure occurs. The ``remove_if``
+method can be used to remove elements while iterating over the set.
+
+Also, the values visited by the iterators are not visited in sorted order.
 
 .. _dss_stringset:
 
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 152849a01c37f9..f6a3712bbc2988 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -263,7 +263,7 @@ LLVM supports (to various degrees) a number of experimental extensions.  All exp
 The primary goal of experimental support is to assist in the process of ratification by providing an existence proof of an implementation, and simplifying efforts to validate the value of a proposed extension against large code bases.  Experimental extensions are expected to either transition to ratified status, or be eventually removed.  The decision on whether to accept an experimental extension is currently done on an entirely case by case basis; if you want to propose one, attending the bi-weekly RISC-V sync-up call is strongly advised.
 
 ``experimental-ssnpm``, ``experimental-smnpm``, ``experimental-smmpm``, ``experimental-sspm``, ``experimental-supm``
-  LLVM implements the `v0.8.1 draft specification <https://github.com/riscv/riscv-j-extension/blob/master/zjpm-spec.pdf>`__.
+  LLVM implements the `v1.0.0-rc2 specification <https://github.com/riscv/riscv-j-extension/releases/tag/pointer-masking-v1.0.0-rc2>`__.
 
 ``experimental-ssqosid``
   LLVM implements assembler support for the `v1.0-rc1 draft specification <https://github.com/riscv/riscv-ssqosid/releases/tag/v1.0-rc1>`_.
@@ -330,7 +330,7 @@ The current vendor extensions supported are:
   LLVM implements `version 1.0.0 of the VTx-family custom instructions specification <https://github.com/ventanamicro/ventana-custom-extensions/releases/download/v1.0.0/ventana-custom-extensions-v1.0.0.pdf>`__ by Ventana Micro Systems.  All instructions are prefixed with `vt.` as described in the specification, and the riscv-toolchain-convention document linked above.  These instructions are only available for riscv64 at this time.
 
 ``XSfvcp``
-  LLVM implements `version 1.0.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/c3829e36-8552-41f0-a841-79945784241b_vcix-spec-software.pdf>`__ by SiFive.  All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above.
+  LLVM implements `version 1.1.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/Zn3m1R5LeNNTwnLS_vcix-spec-software-v1p1.pdf>`__ by SiFive.  All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above.
 
 ``XSfvqmaccdod``, ``XSfvqmaccqoq``
   LLVM implements `version 1.1.0 of the SiFive Int8 Matrix Multiplication Extensions Specification <https://sifive.cdn.prismic.io/sifive/1a2ad85b-d818-49f7-ba83-f51f1731edbe_int8-matmul-spec.pdf>`__ by SiFive.  All instructions are prefixed with `sf.` as described in the specification linked above.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 76356dd76f1d2c..a110671bacf34d 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -132,6 +132,11 @@ Changes to the AMDGPU Backend
 
 * Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`
 
+* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
+  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
+  :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
+  `fmax` with addrspace(3) instead.
+
 Changes to the ARM Backend
 --------------------------
 
@@ -171,7 +176,7 @@ Changes to the RISC-V Backend
 * The names of the majority of the S-prefixed (supervisor-level) extension
   names in the RISC-V profiles specification are now recognised.
 * Codegen support was added for the Zimop (May-Be-Operations) extension.
-* The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 0.8.1 Pointer Masking extensions are supported.
+* The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 1.0.0 Pointer Masking extensions are supported.
 * The experimental Ssqosid extension is supported.
 * Zacas is no longer experimental.
 * Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
@@ -282,6 +287,12 @@ They are described in detail in the `debug info migration guide <https://llvm.or
   * ``LLVMDIBuilderInsertDbgValueBefore``
   * ``LLVMDIBuilderInsertDbgValueAtEnd``
 
+* Added the following functions for accessing a Target Extension Type's data:
+
+  * ``LLVMGetTargetExtTypeName``
+  * ``LLVMGetTargetExtTypeNumTypeParams``/``LLVMGetTargetExtTypeTypeParam``
+  * ``LLVMGetTargetExtTypeNumIntParams``/``LLVMGetTargetExtTypeIntParam``
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
@@ -361,6 +372,12 @@ Changes to the LLVM tools
 Changes to LLDB
 ---------------------------------
 
+Changes to BOLT
+---------------------------------
+* Now supports ``--match-profile-with-function-hash`` to match profiled and
+  binary functions with exact hash, allowing for the matching of renamed but
+  identical functions.
+
 Changes to Sanitizers
 ---------------------
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index af2e562fe0f8d3..9867db4839fe19 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -1710,6 +1710,42 @@ LLVMTypeRef LLVMTargetExtTypeInContext(LLVMContextRef C, const char *Name,
                                        unsigned *IntParams,
                                        unsigned IntParamCount);
 
+/**
+ * Obtain the name for this target extension type.
+ *
+ * @see llvm::TargetExtType::getName()
+ */
+const char *LLVMGetTargetExtTypeName(LLVMTypeRef TargetExtTy);
+
+/**
+ * Obtain the number of type parameters for this target extension type.
+ *
+ * @see llvm::TargetExtType::getNumTypeParameters()
+ */
+unsigned LLVMGetTargetExtTypeNumTypeParams(LLVMTypeRef TargetExtTy);
+
+/**
+ * Get the type parameter at the given index for the target extension type.
+ *
+ * @see llvm::TargetExtType::getTypeParameter()
+ */
+LLVMTypeRef LLVMGetTargetExtTypeTypeParam(LLVMTypeRef TargetExtTy,
+                                          unsigned Idx);
+
+/**
+ * Obtain the number of int parameters for this target extension type.
+ *
+ * @see llvm::TargetExtType::getNumIntParameters()
+ */
+unsigned LLVMGetTargetExtTypeNumIntParams(LLVMTypeRef TargetExtTy);
+
+/**
+ * Get the int parameter at the given index for the target extension type.
+ *
+ * @see llvm::TargetExtType::getIntParameter()
+ */
+unsigned LLVMGetTargetExtTypeIntParam(LLVMTypeRef TargetExtTy, unsigned Idx);
+
 /**
  * @}
  */
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index c24eae8da3797b..db2fa480655c61 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -1483,6 +1483,19 @@ inline APFloat minimum(const APFloat &A, const APFloat &B) {
   return B < A ? B : A;
 }
 
+/// Implements IEEE 754-2019 minimumNumber semantics. Returns the smaller
+/// of 2 arguments, not propagating NaNs and treating -0 as less than +0.
+LLVM_READONLY
+inline APFloat minimumnum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return B.isNaN() ? B.makeQuiet() : B;
+  if (B.isNaN())
+    return A;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? A : B;
+  return B < A ? B : A;
+}
+
 /// Implements IEEE 754-2019 maximum semantics. Returns the larger of 2
 /// arguments, propagating NaNs and treating -0 as less than +0.
 LLVM_READONLY
@@ -1496,6 +1509,19 @@ inline APFloat maximum(const APFloat &A, const APFloat &B) {
   return A < B ? B : A;
 }
 
+/// Implements IEEE 754-2019 maximumNumber semantics. Returns the larger
+/// of 2 arguments, not propagating NaNs and treating -0 as less than +0.
+LLVM_READONLY
+inline APFloat maximumnum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return B.isNaN() ? B.makeQuiet() : B;
+  if (B.isNaN())
+    return A;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? B : A;
+  return A < B ? B : A;
+}
+
 // We want the following functions to be available in the header for inlining.
 // We cannot define them inline in the class definition of `DoubleAPFloat`
 // because doing so would instantiate `std::unique_ptr<APFloat[]>` before
diff --git a/llvm/include/llvm/ADT/FloatingPointMode.h b/llvm/include/llvm/ADT/FloatingPointMode.h
index 468de085d1bede..639d931ef88fec 100644
--- a/llvm/include/llvm/ADT/FloatingPointMode.h
+++ b/llvm/include/llvm/ADT/FloatingPointMode.h
@@ -176,7 +176,7 @@ struct DenormalMode {
     std::string storage;
     raw_string_ostream OS(storage);
     print(OS);
-    return OS.str();
+    return storage;
   }
 };
 
diff --git a/llvm/include/llvm/ADT/Hashing.h b/llvm/include/llvm/ADT/Hashing.h
index a5477362a50793..109966257b51c0 100644
--- a/llvm/include/llvm/ADT/Hashing.h
+++ b/llvm/include/llvm/ADT/Hashing.h
@@ -44,6 +44,7 @@
 #ifndef LLVM_ADT_HASHING_H
 #define LLVM_ADT_HASHING_H
 
+#include "llvm/Config/abi-breaking.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SwapByteOrder.h"
@@ -126,23 +127,6 @@ hash_code hash_value(const std::basic_string<T> &arg);
 /// Compute a hash_code for a standard string.
 template <typename T> hash_code hash_value(const std::optional<T> &arg);
 
-/// Override the execution seed with a fixed value.
-///
-/// This hashing library uses a per-execution seed designed to change on each
-/// run with high probability in order to ensure that the hash codes are not
-/// attackable and to ensure that output which is intended to be stable does
-/// not rely on the particulars of the hash codes produced.
-///
-/// That said, there are use cases where it is important to be able to
-/// reproduce *exactly* a specific behavior. To that end, we provide a function
-/// which will forcibly set the seed to a fixed value. This must be done at the
-/// start of the program, before any hashes are computed. Also, it cannot be
-/// undone. This makes it thread-hostile and very hard to use outside of
-/// immediately on start of a simple program designed for reproducible
-/// behavior.
-void set_fixed_execution_hash_seed(uint64_t fixed_value);
-
-
 // All of the implementation details of actually computing the various hash
 // code values are held within this namespace. These routines are included in
 // the header file mainly to allow inlining and constant propagation.
@@ -322,24 +306,20 @@ struct hash_state {
   }
 };
 
-
-/// A global, fixed seed-override variable.
-///
-/// This variable can be set using the \see llvm::set_fixed_execution_seed
-/// function. See that function for details. Do not, under any circumstances,
-/// set or read this variable.
-extern uint64_t fixed_seed_override;
-
+/// In LLVM_ENABLE_ABI_BREAKING_CHECKS builds, the seed is non-deterministic
+/// per process (address of a function in LLVMSupport) to prevent having users
+/// depend on the particular hash values. On platforms without ASLR, this is
+/// still likely non-deterministic per build.
 inline uint64_t get_execution_seed() {
-  // FIXME: This needs to be a per-execution seed. This is just a placeholder
-  // implementation. Switching to a per-execution seed is likely to flush out
-  // instability bugs and so will happen as its own commit.
-  //
-  // However, if there is a fixed seed override set the first time this is
-  // called, return that instead of the per-execution seed.
-  const uint64_t seed_prime = 0xff51afd7ed558ccdULL;
-  static uint64_t seed = fixed_seed_override ? fixed_seed_override : seed_prime;
-  return seed;
+  // Work around x86-64 negative offset folding for old Clang -fno-pic
+  // https://reviews.llvm.org/D93931
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS &&                                         \
+    (!defined(__clang__) || __clang_major__ > 11)
+  return static_cast<uint64_t>(
+      reinterpret_cast<uintptr_t>(&install_fatal_error_handler));
+#else
+  return 0xff51afd7ed558ccdULL;
+#endif
 }
 
 
diff --git a/llvm/include/llvm/ADT/SetOperations.h b/llvm/include/llvm/ADT/SetOperations.h
index 52aced7068931a..1a911b239f4c6f 100644
--- a/llvm/include/llvm/ADT/SetOperations.h
+++ b/llvm/include/llvm/ADT/SetOperations.h
@@ -15,16 +15,27 @@
 #ifndef LLVM_ADT_SETOPERATIONS_H
 #define LLVM_ADT_SETOPERATIONS_H
 
+#include "llvm/ADT/STLExtras.h"
+
 namespace llvm {
 
+namespace detail {
+template <typename Set, typename Fn>
+using check_has_member_remove_if_t =
+    decltype(std::declval<Set>().remove_if(std::declval<Fn>()));
+
+template <typename Set, typename Fn>
+static constexpr bool HasMemberRemoveIf =
+    is_detected<check_has_member_remove_if_t, Set, Fn>::value;
+} // namespace detail
+
 /// set_union(A, B) - Compute A := A u B, return whether A changed.
 ///
-template <class S1Ty, class S2Ty>
-bool set_union(S1Ty &S1, const S2Ty &S2) {
+template <class S1Ty, class S2Ty> bool set_union(S1Ty &S1, const S2Ty &S2) {
   bool Changed = false;
 
-  for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end();
-       SI != SE; ++SI)
+  for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE;
+       ++SI)
     if (S1.insert(*SI).second)
       Changed = true;
 
@@ -36,32 +47,37 @@ bool set_union(S1Ty &S1, const S2Ty &S2) {
 /// is nicer to use.  Functionally, this iterates through S1, removing
 /// elements that are not contained in S2.
 ///
-template <class S1Ty, class S2Ty>
-void set_intersect(S1Ty &S1, const S2Ty &S2) {
-   for (typename S1Ty::iterator I = S1.begin(); I != S1.end();) {
-     const auto &E = *I;
-     ++I;
-     if (!S2.count(E)) S1.erase(E);   // Erase element if not in S2
-   }
+template <class S1Ty, class S2Ty> void set_intersect(S1Ty &S1, const S2Ty &S2) {
+  auto Pred = [&S2](const auto &E) { return !S2.count(E); };
+  if constexpr (detail::HasMemberRemoveIf<S1Ty, decltype(Pred)>) {
+    S1.remove_if(Pred);
+  } else {
+    for (typename S1Ty::iterator I = S1.begin(); I != S1.end();) {
+      const auto &E = *I;
+      ++I;
+      if (!S2.count(E))
+        S1.erase(E); // Erase element if not in S2
+    }
+  }
 }
 
 template <class S1Ty, class S2Ty>
 S1Ty set_intersection_impl(const S1Ty &S1, const S2Ty &S2) {
-   S1Ty Result;
-   for (typename S1Ty::const_iterator SI = S1.begin(), SE = S1.end(); SI != SE;
-        ++SI)
-     if (S2.count(*SI))
+  S1Ty Result;
+  for (typename S1Ty::const_iterator SI = S1.begin(), SE = S1.end(); SI != SE;
+       ++SI)
+    if (S2.count(*SI))
       Result.insert(*SI);
-   return Result;
+  return Result;
 }
 
 /// set_intersection(A, B) - Return A ^ B
 template <class S1Ty, class S2Ty>
 S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2) {
-   if (S1.size() < S2.size())
-     return set_intersection_impl(S1, S2);
-   else
-     return set_intersection_impl(S2, S1);
+  if (S1.size() < S2.size())
+    return set_intersection_impl(S1, S2);
+  else
+    return set_intersection_impl(S2, S1);
 }
 
 /// set_difference(A, B) - Return A - B
@@ -69,19 +85,18 @@ S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2) {
 template <class S1Ty, class S2Ty>
 S1Ty set_difference(const S1Ty &S1, const S2Ty &S2) {
   S1Ty Result;
-  for (typename S1Ty::const_iterator SI = S1.begin(), SE = S1.end();
-       SI != SE; ++SI)
-    if (!S2.count(*SI))       // if the element is not in set2
+  for (typename S1Ty::const_iterator SI = S1.begin(), SE = S1.end(); SI != SE;
+       ++SI)
+    if (!S2.count(*SI)) // if the element is not in set2
       Result.insert(*SI);
   return Result;
 }
 
 /// set_subtract(A, B) - Compute A := A - B
 ///
-template <class S1Ty, class S2Ty>
-void set_subtract(S1Ty &S1, const S2Ty &S2) {
-  for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end();
-       SI != SE; ++SI)
+template <class S1Ty, class S2Ty> void set_subtract(S1Ty &S1, const S2Ty &S2) {
+  for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE;
+       ++SI)
     S1.erase(*SI);
 }
 
@@ -110,6 +125,6 @@ bool set_is_subset(const S1Ty &S1, const S2Ty &S2) {
   return true;
 }
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index 69076ed3269279..597cad8444f69f 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -127,22 +127,11 @@ class SmallPtrSetImplBase : public DebugEpochBase {
   std::pair<const void *const *, bool> insert_imp(const void *Ptr) {
     if (isSmall()) {
       // Check to see if it is already in the set.
-      const void **LastTombstone = nullptr;
       for (const void **APtr = SmallArray, **E = SmallArray + NumNonEmpty;
            APtr != E; ++APtr) {
         const void *Value = *APtr;
         if (Value == Ptr)
           return std::make_pair(APtr, false);
-        if (Value == getTombstoneMarker())
-          LastTombstone = APtr;
-      }
-
-      // Did we find any tombstone marker?
-      if (LastTombstone != nullptr) {
-        *LastTombstone = Ptr;
-        --NumTombstones;
-        incrementEpoch();
-        return std::make_pair(LastTombstone, true);
       }
 
       // Nope, there isn't.  If we stay small, just 'pushback' now.
@@ -161,14 +150,27 @@ class SmallPtrSetImplBase : public DebugEpochBase {
   /// that the derived class can check that the right type of pointer is passed
   /// in.
   bool erase_imp(const void * Ptr) {
-    const void *const *P = find_imp(Ptr);
-    if (P == EndPointer())
+    if (isSmall()) {
+      for (const void **APtr = SmallArray, **E = SmallArray + NumNonEmpty;
+           APtr != E; ++APtr) {
+        if (*APtr == Ptr) {
+          *APtr = SmallArray[--NumNonEmpty];
+          incrementEpoch();
+          return true;
+        }
+      }
+      return false;
+    }
+
+    auto *Bucket = FindBucketFor(Ptr);
+    if (*Bucket != Ptr)
       return false;
 
-    const void **Loc = const_cast<const void **>(P);
-    assert(*Loc == Ptr && "broken find!");
-    *Loc = getTombstoneMarker();
+    *const_cast<const void **>(Bucket) = getTombstoneMarker();
     NumTombstones++;
+    // Treat this consistently from an API perspective, even if we don't
+    // actually invalidate iterators here.
+    incrementEpoch();
     return true;
   }
 
@@ -192,9 +194,9 @@ class SmallPtrSetImplBase : public DebugEpochBase {
     return EndPointer();
   }
 
-private:
   bool isSmall() const { return CurArray == SmallArray; }
 
+private:
   std::pair<const void *const *, bool> insert_imp_big(const void *Ptr);
 
   const void * const *FindBucketFor(const void *Ptr) const;
@@ -351,11 +353,61 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase {
     return insert(Ptr).first;
   }
 
-  /// erase - If the set contains the specified pointer, remove it and return
-  /// true, otherwise return false.
+  /// Remove pointer from the set.
+  ///
+  /// Returns whether the pointer was in the set. Invalidates iterators if
+  /// true is returned. To remove elements while iterating over the set, use
+  /// remove_if() instead.
   bool erase(PtrType Ptr) {
     return erase_imp(PtrTraits::getAsVoidPointer(Ptr));
   }
+
+  /// Remove elements that match the given predicate.
+  ///
+  /// This method is a safe replacement for the following pattern, which is not
+  /// valid, because the erase() calls would invalidate the iterator:
+  ///
+  ///     for (PtrType *Ptr : Set)
+  ///       if (Pred(P))
+  ///         Set.erase(P);
+  ///
+  /// Returns whether anything was removed. It is safe to read the set inside
+  /// the predicate function. However, the predicate must not modify the set
+  /// itself, only indicate a removal by returning true.
+  template <typename UnaryPredicate>
+  bool remove_if(UnaryPredicate P) {
+    bool Removed = false;
+    if (isSmall()) {
+      const void **APtr = SmallArray, **E = SmallArray + NumNonEmpty;
+      while (APtr != E) {
+        PtrType Ptr = PtrTraits::getFromVoidPointer(const_cast<void *>(*APtr));
+        if (P(Ptr)) {
+          *APtr = *--E;
+          --NumNonEmpty;
+          incrementEpoch();
+          Removed = true;
+        } else {
+          ++APtr;
+        }
+      }
+      return Removed;
+    }
+
+    for (const void **APtr = CurArray, **E = EndPointer(); APtr != E; ++APtr) {
+      const void *Value = *APtr;
+      if (Value == getTombstoneMarker() || Value == getEmptyMarker())
+        continue;
+      PtrType Ptr = PtrTraits::getFromVoidPointer(const_cast<void *>(Value));
+      if (P(Ptr)) {
+        *APtr = getTombstoneMarker();
+        ++NumTombstones;
+        incrementEpoch();
+        Removed = true;
+      }
+    }
+    return Removed;
+  }
+
   /// count - Return 1 if the specified pointer is in the set, 0 otherwise.
   size_type count(ConstPtrType Ptr) const {
     return find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)) != EndPointer();
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 4c6669fe78f678..812b5a9f72a3ad 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -41,6 +41,7 @@
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ModRef.h"
diff --git a/llvm/include/llvm/Analysis/CFGPrinter.h b/llvm/include/llvm/Analysis/CFGPrinter.h
index 3e5b57195b9eae..7d4d3a574b9e81 100644
--- a/llvm/include/llvm/Analysis/CFGPrinter.h
+++ b/llvm/include/llvm/Analysis/CFGPrinter.h
@@ -133,7 +133,7 @@ std::string SimpleNodeLabelString(const BasicBlockT *Node) {
   raw_string_ostream OS(Str);
 
   Node->printAsOperand(OS, false);
-  return OS.str();
+  return Str;
 }
 
 template <typename BasicBlockT>
@@ -145,10 +145,9 @@ std::string CompleteNodeLabelString(
         HandleComment) {
 
   enum { MaxColumns = 80 };
-  std::string Str;
-  raw_string_ostream OS(Str);
+  std::string OutStr;
+  raw_string_ostream OS(OutStr);
   HandleBasicBlock(OS, *Node);
-  std::string OutStr = OS.str();
   // Remove "%" from BB name
   if (OutStr[0] == '%') {
     OutStr.erase(OutStr.begin());
@@ -208,10 +207,8 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
     // Prepend label name
     Node.printAsOperand(OS, false);
     OS << ":\n";
-    for (auto J = Node.begin(), JE = Node.end(); J != JE; ++J) {
-      const Instruction *Inst = &*J;
-      OS << *Inst << "\n";
-    }
+    for (const Instruction &Inst : Node)
+      OS << Inst << "\n";
   }
 
   static std::string getCompleteNodeLabel(
@@ -249,7 +246,7 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
       raw_string_ostream OS(Str);
       auto Case = *SwitchInst::ConstCaseIt::fromSuccessorIndex(SI, SuccNo);
       OS << Case.getCaseValue()->getValue();
-      return OS.str();
+      return Str;
     }
     return "";
   }
@@ -259,7 +256,6 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
     if (NodeName.empty()) {
       raw_string_ostream NodeOS(NodeName);
       Node->printAsOperand(NodeOS, false);
-      NodeName = NodeOS.str();
       // Removing %
       NodeName.erase(NodeName.begin());
     }
diff --git a/llvm/include/llvm/Analysis/CGSCCPassManager.h b/llvm/include/llvm/Analysis/CGSCCPassManager.h
index b19d53621ac867..59974d146b56d3 100644
--- a/llvm/include/llvm/Analysis/CGSCCPassManager.h
+++ b/llvm/include/llvm/Analysis/CGSCCPassManager.h
@@ -230,17 +230,6 @@ using ModuleAnalysisManagerCGSCCProxy =
 /// Passes which do not change the call graph structure in any way can just
 /// ignore this argument to their run method.
 struct CGSCCUpdateResult {
-  /// Worklist of the RefSCCs queued for processing.
-  ///
-  /// When a pass refines the graph and creates new RefSCCs or causes them to
-  /// have a different shape or set of component SCCs it should add the RefSCCs
-  /// to this worklist so that we visit them in the refined form.
-  ///
-  /// This worklist is in reverse post-order, as we pop off the back in order
-  /// to observe RefSCCs in post-order. When adding RefSCCs, clients should add
-  /// them in reverse post-order.
-  SmallPriorityWorklist<LazyCallGraph::RefSCC *, 1> &RCWorklist;
-
   /// Worklist of the SCCs queued for processing.
   ///
   /// When a pass refines the graph and creates new SCCs or causes them to have
diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h
index bd559f3fb69b13..be5f746e31a573 100644
--- a/llvm/include/llvm/Analysis/DDG.h
+++ b/llvm/include/llvm/Analysis/DDG.h
@@ -467,16 +467,16 @@ DependenceGraphInfo<NodeType>::getDependenceString(const NodeType &Src,
   raw_string_ostream OS(Str);
   DependenceList Deps;
   if (!getDependencies(Src, Dst, Deps))
-    return OS.str();
+    return Str;
   interleaveComma(Deps, OS, [&](const std::unique_ptr<Dependence> &D) {
     D->dump(OS);
     // Remove the extra new-line character printed by the dump
     // method
-    if (OS.str().back() == '\n')
-      OS.str().pop_back();
+    if (Str.back() == '\n')
+      Str.pop_back();
   });
 
-  return OS.str();
+  return Str;
 }
 
 //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Analysis/Delinearization.h b/llvm/include/llvm/Analysis/Delinearization.h
index a00adb28960417..eb775babd60615 100644
--- a/llvm/include/llvm/Analysis/Delinearization.h
+++ b/llvm/include/llvm/Analysis/Delinearization.h
@@ -22,6 +22,7 @@ namespace llvm {
 class raw_ostream;
 template <typename T> class SmallVectorImpl;
 class GetElementPtrInst;
+class Instruction;
 class ScalarEvolution;
 class SCEV;
 
diff --git a/llvm/include/llvm/Analysis/DemandedBits.h b/llvm/include/llvm/Analysis/DemandedBits.h
index aac7382528f06b..0475c4ff07ad89 100644
--- a/llvm/include/llvm/Analysis/DemandedBits.h
+++ b/llvm/include/llvm/Analysis/DemandedBits.h
@@ -34,6 +34,8 @@ class Function;
 class Instruction;
 struct KnownBits;
 class raw_ostream;
+class Use;
+class Value;
 
 class DemandedBits {
 public:
diff --git a/llvm/include/llvm/Analysis/DomTreeUpdater.h b/llvm/include/llvm/Analysis/DomTreeUpdater.h
index ddb958455ccd79..e07d2bf2b0df17 100644
--- a/llvm/include/llvm/Analysis/DomTreeUpdater.h
+++ b/llvm/include/llvm/Analysis/DomTreeUpdater.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_DOMTREEUPDATER_H
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/GenericDomTreeUpdater.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Compiler.h"
@@ -23,67 +24,22 @@
 #include <vector>
 
 namespace llvm {
+
 class PostDominatorTree;
 
-class DomTreeUpdater {
-public:
-  enum class UpdateStrategy : unsigned char { Eager = 0, Lazy = 1 };
+class DomTreeUpdater
+    : public GenericDomTreeUpdater<DomTreeUpdater, DominatorTree,
+                                   PostDominatorTree> {
+  friend GenericDomTreeUpdater<DomTreeUpdater, DominatorTree,
+                               PostDominatorTree>;
 
-  explicit DomTreeUpdater(UpdateStrategy Strategy_) : Strategy(Strategy_) {}
-  DomTreeUpdater(DominatorTree &DT_, UpdateStrategy Strategy_)
-      : DT(&DT_), Strategy(Strategy_) {}
-  DomTreeUpdater(DominatorTree *DT_, UpdateStrategy Strategy_)
-      : DT(DT_), Strategy(Strategy_) {}
-  DomTreeUpdater(PostDominatorTree &PDT_, UpdateStrategy Strategy_)
-      : PDT(&PDT_), Strategy(Strategy_) {}
-  DomTreeUpdater(PostDominatorTree *PDT_, UpdateStrategy Strategy_)
-      : PDT(PDT_), Strategy(Strategy_) {}
-  DomTreeUpdater(DominatorTree &DT_, PostDominatorTree &PDT_,
-                 UpdateStrategy Strategy_)
-      : DT(&DT_), PDT(&PDT_), Strategy(Strategy_) {}
-  DomTreeUpdater(DominatorTree *DT_, PostDominatorTree *PDT_,
-                 UpdateStrategy Strategy_)
-      : DT(DT_), PDT(PDT_), Strategy(Strategy_) {}
+public:
+  using Base =
+      GenericDomTreeUpdater<DomTreeUpdater, DominatorTree, PostDominatorTree>;
+  using Base::Base;
 
   ~DomTreeUpdater() { flush(); }
 
-  /// Returns true if the current strategy is Lazy.
-  bool isLazy() const { return Strategy == UpdateStrategy::Lazy; };
-
-  /// Returns true if the current strategy is Eager.
-  bool isEager() const { return Strategy == UpdateStrategy::Eager; };
-
-  /// Returns true if it holds a DominatorTree.
-  bool hasDomTree() const { return DT != nullptr; }
-
-  /// Returns true if it holds a PostDominatorTree.
-  bool hasPostDomTree() const { return PDT != nullptr; }
-
-  /// Returns true if there is BasicBlock awaiting deletion.
-  /// The deletion will only happen until a flush event and
-  /// all available trees are up-to-date.
-  /// Returns false under Eager UpdateStrategy.
-  bool hasPendingDeletedBB() const { return !DeletedBBs.empty(); }
-
-  /// Returns true if DelBB is awaiting deletion.
-  /// Returns false under Eager UpdateStrategy.
-  bool isBBPendingDeletion(BasicBlock *DelBB) const;
-
-  /// Returns true if either of DT or PDT is valid and the tree has at
-  /// least one update pending. If DT or PDT is nullptr it is treated
-  /// as having no pending updates. This function does not check
-  /// whether there is BasicBlock awaiting deletion.
-  /// Returns false under Eager UpdateStrategy.
-  bool hasPendingUpdates() const;
-
-  /// Returns true if there are DominatorTree updates queued.
-  /// Returns false under Eager UpdateStrategy or DT is nullptr.
-  bool hasPendingDomTreeUpdates() const;
-
-  /// Returns true if there are PostDominatorTree updates queued.
-  /// Returns false under Eager UpdateStrategy or PDT is nullptr.
-  bool hasPendingPostDomTreeUpdates() const;
-
   ///@{
   /// \name Mutation APIs
   ///
@@ -105,51 +61,6 @@ class DomTreeUpdater {
   /// Although GenericDomTree provides several update primitives,
   /// it is not encouraged to use these APIs directly.
 
-  /// Submit updates to all available trees.
-  /// The Eager Strategy flushes updates immediately while the Lazy Strategy
-  /// queues the updates.
-  ///
-  /// Note: The "existence" of an edge in a CFG refers to the CFG which DTU is
-  /// in sync with + all updates before that single update.
-  ///
-  /// CAUTION!
-  /// 1. It is required for the state of the LLVM IR to be updated
-  /// *before* submitting the updates because the internal update routine will
-  /// analyze the current state of the CFG to determine whether an update
-  /// is valid.
-  /// 2. It is illegal to submit any update that has already been submitted,
-  /// i.e., you are supposed not to insert an existent edge or delete a
-  /// nonexistent edge.
-  void applyUpdates(ArrayRef<DominatorTree::UpdateType> Updates);
-
-  /// Submit updates to all available trees. It will also
-  /// 1. discard duplicated updates,
-  /// 2. remove invalid updates. (Invalid updates means deletion of an edge that
-  /// still exists or insertion of an edge that does not exist.)
-  /// The Eager Strategy flushes updates immediately while the Lazy Strategy
-  /// queues the updates.
-  ///
-  /// Note: The "existence" of an edge in a CFG refers to the CFG which DTU is
-  /// in sync with + all updates before that single update.
-  ///
-  /// CAUTION!
-  /// 1. It is required for the state of the LLVM IR to be updated
-  /// *before* submitting the updates because the internal update routine will
-  /// analyze the current state of the CFG to determine whether an update
-  /// is valid.
-  /// 2. It is illegal to submit any update that has already been submitted,
-  /// i.e., you are supposed not to insert an existent edge or delete a
-  /// nonexistent edge.
-  /// 3. It is only legal to submit updates to an edge in the order CFG changes
-  /// are made. The order you submit updates on different edges is not
-  /// restricted.
-  void applyUpdatesPermissive(ArrayRef<DominatorTree::UpdateType> Updates);
-
-  /// Notify DTU that the entry block was replaced.
-  /// Recalculate all available trees and flush all BasicBlocks
-  /// awaiting deletion immediately.
-  void recalculate(Function &F);
-
   /// Delete DelBB. DelBB will be removed from its Parent and
   /// erased from available trees if it exists and finally get deleted.
   /// Under Eager UpdateStrategy, DelBB will be processed immediately.
@@ -172,33 +83,6 @@ class DomTreeUpdater {
 
   ///@}
 
-  ///@{
-  /// \name Flush APIs
-  ///
-  /// CAUTION! By the moment these flush APIs are called, the current CFG needs
-  /// to be the same as the CFG which DTU is in sync with + all updates
-  /// submitted.
-
-  /// Flush DomTree updates and return DomTree.
-  /// It flushes Deleted BBs if both trees are up-to-date.
-  /// It must only be called when it has a DomTree.
-  DominatorTree &getDomTree();
-
-  /// Flush PostDomTree updates and return PostDomTree.
-  /// It flushes Deleted BBs if both trees are up-to-date.
-  /// It must only be called when it has a PostDomTree.
-  PostDominatorTree &getPostDomTree();
-
-  /// Apply all pending updates to available trees and flush all BasicBlocks
-  /// awaiting deletion.
-
-  void flush();
-
-  ///@}
-
-  /// Debug method to help view the internal state of this class.
-  LLVM_DUMP_METHOD void dump() const;
-
 private:
   class CallBackOnDeletion final : public CallbackVH {
   public:
@@ -216,16 +100,7 @@ class DomTreeUpdater {
     }
   };
 
-  SmallVector<DominatorTree::UpdateType, 16> PendUpdates;
-  size_t PendDTUpdateIndex = 0;
-  size_t PendPDTUpdateIndex = 0;
-  DominatorTree *DT = nullptr;
-  PostDominatorTree *PDT = nullptr;
-  const UpdateStrategy Strategy;
-  SmallPtrSet<BasicBlock *, 8> DeletedBBs;
   std::vector<CallBackOnDeletion> Callbacks;
-  bool IsRecalculatingDomTree = false;
-  bool IsRecalculatingPostDomTree = false;
 
   /// First remove all the instructions of DelBB and then make sure DelBB has a
   /// valid terminator instruction which is necessary to have when DelBB still
@@ -237,32 +112,16 @@ class DomTreeUpdater {
   /// Returns true if at least one BasicBlock is deleted.
   bool forceFlushDeletedBB();
 
-  /// Helper function to apply all pending DomTree updates.
-  void applyDomTreeUpdates();
-
-  /// Helper function to apply all pending PostDomTree updates.
-  void applyPostDomTreeUpdates();
-
-  /// Helper function to flush deleted BasicBlocks if all available
-  /// trees are up-to-date.
-  void tryFlushDeletedBB();
-
-  /// Drop all updates applied by all available trees and delete BasicBlocks if
-  /// all available trees are up-to-date.
-  void dropOutOfDateUpdates();
-
-  /// Erase Basic Block node that has been unlinked from Function
-  /// in the DomTree and PostDomTree.
-  void eraseDelBBNode(BasicBlock *DelBB);
+  /// Debug method to help view the internal state of this class.
+  LLVM_DUMP_METHOD void dump() const;
+};
 
-  /// Returns true if the update appears in the LLVM IR.
-  /// It is used to check whether an update is valid in
-  /// insertEdge/deleteEdge or is unnecessary in the batch update.
-  bool isUpdateValid(DominatorTree::UpdateType Update) const;
+extern template class GenericDomTreeUpdater<DomTreeUpdater, DominatorTree,
+                                            PostDominatorTree>;
 
-  /// Returns true if the update is self dominance.
-  bool isSelfDominance(DominatorTree::UpdateType Update) const;
-};
+extern template void
+GenericDomTreeUpdater<DomTreeUpdater, DominatorTree,
+                      PostDominatorTree>::recalculate(Function &F);
 } // namespace llvm
 
 #endif // LLVM_ANALYSIS_DOMTREEUPDATER_H
diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h
index 772fdc9ddee02a..1339b4869bc29b 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -29,6 +29,7 @@
 
 namespace llvm {
 
+class BasicBlock;
 class Function;
 class raw_ostream;
 
diff --git a/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h b/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
new file mode 100644
index 00000000000000..84ed882c6de84d
--- /dev/null
+++ b/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
@@ -0,0 +1,250 @@
+//===- GenericDomTreeUpdater.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the GenericDomTreeUpdater class, which provides a uniform
+// way to update dominator tree related data structures.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_GENERICDOMTREEUPDATER_H
+#define LLVM_ANALYSIS_GENERICDOMTREEUPDATER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+class GenericDomTreeUpdater {
+  DerivedT &derived() { return *static_cast<DerivedT *>(this); }
+  const DerivedT &derived() const {
+    return *static_cast<const DerivedT *>(this);
+  }
+
+public:
+  enum class UpdateStrategy : unsigned char { Eager = 0, Lazy = 1 };
+  using BasicBlockT = typename DomTreeT::NodeType;
+
+  explicit GenericDomTreeUpdater(UpdateStrategy Strategy_)
+      : Strategy(Strategy_) {}
+  GenericDomTreeUpdater(DomTreeT &DT_, UpdateStrategy Strategy_)
+      : DT(&DT_), Strategy(Strategy_) {}
+  GenericDomTreeUpdater(DomTreeT *DT_, UpdateStrategy Strategy_)
+      : DT(DT_), Strategy(Strategy_) {}
+  GenericDomTreeUpdater(PostDomTreeT &PDT_, UpdateStrategy Strategy_)
+      : PDT(&PDT_), Strategy(Strategy_) {}
+  GenericDomTreeUpdater(PostDomTreeT *PDT_, UpdateStrategy Strategy_)
+      : PDT(PDT_), Strategy(Strategy_) {}
+  GenericDomTreeUpdater(DomTreeT &DT_, PostDomTreeT &PDT_,
+                        UpdateStrategy Strategy_)
+      : DT(&DT_), PDT(&PDT_), Strategy(Strategy_) {}
+  GenericDomTreeUpdater(DomTreeT *DT_, PostDomTreeT *PDT_,
+                        UpdateStrategy Strategy_)
+      : DT(DT_), PDT(PDT_), Strategy(Strategy_) {}
+
+  ~GenericDomTreeUpdater() {
+    // We cannot call into derived() here as it will already be destroyed.
+    assert(!hasPendingUpdates() &&
+           "Pending updates were not flushed by derived class.");
+  }
+
+  /// Returns true if the current strategy is Lazy.
+  bool isLazy() const { return Strategy == UpdateStrategy::Lazy; }
+
+  /// Returns true if the current strategy is Eager.
+  bool isEager() const { return Strategy == UpdateStrategy::Eager; }
+
+  /// Returns true if it holds a DomTreeT.
+  bool hasDomTree() const { return DT != nullptr; }
+
+  /// Returns true if it holds a PostDomTreeT.
+  bool hasPostDomTree() const { return PDT != nullptr; }
+
+  /// Returns true if there is BasicBlockT awaiting deletion.
+  /// The deletion will only happen until a flush event and
+  /// all available trees are up-to-date.
+  /// Returns false under Eager UpdateStrategy.
+  bool hasPendingDeletedBB() const { return !DeletedBBs.empty(); }
+
+  /// Returns true if DelBB is awaiting deletion.
+  /// Returns false under Eager UpdateStrategy.
+  bool isBBPendingDeletion(BasicBlockT *DelBB) const {
+    if (Strategy == UpdateStrategy::Eager || DeletedBBs.empty())
+      return false;
+    return DeletedBBs.contains(DelBB);
+  }
+
+  /// Returns true if either of DT or PDT is valid and the tree has at
+  /// least one update pending. If DT or PDT is nullptr it is treated
+  /// as having no pending updates. This function does not check
+  /// whether there is MachineBasicBlock awaiting deletion.
+  /// Returns false under Eager UpdateStrategy.
+  bool hasPendingUpdates() const {
+    return hasPendingDomTreeUpdates() || hasPendingPostDomTreeUpdates();
+  }
+
+  /// Returns true if there are DomTreeT updates queued.
+  /// Returns false under Eager UpdateStrategy or DT is nullptr.
+  bool hasPendingDomTreeUpdates() const {
+    if (!DT)
+      return false;
+    return PendUpdates.size() != PendDTUpdateIndex;
+  }
+
+  /// Returns true if there are PostDomTreeT updates queued.
+  /// Returns false under Eager UpdateStrategy or PDT is nullptr.
+  bool hasPendingPostDomTreeUpdates() const {
+    if (!PDT)
+      return false;
+    return PendUpdates.size() != PendPDTUpdateIndex;
+  }
+
+  ///@{
+  /// \name Mutation APIs
+  ///
+  /// These methods provide APIs for submitting updates to the DomTreeT and
+  /// the PostDominatorTree.
+  ///
+  /// Note: There are two strategies to update the DomTreeT and the
+  /// PostDominatorTree:
+  /// 1. Eager UpdateStrategy: Updates are submitted and then flushed
+  /// immediately.
+  /// 2. Lazy UpdateStrategy: Updates are submitted but only flushed when you
+  /// explicitly call Flush APIs. It is recommended to use this update strategy
+  /// when you submit a bunch of updates multiple times which can then
+  /// add up to a large number of updates between two queries on the
+  /// DomTreeT. The incremental updater can reschedule the updates or
+  /// decide to recalculate the dominator tree in order to speedup the updating
+  /// process depending on the number of updates.
+  ///
+  /// Although GenericDomTree provides several update primitives,
+  /// it is not encouraged to use these APIs directly.
+
+  /// Notify DTU that the entry block was replaced.
+  /// Recalculate all available trees and flush all BasicBlocks
+  /// awaiting deletion immediately.
+  template <typename FuncT> void recalculate(FuncT &F);
+
+  /// Submit updates to all available trees.
+  /// The Eager Strategy flushes updates immediately while the Lazy Strategy
+  /// queues the updates.
+  ///
+  /// Note: The "existence" of an edge in a CFG refers to the CFG which DTU is
+  /// in sync with + all updates before that single update.
+  ///
+  /// CAUTION!
+  /// 1. It is required for the state of the LLVM IR to be updated
+  /// *before* submitting the updates because the internal update routine will
+  /// analyze the current state of the CFG to determine whether an update
+  /// is valid.
+  /// 2. It is illegal to submit any update that has already been submitted,
+  /// i.e., you are supposed not to insert an existent edge or delete a
+  /// nonexistent edge.
+  void applyUpdates(ArrayRef<typename DomTreeT::UpdateType> Updates);
+
+  /// Submit updates to all available trees. It will also
+  /// 1. discard duplicated updates,
+  /// 2. remove invalid updates. (Invalid updates means deletion of an edge that
+  /// still exists or insertion of an edge that does not exist.)
+  /// The Eager Strategy flushes updates immediately while the Lazy Strategy
+  /// queues the updates.
+  ///
+  /// Note: The "existence" of an edge in a CFG refers to the CFG which DTU is
+  /// in sync with + all updates before that single update.
+  ///
+  /// CAUTION!
+  /// 1. It is required for the state of the LLVM IR to be updated
+  /// *before* submitting the updates because the internal update routine will
+  /// analyze the current state of the CFG to determine whether an update
+  /// is valid.
+  /// 2. It is illegal to submit any update that has already been submitted,
+  /// i.e., you are supposed not to insert an existent edge or delete a
+  /// nonexistent edge.
+  /// 3. It is only legal to submit updates to an edge in the order CFG changes
+  /// are made. The order you submit updates on different edges is not
+  /// restricted.
+  void applyUpdatesPermissive(ArrayRef<typename DomTreeT::UpdateType> Updates);
+
+  ///@}
+
+  ///@{
+  /// \name Flush APIs
+  ///
+  /// CAUTION! By the moment these flush APIs are called, the current CFG needs
+  /// to be the same as the CFG which DTU is in sync with + all updates
+  /// submitted.
+
+  /// Flush DomTree updates and return DomTree.
+  /// It flushes Deleted BBs if both trees are up-to-date.
+  /// It must only be called when it has a DomTree.
+  DomTreeT &getDomTree();
+
+  /// Flush PostDomTree updates and return PostDomTree.
+  /// It flushes Deleted BBs if both trees are up-to-date.
+  /// It must only be called when it has a PostDomTree.
+  PostDomTreeT &getPostDomTree();
+
+  /// Apply all pending updates to available trees and flush all BasicBlocks
+  /// awaiting deletion.
+
+  void flush() {
+    applyDomTreeUpdates();
+    applyPostDomTreeUpdates();
+    dropOutOfDateUpdates();
+  }
+
+  ///@}
+
+  /// Debug method to help view the internal state of this class.
+  LLVM_DUMP_METHOD void dump() const;
+
+protected:
+  SmallVector<typename DomTreeT::UpdateType, 16> PendUpdates;
+  size_t PendDTUpdateIndex = 0;
+  size_t PendPDTUpdateIndex = 0;
+  DomTreeT *DT = nullptr;
+  PostDomTreeT *PDT = nullptr;
+  const UpdateStrategy Strategy;
+  SmallPtrSet<BasicBlockT *, 8> DeletedBBs;
+  bool IsRecalculatingDomTree = false;
+  bool IsRecalculatingPostDomTree = false;
+
+  /// Returns true if the update is self dominance.
+  bool isSelfDominance(typename DomTreeT::UpdateType Update) const {
+    // Won't affect DomTree and PostDomTree.
+    return Update.getFrom() == Update.getTo();
+  }
+
+  /// Helper function to apply all pending DomTree updates.
+  void applyDomTreeUpdates();
+
+  /// Helper function to apply all pending PostDomTree updates.
+  void applyPostDomTreeUpdates();
+
+  /// Returns true if the update appears in the LLVM IR.
+  /// It is used to check whether an update is valid in
+  /// insertEdge/deleteEdge or is unnecessary in the batch update.
+  bool isUpdateValid(typename DomTreeT::UpdateType Update) const;
+
+  /// Erase Basic Block node that has been unlinked from Function
+  /// in the DomTree and PostDomTree.
+  void eraseDelBBNode(BasicBlockT *DelBB);
+
+  /// Helper function to flush deleted BasicBlocks if all available
+  /// trees are up-to-date.
+  void tryFlushDeletedBB();
+
+  /// Drop all updates applied by all available trees and delete BasicBlocks if
+  /// all available trees are up-to-date.
+  void dropOutOfDateUpdates();
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_GENERICDOMTREEUPDATER_H
diff --git a/llvm/include/llvm/Analysis/GenericDomTreeUpdaterImpl.h b/llvm/include/llvm/Analysis/GenericDomTreeUpdaterImpl.h
new file mode 100644
index 00000000000000..35c54a6062bfd4
--- /dev/null
+++ b/llvm/include/llvm/Analysis/GenericDomTreeUpdaterImpl.h
@@ -0,0 +1,353 @@
+//===- GenericDomTreeUpdaterImpl.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GenericDomTreeUpdater class. This file should only
+// be included by files that implement a specialization of the relevant
+// templates. Currently these are:
+// - llvm/lib/Analysis/DomTreeUpdater.cpp
+// - llvm/lib/CodeGen/MachineDomTreeUpdater.cpp
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_ANALYSIS_GENERICDOMTREEUPDATERIMPL_H
+#define LLVM_ANALYSIS_GENERICDOMTREEUPDATERIMPL_H
+
+#include "llvm/Analysis/GenericDomTreeUpdater.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+template <typename FuncT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::recalculate(
+    FuncT &F) {
+  if (Strategy == UpdateStrategy::Eager) {
+    if (DT)
+      DT->recalculate(F);
+    if (PDT)
+      PDT->recalculate(F);
+    return;
+  }
+
+  // There is little performance gain if we pend the recalculation under
+  // Lazy UpdateStrategy so we recalculate available trees immediately.
+
+  // Prevent forceFlushDeletedBB() from erasing DomTree or PostDomTree nodes.
+  IsRecalculatingDomTree = IsRecalculatingPostDomTree = true;
+
+  // Because all trees are going to be up-to-date after recalculation,
+  // flush awaiting deleted BasicBlocks.
+  derived().forceFlushDeletedBB();
+  if (DT)
+    DT->recalculate(F);
+  if (PDT)
+    PDT->recalculate(F);
+
+  // Resume forceFlushDeletedBB() to erase DomTree or PostDomTree nodes.
+  IsRecalculatingDomTree = IsRecalculatingPostDomTree = false;
+  PendDTUpdateIndex = PendPDTUpdateIndex = PendUpdates.size();
+  dropOutOfDateUpdates();
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::applyUpdates(
+    ArrayRef<typename DomTreeT::UpdateType> Updates) {
+  if (!DT && !PDT)
+    return;
+
+  if (Strategy == UpdateStrategy::Lazy) {
+    PendUpdates.reserve(PendUpdates.size() + Updates.size());
+    for (const auto &U : Updates)
+      if (!isSelfDominance(U))
+        PendUpdates.push_back(U);
+
+    return;
+  }
+
+  if (DT)
+    DT->applyUpdates(Updates);
+  if (PDT)
+    PDT->applyUpdates(Updates);
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::
+    applyUpdatesPermissive(ArrayRef<typename DomTreeT::UpdateType> Updates) {
+  if (!DT && !PDT)
+    return;
+
+  SmallSet<std::pair<BasicBlockT *, BasicBlockT *>, 8> Seen;
+  SmallVector<typename DomTreeT::UpdateType, 8> DeduplicatedUpdates;
+  for (const auto &U : Updates) {
+    auto Edge = std::make_pair(U.getFrom(), U.getTo());
+    // Because it is illegal to submit updates that have already been applied
+    // and updates to an edge need to be strictly ordered,
+    // it is safe to infer the existence of an edge from the first update
+    // to this edge.
+    // If the first update to an edge is "Delete", it means that the edge
+    // existed before. If the first update to an edge is "Insert", it means
+    // that the edge didn't exist before.
+    //
+    // For example, if the user submits {{Delete, A, B}, {Insert, A, B}},
+    // because
+    // 1. it is illegal to submit updates that have already been applied,
+    // i.e., user cannot delete an nonexistent edge,
+    // 2. updates to an edge need to be strictly ordered,
+    // So, initially edge A -> B existed.
+    // We can then safely ignore future updates to this edge and directly
+    // inspect the current CFG:
+    // a. If the edge still exists, because the user cannot insert an existent
+    // edge, so both {Delete, A, B}, {Insert, A, B} actually happened and
+    // resulted in a no-op. DTU won't submit any update in this case.
+    // b. If the edge doesn't exist, we can then infer that {Delete, A, B}
+    // actually happened but {Insert, A, B} was an invalid update which never
+    // happened. DTU will submit {Delete, A, B} in this case.
+    if (!isSelfDominance(U) && Seen.count(Edge) == 0) {
+      Seen.insert(Edge);
+      // If the update doesn't appear in the CFG, it means that
+      // either the change isn't made or relevant operations
+      // result in a no-op.
+      if (isUpdateValid(U)) {
+        if (isLazy())
+          PendUpdates.push_back(U);
+        else
+          DeduplicatedUpdates.push_back(U);
+      }
+    }
+  }
+
+  if (Strategy == UpdateStrategy::Lazy)
+    return;
+
+  if (DT)
+    DT->applyUpdates(DeduplicatedUpdates);
+  if (PDT)
+    PDT->applyUpdates(DeduplicatedUpdates);
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+DomTreeT &
+GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::getDomTree() {
+  assert(DT && "Invalid acquisition of a null DomTree");
+  applyDomTreeUpdates();
+  dropOutOfDateUpdates();
+  return *DT;
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+PostDomTreeT &
+GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::getPostDomTree() {
+  assert(PDT && "Invalid acquisition of a null PostDomTree");
+  applyPostDomTreeUpdates();
+  dropOutOfDateUpdates();
+  return *PDT;
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+LLVM_DUMP_METHOD void
+GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  raw_ostream &OS = llvm::dbgs();
+
+  OS << "Available Trees: ";
+  if (DT || PDT) {
+    if (DT)
+      OS << "DomTree ";
+    if (PDT)
+      OS << "PostDomTree ";
+    OS << "\n";
+  } else
+    OS << "None\n";
+
+  OS << "UpdateStrategy: ";
+  if (Strategy == UpdateStrategy::Eager) {
+    OS << "Eager\n";
+    return;
+  } else
+    OS << "Lazy\n";
+  int Index = 0;
+
+  auto printUpdates =
+      [&](typename ArrayRef<typename DomTreeT::UpdateType>::const_iterator
+              begin,
+          typename ArrayRef<typename DomTreeT::UpdateType>::const_iterator
+              end) {
+        if (begin == end)
+          OS << "  None\n";
+        Index = 0;
+        for (auto It = begin, ItEnd = end; It != ItEnd; ++It) {
+          auto U = *It;
+          OS << "  " << Index << " : ";
+          ++Index;
+          if (U.getKind() == DomTreeT::Insert)
+            OS << "Insert, ";
+          else
+            OS << "Delete, ";
+          BasicBlockT *From = U.getFrom();
+          if (From) {
+            auto S = From->getName();
+            if (!From->hasName())
+              S = "(no name)";
+            OS << S << "(" << From << "), ";
+          } else {
+            OS << "(badref), ";
+          }
+          BasicBlockT *To = U.getTo();
+          if (To) {
+            auto S = To->getName();
+            if (!To->hasName())
+              S = "(no_name)";
+            OS << S << "(" << To << ")\n";
+          } else {
+            OS << "(badref)\n";
+          }
+        }
+      };
+
+  if (DT) {
+    const auto I = PendUpdates.begin() + PendDTUpdateIndex;
+    assert(PendUpdates.begin() <= I && I <= PendUpdates.end() &&
+           "Iterator out of range.");
+    OS << "Applied but not cleared DomTreeUpdates:\n";
+    printUpdates(PendUpdates.begin(), I);
+    OS << "Pending DomTreeUpdates:\n";
+    printUpdates(I, PendUpdates.end());
+  }
+
+  if (PDT) {
+    const auto I = PendUpdates.begin() + PendPDTUpdateIndex;
+    assert(PendUpdates.begin() <= I && I <= PendUpdates.end() &&
+           "Iterator out of range.");
+    OS << "Applied but not cleared PostDomTreeUpdates:\n";
+    printUpdates(PendUpdates.begin(), I);
+    OS << "Pending PostDomTreeUpdates:\n";
+    printUpdates(I, PendUpdates.end());
+  }
+
+  OS << "Pending DeletedBBs:\n";
+  Index = 0;
+  for (const auto *BB : DeletedBBs) {
+    OS << "  " << Index << " : ";
+    ++Index;
+    if (BB->hasName())
+      OS << BB->getName() << "(";
+    else
+      OS << "(no_name)(";
+    OS << BB << ")\n";
+  }
+#endif
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT,
+                           PostDomTreeT>::applyDomTreeUpdates() {
+  // No pending DomTreeUpdates.
+  if (Strategy != UpdateStrategy::Lazy || !DT)
+    return;
+
+  // Only apply updates not are applied by DomTree.
+  if (hasPendingDomTreeUpdates()) {
+    const auto I = PendUpdates.begin() + PendDTUpdateIndex;
+    const auto E = PendUpdates.end();
+    assert(I < E && "Iterator range invalid; there should be DomTree updates.");
+    DT->applyUpdates(ArrayRef<typename DomTreeT::UpdateType>(I, E));
+    PendDTUpdateIndex = PendUpdates.size();
+  }
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT,
+                           PostDomTreeT>::applyPostDomTreeUpdates() {
+  // No pending PostDomTreeUpdates.
+  if (Strategy != UpdateStrategy::Lazy || !PDT)
+    return;
+
+  // Only apply updates not are applied by PostDomTree.
+  if (hasPendingPostDomTreeUpdates()) {
+    const auto I = PendUpdates.begin() + PendPDTUpdateIndex;
+    const auto E = PendUpdates.end();
+    assert(I < E &&
+           "Iterator range invalid; there should be PostDomTree updates.");
+    PDT->applyUpdates(ArrayRef<typename DomTreeT::UpdateType>(I, E));
+    PendPDTUpdateIndex = PendUpdates.size();
+  }
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+bool GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::isUpdateValid(
+    typename DomTreeT::UpdateType Update) const {
+  const auto *From = Update.getFrom();
+  const auto *To = Update.getTo();
+  const auto Kind = Update.getKind();
+
+  // Discard updates by inspecting the current state of successors of From.
+  // Since isUpdateValid() must be called *after* the Terminator of From is
+  // altered we can determine if the update is unnecessary for batch updates
+  // or invalid for a single update.
+  const bool HasEdge = llvm::is_contained(successors(From), To);
+
+  // If the IR does not match the update,
+  // 1. In batch updates, this update is unnecessary.
+  // 2. When called by insertEdge*()/deleteEdge*(), this update is invalid.
+  // Edge does not exist in IR.
+  if (Kind == DomTreeT::Insert && !HasEdge)
+    return false;
+
+  // Edge exists in IR.
+  if (Kind == DomTreeT::Delete && HasEdge)
+    return false;
+
+  return true;
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT, PostDomTreeT>::eraseDelBBNode(
+    BasicBlockT *DelBB) {
+  if (DT && !IsRecalculatingDomTree)
+    if (DT->getNode(DelBB))
+      DT->eraseNode(DelBB);
+
+  if (PDT && !IsRecalculatingPostDomTree)
+    if (PDT->getNode(DelBB))
+      PDT->eraseNode(DelBB);
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT,
+                           PostDomTreeT>::tryFlushDeletedBB() {
+  if (!hasPendingUpdates())
+    derived().forceFlushDeletedBB();
+}
+
+template <typename DerivedT, typename DomTreeT, typename PostDomTreeT>
+void GenericDomTreeUpdater<DerivedT, DomTreeT,
+                           PostDomTreeT>::dropOutOfDateUpdates() {
+  if (Strategy == UpdateStrategy::Eager)
+    return;
+
+  tryFlushDeletedBB();
+
+  // Drop all updates applied by both trees.
+  if (!DT)
+    PendDTUpdateIndex = PendUpdates.size();
+  if (!PDT)
+    PendPDTUpdateIndex = PendUpdates.size();
+
+  const size_t dropIndex = std::min(PendDTUpdateIndex, PendPDTUpdateIndex);
+  const auto B = PendUpdates.begin();
+  const auto E = PendUpdates.begin() + dropIndex;
+  assert(B <= E && "Iterator out of range.");
+  PendUpdates.erase(B, E);
+  // Calculate current index.
+  PendDTUpdateIndex -= dropIndex;
+  PendPDTUpdateIndex -= dropIndex;
+}
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_GENERICDOMTREEUPDATERIMPL_H
diff --git a/llvm/include/llvm/Analysis/IVUsers.h b/llvm/include/llvm/Analysis/IVUsers.h
index 6b9b6bf190f1c8..2af3e389446c67 100644
--- a/llvm/include/llvm/Analysis/IVUsers.h
+++ b/llvm/include/llvm/Analysis/IVUsers.h
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
index e0e8a7cda93697..9c2be12fce2fbb 100644
--- a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
+++ b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
@@ -57,7 +57,7 @@ class ICallPromotionAnalysis {
   ///
   /// The returned array space is owned by this class, and overwritten on
   /// subsequent calls.
-  ArrayRef<InstrProfValueData> getPromotionCandidatesForInstruction(
+  MutableArrayRef<InstrProfValueData> getPromotionCandidatesForInstruction(
       const Instruction *I, uint64_t &TotalCount, uint32_t &NumCandidates);
 };
 
diff --git a/llvm/include/llvm/Analysis/IndirectCallVisitor.h b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
index 66c972572b06c8..6c424038070dc7 100644
--- a/llvm/include/llvm/Analysis/IndirectCallVisitor.h
+++ b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
@@ -37,8 +37,10 @@ struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
   // A heuristic is used to find the address feeding instructions.
   static Instruction *tryGetVTableInstruction(CallBase *CB) {
     assert(CB != nullptr && "Caller guaranteed");
-    LoadInst *LI = dyn_cast<LoadInst>(CB->getCalledOperand());
+    if (!CB->isIndirectCall())
+      return nullptr;
 
+    LoadInst *LI = dyn_cast<LoadInst>(CB->getCalledOperand());
     if (LI != nullptr) {
       Value *FuncPtr = LI->getPointerOperand(); // GEP (or bitcast)
       Value *VTablePtr = FuncPtr->stripInBoundsConstantOffsets();
diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
index d62ec9ce491191..acb6c21e480386 100644
--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
@@ -121,7 +121,15 @@ constexpr bool isHeuristicInlineCostFeature(InlineCostFeatureIndex Feature) {
     "number of blocks reached from a conditional instruction, in the callee")  \
   M(int64_t, {1}, callee_users,                                                \
     "number of module-internal users of the callee, +1 if the callee is "      \
-    "exposed externally")
+    "exposed externally")                                                      \
+  M(int64_t, {1}, is_callee_avail_external,                                    \
+    "Is callee an available-externally linkage type (i.e. could be DCEd if "   \
+    "not "                                                                     \
+    "fully inlined by ElimAvailExtern)")                                       \
+  M(int64_t, {1}, is_caller_avail_external,                                    \
+    "Is caller an available-externally linkage type (i.e. could be DCEd if "   \
+    "not "                                                                     \
+    "fully inlined by ElimAvailExtern)")
 
 // clang-format off
 enum class FeatureIndex : size_t {
diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h
index 5611a2b98020f4..596fb2d73150f1 100644
--- a/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
   class AssumptionCache;
+  class BasicBlock;
   class Constant;
   class ConstantRange;
   class DataLayout;
@@ -26,6 +27,7 @@ namespace llvm {
   class Instruction;
   class TargetLibraryInfo;
   class Value;
+  class Use;
   class LazyValueInfoImpl;
   /// This pass computes, caches, and vends lazy value constraint information.
   class LazyValueInfo {
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index a8d954b9872d97..6da1c5596b10e7 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -68,7 +68,7 @@ bool isDereferenceableAndAlignedPointer(const Value *V, Align Alignment,
 /// If it is not obviously safe to load from the specified pointer, we do a
 /// quick local scan of the basic block containing ScanFrom, to determine if
 /// the address is already accessed.
-bool isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
+bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size,
                                  const DataLayout &DL,
                                  Instruction *ScanFrom = nullptr,
                                  AssumptionCache *AC = nullptr,
diff --git a/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
index d22675a308aac7..a760ea98d7cfbe 100644
--- a/llvm/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
@@ -35,6 +35,8 @@ namespace llvm {
 
 class AAResults;
 class AssumptionCache;
+class BlockFrequencyInfo;
+class BranchProbabilityInfo;
 class DominatorTree;
 class Function;
 class Loop;
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 4f06a7e889f912..054206925483c5 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -385,6 +385,11 @@ class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase<BasicBlock, Loop> {
   /// Return the source code span of the loop.
   LocRange getLocRange() const;
 
+  /// Return a string containing the debug location of the loop (file name +
+  /// line number if present, otherwise module name). Meant to be used for debug
+  /// printing within LLVM_DEBUG.
+  std::string getLocStr() const;
+
   StringRef getName() const {
     if (BasicBlock *Header = getHeader())
       if (Header->hasName())
@@ -693,7 +698,6 @@ llvm::MDNode *
 makePostTransformationMetadata(llvm::LLVMContext &Context, MDNode *OrigLoopID,
                                llvm::ArrayRef<llvm::StringRef> RemovePrefixes,
                                llvm::ArrayRef<llvm::MDNode *> AddAttrs);
-
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index 2ca5c281166cad..5a2425257b03fa 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -1269,7 +1269,7 @@ class upward_defs_iterator
     if (WalkingPhi && Location.Ptr) {
       PHITransAddr Translator(
           const_cast<Value *>(Location.Ptr),
-          OriginalAccess->getBlock()->getModule()->getDataLayout(), nullptr);
+          OriginalAccess->getBlock()->getDataLayout(), nullptr);
 
       if (Value *Addr =
               Translator.translateValue(OriginalAccess->getBlock(),
diff --git a/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h b/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
index 8aaeaf29910293..f14344e743759f 100644
--- a/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
+++ b/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include <optional>
diff --git a/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h b/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h
index 91855138fe18e6..641036d8fdb4e9 100644
--- a/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h
+++ b/llvm/include/llvm/Analysis/ReleaseModeModelRunner.h
@@ -14,17 +14,43 @@
 #ifndef LLVM_ANALYSIS_RELEASEMODEMODELRUNNER_H
 #define LLVM_ANALYSIS_RELEASEMODEMODELRUNNER_H
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/MLModelRunner.h"
 #include "llvm/Analysis/TensorSpec.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
 
 #include <memory>
-#include <vector>
 
 namespace llvm {
 
 /// ReleaseModeModelRunner - production mode implementation of the
 /// MLModelRunner. It uses an AOT-compiled SavedModel for efficient execution.
+struct EmbeddedModelRunnerOptions {
+  /// Feed and Fetch feature prefixes - i.e. a feature named "foo" will be
+  /// looked up as {FeedPrefix}_foo; and the output named "bar" will be looked
+  /// up as {FetchPrefix}_bar
+  StringRef FeedPrefix = "feed_";
+  StringRef FetchPrefix = "fetch_";
+
+  /// ModelSelector is the name (recognized by the AOT-ed model) of a sub-model
+  /// to use. "" is allowed if the model doesn't support sub-models.
+  StringRef ModelSelector = "";
+
+  EmbeddedModelRunnerOptions &setFeedPrefix(StringRef Value) {
+    FeedPrefix = Value;
+    return *this;
+  }
+  EmbeddedModelRunnerOptions &setFetchPrefix(StringRef Value) {
+    FetchPrefix = Value;
+    return *this;
+  }
+  EmbeddedModelRunnerOptions &setModelSelector(StringRef Value) {
+    ModelSelector = Value;
+    return *this;
+  }
+};
+
 template <class TGen>
 class ReleaseModeModelRunner final : public MLModelRunner {
 public:
@@ -32,22 +58,50 @@ class ReleaseModeModelRunner final : public MLModelRunner {
   /// std::array or std::vector, that has a size() method.
   template <class FType>
   ReleaseModeModelRunner(LLVMContext &Ctx, const FType &InputSpec,
-                         StringRef DecisionName, StringRef FeedPrefix = "feed_",
-                         StringRef FetchPrefix = "fetch_")
-      : MLModelRunner(Ctx, MLModelRunner::Kind::Release, InputSpec.size()),
+                         StringRef DecisionName,
+                         const EmbeddedModelRunnerOptions &Options = {})
+      : MLModelRunner(Ctx, MLModelRunner::Kind::Release, InputSpec.size() + 1),
         CompiledModel(std::make_unique<TGen>()) {
     assert(CompiledModel && "The CompiledModel should be valid");
-
-    for (size_t I = 0; I < InputSpec.size(); ++I) {
-      const int Index =
-          CompiledModel->LookupArgIndex(FeedPrefix.str() + InputSpec[I].name());
-      void *Buffer = nullptr;
-      if (Index >= 0)
-        Buffer = CompiledModel->arg_data(Index);
-      setUpBufferForTensor(I, InputSpec[I], Buffer);
+    // Set up the model_selector past all the InputSpecs in all cases.
+    //   - if the model doesn't have such a feature, but the user requested it,
+    //   we report error. Same if the model supports it but the user didn't
+    //   specify it
+    //   - finally, we compute the MD5 hash of the user input and set the value
+    //   of the model selector to {high, low}
+    bool InputIsPresent = true;
+    populateTensor(InputSpec.size(),
+                   TensorSpec::createSpec<uint64_t>("model_selector", {2}),
+                   Options.FeedPrefix, InputIsPresent);
+
+    // If we hit the "report an error" cases outlined above, continue with the
+    // set up in case there's some custom diagnostics handler installed and it
+    // doesn't promptly exit.
+    if (Options.ModelSelector.empty() && InputIsPresent)
+      Ctx.emitError(
+          "A model selector was not specified but the underlying model "
+          "requires selecting one because it exposes a model_selector input");
+    uint64_t High = 0;
+    uint64_t Low = 0;
+    if (!Options.ModelSelector.empty()) {
+      if (!InputIsPresent)
+        Ctx.emitError("A model selector was specified but the underlying model "
+                      "does not expose a model_selector input");
+      const auto Hash = MD5::hash(arrayRefFromStringRef(Options.ModelSelector));
+      High = Hash.high();
+      Low = Hash.low();
     }
-
-    ResultIndex = CompiledModel->LookupResultIndex(FetchPrefix.str() +
+    getTensor<uint64_t>(InputSpec.size())[0] = High;
+    getTensor<uint64_t>(InputSpec.size())[1] = Low;
+    // At this point, the model selector is set up. If the user didn't provide
+    // one, but the model has a model_selector, it'll be set to (0, 0) which
+    // the composite model should treat as error as part of its implementation
+    // (but that should only matter if there is a custom handler that doesn't
+    // exit on error)
+    for (size_t I = 0; I < InputSpec.size(); ++I)
+      populateTensor(I, InputSpec[I], Options.FeedPrefix, InputIsPresent);
+
+    ResultIndex = CompiledModel->LookupResultIndex(Options.FetchPrefix.str() +
                                                    DecisionName.str());
     assert(ResultIndex >= 0 && "Cannot find DecisionName in inlining model");
   }
@@ -59,6 +113,20 @@ class ReleaseModeModelRunner final : public MLModelRunner {
   }
 
 private:
+  // fetch the model-provided buffer for the given Spec, or let MLModelRunner
+  // create a scratch buffer. Indicate back to the caller if the model had that
+  // input in the first place.
+  void populateTensor(size_t Pos, const TensorSpec &Spec, StringRef Prefix,
+                      bool &InputIsPresent) {
+    const int Index =
+        CompiledModel->LookupArgIndex((Prefix + Spec.name()).str());
+    void *Buffer = nullptr;
+    InputIsPresent = Index >= 0;
+    if (InputIsPresent)
+      Buffer = CompiledModel->arg_data(Index);
+    setUpBufferForTensor(Pos, Spec, Buffer);
+  }
+
   void *evaluateUntyped() override {
     CompiledModel->Run();
     return CompiledModel->result_data(ResultIndex);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 250848295cd366..97b30daf4427a1 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1265,9 +1265,7 @@ class ScalarEvolution {
 
   /// Return the DataLayout associated with the module this SCEV instance is
   /// operating on.
-  const DataLayout &getDataLayout() const {
-    return F.getParent()->getDataLayout();
-  }
+  const DataLayout &getDataLayout() const { return DL; }
 
   const SCEVPredicate *getEqualPredicate(const SCEV *LHS, const SCEV *RHS);
   const SCEVPredicate *getComparePredicate(ICmpInst::Predicate Pred,
@@ -1373,6 +1371,9 @@ class ScalarEvolution {
   /// The function we are analyzing.
   Function &F;
 
+  /// Data layout of the module.
+  const DataLayout &DL;
+
   /// Does the module have any calls to the llvm.experimental.guard intrinsic
   /// at all?  If this is false, we avoid doing work that will only help if
   /// thare are guards present in the IR.
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 37e8bcbcc009e1..c98d08fa0888db 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -410,10 +410,12 @@ class TargetLibraryInfo {
       return false;
     switch (F) {
     default: break;
+      // clang-format off
     case LibFunc_copysign:     case LibFunc_copysignf:  case LibFunc_copysignl:
     case LibFunc_fabs:         case LibFunc_fabsf:      case LibFunc_fabsl:
     case LibFunc_sin:          case LibFunc_sinf:       case LibFunc_sinl:
     case LibFunc_cos:          case LibFunc_cosf:       case LibFunc_cosl:
+    case LibFunc_tan:          case LibFunc_tanf:       case LibFunc_tanl:
     case LibFunc_sqrt:         case LibFunc_sqrtf:      case LibFunc_sqrtl:
     case LibFunc_sqrt_finite:  case LibFunc_sqrtf_finite:
                                                    case LibFunc_sqrtl_finite:
@@ -432,6 +434,7 @@ class TargetLibraryInfo {
     case LibFunc_memcmp:       case LibFunc_bcmp:       case LibFunc_strcmp:
     case LibFunc_strcpy:       case LibFunc_stpcpy:     case LibFunc_strlen:
     case LibFunc_strnlen:      case LibFunc_memchr:     case LibFunc_mempcpy:
+      // clang-format on
       return true;
     }
     return false;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a4..dcdd9f82cde8e0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -21,6 +21,7 @@
 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/InstrTypes.h"
@@ -1113,6 +1114,10 @@ class TargetTransformInfo {
   /// \return the number of registers in the target-provided register class.
   unsigned getNumberOfRegisters(unsigned ClassID) const;
 
+  /// \return true if the target supports load/store that enables fault
+  /// suppression of memory operands when the source condition is false.
+  bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const;
+
   /// \return the target-provided register class ID for the provided type,
   /// accounting for type promotion and other type-legalization techniques that
   /// the target might apply. However, it specifically does not account for the
@@ -1956,6 +1961,7 @@ class TargetTransformInfo::Concept {
   virtual bool preferToKeepConstantsAttached(const Instruction &Inst,
                                              const Function &Fn) const = 0;
   virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
+  virtual bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const = 0;
   virtual unsigned getRegisterClassForType(bool Vector,
                                            Type *Ty = nullptr) const = 0;
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
@@ -2543,6 +2549,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getNumberOfRegisters(unsigned ClassID) const override {
     return Impl.getNumberOfRegisters(ClassID);
   }
+  bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const override {
+    return Impl.hasConditionalLoadStoreForType(Ty);
+  }
   unsigned getRegisterClassForType(bool Vector,
                                    Type *Ty = nullptr) const override {
     return Impl.getRegisterClassForType(Vector, Ty);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43c..0ded98f162abfb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -156,14 +156,17 @@ class TargetTransformInfoImplBase {
     StringRef Name = F->getName();
 
     // These will all likely lower to a single selection DAG node.
+    // clang-format off
     if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
-        Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
+        Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
         Name == "fmin" || Name == "fminf" || Name == "fminl" ||
         Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" ||
-        Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
-        Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
+        Name == "sin"  || Name == "sinf"  || Name == "sinl"  || 
+        Name == "cos"  || Name == "cosf"  || Name == "cosl"  || 
+        Name == "tan"  || Name == "tanf"  || Name == "tanl"  || 
+        Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
       return false;
-
+    // clang-format on
     // These are all likely to be optimized into something smaller.
     if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
         Name == "exp2l" || Name == "exp2f" || Name == "floor" ||
@@ -457,6 +460,7 @@ class TargetTransformInfoImplBase {
   }
 
   unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; }
+  bool hasConditionalLoadStoreForType(Type *Ty) const { return false; }
 
   unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
     return Vector ? 1 : 0;
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 7a740d1206f7c3..3c9495c1b372ce 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/VFABIDemangler.h"
 #include "llvm/Support/CheckedArithmetic.h"
 
@@ -224,6 +225,13 @@ void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
 bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
                           SmallVectorImpl<int> &ScaledMask);
 
+/// Attempt to narrow/widen the \p Mask shuffle mask to the \p NumDstElts target
+/// width. Internally this will call narrowShuffleMaskElts/widenShuffleMaskElts.
+/// This will assert unless NumDstElts is a multiple of Mask.size (or vice-versa).
+/// Returns false on failure, and ScaledMask will be in an undefined state.
+bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
+                          SmallVectorImpl<int> &ScaledMask);
+
 /// Repetitively apply `widenShuffleMaskElts()` for as long as it succeeds,
 /// to get the shuffle mask with widest possible elements.
 void getShuffleMaskWithWidestElts(ArrayRef<int> Mask,
@@ -715,11 +723,17 @@ class InterleavedAccessInfo {
 
   /// Release the group and remove all the relationships.
   void releaseGroup(InterleaveGroup<Instruction> *Group) {
+    InterleaveGroups.erase(Group);
+    releaseGroupWithoutRemovingFromSet(Group);
+  }
+
+  /// Do everything necessary to release the group, apart from removing it from
+  /// the InterleaveGroups set.
+  void releaseGroupWithoutRemovingFromSet(InterleaveGroup<Instruction> *Group) {
     for (unsigned i = 0; i < Group->getFactor(); i++)
       if (Instruction *Member = Group->getMember(i))
         InterleaveGroupMap.erase(Member);
 
-    InterleaveGroups.erase(Group);
     delete Group;
   }
 
diff --git a/llvm/include/llvm/BinaryFormat/Minidump.h b/llvm/include/llvm/BinaryFormat/Minidump.h
index bc303929498d03..96693032526159 100644
--- a/llvm/include/llvm/BinaryFormat/Minidump.h
+++ b/llvm/include/llvm/BinaryFormat/Minidump.h
@@ -70,6 +70,11 @@ struct MemoryDescriptor {
 };
 static_assert(sizeof(MemoryDescriptor) == 16);
 
+struct MemoryDescriptor_64 {
+  support::ulittle64_t StartOfMemoryRange;
+  support::ulittle64_t DataSize;
+};
+
 struct MemoryInfoListHeader {
   support::ulittle32_t SizeOfHeader;
   support::ulittle32_t SizeOfEntry;
diff --git a/llvm/include/llvm/BinaryFormat/MsgPackDocument.h b/llvm/include/llvm/BinaryFormat/MsgPackDocument.h
index 7a181bd9bf8411..09091264cac2d2 100644
--- a/llvm/include/llvm/BinaryFormat/MsgPackDocument.h
+++ b/llvm/include/llvm/BinaryFormat/MsgPackDocument.h
@@ -388,7 +388,7 @@ class Document {
   /// Create an empty Map node associated with this Document.
   MapDocNode getMapNode() {
     auto N = DocNode(&KindAndDocs[size_t(Type::Map)]);
-    Maps.push_back(std::unique_ptr<DocNode::MapTy>(new DocNode::MapTy));
+    Maps.push_back(std::make_unique<DocNode::MapTy>());
     N.Map = Maps.back().get();
     return N.getMap();
   }
@@ -396,7 +396,7 @@ class Document {
   /// Create an empty Array node associated with this Document.
   ArrayDocNode getArrayNode() {
     auto N = DocNode(&KindAndDocs[size_t(Type::Array)]);
-    Arrays.push_back(std::unique_ptr<DocNode::ArrayTy>(new DocNode::ArrayTy));
+    Arrays.push_back(std::make_unique<DocNode::ArrayTy>());
     N.Array = Arrays.back().get();
     return N.getArray();
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9f8d3ded9b3c1a..4f1dc9f991c065 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1945,8 +1945,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         ScalarRetTy = RetTy->getScalarType();
       }
       SmallVector<Type *, 4> ScalarTys;
-      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-        Type *Ty = Tys[i];
+      for (Type *Ty : Tys) {
         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
           if (!SkipScalarizationCost)
             ScalarizationCost += getScalarizationOverhead(
@@ -2368,8 +2367,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
       SmallVector<Type *, 4> ScalarTys;
-      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-        Type *Ty = Tys[i];
+      for (Type *Ty : Tys) {
         if (Ty->isVectorTy())
           Ty = Ty->getScalarType();
         ScalarTys.push_back(Ty);
@@ -2377,8 +2375,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
       InstructionCost ScalarCost =
           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
-      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-        if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
+      for (Type *Ty : Tys) {
+        if (auto *VTy = dyn_cast<VectorType>(Ty)) {
           if (!ICA.skipScalarizationCost())
             ScalarizationCost += getScalarizationOverhead(
                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 43659564d5ace7..37a56e12efcc3d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -865,6 +865,13 @@ class CombinerHelper {
   /// By default, it erases the instruction def'd on \p MO from the function.
   void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo);
 
+  /// Match FPOWI if it's safe to extend it into a series of multiplications.
+  bool matchFPowIExpansion(MachineInstr &MI, int64_t Exponent);
+
+  /// Expands FPOWI into a series of multiplications and a division if the
+  /// exponent is negative.
+  void applyExpandFPowI(MachineInstr &MI, int64_t Exponent);
+
   /// Combine insert vector element OOB.
   bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 92e05ee858a755..e74136f34b234d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -284,7 +284,7 @@ class MachineIRBuilder {
   }
 
   const DataLayout &getDataLayout() const {
-    return getMF().getFunction().getParent()->getDataLayout();
+    return getMF().getFunction().getDataLayout();
   }
 
   LLVMContext &getContext() const {
@@ -886,6 +886,13 @@ class MachineIRBuilder {
   MachineInstrBuilder buildFConstant(const DstOp &Res, double Val);
   MachineInstrBuilder buildFConstant(const DstOp &Res, const APFloat &Val);
 
+  /// Build and insert G_PTRAUTH_GLOBAL_VALUE
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildConstantPtrAuth(const DstOp &Res,
+                                           const ConstantPtrAuth *CPA,
+                                           Register Addr, Register AddrDisc);
+
   /// Build and insert \p Res = COPY Op
   ///
   /// Register-to-register COPY sets \p Res to \p Op.
@@ -2173,6 +2180,36 @@ class MachineIRBuilder {
     return buildInstr(TargetOpcode::G_BITREVERSE, {Dst}, {Src});
   }
 
+  /// Build and insert \p Dst = G_GET_FPENV
+  MachineInstrBuilder buildGetFPEnv(const DstOp &Dst) {
+    return buildInstr(TargetOpcode::G_GET_FPENV, {Dst}, {});
+  }
+
+  /// Build and insert G_SET_FPENV \p Src
+  MachineInstrBuilder buildSetFPEnv(const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_SET_FPENV, {}, {Src});
+  }
+
+  /// Build and insert G_RESET_FPENV
+  MachineInstrBuilder buildResetFPEnv() {
+    return buildInstr(TargetOpcode::G_RESET_FPENV, {}, {});
+  }
+
+  /// Build and insert \p Dst = G_GET_FPMODE
+  MachineInstrBuilder buildGetFPMode(const DstOp &Dst) {
+    return buildInstr(TargetOpcode::G_GET_FPMODE, {Dst}, {});
+  }
+
+  /// Build and insert G_SET_FPMODE \p Src
+  MachineInstrBuilder buildSetFPMode(const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_SET_FPMODE, {}, {Src});
+  }
+
+  /// Build and insert G_RESET_FPMODE
+  MachineInstrBuilder buildResetFPMode() {
+    return buildInstr(TargetOpcode::G_RESET_FPMODE, {}, {});
+  }
+
   virtual MachineInstrBuilder
   buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps, ArrayRef<SrcOp> SrcOps,
              std::optional<unsigned> Flags = std::nullopt);
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6bb89fb58a2966..88e3339e2453f6 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -83,6 +83,12 @@ enum NodeType {
   ExternalSymbol,
   BlockAddress,
 
+  /// A ptrauth constant.
+  /// ptr, key, addr-disc, disc
+  /// Note that the addr-disc can be a non-constant value, to allow representing
+  /// a constant global address signed using address-diversification, in code.
+  PtrAuthGlobalAddress,
+
   /// The address of the GOT
   GLOBAL_OFFSET_TABLE,
 
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 5b6be3a96b2fb3..e4919ecabd7053 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -238,6 +238,9 @@ class MachineBasicBlock
     BB = nullptr;
   }
 
+  /// Check if there is a name of corresponding LLVM basic block.
+  bool hasName() const;
+
   /// Return the name of the corresponding LLVM basic block, or an empty string.
   StringRef getName() const;
 
diff --git a/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index bd544421bc0ff6..12d33f96edd116 100644
--- a/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -14,14 +14,13 @@
 #define LLVM_CODEGEN_MACHINEBRANCHPROBABILITYINFO_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
 
 namespace llvm {
 
-class MachineBranchProbabilityInfo : public ImmutablePass {
-  virtual void anchor();
-
+class MachineBranchProbabilityInfo {
   // Default weight value. Used when we don't have information about the edge.
   // TODO: DEFAULT_WEIGHT makes sense during static predication, when none of
   // the successors have a weight yet. But it doesn't make sense when providing
@@ -31,13 +30,8 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   static const uint32_t DEFAULT_WEIGHT = 16;
 
 public:
-  static char ID;
-
-  MachineBranchProbabilityInfo();
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
+  bool invalidate(MachineFunction &, const PreservedAnalyses &PA,
+                  MachineFunctionAnalysisManager::Invalidator &);
 
   // Return edge probability.
   BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
@@ -61,6 +55,45 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
                                     const MachineBasicBlock *Dst) const;
 };
 
+class MachineBranchProbabilityAnalysis
+    : public AnalysisInfoMixin<MachineBranchProbabilityAnalysis> {
+  friend AnalysisInfoMixin<MachineBranchProbabilityAnalysis>;
+
+  static AnalysisKey Key;
+
+public:
+  using Result = MachineBranchProbabilityInfo;
+
+  Result run(MachineFunction &, MachineFunctionAnalysisManager &);
+};
+
+class MachineBranchProbabilityPrinterPass
+    : public PassInfoMixin<MachineBranchProbabilityPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  MachineBranchProbabilityPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+class MachineBranchProbabilityInfoWrapperPass : public ImmutablePass {
+  virtual void anchor();
+
+  MachineBranchProbabilityInfo MBPI;
+
+public:
+  static char ID;
+
+  MachineBranchProbabilityInfoWrapperPass();
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  MachineBranchProbabilityInfo &getMBPI() { return MBPI; }
+  const MachineBranchProbabilityInfo &getMBPI() const { return MBPI; }
+};
 }
 
 
diff --git a/llvm/include/llvm/CodeGen/MachineDomTreeUpdater.h b/llvm/include/llvm/CodeGen/MachineDomTreeUpdater.h
new file mode 100644
index 00000000000000..9e3971f0b9fcec
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MachineDomTreeUpdater.h
@@ -0,0 +1,73 @@
+//===- llvm/CodeGen/MachineDomTreeUpdater.h -----------------------*- C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes interfaces to post dominance information for
+// target-specific code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEDOMTREEUPDATER_H
+#define LLVM_CODEGEN_MACHINEDOMTREEUPDATER_H
+
+#include "llvm/Analysis/GenericDomTreeUpdater.h"
+#include "llvm/CodeGen/MachineDominators.h"
+
+namespace llvm {
+
+class MachinePostDominatorTree;
+
+class MachineDomTreeUpdater
+    : public GenericDomTreeUpdater<MachineDomTreeUpdater, MachineDominatorTree,
+                                   MachinePostDominatorTree> {
+  friend GenericDomTreeUpdater<MachineDomTreeUpdater, MachineDominatorTree,
+                               MachinePostDominatorTree>;
+
+public:
+  using Base =
+      GenericDomTreeUpdater<MachineDomTreeUpdater, MachineDominatorTree,
+                            MachinePostDominatorTree>;
+  using Base::Base;
+
+  ~MachineDomTreeUpdater() { flush(); }
+
+  ///@{
+  /// \name Mutation APIs
+  ///
+
+  /// Delete DelBB. DelBB will be removed from its Parent and
+  /// erased from available trees if it exists and finally get deleted.
+  /// Under Eager UpdateStrategy, DelBB will be processed immediately.
+  /// Under Lazy UpdateStrategy, DelBB will be queued until a flush event and
+  /// all available trees are up-to-date. Assert if any instruction of DelBB is
+  /// modified while awaiting deletion. When both DT and PDT are nullptrs, DelBB
+  /// will be queued until flush() is called.
+  void deleteBB(MachineBasicBlock *DelBB);
+
+  ///@}
+
+private:
+  /// First remove all the instructions of DelBB and then make sure DelBB has a
+  /// valid terminator instruction which is necessary to have when DelBB still
+  /// has to be inside of its parent Function while awaiting deletion under Lazy
+  /// UpdateStrategy to prevent other routines from asserting the state of the
+  /// IR is inconsistent. Assert if DelBB is nullptr or has predecessors.
+  void validateDeleteBB(MachineBasicBlock *DelBB);
+
+  /// Returns true if at least one MachineBasicBlock is deleted.
+  bool forceFlushDeletedBB();
+};
+
+extern template class GenericDomTreeUpdater<
+    MachineDomTreeUpdater, MachineDominatorTree, MachinePostDominatorTree>;
+
+extern template void
+GenericDomTreeUpdater<MachineDomTreeUpdater, MachineDominatorTree,
+                      MachinePostDominatorTree>::recalculate(MachineFunction
+                                                                 &MF);
+} // namespace llvm
+#endif // LLVM_CODEGEN_MACHINEDOMTREEUPDATER_H
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
index 01f3783a4ffb4b..74cf94398736dd 100644
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -286,9 +286,10 @@ class MachineDominatorTreePrinterPass
   raw_ostream &OS;
 
 public:
-  MachineDominatorTreePrinterPass(raw_ostream &OS) : OS(OS) {}
+  explicit MachineDominatorTreePrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
 };
 
 /// \brief Analysis pass which computes a \c MachineDominatorTree.
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index b3cb5c8b84839d..229f5159fd6efd 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -304,6 +304,9 @@ class MachineInstr
   /// defined by this instruction.
   unsigned DebugInstrNum;
 
+  /// Cached opcode from MCID.
+  uint16_t Opcode;
+
   // Intrusive list support
   friend struct ilist_traits<MachineInstr>;
   friend struct ilist_callback_traits<MachineBasicBlock>;
@@ -563,7 +566,7 @@ class MachineInstr
   const MCInstrDesc &getDesc() const { return *MCID; }
 
   /// Returns the opcode of this MachineInstr.
-  unsigned getOpcode() const { return MCID->Opcode; }
+  unsigned getOpcode() const { return Opcode; }
 
   /// Retuns the total number of operands.
   unsigned getNumOperands() const { return NumOperands; }
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfoImpls.h b/llvm/include/llvm/CodeGen/MachineModuleInfoImpls.h
index f8a328f13eded7..64d841d86c7c4e 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfoImpls.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfoImpls.h
@@ -61,10 +61,20 @@ class MachineModuleInfoMachO : public MachineModuleInfoImpl {
 /// MachineModuleInfoELF - This is a MachineModuleInfoImpl implementation
 /// for ELF targets.
 class MachineModuleInfoELF : public MachineModuleInfoImpl {
+public:
+  struct AuthStubInfo {
+    const MCExpr *AuthPtrRef;
+  };
+
+private:
   /// GVStubs - These stubs are used to materialize global addresses in PIC
   /// mode.
   DenseMap<MCSymbol *, StubValueTy> GVStubs;
 
+  /// AuthPtrStubs - These stubs are used to materialize signed addresses for
+  /// extern_weak symbols.
+  DenseMap<MCSymbol *, AuthStubInfo> AuthPtrStubs;
+
   virtual void anchor(); // Out of line virtual method.
 
 public:
@@ -75,9 +85,19 @@ class MachineModuleInfoELF : public MachineModuleInfoImpl {
     return GVStubs[Sym];
   }
 
+  AuthStubInfo &getAuthPtrStubEntry(MCSymbol *Sym) {
+    assert(Sym && "Key cannot be null");
+    return AuthPtrStubs[Sym];
+  }
+
   /// Accessor methods to return the set of stubs in sorted order.
 
   SymbolListTy GetGVStubList() { return getSortedStubs(GVStubs); }
+
+  using AuthStubPairTy = std::pair<MCSymbol *, AuthStubInfo>;
+  typedef std::vector<AuthStubPairTy> AuthStubListTy;
+
+  AuthStubListTy getAuthGVStubList();
 };
 
 /// MachineModuleInfoCOFF - This is a MachineModuleInfoImpl implementation
diff --git a/llvm/include/llvm/CodeGen/MachinePostDominators.h b/llvm/include/llvm/CodeGen/MachinePostDominators.h
index c047e082662925..1d37272e0315fe 100644
--- a/llvm/include/llvm/CodeGen/MachinePostDominators.h
+++ b/llvm/include/llvm/CodeGen/MachinePostDominators.h
@@ -49,6 +49,12 @@ class MachinePostDominatorTree : public PostDomTreeBase<MachineBasicBlock> {
 public:
   MachinePostDominatorTree() = default;
 
+  explicit MachinePostDominatorTree(MachineFunction &MF) { recalculate(MF); }
+
+  /// Handle invalidation explicitly.
+  bool invalidate(MachineFunction &, const PreservedAnalyses &PA,
+                  MachineFunctionAnalysisManager::Invalidator &);
+
   /// Make findNearestCommonDominator(const NodeT *A, const NodeT *B) available.
   using Base::findNearestCommonDominator;
 
@@ -58,6 +64,29 @@ class MachinePostDominatorTree : public PostDomTreeBase<MachineBasicBlock> {
   findNearestCommonDominator(ArrayRef<MachineBasicBlock *> Blocks) const;
 };
 
+class MachinePostDominatorTreeAnalysis
+    : public AnalysisInfoMixin<MachinePostDominatorTreeAnalysis> {
+  friend AnalysisInfoMixin<MachinePostDominatorTreeAnalysis>;
+
+  static AnalysisKey Key;
+
+public:
+  using Result = MachinePostDominatorTree;
+
+  Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
+class MachinePostDominatorTreePrinterPass
+    : public PassInfoMixin<MachinePostDominatorTreePrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit MachinePostDominatorTreePrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
+};
+
 class MachinePostDominatorTreeWrapperPass : public MachineFunctionPass {
   std::optional<MachinePostDominatorTree> PDT;
 
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 32ff15fc75936a..822b06f080fa64 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseMultiSet.h"
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/identity.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -91,12 +90,6 @@ namespace llvm {
   using RegUnit2SUnitsMap =
       SparseMultiSet<PhysRegSUOper, identity<unsigned>, uint16_t>;
 
-  /// Use SparseSet as a SparseMap by relying on the fact that it never
-  /// compares ValueT's, only unsigned keys. This allows the set to be cleared
-  /// between scheduling regions in constant time as long as ValueT does not
-  /// require a destructor.
-  using VReg2SUnitMap = SparseSet<VReg2SUnit, VirtReg2IndexFunctor>;
-
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
   /// vreg use list.
diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h
index 72f4a6876b6cb1..21a9df22c2e1cf 100644
--- a/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -22,7 +22,7 @@
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/simple_ilist.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -60,10 +60,6 @@ class raw_ostream;
     }
   };
 
-  template <>
-  struct ilist_alloc_traits<IndexListEntry>
-      : public ilist_noalloc_traits<IndexListEntry> {};
-
   /// SlotIndex - An opaque wrapper around machine indexes.
   class SlotIndex {
     friend class SlotIndexes;
@@ -302,7 +298,7 @@ class raw_ostream;
     // IndexListEntry allocator.
     BumpPtrAllocator ileAllocator;
 
-    using IndexList = ilist<IndexListEntry>;
+    using IndexList = simple_ilist<IndexListEntry>;
     IndexList indexList;
 
     MachineFunction *mf = nullptr;
@@ -549,7 +545,7 @@ class raw_ostream;
 
       // Insert a new list entry for MI.
       IndexList::iterator newItr =
-          indexList.insert(nextItr, createEntry(&MI, newNumber));
+          indexList.insert(nextItr, *createEntry(&MI, newNumber));
 
       // Renumber locally if we need to.
       if (dist == 0)
@@ -608,7 +604,7 @@ class raw_ostream;
           mbb->empty() ? endEntry
                        : getInstructionIndex(mbb->front()).listEntry();
       IndexList::iterator newItr =
-          indexList.insert(insEntry->getIterator(), startEntry);
+          indexList.insert(insEntry->getIterator(), *startEntry);
 
       SlotIndex startIdx(startEntry, SlotIndex::Slot_Block);
       SlotIndex endIdx(endEntry, SlotIndex::Slot_Block);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 06f7ee2a589c88..9a0df8b29d7525 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3895,6 +3895,20 @@ class TargetLowering : public TargetLoweringBase {
                            const SDValue OldRHS, SDValue &Chain,
                            bool IsSignaling = false) const;
 
+  virtual SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL,
+                                  SDValue Chain, MachineMemOperand *MMO,
+                                  SDValue &NewLoad, SDValue Ptr,
+                                  SDValue PassThru, SDValue Mask) const {
+    llvm_unreachable("Not Implemented");
+  }
+
+  virtual SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
+                                   SDValue Chain, MachineMemOperand *MMO,
+                                   SDValue Ptr, SDValue Val,
+                                   SDValue Mask) const {
+    llvm_unreachable("Not Implemented");
+  }
+
   /// Returns a pair of (return value, chain).
   /// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC.
   std::pair<SDValue, SDValue> makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC,
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index 0d447a78cdc614..0117c90e77f990 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -64,6 +64,14 @@ class DWARFAcceleratorTable {
       return std::nullopt;
     }
 
+    /// Returns the type signature of the Type Unit associated with this
+    /// Accelerator Entry or std::nullopt if the Type Unit offset is not
+    /// recorded in this Accelerator Entry.
+    virtual std::optional<uint64_t> getForeignTUTypeSignature() const {
+      // Default return for accelerator tables that don't support type units.
+      return std::nullopt;
+    }
+
     /// Returns the Tag of the Debug Info Entry associated with this
     /// Accelerator Entry or std::nullopt if the Tag is not recorded in this
     /// Accelerator Entry.
@@ -433,10 +441,17 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
     Entry(const NameIndex &NameIdx, const Abbrev &Abbr);
 
   public:
+    const NameIndex *getNameIndex() const { return NameIdx; }
     std::optional<uint64_t> getCUOffset() const override;
     std::optional<uint64_t> getLocalTUOffset() const override;
+    std::optional<uint64_t> getForeignTUTypeSignature() const override;
     std::optional<dwarf::Tag> getTag() const override { return tag(); }
 
+    // Special function that will return the related CU offset needed type 
+    // units. This gets used to find the .dwo file that originated the entries
+    // for a given type unit.
+    std::optional<uint64_t> getRelatedCUOffset() const;
+
     /// Returns the Index into the Compilation Unit list of the owning Name
     /// Index or std::nullopt if this Accelerator Entry does not have an
     /// associated Compilation Unit. It is up to the user to verify that the
@@ -448,6 +463,11 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
     /// attribute.
     std::optional<uint64_t> getCUIndex() const;
 
+    /// Similar functionality to getCUIndex() but without the DW_IDX_type_unit
+    /// restriction. This allows us to get the associated a compilation unit
+    /// index for an entry that is a type unit.
+    std::optional<uint64_t> getRelatedCUIndex() const;
+
     /// Returns the Index into the Local Type Unit list of the owning Name
     /// Index or std::nullopt if this Accelerator Entry does not have an
     /// associated Type Unit. It is up to the user to verify that the
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 7be65a8b45e2d6..d800850450eb3c 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -422,7 +422,7 @@ class DWARFContext : public DIContext {
     for (unsigned Size : DWARFContext::getSupportedAddressSizes())
       Stream << LS << Size;
     Stream << ')';
-    return make_error<StringError>(Stream.str(), EC);
+    return make_error<StringError>(Buffer, EC);
   }
 
   std::shared_ptr<DWARFContext> getDWOContext(StringRef AbsolutePath);
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h
index 67089d9e7d41be..8269344fe6efe5 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h
@@ -110,7 +110,7 @@ inline std::string hexString(uint64_t Value, size_t Width = HEX_WIDTH) {
   std::string String;
   raw_string_ostream Stream(String);
   Stream << hexValue(Value, Width, false);
-  return Stream.str();
+  return String;
 }
 
 // Get a hexadecimal string representation for the given value.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 9a737f6f2f8abd..f578455905f210 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -1297,7 +1297,7 @@ class JITDylib : public ThreadSafeRefCountedBase<JITDylib>,
   JITDylib(ExecutionSession &ES, std::string Name);
 
   std::pair<AsynchronousSymbolQuerySet, std::shared_ptr<SymbolDependenceMap>>
-  removeTracker(ResourceTracker &RT);
+  IL_removeTracker(ResourceTracker &RT);
 
   void transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT);
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index c818448d3d1902..005c678302b270 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -665,6 +665,7 @@ def OMP_Distribute : Directive<"distribute"> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Collapse>,
     VersionedClause<OMPC_DistSchedule>,
+    VersionedClause<OMPC_Order, 50>,
   ];
   let association = AS_Loop;
   let category = CA_Executable;
@@ -2057,6 +2058,7 @@ def OMP_TargetTeamsDistribute : Directive<"target teams distribute"> {
     VersionedClause<OMPC_NoWait>,
     VersionedClause<OMPC_NumTeams>,
     VersionedClause<OMPC_OMPX_DynCGroupMem>,
+    VersionedClause<OMPC_Order, 50>,
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute];
@@ -2330,6 +2332,7 @@ def OMP_TeamsDistribute : Directive<"teams distribute"> {
   ];
   let allowedOnceClauses = [
     VersionedClause<OMPC_If>,
+    VersionedClause<OMPC_Order, 50>,
   ];
   let leafConstructs = [OMP_Teams, OMP_Distribute];
   let category = CA_Executable;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index bff49dab4a313d..0f40c3ff801f09 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -16,8 +16,10 @@
 
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/TargetParser/Triple.h"
 #include <forward_list>
@@ -99,7 +101,10 @@ class OpenMPIRBuilderConfig {
   /// expanded.
   std::optional<bool> IsGPU;
 
-  // Flag for specifying if offloading is mandatory.
+  /// Flag for specifying if LLVMUsed information should be emitted.
+  std::optional<bool> EmitLLVMUsedMetaInfo;
+
+  /// Flag for specifying if offloading is mandatory.
   std::optional<bool> OpenMPOffloadMandatory;
 
   /// First separator used between the initial two parts of a name.
@@ -107,6 +112,9 @@ class OpenMPIRBuilderConfig {
   /// Separator used between all of the rest consecutive parts of s name
   std::optional<StringRef> Separator;
 
+  // Grid Value for the GPU target
+  std::optional<omp::GV> GridValue;
+
   OpenMPIRBuilderConfig();
   OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
                         bool OpenMPOffloadMandatory,
@@ -132,6 +140,11 @@ class OpenMPIRBuilderConfig {
     return *OpenMPOffloadMandatory;
   }
 
+  omp::GV getGridValue() const {
+    assert(GridValue.has_value() && "GridValue is not set");
+    return *GridValue;
+  }
+
   bool hasRequiresFlags() const { return RequiresFlags; }
   bool hasRequiresReverseOffload() const;
   bool hasRequiresUnifiedAddress() const;
@@ -164,9 +177,11 @@ class OpenMPIRBuilderConfig {
 
   void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
   void setIsGPU(bool Value) { IsGPU = Value; }
+  void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
   void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
   void setSeparator(StringRef S) { Separator = S; }
+  void setGridValue(omp::GV G) { GridValue = G; }
 
   void setHasRequiresReverseOffload(bool Value);
   void setHasRequiresUnifiedAddress(bool Value);
@@ -607,14 +622,15 @@ class OpenMPIRBuilder {
   /// Generator for '#omp barrier'
   ///
   /// \param Loc The location where the barrier directive was encountered.
-  /// \param DK The kind of directive that caused the barrier.
+  /// \param Kind The kind of directive that caused the barrier.
   /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
   /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
   ///                        should be checked and acted upon.
+  /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
   ///
   /// \returns The insertion point after the barrier.
-  InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK,
-                              bool ForceSimpleCall = false,
+  InsertPointTy createBarrier(const LocationDescription &Loc,
+                              omp::Directive Kind, bool ForceSimpleCall = false,
                               bool CheckCancelFlag = true);
 
   /// Generator for '#omp cancel'
@@ -1235,27 +1251,55 @@ class OpenMPIRBuilder {
   getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
                            StringRef ParentName = "");
 
-  /// Functions used to generate reductions. Such functions take two Values
-  /// representing LHS and RHS of the reduction, respectively, and a reference
-  /// to the value that is updated to refer to the reduction result.
-  using ReductionGenTy =
-      function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+  /// Enum class for the RedctionGen CallBack type to be used.
+  enum class ReductionGenCBKind { Clang, MLIR };
+
+  /// ReductionGen CallBack for Clang
+  ///
+  /// \param CodeGenIP InsertPoint for CodeGen.
+  /// \param Index Index of the ReductionInfo to generate code for.
+  /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
+  /// codegen, used for fixup later.
+  /// \param RHSPtr Optionally used by Clang to
+  /// return the RHSPtr it used for codegen, used for fixup later.
+  /// \param CurFn Optionally used by Clang to pass in the Current Function as
+  /// Clang context may be old.
+  using ReductionGenClangCBTy =
+      std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
+                                  Value **LHS, Value **RHS, Function *CurFn)>;
+
+  /// ReductionGen CallBack for MLIR
+  ///
+  /// \param CodeGenIP InsertPoint for CodeGen.
+  /// \param LHS Pass in the LHS Value to be used for CodeGen.
+  /// \param RHS Pass in the RHS Value to be used for CodeGen.
+  using ReductionGenCBTy = std::function<InsertPointTy(
+      InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
 
   /// Functions used to generate atomic reductions. Such functions take two
   /// Values representing pointers to LHS and RHS of the reduction, as well as
   /// the element type of these pointers. They are expected to atomically
   /// update the LHS to the reduced value.
-  using AtomicReductionGenTy =
-      function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+  using ReductionGenAtomicCBTy =
+      std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+
+  /// Enum class for reduction evaluation types scalar, complex and aggregate.
+  enum class EvalKind { Scalar, Complex, Aggregate };
 
   /// Information about an OpenMP reduction.
   struct ReductionInfo {
     ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
-                  ReductionGenTy ReductionGen,
-                  AtomicReductionGenTy AtomicReductionGen)
+                  EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
+                  ReductionGenClangCBTy ReductionGenClang,
+                  ReductionGenAtomicCBTy AtomicReductionGen)
         : ElementType(ElementType), Variable(Variable),
-          PrivateVariable(PrivateVariable), ReductionGen(ReductionGen),
+          PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
+          ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
           AtomicReductionGen(AtomicReductionGen) {}
+    ReductionInfo(Value *PrivateVariable)
+        : ElementType(nullptr), Variable(nullptr),
+          PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
+          ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
 
     /// Reduction element type, must match pointee type of variable.
     Type *ElementType;
@@ -1266,18 +1310,547 @@ class OpenMPIRBuilder {
     /// Thread-private partial reduction variable.
     Value *PrivateVariable;
 
+    /// Reduction evaluation kind - scalar, complex or aggregate.
+    EvalKind EvaluationKind;
+
     /// Callback for generating the reduction body. The IR produced by this will
     /// be used to combine two values in a thread-safe context, e.g., under
     /// lock or within the same thread, and therefore need not be atomic.
-    ReductionGenTy ReductionGen;
+    ReductionGenCBTy ReductionGen;
+
+    /// Clang callback for generating the reduction body. The IR produced by
+    /// this will be used to combine two values in a thread-safe context, e.g.,
+    /// under lock or within the same thread, and therefore need not be atomic.
+    ReductionGenClangCBTy ReductionGenClang;
 
     /// Callback for generating the atomic reduction body, may be null. The IR
     /// produced by this will be used to atomically combine two values during
     /// reduction. If null, the implementation will use the non-atomic version
     /// along with the appropriate synchronization mechanisms.
-    AtomicReductionGenTy AtomicReductionGen;
+    ReductionGenAtomicCBTy AtomicReductionGen;
+  };
+
+  enum class CopyAction : unsigned {
+    // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+    // the warp using shuffle instructions.
+    RemoteLaneToThread,
+    // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
+    ThreadCopy,
+  };
+
+  struct CopyOptionsTy {
+    Value *RemoteLaneOffset = nullptr;
+    Value *ScratchpadIndex = nullptr;
+    Value *ScratchpadWidth = nullptr;
   };
 
+  /// Supporting functions for Reductions CodeGen.
+private:
+  /// Emit the llvm.used metadata.
+  void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
+
+  /// Get the id of the current thread on the GPU.
+  Value *getGPUThreadID();
+
+  /// Get the GPU warp size.
+  Value *getGPUWarpSize();
+
+  /// Get the id of the warp in the block.
+  /// We assume that the warp size is 32, which is always the case
+  /// on the NVPTX device, to generate more efficient code.
+  Value *getNVPTXWarpID();
+
+  /// Get the id of the current lane in the Warp.
+  /// We assume that the warp size is 32, which is always the case
+  /// on the NVPTX device, to generate more efficient code.
+  Value *getNVPTXLaneID();
+
+  /// Cast value to the specified type.
+  Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
+
+  /// This function creates calls to one of two shuffle functions to copy
+  /// variables between lanes in a warp.
+  Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
+                                      Type *ElementType, Value *Offset);
+
+  /// Function to shuffle over the value from the remote lane.
+  void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
+                       Type *ElementType, Value *Offset,
+                       Type *ReductionArrayTy);
+
+  /// Emit instructions to copy a Reduce list, which contains partially
+  /// aggregated values, in the specified direction.
+  void emitReductionListCopy(
+      InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+      ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+      CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
+
+  /// Emit a helper that reduces data across two OpenMP threads (lanes)
+  /// in the same warp.  It uses shuffle instructions to copy over data from
+  /// a remote lane's stack.  The reduction algorithm performed is specified
+  /// by the fourth parameter.
+  ///
+  /// Algorithm Versions.
+  /// Full Warp Reduce (argument value 0):
+  ///   This algorithm assumes that all 32 lanes are active and gathers
+  ///   data from these 32 lanes, producing a single resultant value.
+  /// Contiguous Partial Warp Reduce (argument value 1):
+  ///   This algorithm assumes that only a *contiguous* subset of lanes
+  ///   are active.  This happens for the last warp in a parallel region
+  ///   when the user specified num_threads is not an integer multiple of
+  ///   32.  This contiguous subset always starts with the zeroth lane.
+  /// Partial Warp Reduce (argument value 2):
+  ///   This algorithm gathers data from any number of lanes at any position.
+  /// All reduced values are stored in the lowest possible lane.  The set
+  /// of problems every algorithm addresses is a super set of those
+  /// addressable by algorithms with a lower version number.  Overhead
+  /// increases as algorithm version increases.
+  ///
+  /// Terminology
+  /// Reduce element:
+  ///   Reduce element refers to the individual data field with primitive
+  ///   data types to be combined and reduced across threads.
+  /// Reduce list:
+  ///   Reduce list refers to a collection of local, thread-private
+  ///   reduce elements.
+  /// Remote Reduce list:
+  ///   Remote Reduce list refers to a collection of remote (relative to
+  ///   the current thread) reduce elements.
+  ///
+  /// We distinguish between three states of threads that are important to
+  /// the implementation of this function.
+  /// Alive threads:
+  ///   Threads in a warp executing the SIMT instruction, as distinguished from
+  ///   threads that are inactive due to divergent control flow.
+  /// Active threads:
+  ///   The minimal set of threads that has to be alive upon entry to this
+  ///   function.  The computation is correct iff active threads are alive.
+  ///   Some threads are alive but they are not active because they do not
+  ///   contribute to the computation in any useful manner.  Turning them off
+  ///   may introduce control flow overheads without any tangible benefits.
+  /// Effective threads:
+  ///   In order to comply with the argument requirements of the shuffle
+  ///   function, we must keep all lanes holding data alive.  But at most
+  ///   half of them perform value aggregation; we refer to this half of
+  ///   threads as effective. The other half is simply handing off their
+  ///   data.
+  ///
+  /// Procedure
+  /// Value shuffle:
+  ///   In this step active threads transfer data from higher lane positions
+  ///   in the warp to lower lane positions, creating Remote Reduce list.
+  /// Value aggregation:
+  ///   In this step, effective threads combine their thread local Reduce list
+  ///   with Remote Reduce list and store the result in the thread local
+  ///   Reduce list.
+  /// Value copy:
+  ///   In this step, we deal with the assumption made by algorithm 2
+  ///   (i.e. contiguity assumption).  When we have an odd number of lanes
+  ///   active, say 2k+1, only k threads will be effective and therefore k
+  ///   new values will be produced.  However, the Reduce list owned by the
+  ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
+  ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
+  ///   that the contiguity assumption still holds.
+  ///
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReduceFn The reduction function.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ShuffleAndReduce function.
+  Function *emitShuffleAndReduceFunction(
+      ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+      Function *ReduceFn, AttributeList FuncAttrs);
+
+  /// This function emits a helper that gathers Reduce lists from the first
+  /// lane of every active warp to lanes in the first warp.
+  ///
+  /// void inter_warp_copy_func(void* reduce_data, num_warps)
+  ///   shared smem[warp_size];
+  ///   For all data entries D in reduce_data:
+  ///     sync
+  ///     If (I am the first lane in each warp)
+  ///       Copy my local D to smem[warp_id]
+  ///     sync
+  ///     if (I am the first warp)
+  ///       Copy smem[thread_id] to my local D
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The InterWarpCopy function.
+  Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
+                                      ArrayRef<ReductionInfo> ReductionInfos,
+                                      AttributeList FuncAttrs);
+
+  /// This function emits a helper that copies all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+  ///   For all data entries D in reduce_data:
+  ///     Copy local D to buffer.D[Idx]
+  ///
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ListToGlobalCopy function.
+  Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                         Type *ReductionsBufferTy,
+                                         AttributeList FuncAttrs);
+
+  /// This function emits a helper that copies all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+  ///   For all data entries D in reduce_data:
+  ///     Copy buffer.D[Idx] to local D;
+  ///
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The GlobalToList function.
+  Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                         Type *ReductionsBufferTy,
+                                         AttributeList FuncAttrs);
+
+  /// This function emits a helper that reduces all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+  ///  void *GlobPtrs[];
+  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+  ///  ...
+  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+  ///  reduce_function(GlobPtrs, reduce_data);
+  ///
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReduceFn The reduction function.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ListToGlobalReduce function.
+  Function *
+  emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                 Function *ReduceFn, Type *ReductionsBufferTy,
+                                 AttributeList FuncAttrs);
+
+  /// This function emits a helper that reduces all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
+  ///  void *GlobPtrs[];
+  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+  ///  ...
+  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+  ///  reduce_function(reduce_data, GlobPtrs);
+  ///
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReduceFn The reduction function.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The GlobalToListReduce function.
+  Function *
+  emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                 Function *ReduceFn, Type *ReductionsBufferTy,
+                                 AttributeList FuncAttrs);
+
+  /// Get the function name of a reduction function.
+  std::string getReductionFuncName(StringRef Name) const;
+
+  /// Emits reduction function.
+  /// \param ReducerName Name of the function calling the reduction.
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
+  ///                           CodeGenCB kind.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The reduction function.
+  Function *createReductionFunction(
+      StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+      ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
+      AttributeList FuncAttrs = {});
+
+public:
+  ///
+  /// Design of OpenMP reductions on the GPU
+  ///
+  /// Consider a typical OpenMP program with one or more reduction
+  /// clauses:
+  ///
+  /// float foo;
+  /// double bar;
+  /// #pragma omp target teams distribute parallel for \
+  ///             reduction(+:foo) reduction(*:bar)
+  /// for (int i = 0; i < N; i++) {
+  ///   foo += A[i]; bar *= B[i];
+  /// }
+  ///
+  /// where 'foo' and 'bar' are reduced across all OpenMP threads in
+  /// all teams.  In our OpenMP implementation on the NVPTX device an
+  /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
+  /// within a team are mapped to CUDA threads within a threadblock.
+  /// Our goal is to efficiently aggregate values across all OpenMP
+  /// threads such that:
+  ///
+  ///   - the compiler and runtime are logically concise, and
+  ///   - the reduction is performed efficiently in a hierarchical
+  ///     manner as follows: within OpenMP threads in the same warp,
+  ///     across warps in a threadblock, and finally across teams on
+  ///     the NVPTX device.
+  ///
+  /// Introduction to Decoupling
+  ///
+  /// We would like to decouple the compiler and the runtime so that the
+  /// latter is ignorant of the reduction variables (number, data types)
+  /// and the reduction operators.  This allows a simpler interface
+  /// and implementation while still attaining good performance.
+  ///
+  /// Pseudocode for the aforementioned OpenMP program generated by the
+  /// compiler is as follows:
+  ///
+  /// 1. Create private copies of reduction variables on each OpenMP
+  ///    thread: 'foo_private', 'bar_private'
+  /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
+  ///    to it and writes the result in 'foo_private' and 'bar_private'
+  ///    respectively.
+  /// 3. Call the OpenMP runtime on the GPU to reduce within a team
+  ///    and store the result on the team master:
+  ///
+  ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
+  ///        reduceData, shuffleReduceFn, interWarpCpyFn)
+  ///
+  ///     where:
+  ///       struct ReduceData {
+  ///         double *foo;
+  ///         double *bar;
+  ///       } reduceData
+  ///       reduceData.foo = &foo_private
+  ///       reduceData.bar = &bar_private
+  ///
+  ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
+  ///     auxiliary functions generated by the compiler that operate on
+  ///     variables of type 'ReduceData'.  They aid the runtime perform
+  ///     algorithmic steps in a data agnostic manner.
+  ///
+  ///     'shuffleReduceFn' is a pointer to a function that reduces data
+  ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
+  ///     same warp.  It takes the following arguments as input:
+  ///
+  ///     a. variable of type 'ReduceData' on the calling lane,
+  ///     b. its lane_id,
+  ///     c. an offset relative to the current lane_id to generate a
+  ///        remote_lane_id.  The remote lane contains the second
+  ///        variable of type 'ReduceData' that is to be reduced.
+  ///     d. an algorithm version parameter determining which reduction
+  ///        algorithm to use.
+  ///
+  ///     'shuffleReduceFn' retrieves data from the remote lane using
+  ///     efficient GPU shuffle intrinsics and reduces, using the
+  ///     algorithm specified by the 4th parameter, the two operands
+  ///     element-wise.  The result is written to the first operand.
+  ///
+  ///     Different reduction algorithms are implemented in different
+  ///     runtime functions, all calling 'shuffleReduceFn' to perform
+  ///     the essential reduction step.  Therefore, based on the 4th
+  ///     parameter, this function behaves slightly differently to
+  ///     cooperate with the runtime to ensure correctness under
+  ///     different circumstances.
+  ///
+  ///     'InterWarpCpyFn' is a pointer to a function that transfers
+  ///     reduced variables across warps.  It tunnels, through CUDA
+  ///     shared memory, the thread-private data of type 'ReduceData'
+  ///     from lane 0 of each warp to a lane in the first warp.
+  /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
+  ///    The last team writes the global reduced value to memory.
+  ///
+  ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
+  ///             reduceData, shuffleReduceFn, interWarpCpyFn,
+  ///             scratchpadCopyFn, loadAndReduceFn)
+  ///
+  ///     'scratchpadCopyFn' is a helper that stores reduced
+  ///     data from the team master to a scratchpad array in
+  ///     global memory.
+  ///
+  ///     'loadAndReduceFn' is a helper that loads data from
+  ///     the scratchpad array and reduces it with the input
+  ///     operand.
+  ///
+  ///     These compiler generated functions hide address
+  ///     calculation and alignment information from the runtime.
+  /// 5. if ret == 1:
+  ///     The team master of the last team stores the reduced
+  ///     result to the globals in memory.
+  ///     foo += reduceData.foo; bar *= reduceData.bar
+  ///
+  ///
+  /// Warp Reduction Algorithms
+  ///
+  /// On the warp level, we have three algorithms implemented in the
+  /// OpenMP runtime depending on the number of active lanes:
+  ///
+  /// Full Warp Reduction
+  ///
+  /// The reduce algorithm within a warp where all lanes are active
+  /// is implemented in the runtime as follows:
+  ///
+  /// full_warp_reduce(void *reduce_data,
+  ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+  ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
+  /// }
+  ///
+  /// The algorithm completes in log(2, WARPSIZE) steps.
+  ///
+  /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
+  /// not used therefore we save instructions by not retrieving lane_id
+  /// from the corresponding special registers.  The 4th parameter, which
+  /// represents the version of the algorithm being used, is set to 0 to
+  /// signify full warp reduction.
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// #reduce_elem refers to an element in the local lane's data structure
+  /// #remote_elem is retrieved from a remote lane
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
+  ///
+  /// Contiguous Partial Warp Reduction
+  ///
+  /// This reduce algorithm is used within a warp where only the first
+  /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
+  /// number of OpenMP threads in a parallel region is not a multiple of
+  /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
+  ///
+  /// void
+  /// contiguous_partial_reduce(void *reduce_data,
+  ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
+  ///                           int size, int lane_id) {
+  ///   int curr_size;
+  ///   int offset;
+  ///   curr_size = size;
+  ///   mask = curr_size/2;
+  ///   while (offset>0) {
+  ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+  ///     curr_size = (curr_size+1)/2;
+  ///     offset = curr_size/2;
+  ///   }
+  /// }
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// if (lane_id < offset)
+  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
+  /// else
+  ///     reduce_elem = remote_elem
+  ///
+  /// This algorithm assumes that the data to be reduced are located in a
+  /// contiguous subset of lanes starting from the first.  When there is
+  /// an odd number of active lanes, the data in the last lane is not
+  /// aggregated with any other lane's dat but is instead copied over.
+  ///
+  /// Dispersed Partial Warp Reduction
+  ///
+  /// This algorithm is used within a warp when any discontiguous subset of
+  /// lanes are active.  It is used to implement the reduction operation
+  /// across lanes in an OpenMP simd region or in a nested parallel region.
+  ///
+  /// void
+  /// dispersed_partial_reduce(void *reduce_data,
+  ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  ///   int size, remote_id;
+  ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
+  ///   do {
+  ///       remote_id = next_active_lane_id_right_after_me();
+  ///       # the above function returns 0 of no active lane
+  ///       # is present right after the current lane.
+  ///       size = number_of_active_lanes_in_this_warp();
+  ///       logical_lane_id /= 2;
+  ///       ShuffleReduceFn(reduce_data, logical_lane_id,
+  ///                       remote_id-1-threadIdx.x, 2);
+  ///   } while (logical_lane_id % 2 == 0 && size > 1);
+  /// }
+  ///
+  /// There is no assumption made about the initial state of the reduction.
+  /// Any number of lanes (>=1) could be active at any position.  The reduction
+  /// result is returned in the first active lane.
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// if (lane_id % 2 == 0 && offset > 0)
+  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
+  /// else
+  ///     reduce_elem = remote_elem
+  ///
+  ///
+  /// Intra-Team Reduction
+  ///
+  /// This function, as implemented in the runtime call
+  /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
+  /// threads in a team.  It first reduces within a warp using the
+  /// aforementioned algorithms.  We then proceed to gather all such
+  /// reduced values at the first warp.
+  ///
+  /// The runtime makes use of the function 'InterWarpCpyFn', which copies
+  /// data from each of the "warp master" (zeroth lane of each warp, where
+  /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
+  /// a mathematical sense) the problem of reduction across warp masters in
+  /// a block to the problem of warp reduction.
+  ///
+  ///
+  /// Inter-Team Reduction
+  ///
+  /// Once a team has reduced its data to a single value, it is stored in
+  /// a global scratchpad array.  Since each team has a distinct slot, this
+  /// can be done without locking.
+  ///
+  /// The last team to write to the scratchpad array proceeds to reduce the
+  /// scratchpad array.  One or more workers in the last team use the helper
+  /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
+  /// the k'th worker reduces every k'th element.
+  ///
+  /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
+  /// reduce across workers and compute a globally reduced value.
+  ///
+  /// \param Loc                The location where the reduction was
+  ///                           encountered. Must be within the associate
+  ///                           directive and after the last local access to the
+  ///                           reduction variables.
+  /// \param AllocaIP           An insertion point suitable for allocas usable
+  ///                           in reductions.
+  /// \param CodeGenIP           An insertion point suitable for code
+  /// generation. \param ReductionInfos     A list of info on each reduction
+  /// variable. \param IsNoWait           Optional flag set if the reduction is
+  /// marked as
+  ///                           nowait.
+  /// \param IsTeamsReduction   Optional flag set if it is a teams
+  ///                           reduction.
+  /// \param HasDistribute      Optional flag set if it is a
+  ///                           distribute reduction.
+  /// \param GridValue          Optional GPU grid value.
+  /// \param ReductionBufNum    Optional OpenMPCUDAReductionBufNumValue to be
+  /// used for teams reduction.
+  /// \param SrcLocInfo         Source location information global.
+  InsertPointTy createReductionsGPU(
+      const LocationDescription &Loc, InsertPointTy AllocaIP,
+      InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+      bool IsNoWait = false, bool IsTeamsReduction = false,
+      bool HasDistribute = false,
+      ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
+      std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
+      Value *SrcLocInfo = nullptr);
+
   // TODO: provide atomic and non-atomic reduction generators for reduction
   // operators defined by the OpenMP specification.
 
@@ -1421,19 +1994,6 @@ class OpenMPIRBuilder {
                                  Value *NumThreads, Value *HostPtr,
                                  ArrayRef<Value *> KernelArgs);
 
-  /// Generate a barrier runtime call.
-  ///
-  /// \param Loc The location at which the request originated and is fulfilled.
-  /// \param DK The directive which caused the barrier
-  /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
-  /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
-  ///                        should be checked and acted upon.
-  ///
-  /// \returns The insertion point after the barrier.
-  InsertPointTy emitBarrierImpl(const LocationDescription &Loc,
-                                omp::Directive DK, bool ForceSimpleCall,
-                                bool CheckCancelFlag);
-
   /// Generate a flush runtime call.
   ///
   /// \param Loc The location at which the request originated and is fulfilled.
diff --git a/llvm/include/llvm/IR/Analysis.h b/llvm/include/llvm/IR/Analysis.h
index a7005285dd9bfe..6c80c771d40b4a 100644
--- a/llvm/include/llvm/IR/Analysis.h
+++ b/llvm/include/llvm/IR/Analysis.h
@@ -13,10 +13,12 @@
 #define LLVM_IR_ANALYSIS_H
 
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 
 namespace llvm {
+
+class Function;
+class Module;
+
 /// A special type used by analysis passes to provide an address that
 /// identifies that particular analysis pass type.
 ///
@@ -190,9 +192,8 @@ class PreservedAnalyses {
       PreservedIDs.erase(ID);
       NotPreservedAnalysisIDs.insert(ID);
     }
-    for (auto *ID : PreservedIDs)
-      if (!Arg.PreservedIDs.count(ID))
-        PreservedIDs.erase(ID);
+    PreservedIDs.remove_if(
+        [&](void *ID) { return !Arg.PreservedIDs.contains(ID); });
   }
 
   /// Intersect this set with a temporary other set in place.
@@ -212,9 +213,8 @@ class PreservedAnalyses {
       PreservedIDs.erase(ID);
       NotPreservedAnalysisIDs.insert(ID);
     }
-    for (auto *ID : PreservedIDs)
-      if (!Arg.PreservedIDs.count(ID))
-        PreservedIDs.erase(ID);
+    PreservedIDs.remove_if(
+        [&](void *ID) { return !Arg.PreservedIDs.contains(ID); });
   }
 
   /// A checker object that makes it easy to query for whether an analysis or
@@ -316,4 +316,4 @@ class PreservedAnalyses {
 };
 } // namespace llvm
 
-#endif
\ No newline at end of file
+#endif
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 80067f2652a2ba..12571d957da609 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -32,6 +32,7 @@ namespace llvm {
 
 class AssemblyAnnotationWriter;
 class CallInst;
+class DataLayout;
 class Function;
 class LandingPadInst;
 class LLVMContext;
@@ -218,6 +219,11 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
                             static_cast<const BasicBlock *>(this)->getModule());
   }
 
+  /// Get the data layout of the module this basic block belongs to.
+  ///
+  /// Requires the basic block to have a parent module.
+  const DataLayout &getDataLayout() const;
+
   /// Returns the terminator instruction if the block is well formed or null
   /// if the block is not well formed.
   const Instruction *getTerminator() const LLVM_READONLY {
diff --git a/llvm/include/llvm/IR/ConstantRangeList.h b/llvm/include/llvm/IR/ConstantRangeList.h
index f696bd6cc6a3d7..44d1daebe49e4a 100644
--- a/llvm/include/llvm/IR/ConstantRangeList.h
+++ b/llvm/include/llvm/IR/ConstantRangeList.h
@@ -72,6 +72,16 @@ class [[nodiscard]] ConstantRangeList {
                          APInt(64, Upper, /*isSigned=*/true)));
   }
 
+  void subtract(const ConstantRange &SubRange);
+
+  /// Return the range list that results from the union of this
+  /// ConstantRangeList with another ConstantRangeList, "CRL".
+  ConstantRangeList unionWith(const ConstantRangeList &CRL) const;
+
+  /// Return the range list that results from the intersection of this
+  /// ConstantRangeList with another ConstantRangeList, "CRL".
+  ConstantRangeList intersectWith(const ConstantRangeList &CRL) const;
+
   /// Return true if this range list is equal to another range list.
   bool operator==(const ConstantRangeList &CRL) const {
     return Ranges == CRL.Ranges;
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 5468cedb0815a4..fd7a6aa46eea0b 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -47,6 +47,7 @@ typedef unsigned ID;
 class AssemblyAnnotationWriter;
 class Constant;
 class ConstantRange;
+class DataLayout;
 struct DenormalMode;
 class DISubprogram;
 enum LibFunc : unsigned;
@@ -181,10 +182,14 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
                           const Twine &N, Module &M);
 
   /// Creates a function with some attributes recorded in llvm.module.flags
-  /// applied.
+  /// and the LLVMContext applied.
   ///
   /// Use this when synthesizing new functions that need attributes that would
   /// have been set by command line options.
+  ///
+  /// This function should not be called from backends or the LTO pipeline. If
+  /// it is called from one of those places, some default attributes will not be
+  /// applied to the function.
   static Function *createWithDefaultAttr(FunctionType *Ty, LinkageTypes Linkage,
                                          unsigned AddrSpace,
                                          const Twine &N = "",
@@ -210,6 +215,11 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
   /// function.
   LLVMContext &getContext() const;
 
+  /// Get the data layout of the module this function belongs to.
+  ///
+  /// Requires the function to have a parent module.
+  const DataLayout &getDataLayout() const;
+
   /// isVarArg - Return true if this function takes a variable number of
   /// arguments.
   bool isVarArg() const { return getFunctionType()->isVarArg(); }
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index b1262b27f02214..540a21c1118b86 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -32,6 +32,7 @@ namespace llvm {
 
 class Comdat;
 class ConstantRange;
+class DataLayout;
 class Error;
 class GlobalObject;
 class Module;
@@ -655,6 +656,11 @@ class GlobalValue : public Constant {
   Module *getParent() { return Parent; }
   const Module *getParent() const { return Parent; }
 
+  /// Get the data layout of the module this global belongs to.
+  ///
+  /// Requires the global to have a parent module.
+  const DataLayout &getDataLayout() const;
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
     return V->getValueID() == Value::FunctionVal ||
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index c10ea33a4ee13f..f81cfc0dce9721 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -34,7 +34,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -1049,7 +1048,7 @@ class IRBuilderBase {
 
   /// Create a call to llvm.stacksave
   CallInst *CreateStackSave(const Twine &Name = "") {
-    const DataLayout &DL = BB->getModule()->getDataLayout();
+    const DataLayout &DL = BB->getDataLayout();
     return CreateIntrinsic(Intrinsic::stacksave, {DL.getAllocaPtrType(Context)},
                            {}, nullptr, Name);
   }
@@ -1770,14 +1769,14 @@ class IRBuilderBase {
 
   AllocaInst *CreateAlloca(Type *Ty, unsigned AddrSpace,
                            Value *ArraySize = nullptr, const Twine &Name = "") {
-    const DataLayout &DL = BB->getModule()->getDataLayout();
+    const DataLayout &DL = BB->getDataLayout();
     Align AllocaAlign = DL.getPrefTypeAlign(Ty);
     return Insert(new AllocaInst(Ty, AddrSpace, ArraySize, AllocaAlign), Name);
   }
 
   AllocaInst *CreateAlloca(Type *Ty, Value *ArraySize = nullptr,
                            const Twine &Name = "") {
-    const DataLayout &DL = BB->getModule()->getDataLayout();
+    const DataLayout &DL = BB->getDataLayout();
     Align AllocaAlign = DL.getPrefTypeAlign(Ty);
     unsigned AddrSpace = DL.getAllocaAddrSpace();
     return Insert(new AllocaInst(Ty, AddrSpace, ArraySize, AllocaAlign), Name);
@@ -1815,7 +1814,7 @@ class IRBuilderBase {
   LoadInst *CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align,
                               bool isVolatile, const Twine &Name = "") {
     if (!Align) {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
+      const DataLayout &DL = BB->getDataLayout();
       Align = DL.getABITypeAlign(Ty);
     }
     return Insert(new LoadInst(Ty, Ptr, Twine(), isVolatile, *Align), Name);
@@ -1824,7 +1823,7 @@ class IRBuilderBase {
   StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align,
                                 bool isVolatile = false) {
     if (!Align) {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
+      const DataLayout &DL = BB->getDataLayout();
       Align = DL.getABITypeAlign(Val->getType());
     }
     return Insert(new StoreInst(Val, Ptr, isVolatile, *Align));
@@ -1841,7 +1840,7 @@ class IRBuilderBase {
                       AtomicOrdering FailureOrdering,
                       SyncScope::ID SSID = SyncScope::System) {
     if (!Align) {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
+      const DataLayout &DL = BB->getDataLayout();
       Align = llvm::Align(DL.getTypeStoreSize(New->getType()));
     }
 
@@ -1854,7 +1853,7 @@ class IRBuilderBase {
                                  AtomicOrdering Ordering,
                                  SyncScope::ID SSID = SyncScope::System) {
     if (!Align) {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
+      const DataLayout &DL = BB->getDataLayout();
       Align = llvm::Align(DL.getTypeStoreSize(Val->getType()));
     }
 
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 2e72f6742a6593..7a9b95f23465cd 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -29,6 +29,7 @@
 namespace llvm {
 
 class BasicBlock;
+class DataLayout;
 class DbgMarker;
 class FastMathFlags;
 class MDNode;
@@ -189,6 +190,11 @@ class Instruction : public User,
                          static_cast<const Instruction *>(this)->getFunction());
   }
 
+  /// Get the data layout of the module this instruction belongs to.
+  ///
+  /// Requires the instruction to have a parent module.
+  const DataLayout &getDataLayout() const;
+
   /// This method unlinks 'this' from the containing basic block, but does not
   /// delete it.
   void removeFromParent();
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 9a71aaa9f44349..cbefd2e08a9667 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2841,6 +2841,34 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_writeq_horiz : SME_VectorToTile_Intrinsic;
   def int_aarch64_sme_writeq_vert  : SME_VectorToTile_Intrinsic;
 
+  class SME_MOVAZ_TileToVector_X2_Intrinsic
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+          [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
+  class SME_MOVAZ_TileToVector_X4_Intrinsic
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+           LLVMMatchType<0>,LLVMMatchType<0>],
+          [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
+  def int_aarch64_sme_readz_horiz_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
+  def int_aarch64_sme_readz_vert_x2  : SME_MOVAZ_TileToVector_X2_Intrinsic;
+
+  def int_aarch64_sme_readz_horiz_x4 : SME_MOVAZ_TileToVector_X4_Intrinsic;
+  def int_aarch64_sme_readz_vert_x4  : SME_MOVAZ_TileToVector_X4_Intrinsic;
+
+  class SME_MOVAZ_TileToVector_Intrinsic
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+          [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
+  def int_aarch64_sme_readz_horiz : SME_MOVAZ_TileToVector_Intrinsic;
+  def int_aarch64_sme_readz_vert  : SME_MOVAZ_TileToVector_Intrinsic;
+
+  def int_aarch64_sme_readz_q_horiz : SME_MOVAZ_TileToVector_Intrinsic;
+  def int_aarch64_sme_readz_q_vert  : SME_MOVAZ_TileToVector_Intrinsic;
+
   def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
   class SME_OuterProduct_Intrinsic
@@ -3663,12 +3691,12 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
 // SVE2.1 - Move predicate to/from vector
 //
 def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic; 
-    
+
 def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 
 def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
-   
 def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
 
 def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic;
 def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic;
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8a5566ae120200..71b1e832bde3c4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
             [IntrNoMem, IntrSpeculatable]
 >;
 
-class AMDGPULDSIntrin :
-  Intrinsic<[llvm_any_ty],
-    [LLVMQualPointerType<3>,
-    LLVMMatchType<0>,
-    llvm_i32_ty, // ordering
-    llvm_i32_ty, // scope
-    llvm_i1_ty], // isVolatile
-    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
-     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]
->;
-
 // FIXME: The m0 argument should be moved after the normal arguments
 class AMDGPUDSOrderedIntrinsic : Intrinsic<
   [llvm_i32_ty],
@@ -571,10 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
-def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
-def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
-
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
@@ -2043,26 +2028,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
 
 def int_amdgcn_readfirstlane :
-  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
-  ClangBuiltin<"__builtin_amdgcn_readlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The value to write and lane select arguments must be uniform across the
 // currently active threads of the current wave. Otherwise, the result is
 // undefined.
 def int_amdgcn_writelane :
-  ClangBuiltin<"__builtin_amdgcn_writelane">,
-  Intrinsic<[llvm_i32_ty], [
-    llvm_i32_ty,    // uniform value to write: returned by the selected lane
-    llvm_i32_ty,    // uniform lane select
-    llvm_i32_ty     // returned by all lanes other than the selected one
+  Intrinsic<[llvm_any_ty], [
+    LLVMMatchType<0>,   // uniform value to write: returned by the selected lane
+    llvm_i32_ty,        // uniform lane select
+    LLVMMatchType<0>    // returned by all lanes other than the selected one
   ],
   [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
@@ -2349,26 +2331,25 @@ class AMDGPUGlobalLoadLDS :
      "", [SDNPMemOperand]>;
 def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
 
-// Use read/write of inaccessible memory to model the fact that this reads a
-// volatile value.
+// This is IntrHasSideEffects because it reads from a volatile hardware register.
 def int_amdgcn_pops_exiting_wave_id :
-  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly]>;
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;
 
 //===----------------------------------------------------------------------===//
 // GFX10 Intrinsics
 //===----------------------------------------------------------------------===//
 
 // llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+def int_amdgcn_permlane16 :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
 
 // llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+def int_amdgcn_permlanex16 :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
 
@@ -2411,8 +2392,7 @@ def int_amdgcn_image_bvh_intersect_ray :
 
 // llvm.amdgcn.permlane64 <src0>
 def int_amdgcn_permlane64 :
-  ClangBuiltin<"__builtin_amdgcn_permlane64">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 def int_amdgcn_ds_add_gs_reg_rtn :
@@ -2930,10 +2910,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
 // bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
 def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
 def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
-    [llvm_v2i16_ty],
-    [LLVMQualPointerType<3>, llvm_v2i16_ty],
-    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 
 defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
 def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 0a9139e0062ba3..854eb2f8dd6dff 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1596,6 +1596,12 @@ def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
                                    [IntrNoMem, IntrSpeculatable, IntrNoCallback],
                                    "llvm.nvvm.ptr.gen.to.param">;
 
+// sm70+, PTX7.7+
+def int_nvvm_ptr_param_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                                     [llvm_anyptr_ty],
+                                   [IntrNoMem, IntrSpeculatable, IntrNoCallback],
+                                   "llvm.nvvm.ptr.param.to.gen">;
+
 // Move intrinsics, used in nvvm internally
 
 def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem],
diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index 89ad6f1572c679..6ffa2bdaa319a7 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -327,6 +327,22 @@ class LLVMContext {
   [[deprecated("Always returns false")]]
   bool supportsTypedPointers() const;
 
+  /// Get or set the current "default" target CPU (target-cpu function
+  /// attribute). The intent is that compiler frontends will set this to a value
+  /// that reflects the attribute that a function would get "by default" without
+  /// any specific function attributes, and compiler passes will attach the
+  /// attribute to newly created functions that are not associated with a
+  /// particular function, such as global initializers.
+  /// Function::createWithDefaultAttr() will create functions with this
+  /// attribute. This function should only be called by passes that run at
+  /// compile time and not by the backend or LTO passes.
+  StringRef getDefaultTargetCPU();
+  void setDefaultTargetCPU(StringRef CPU);
+
+  /// Similar to {get,set}DefaultTargetCPU() but for default target-features.
+  StringRef getDefaultTargetFeatures();
+  void setDefaultTargetFeatures(StringRef Features);
+
 private:
   // Module needs access to the add/removeModule methods.
   friend class Module;
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index a6bb261af75226..ecafb0e68fa7ae 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -793,7 +793,7 @@ class FunctionSummary : public GlobalValueSummary {
       OS << ", hasUnknownCall: " << this->HasUnknownCall;
       OS << ", mustBeUnreachable: " << this->MustBeUnreachable;
       OS << ")";
-      return OS.str();
+      return Output;
     }
   };
 
diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h
index ff5e54427814df..9fcc2d5957a30c 100644
--- a/llvm/include/llvm/IR/PassInstrumentation.h
+++ b/llvm/include/llvm/IR/PassInstrumentation.h
@@ -149,6 +149,11 @@ class PassInstrumentationCallbacks {
     AnalysesClearedCallbacks.emplace_back(std::move(C));
   }
 
+  template <typename CallableT>
+  void registerClassToPassNameCallback(CallableT C) {
+    ClassToPassNameCallbacks.emplace_back(std::move(C));
+  }
+
   /// Add a class name to pass name mapping for use by pass instrumentation.
   void addClassToPassName(StringRef ClassName, StringRef PassName);
   /// Get the pass name for a given pass class name.
@@ -185,6 +190,7 @@ class PassInstrumentationCallbacks {
   SmallVector<llvm::unique_function<AnalysesClearedFunc>, 4>
       AnalysesClearedCallbacks;
 
+  SmallVector<llvm::unique_function<void ()>, 4> ClassToPassNameCallbacks;
   DenseMap<StringRef, std::string> ClassToPassName;
 };
 
@@ -326,6 +332,13 @@ class PassInstrumentation {
     if (Callbacks)
       Callbacks->BeforeNonSkippedPassCallbacks.pop_back();
   }
+
+  /// Get the pass name for a given pass class name.
+  StringRef getPassNameForClassName(StringRef ClassName) const {
+    if (Callbacks)
+      return Callbacks->getPassNameForClassName(ClassName);
+    return {};
+  }
 };
 
 bool isSpecialPass(StringRef PassID, const std::vector<StringRef> &Specials);
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 23a05c056dfafd..d269221fac0701 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -42,8 +42,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/IR/Analysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManagerInternal.h"
 #include "llvm/Support/TypeName.h"
 #include <cassert>
@@ -58,6 +56,9 @@
 
 namespace llvm {
 
+class Function;
+class Module;
+
 // Forward declare the analysis manager template.
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 
@@ -224,11 +225,21 @@ class PassManager : public PassInfoMixin<
   std::vector<std::unique_ptr<PassConceptT>> Passes;
 };
 
+template <typename IRUnitT>
+void printIRUnitNameForStackTrace(raw_ostream &OS, const IRUnitT &IR);
+
+template <>
+void printIRUnitNameForStackTrace<Module>(raw_ostream &OS, const Module &IR);
+
 extern template class PassManager<Module>;
 
 /// Convenience typedef for a pass manager over modules.
 using ModulePassManager = PassManager<Module>;
 
+template <>
+void printIRUnitNameForStackTrace<Function>(raw_ostream &OS,
+                                            const Function &IR);
+
 extern template class PassManager<Function>;
 
 /// Convenience typedef for a pass manager over functions.
diff --git a/llvm/include/llvm/IR/PassManagerImpl.h b/llvm/include/llvm/IR/PassManagerImpl.h
index e8a95761c3494b..78877199e614fb 100644
--- a/llvm/include/llvm/IR/PassManagerImpl.h
+++ b/llvm/include/llvm/IR/PassManagerImpl.h
@@ -15,9 +15,11 @@
 #ifndef LLVM_IR_PASSMANAGERIMPL_H
 #define LLVM_IR_PASSMANAGERIMPL_H
 
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/PrettyStackTrace.h"
 
 extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
 
@@ -26,6 +28,28 @@ namespace llvm {
 template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
 PreservedAnalyses PassManager<IRUnitT, AnalysisManagerT, ExtraArgTs...>::run(
     IRUnitT &IR, AnalysisManagerT &AM, ExtraArgTs... ExtraArgs) {
+  class StackTraceEntry : public PrettyStackTraceEntry {
+    const PassInstrumentation &PI;
+    PassConceptT &Pass;
+    IRUnitT &IR;
+
+  public:
+    explicit StackTraceEntry(const PassInstrumentation &PI, PassConceptT &Pass,
+                             IRUnitT &IR)
+        : PI(PI), Pass(Pass), IR(IR) {}
+
+    void print(raw_ostream &OS) const override {
+      OS << "Running pass \"";
+      Pass.printPipeline(OS, [this](StringRef ClassName) {
+        auto PassName = PI.getPassNameForClassName(ClassName);
+        return PassName.empty() ? ClassName : PassName;
+      });
+      OS << "\" on ";
+      printIRUnitNameForStackTrace(OS, IR);
+      OS << "\n";
+    }
+  };
+
   PreservedAnalyses PA = PreservedAnalyses::all();
 
   // Request PassInstrumentation from analysis manager, will use it to run
@@ -47,6 +71,7 @@ PreservedAnalyses PassManager<IRUnitT, AnalysisManagerT, ExtraArgTs...>::run(
     if (!PI.runBeforePass<IRUnitT>(*Pass, IR))
       continue;
 
+    StackTraceEntry Entry(PI, *Pass, IR);
     PreservedAnalyses PassPA = Pass->run(IR, AM, ExtraArgs...);
 
     // Update the analysis manager as each pass runs and potentially
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 526b7258b8ab78..d4e355431a27a2 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2309,6 +2309,22 @@ m_UnordFMin(const LHS &L, const RHS &R) {
   return MaxMin_match<FCmpInst, LHS, RHS, ufmin_pred_ty>(L, R);
 }
 
+/// Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
+/// NOTE: we first match the 'Not' (by matching '-1'),
+/// and only then match the inner matcher!
+template <typename ValTy>
+inline BinaryOp_match<cst_pred_ty<is_all_ones>, ValTy, Instruction::Xor, true>
+m_Not(const ValTy &V) {
+  return m_c_Xor(m_AllOnes(), V);
+}
+
+template <typename ValTy>
+inline BinaryOp_match<cst_pred_ty<is_all_ones, false>, ValTy, Instruction::Xor,
+                      true>
+m_NotForbidPoison(const ValTy &V) {
+  return m_c_Xor(m_AllOnesForbidPoison(), V);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for overflow check patterns: e.g. (a + b) u< a, (a ^ -1) <u b
 // Note that S might be matched to other instructions than AddInst.
@@ -2343,13 +2359,13 @@ struct UAddWithOverflow_match {
         return L.match(AddLHS) && R.match(AddRHS) && S.match(ICmpRHS);
 
     Value *Op1;
-    auto XorExpr = m_OneUse(m_Xor(m_Value(Op1), m_AllOnes()));
-    // (a ^ -1) <u b
+    auto XorExpr = m_OneUse(m_Not(m_Value(Op1)));
+    // (~a) <u b
     if (Pred == ICmpInst::ICMP_ULT) {
       if (XorExpr.match(ICmpLHS))
         return L.match(Op1) && R.match(ICmpRHS) && S.match(ICmpLHS);
     }
-    //  b > u (a ^ -1)
+    //  b > u (~a)
     if (Pred == ICmpInst::ICMP_UGT) {
       if (XorExpr.match(ICmpRHS))
         return L.match(Op1) && R.match(ICmpLHS) && S.match(ICmpRHS);
@@ -2659,22 +2675,6 @@ m_NSWNeg(const ValTy &V) {
   return m_NSWSub(m_ZeroInt(), V);
 }
 
-/// Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
-/// NOTE: we first match the 'Not' (by matching '-1'),
-/// and only then match the inner matcher!
-template <typename ValTy>
-inline BinaryOp_match<cst_pred_ty<is_all_ones>, ValTy, Instruction::Xor, true>
-m_Not(const ValTy &V) {
-  return m_c_Xor(m_AllOnes(), V);
-}
-
-template <typename ValTy>
-inline BinaryOp_match<cst_pred_ty<is_all_ones, false>, ValTy, Instruction::Xor,
-                      true>
-m_NotForbidPoison(const ValTy &V) {
-  return m_c_Xor(m_AllOnesForbidPoison(), V);
-}
-
 /// Matches an SMin with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty, true>
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index 1d7c97d9be953e..0bea517df832e3 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -66,6 +66,8 @@ bool hasBranchWeightOrigin(const MDNode *ProfileData);
 /// Return the offset to the first branch weight data
 unsigned getBranchWeightOffset(const MDNode *ProfileData);
 
+unsigned getNumBranchWeights(const MDNode &ProfileData);
+
 /// Extract branch weights from MD_prof metadata
 ///
 /// \param ProfileData A pointer to an MDNode.
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 4ddb7112a47bba..421c09ada7a194 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -182,7 +182,7 @@ void initializeMIRPrintingPassPass(PassRegistry&);
 void initializeMachineBlockFrequencyInfoPass(PassRegistry&);
 void initializeMachineBlockPlacementPass(PassRegistry&);
 void initializeMachineBlockPlacementStatsPass(PassRegistry&);
-void initializeMachineBranchProbabilityInfoPass(PassRegistry&);
+void initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
 void initializeMachineCFGPrinterPass(PassRegistry &);
 void initializeMachineCSEPass(PassRegistry&);
 void initializeMachineCombinerPass(PassRegistry&);
diff --git a/llvm/include/llvm/MC/DXContainerPSVInfo.h b/llvm/include/llvm/MC/DXContainerPSVInfo.h
index bad2fe78eb8fbb..3a2d2949d0223f 100644
--- a/llvm/include/llvm/MC/DXContainerPSVInfo.h
+++ b/llvm/include/llvm/MC/DXContainerPSVInfo.h
@@ -47,7 +47,9 @@ struct PSVSignatureElement {
 // modifiable format, and can be used to serialize the data back into valid PSV
 // RuntimeInfo.
 struct PSVRuntimeInfo {
-  PSVRuntimeInfo() : DXConStrTabBuilder(StringTableBuilder::DXContainer) {}
+  PSVRuntimeInfo() : DXConStrTabBuilder(StringTableBuilder::DXContainer) {
+    memset((void *)&BaseData, 0, sizeof(dxbc::PSV::v3::RuntimeInfo));
+  }
   bool IsFinalized = false;
   dxbc::PSV::v3::RuntimeInfo BaseData;
   SmallVector<dxbc::PSV::v2::ResourceBindInfo> Resources;
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 01a64fb425a94f..1f36b7e98274f3 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -62,13 +62,6 @@ class MCAsmBackend {
   /// tricky way for optimization.
   virtual bool allowEnhancedRelaxation() const { return false; }
 
-  /// Give the target a chance to manipulate state related to instruction
-  /// alignment (e.g. padding for optimization), instruction relaxablility, etc.
-  /// before and after actually emitting the instruction.
-  virtual void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
-                                    const MCSubtargetInfo &STI) {}
-  virtual void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {}
-
   /// lifetime management
   virtual void reset() {}
 
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 48056b6ad61379..add60ef7006b85 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -580,12 +580,6 @@ class MCAsmInfo {
     return nullptr;
   }
 
-  /// True if the section is atomized using the symbols in it.
-  /// This is false if the section is not atomized at all (most ELF sections) or
-  /// if it is atomized based on its contents (MachO' __TEXT,__cstring for
-  /// example).
-  virtual bool isSectionAtomizableBySymbols(const MCSection &Section) const;
-
   virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                     unsigned Encoding,
                                                     MCStreamer &Streamer) const;
diff --git a/llvm/include/llvm/MC/MCAsmInfoDarwin.h b/llvm/include/llvm/MC/MCAsmInfoDarwin.h
index c889ce99cebeb8..4ca62b3a773e20 100644
--- a/llvm/include/llvm/MC/MCAsmInfoDarwin.h
+++ b/llvm/include/llvm/MC/MCAsmInfoDarwin.h
@@ -22,7 +22,10 @@ class MCAsmInfoDarwin : public MCAsmInfo {
 public:
   explicit MCAsmInfoDarwin();
 
-  bool isSectionAtomizableBySymbols(const MCSection &Section) const override;
+  /// True if the section is atomized using the symbols in it.
+  /// This is false if the section is atomized based on its contents (MachO' __TEXT,__cstring for
+  /// example).
+  static bool isSectionAtomizableBySymbols(const MCSection &Section);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCAsmLayout.h b/llvm/include/llvm/MC/MCAsmLayout.h
index e3f707fd21a186..24b621e6f145e0 100644
--- a/llvm/include/llvm/MC/MCAsmLayout.h
+++ b/llvm/include/llvm/MC/MCAsmLayout.h
@@ -31,21 +31,12 @@ class MCAsmLayout {
   /// List of sections in layout order.
   llvm::SmallVector<MCSection *, 16> SectionOrder;
 
-  /// Compute the layout for the section if necessary.
-  void ensureValid(const MCFragment *F) const;
-
 public:
   MCAsmLayout(MCAssembler &Assembler);
 
   /// Get the assembler object this is a layout for.
   MCAssembler &getAssembler() const { return Assembler; }
 
-  /// Invalidate the fragments starting with F because it has been
-  /// resized. The fragment's size should have already been updated, but
-  /// its bundle padding will be recomputed.
-  void invalidateFragmentsFrom(MCFragment *F);
-
-  void layoutBundle(MCFragment *Prev, MCFragment *F);
 
   /// \name Section Access (in layout order)
   /// @{
@@ -75,11 +66,6 @@ class MCAsmLayout {
   /// file. This may include additional padding, or be 0 for virtual sections.
   uint64_t getSectionFileSize(const MCSection *Sec) const;
 
-  /// Get the offset of the given symbol, as computed in the current
-  /// layout.
-  /// \return True on success.
-  bool getSymbolOffset(const MCSymbol &S, uint64_t &Val) const;
-
   /// Variant that reports a fatal error if the offset is not computable.
   uint64_t getSymbolOffset(const MCSymbol &S) const;
 
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 5cf6459ca40529..53588139f7fac3 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -115,11 +115,14 @@ class MCAssembler {
   MCContext &Context;
 
   std::unique_ptr<MCAsmBackend> Backend;
-
   std::unique_ptr<MCCodeEmitter> Emitter;
-
   std::unique_ptr<MCObjectWriter> Writer;
 
+  MCAsmLayout *Layout = nullptr;
+  bool RelaxAll = false;
+  bool SubsectionsViaSymbols = false;
+  bool IncrementalLinkerCompatible = false;
+
   SectionListType Sections;
 
   SymbolDataListType Symbols;
@@ -150,18 +153,14 @@ class MCAssembler {
   /// The bundle alignment size currently set in the assembler.
   ///
   /// By default it's 0, which means bundling is disabled.
-  unsigned BundleAlignSize;
-
-  bool RelaxAll : 1;
-  bool SubsectionsViaSymbols : 1;
-  bool IncrementalLinkerCompatible : 1;
+  unsigned BundleAlignSize = 0;
 
   /// ELF specific e_header flags
   // It would be good if there were an MCELFAssembler class to hold this.
   // ELF header flags are used both by the integrated and standalone assemblers.
   // Access to the flags is necessary in cases where assembler directives affect
   // which flags to be set.
-  unsigned ELFHeaderEFlags;
+  unsigned ELFHeaderEFlags = 0;
 
   /// Used to communicate Linker Optimization Hint information between
   /// the Streamer and the .o writer
@@ -173,7 +172,6 @@ class MCAssembler {
   /// Evaluate a fixup to a relocatable expression and the value which should be
   /// placed into the fixup.
   ///
-  /// \param Layout The layout to use for evaluation.
   /// \param Fixup The fixup to evaluate.
   /// \param DF The fragment the fixup is inside.
   /// \param Target [out] On return, the relocatable expression the fixup
@@ -185,45 +183,38 @@ class MCAssembler {
   /// \return Whether the fixup value was fully resolved. This is true if the
   /// \p Value result is fixed, otherwise the value may change due to
   /// relocation.
-  bool evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
-                     const MCFragment *DF, MCValue &Target,
-                     const MCSubtargetInfo *STI, uint64_t &Value,
-                     bool &WasForced) const;
+  bool evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
+                     MCValue &Target, const MCSubtargetInfo *STI,
+                     uint64_t &Value, bool &WasForced) const;
 
   /// Check whether a fixup can be satisfied, or whether it needs to be relaxed
   /// (increased in size, in order to hold its value correctly).
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, const MCRelaxableFragment *DF) const;
 
   /// Check whether the given fragment needs relaxation.
-  bool fragmentNeedsRelaxation(const MCRelaxableFragment *IF,
-                               const MCAsmLayout &Layout) const;
+  bool fragmentNeedsRelaxation(const MCRelaxableFragment *IF) const;
 
   /// Perform one layout iteration and return true if any offsets
   /// were adjusted.
-  bool layoutOnce(MCAsmLayout &Layout);
+  bool layoutOnce();
 
   /// Perform relaxation on a single fragment - returns true if the fragment
   /// changes as a result of relaxation.
-  bool relaxFragment(MCAsmLayout &Layout, MCFragment &F);
-  bool relaxInstruction(MCAsmLayout &Layout, MCRelaxableFragment &IF);
-  bool relaxLEB(MCAsmLayout &Layout, MCLEBFragment &IF);
-  bool relaxBoundaryAlign(MCAsmLayout &Layout, MCBoundaryAlignFragment &BF);
-  bool relaxDwarfLineAddr(MCAsmLayout &Layout, MCDwarfLineAddrFragment &DF);
-  bool relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
-                                   MCDwarfCallFrameFragment &DF);
-  bool relaxCVInlineLineTable(MCAsmLayout &Layout,
-                              MCCVInlineLineTableFragment &DF);
-  bool relaxCVDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &DF);
-  bool relaxPseudoProbeAddr(MCAsmLayout &Layout, MCPseudoProbeAddrFragment &DF);
+  bool relaxFragment(MCFragment &F);
+  bool relaxInstruction(MCRelaxableFragment &IF);
+  bool relaxLEB(MCLEBFragment &IF);
+  bool relaxBoundaryAlign(MCBoundaryAlignFragment &BF);
+  bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF);
+  bool relaxDwarfCallFrameFragment(MCDwarfCallFrameFragment &DF);
+  bool relaxCVInlineLineTable(MCCVInlineLineTableFragment &DF);
+  bool relaxCVDefRange(MCCVDefRangeFragment &DF);
+  bool relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &DF);
 
   /// finishLayout - Finalize a layout, including fragment lowering.
   void finishLayout(MCAsmLayout &Layout);
 
-  std::tuple<MCValue, uint64_t, bool> handleFixup(const MCAsmLayout &Layout,
-                                                  MCFragment &F,
-                                                  const MCFixup &Fixup,
-                                                  const MCSubtargetInfo *STI);
+  std::tuple<MCValue, uint64_t, bool>
+  handleFixup(MCFragment &F, const MCFixup &Fixup, const MCSubtargetInfo *STI);
 
 public:
   struct Symver {
@@ -248,14 +239,28 @@ class MCAssembler {
   MCAssembler &operator=(const MCAssembler &) = delete;
   ~MCAssembler();
 
-  /// Compute the effective fragment size assuming it is laid out at the given
-  /// \p SectionAddress and \p FragmentOffset.
-  uint64_t computeFragmentSize(const MCAsmLayout &Layout,
-                               const MCFragment &F) const;
+  /// Compute the effective fragment size.
+  uint64_t computeFragmentSize(const MCFragment &F) const;
+
+  void layoutBundle(MCFragment *Prev, MCFragment *F) const;
+  void ensureValid(MCSection &Sec) const;
+
+  // Get the offset of the given fragment inside its containing section.
+  uint64_t getFragmentOffset(const MCFragment &F) const;
+
+  uint64_t getSectionAddressSize(const MCSection &Sec) const;
+  uint64_t getSectionFileSize(const MCSection &Sec) const;
+
+  // Get the offset of the given symbol, as computed in the current
+  // layout.
+  // \return True on success.
+  bool getSymbolOffset(const MCSymbol &S, uint64_t &Val) const;
+
+  // Variant that reports a fatal error if the offset is not computable.
+  uint64_t getSymbolOffset(const MCSymbol &S) const;
 
-  /// Find the symbol which defines the atom containing the given symbol, or
-  /// null if there is no such symbol.
-  const MCSymbol *getAtom(const MCSymbol &S) const;
+  // If this symbol is equivalent to A + Constant, return A.
+  const MCSymbol *getBaseSymbol(const MCSymbol &Symbol) const;
 
   /// Check whether a particular symbol is visible to the linker and is required
   /// in the symbol table, or whether it can be discarded by the assembler. This
@@ -355,6 +360,8 @@ class MCAssembler {
     IncrementalLinkerCompatible = Value;
   }
 
+  MCAsmLayout *getLayout() const { return Layout; }
+  bool hasLayout() const { return Layout; }
   bool getRelaxAll() const { return RelaxAll; }
   void setRelaxAll(bool Value) { RelaxAll = Value; }
 
diff --git a/llvm/include/llvm/MC/MCCodeEmitter.h b/llvm/include/llvm/MC/MCCodeEmitter.h
index e4f4b27816bb3e..ceb16c9b4da78d 100644
--- a/llvm/include/llvm/MC/MCCodeEmitter.h
+++ b/llvm/include/llvm/MC/MCCodeEmitter.h
@@ -30,12 +30,6 @@ class MCCodeEmitter {
   /// Lifetime management
   virtual void reset() {}
 
-  /// Append the prefixes of given instruction to the code buffer.
-  ///
-  /// \param Inst a single low-level machine instruction.
-  /// \param CB code buffer
-  virtual void emitPrefix(const MCInst &Inst, SmallVectorImpl<char> &CB,
-                          const MCSubtargetInfo &STI) const {}
   /// Encode the given \p Inst to bytes and append to \p CB.
   virtual void encodeInstruction(const MCInst &Inst, SmallVectorImpl<char> &CB,
                                  SmallVectorImpl<MCFixup> &Fixups,
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index 0f18ce5df1701d..73fe6361cb8669 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -351,6 +351,9 @@ class MCContext {
   MCSymbol *getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
                                               unsigned Instance);
 
+  template <typename Symbol>
+  Symbol *getOrCreateSectionSymbol(StringRef Section);
+
   MCSectionELF *createELFSectionImpl(StringRef Section, unsigned Type,
                                      unsigned Flags, unsigned EntrySize,
                                      const MCSymbolELF *Group, bool IsComdat,
@@ -600,15 +603,13 @@ class MCContext {
                                                    unsigned EntrySize);
 
   MCSectionGOFF *getGOFFSection(StringRef Section, SectionKind Kind,
-                                MCSection *Parent, const MCExpr *SubsectionId);
+                                MCSection *Parent, uint32_t Subsection = 0);
 
   MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
                                 StringRef COMDATSymName, int Selection,
-                                unsigned UniqueID = GenericSectionID,
-                                const char *BeginSymName = nullptr);
+                                unsigned UniqueID = GenericSectionID);
 
-  MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
-                                const char *BeginSymName = nullptr);
+  MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics);
 
   /// Gets or creates a section equivalent to Sec that is associated with the
   /// section containing KeySym. For example, to create a debug info section
@@ -622,27 +623,16 @@ class MCContext {
 
   MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
                                 unsigned Flags = 0) {
-    return getWasmSection(Section, K, Flags, nullptr);
-  }
-
-  MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
-                                unsigned Flags, const char *BeginSymName) {
-    return getWasmSection(Section, K, Flags, "", ~0, BeginSymName);
-  }
-
-  MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
-                                unsigned Flags, const Twine &Group,
-                                unsigned UniqueID) {
-    return getWasmSection(Section, K, Flags, Group, UniqueID, nullptr);
+    return getWasmSection(Section, K, Flags, "", ~0);
   }
 
   MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
                                 unsigned Flags, const Twine &Group,
-                                unsigned UniqueID, const char *BeginSymName);
+                                unsigned UniqueID);
 
   MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
                                 unsigned Flags, const MCSymbolWasm *Group,
-                                unsigned UniqueID, const char *BeginSymName);
+                                unsigned UniqueID);
 
   /// Get the section for the provided Section name
   MCSectionDXContainer *getDXContainerSection(StringRef Section, SectionKind K);
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index f96bddd7365335..1cd64e406d4d06 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -46,7 +46,7 @@ class MCELFStreamer : public MCObjectStreamer {
   /// @{
 
   void initSections(bool NoExecStack, const MCSubtargetInfo &STI) override;
-  void changeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void changeSection(MCSection *Section, uint32_t Subsection = 0) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCDataFragment &F,
                       uint64_t Offset) override;
@@ -80,7 +80,9 @@ class MCELFStreamer : public MCObjectStreamer {
   void emitCGProfileEntry(const MCSymbolRefExpr *From,
                           const MCSymbolRefExpr *To, uint64_t Count) override;
 
-  void finishImpl() override;
+  // This is final. Override MCTargetStreamer::finish instead for
+  // target-specific code.
+  void finishImpl() final;
 
   void emitBundleAlignMode(Align Alignment) override;
   void emitBundleLock(bool AlignToEnd) override;
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 42d240254be6aa..40319dcc8ee792 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -65,7 +65,6 @@ class MCExpr {
   }
 
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
-                                 const MCAsmLayout *Layout,
                                  const MCFixup *Fixup,
                                  const SectionAddrMap *Addrs, bool InSet) const;
 
@@ -96,11 +95,8 @@ class MCExpr {
   /// Try to evaluate the expression to an absolute value.
   ///
   /// \param Res - The absolute value, if evaluation succeeds.
-  /// \param Layout - The assembler layout object to use for evaluating symbol
-  /// values. If not given, then only non-symbolic expressions will be
-  /// evaluated.
   /// \return - True on success.
-  bool evaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout,
+  bool evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm,
                           const SectionAddrMap &Addrs) const;
   bool evaluateAsAbsolute(int64_t &Res) const;
   bool evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const;
@@ -124,7 +120,7 @@ class MCExpr {
   ///
   /// This is a more aggressive variant of evaluateAsRelocatable. The intended
   /// use is for when relocations are not available, like the .size directive.
-  bool evaluateAsValue(MCValue &Res, const MCAsmLayout &Layout) const;
+  bool evaluateAsValue(MCValue &Res, const MCAssembler &Asm) const;
 
   /// Find the "associated section" for this expression, which is
   /// currently defined as the absolute section for constants, or
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index 72db28cb5eb67d..c8832e66e1714f 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -24,6 +24,7 @@
 namespace llvm {
 
 class MCAssembler;
+class MCObjectStreamer;
 class MCSection;
 class MCSubtargetInfo;
 class MCSymbol;
@@ -31,6 +32,7 @@ class MCSymbol;
 class MCFragment {
   friend class MCAsmLayout;
   friend class MCAssembler;
+  friend class MCObjectStreamer;
   friend class MCSection;
 
 public:
diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h
index 1683543082e289..6f47f12f73f13a 100644
--- a/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -162,6 +162,8 @@ class MachObjectWriter : public MCObjectWriter {
 
   uint64_t getPaddingSize(const MCSection *SD, const MCAsmLayout &Layout) const;
 
+  const MCSymbol *getAtom(const MCSymbol &S) const;
+
   bool doesSymbolRequireExternRelocation(const MCSymbol &S);
 
   /// @}
@@ -254,11 +256,6 @@ class MachObjectWriter : public MCObjectWriter {
   void executePostLayoutBinding(MCAssembler &Asm,
                                 const MCAsmLayout &Layout) override;
 
-  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
-                                              const MCSymbol &A,
-                                              const MCSymbol &B,
-                                              bool InSet) const override;
-
   bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
                                               const MCSymbol &SymA,
                                               const MCFragment &FB, bool InSet,
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 9f6e1aa95f352b..4241ec1e1881ba 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -26,8 +26,6 @@ class MCAssembler;
 class MCCodeEmitter;
 class MCSubtargetInfo;
 class MCExpr;
-class MCFragment;
-class MCDataFragment;
 class MCAsmBackend;
 class raw_ostream;
 class raw_pwrite_stream;
@@ -43,7 +41,6 @@ class MCObjectStreamer : public MCStreamer {
   std::unique_ptr<MCAssembler> Assembler;
   bool EmitEHFrame;
   bool EmitDebugFrame;
-  unsigned CurSubsectionIdx = 0;
   struct PendingMCFixup {
     const MCSymbol *Sym;
     MCFixup Fixup;
@@ -86,12 +83,13 @@ class MCObjectStreamer : public MCStreamer {
   void emitFrames(MCAsmBackend *MAB);
   void emitCFISections(bool EH, bool Debug) override;
 
-  MCFragment *getCurrentFragment() const;
-
   void insert(MCFragment *F) {
-    MCSection *CurSection = getCurrentSectionOnly();
-    CurSection->addFragment(*F);
-    F->setParent(CurSection);
+    auto *Sec = CurFrag->getParent();
+    F->setParent(Sec);
+    F->setLayoutOrder(CurFrag->getLayoutOrder() + 1);
+    CurFrag->Next = F;
+    CurFrag = F;
+    Sec->curFragList()->Tail = F;
   }
 
   /// Get a data fragment to write into, creating a new one if the current
@@ -101,7 +99,7 @@ class MCObjectStreamer : public MCStreamer {
   MCDataFragment *getOrCreateDataFragment(const MCSubtargetInfo* STI = nullptr);
 
 protected:
-  bool changeSectionImpl(MCSection *Section, const MCExpr *Subsection);
+  bool changeSectionImpl(MCSection *Section, uint32_t Subsection);
 
 public:
   void visitUsedSymbol(const MCSymbol &Sym) override;
@@ -122,7 +120,8 @@ class MCObjectStreamer : public MCStreamer {
   void emitULEB128Value(const MCExpr *Value) override;
   void emitSLEB128Value(const MCExpr *Value) override;
   void emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
-  void changeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void changeSection(MCSection *Section, uint32_t Subsection = 0) override;
+  void switchSectionNoPrint(MCSection *Section) override;
   void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
   /// Emit an instruction to a special fragment, because this instruction
diff --git a/llvm/include/llvm/MC/MCObjectWriter.h b/llvm/include/llvm/MC/MCObjectWriter.h
index e66bb2c932dd06..0b21853ea85ae9 100644
--- a/llvm/include/llvm/MC/MCObjectWriter.h
+++ b/llvm/include/llvm/MC/MCObjectWriter.h
@@ -78,11 +78,6 @@ class MCObjectWriter {
                                           const MCSymbolRefExpr *B,
                                           bool InSet) const;
 
-  virtual bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
-                                                      const MCSymbol &A,
-                                                      const MCSymbol &B,
-                                                      bool InSet) const;
-
   virtual bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
                                                       const MCSymbol &SymA,
                                                       const MCFragment &FB,
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 969f54cca2382e..58ba34c07262bd 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -26,6 +26,7 @@ class MCAsmInfo;
 class MCAssembler;
 class MCContext;
 class MCExpr;
+class MCObjectStreamer;
 class MCSymbol;
 class raw_ostream;
 class Triple;
@@ -35,6 +36,7 @@ class Triple;
 class MCSection {
 public:
   friend MCAssembler;
+  friend MCObjectStreamer;
   static constexpr unsigned NonUniqueID = ~0U;
 
   enum SectionVariant {
@@ -106,6 +108,8 @@ class MCSection {
 
   bool IsText : 1;
 
+  bool IsVirtual : 1;
+
   MCDummyFragment DummyFragment;
 
   // Mapping from subsection number to fragment list. At layout time, the
@@ -127,7 +131,8 @@ class MCSection {
   StringRef Name;
   SectionVariant Variant;
 
-  MCSection(SectionVariant V, StringRef Name, bool IsText, MCSymbol *Begin);
+  MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsVirtual,
+            MCSymbol *Begin);
   ~MCSection();
 
 public:
@@ -193,25 +198,11 @@ class MCSection {
   iterator end() const { return {}; }
   bool empty() const { return !CurFragList->Head; }
 
-  void addFragment(MCFragment &F) {
-    // The formal layout order will be finalized in MCAssembler::layout.
-    if (CurFragList->Tail) {
-      CurFragList->Tail->Next = &F;
-      F.setLayoutOrder(CurFragList->Tail->getLayoutOrder() + 1);
-    } else {
-      CurFragList->Head = &F;
-      assert(F.getLayoutOrder() == 0);
-    }
-    CurFragList->Tail = &F;
-  }
-
-  void switchSubsection(unsigned Subsection);
-
   void dump() const;
 
   virtual void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                     raw_ostream &OS,
-                                    const MCExpr *Subsection) const = 0;
+                                    uint32_t Subsection) const = 0;
 
   /// Return true if a .align directive should use "optimized nops" to fill
   /// instead of 0s.
@@ -219,7 +210,7 @@ class MCSection {
 
   /// Check whether this section is "virtual", that is has no actual object
   /// file contents.
-  virtual bool isVirtualSection() const = 0;
+  bool isVirtualSection() const { return IsVirtual; }
 
   virtual StringRef getVirtualSectionKind() const;
 
diff --git a/llvm/include/llvm/MC/MCSectionCOFF.h b/llvm/include/llvm/MC/MCSectionCOFF.h
index c99e7d405d001e..97540d985dbd1d 100644
--- a/llvm/include/llvm/MC/MCSectionCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionCOFF.h
@@ -53,6 +53,7 @@ class MCSectionCOFF final : public MCSection {
   MCSectionCOFF(StringRef Name, unsigned Characteristics,
                 MCSymbol *COMDATSymbol, int Selection, MCSymbol *Begin)
       : MCSection(SV_COFF, Name, Characteristics & COFF::IMAGE_SCN_CNT_CODE,
+                  Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA,
                   Begin),
         Characteristics(Characteristics), COMDATSymbol(COMDATSymbol),
         Selection(Selection) {
@@ -73,9 +74,8 @@ class MCSectionCOFF final : public MCSection {
 
   void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
-                            const MCExpr *Subsection) const override;
+                            uint32_t Subsection) const override;
   bool useCodeAlign() const override;
-  bool isVirtualSection() const override;
   StringRef getVirtualSectionKind() const override;
 
   unsigned getOrAssignWinCFISectionID(unsigned *NextID) const {
diff --git a/llvm/include/llvm/MC/MCSectionDXContainer.h b/llvm/include/llvm/MC/MCSectionDXContainer.h
index 4ef2f29d7e67d7..723b4778a3ac93 100644
--- a/llvm/include/llvm/MC/MCSectionDXContainer.h
+++ b/llvm/include/llvm/MC/MCSectionDXContainer.h
@@ -24,13 +24,13 @@ class MCSectionDXContainer final : public MCSection {
   friend class MCContext;
 
   MCSectionDXContainer(StringRef Name, SectionKind K, MCSymbol *Begin)
-      : MCSection(SV_DXContainer, Name, K.isText(), Begin) {}
+      : MCSection(SV_DXContainer, Name, K.isText(), /*IsVirtual=*/false,
+                  Begin) {}
 
 public:
   void printSwitchToSection(const MCAsmInfo &, const Triple &, raw_ostream &,
-                            const MCExpr *) const override;
+                            uint32_t) const override;
   bool useCodeAlign() const override { return false; }
-  bool isVirtualSection() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionELF.h b/llvm/include/llvm/MC/MCSectionELF.h
index ff9240ff4e32d0..3d45d3da10ca14 100644
--- a/llvm/include/llvm/MC/MCSectionELF.h
+++ b/llvm/include/llvm/MC/MCSectionELF.h
@@ -54,8 +54,9 @@ class MCSectionELF final : public MCSection {
                unsigned entrySize, const MCSymbolELF *group, bool IsComdat,
                unsigned UniqueID, MCSymbol *Begin,
                const MCSymbolELF *LinkedToSym)
-      : MCSection(SV_ELF, Name, flags & ELF::SHF_EXECINSTR, Begin), Type(type),
-        Flags(flags), UniqueID(UniqueID), EntrySize(entrySize),
+      : MCSection(SV_ELF, Name, flags & ELF::SHF_EXECINSTR,
+                  type == ELF::SHT_NOBITS, Begin),
+        Type(type), Flags(flags), UniqueID(UniqueID), EntrySize(entrySize),
         Group(group, IsComdat), LinkedToSym(LinkedToSym) {
     if (Group.getPointer())
       Group.getPointer()->setIsSignature();
@@ -79,9 +80,8 @@ class MCSectionELF final : public MCSection {
 
   void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
-                            const MCExpr *Subsection) const override;
+                            uint32_t Subsection) const override;
   bool useCodeAlign() const override;
-  bool isVirtualSection() const override;
   StringRef getVirtualSectionKind() const override;
 
   bool isUnique() const { return UniqueID != NonUniqueID; }
diff --git a/llvm/include/llvm/MC/MCSectionGOFF.h b/llvm/include/llvm/MC/MCSectionGOFF.h
index 1cc13757abb8db..11c0f95364037c 100644
--- a/llvm/include/llvm/MC/MCSectionGOFF.h
+++ b/llvm/include/llvm/MC/MCSectionGOFF.h
@@ -26,26 +26,24 @@ class MCExpr;
 class MCSectionGOFF final : public MCSection {
 private:
   MCSection *Parent;
-  const MCExpr *SubsectionId;
+  uint32_t Subsection;
 
   friend class MCContext;
-  MCSectionGOFF(StringRef Name, SectionKind K, MCSection *P, const MCExpr *Sub)
-      : MCSection(SV_GOFF, Name, K.isText(), nullptr), Parent(P),
-        SubsectionId(Sub) {}
+  MCSectionGOFF(StringRef Name, SectionKind K, MCSection *P, uint32_t Sub)
+      : MCSection(SV_GOFF, Name, K.isText(), /*IsVirtual=*/false, nullptr),
+        Parent(P), Subsection(Sub) {}
 
 public:
   void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
-                            const MCExpr *Subsection) const override {
+                            uint32_t /*Subsection*/) const override {
     OS << "\t.section\t\"" << getName() << "\"\n";
   }
 
   bool useCodeAlign() const override { return false; }
 
-  bool isVirtualSection() const override { return false; }
-
   MCSection *getParent() const { return Parent; }
-  const MCExpr *getSubsectionId() const { return SubsectionId; }
+  uint32_t getSubsection() const { return Subsection; }
 
   static bool classof(const MCSection *S) { return S->getVariant() == SV_GOFF; }
 };
diff --git a/llvm/include/llvm/MC/MCSectionMachO.h b/llvm/include/llvm/MC/MCSectionMachO.h
index e268562786b2fb..3b7623fd450e5e 100644
--- a/llvm/include/llvm/MC/MCSectionMachO.h
+++ b/llvm/include/llvm/MC/MCSectionMachO.h
@@ -73,9 +73,8 @@ class MCSectionMachO final : public MCSection {
 
   void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
-                            const MCExpr *Subsection) const override;
+                            uint32_t Subsection) const override;
   bool useCodeAlign() const override;
-  bool isVirtualSection() const override;
 
   void allocAtoms();
   const MCSymbol *getAtom(size_t I) const;
diff --git a/llvm/include/llvm/MC/MCSectionSPIRV.h b/llvm/include/llvm/MC/MCSectionSPIRV.h
index 0b2b15b4e3dcdc..091114a04b5cf1 100644
--- a/llvm/include/llvm/MC/MCSectionSPIRV.h
+++ b/llvm/include/llvm/MC/MCSectionSPIRV.h
@@ -24,16 +24,16 @@ class MCSectionSPIRV final : public MCSection {
   friend class MCContext;
 
   MCSectionSPIRV()
-      : MCSection(SV_SPIRV, "", /*IsText=*/true, /*Begin=*/nullptr) {}
+      : MCSection(SV_SPIRV, "", /*IsText=*/true, /*IsVirtual=*/false,
+                  /*Begin=*/nullptr) {}
   // TODO: Add StringRef Name to MCSectionSPIRV.
 
 public:
   ~MCSectionSPIRV() = default;
   void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
-                            const MCExpr *Subsection) const override {}
+                            uint32_t Subsection) const override {}
   bool useCodeAlign() const override { return false; }
-  bool isVirtualSection() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionWasm.h b/llvm/include/llvm/MC/MCSectionWasm.h
index a94397c37060c4..452381819c4b3f 100644
--- a/llvm/include/llvm/MC/MCSectionWasm.h
+++ b/llvm/include/llvm/MC/MCSectionWasm.h
@@ -51,8 +51,9 @@ class MCSectionWasm final : public MCSection {
   friend class MCContext;
   MCSectionWasm(StringRef Name, SectionKind K, unsigned SegmentFlags,
                 const MCSymbolWasm *Group, unsigned UniqueID, MCSymbol *Begin)
-      : MCSection(SV_Wasm, Name, K.isText(), Begin), UniqueID(UniqueID),
-        Group(Group), IsWasmData(K.isReadOnly() || K.isWriteable()),
+      : MCSection(SV_Wasm, Name, K.isText(), /*IsVirtual=*/false, Begin),
+        UniqueID(UniqueID), Group(Group),
+        IsWasmData(K.isReadOnly() || K.isWriteable()),
         IsMetadata(K.isMetadata()), SegmentFlags(SegmentFlags) {}
 
 public:
@@ -65,9 +66,8 @@ class MCSectionWasm final : public MCSection {
 
   void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
-                            const MCExpr *Subsection) const override;
+                            uint32_t Subsection) const override;
   bool useCodeAlign() const override;
-  bool isVirtualSection() const override;
 
   bool isWasmData() const { return IsWasmData; }
   bool isMetadata() const { return IsMetadata; }
diff --git a/llvm/include/llvm/MC/MCSectionXCOFF.h b/llvm/include/llvm/MC/MCSectionXCOFF.h
index 11bcfc8e63a6b5..499df6b5fcc87c 100644
--- a/llvm/include/llvm/MC/MCSectionXCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionXCOFF.h
@@ -41,11 +41,14 @@ class MCSectionXCOFF final : public MCSection {
   static constexpr unsigned DefaultAlignVal = 4;
   static constexpr unsigned DefaultTextAlignVal = 32;
 
+  // XTY_CM sections are virtual except for toc-data symbols.
   MCSectionXCOFF(StringRef Name, XCOFF::StorageMappingClass SMC,
                  XCOFF::SymbolType ST, SectionKind K, MCSymbolXCOFF *QualName,
                  MCSymbol *Begin, StringRef SymbolTableName,
                  bool MultiSymbolsAllowed)
-      : MCSection(SV_XCOFF, Name, K.isText(), Begin),
+      : MCSection(SV_XCOFF, Name, K.isText(),
+                  /*IsVirtual=*/ST == XCOFF::XTY_CM && SMC != XCOFF::XMC_TD,
+                  Begin),
         CsectProp(XCOFF::CsectProperties(SMC, ST)), QualName(QualName),
         SymbolTableName(SymbolTableName), DwarfSubtypeFlags(std::nullopt),
         MultiSymbolsAllowed(MultiSymbolsAllowed), Kind(K) {
@@ -69,12 +72,14 @@ class MCSectionXCOFF final : public MCSection {
     }
   }
 
+  // DWARF sections are never virtual.
   MCSectionXCOFF(StringRef Name, SectionKind K, MCSymbolXCOFF *QualName,
                  XCOFF::DwarfSectionSubtypeFlags DwarfSubtypeFlags,
                  MCSymbol *Begin, StringRef SymbolTableName,
                  bool MultiSymbolsAllowed)
-      : MCSection(SV_XCOFF, Name, K.isText(), Begin), QualName(QualName),
-        SymbolTableName(SymbolTableName), DwarfSubtypeFlags(DwarfSubtypeFlags),
+      : MCSection(SV_XCOFF, Name, K.isText(), /*IsVirtual=*/false, Begin),
+        QualName(QualName), SymbolTableName(SymbolTableName),
+        DwarfSubtypeFlags(DwarfSubtypeFlags),
         MultiSymbolsAllowed(MultiSymbolsAllowed), Kind(K) {
     assert(QualName != nullptr && "QualName is needed.");
 
@@ -112,9 +117,8 @@ class MCSectionXCOFF final : public MCSection {
 
   void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
-                            const MCExpr *Subsection) const override;
+                            uint32_t Subsection) const override;
   bool useCodeAlign() const override;
-  bool isVirtualSection() const override;
   StringRef getSymbolTableName() const { return SymbolTableName; }
   void setSymbolTableName(StringRef STN) { SymbolTableName = STN; }
   bool isMultiSymbolsAllowed() const { return MultiSymbolsAllowed; }
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index b7468cf70a6643..1d3057a140d8a1 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCWinEH.h"
@@ -63,7 +64,7 @@ struct DefRangeRegisterHeader;
 struct DefRangeFramePointerRelHeader;
 }
 
-using MCSectionSubPair = std::pair<MCSection *, const MCExpr *>;
+using MCSectionSubPair = std::pair<MCSection *, uint32_t>;
 
 /// Target specific streamer interface. This is used so that targets can
 /// implement support for target specific assembly directives.
@@ -116,7 +117,7 @@ class MCTargetStreamer {
   /// This is called by popSection and switchSection, if the current
   /// section changes.
   virtual void changeSection(const MCSection *CurSection, MCSection *Section,
-                             const MCExpr *SubSection, raw_ostream &OS);
+                             uint32_t SubSection, raw_ostream &OS);
 
   virtual void emitValue(const MCExpr *Value);
 
@@ -227,10 +228,6 @@ class MCStreamer {
   WinEH::FrameInfo *CurrentWinFrameInfo;
   size_t CurrentProcWinFrameInfoStartIndex;
 
-  /// Tracks an index to represent the order a symbol was emitted in.
-  /// Zero means we did not emit that symbol.
-  DenseMap<const MCSymbol *, unsigned> SymbolOrdering;
-
   /// This is stack of current and previous section values saved by
   /// pushSection.
   SmallVector<std::pair<MCSectionSubPair, MCSectionSubPair>, 4> SectionStack;
@@ -255,8 +252,14 @@ class MCStreamer {
   bool AllowAutoPadding = false;
 
 protected:
+  MCFragment *CurFrag = nullptr;
+
   MCStreamer(MCContext &Ctx);
 
+  /// This is called by popSection and switchSection, if the current
+  /// section changes.
+  virtual void changeSection(MCSection *, uint32_t);
+
   virtual void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
   virtual void emitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
 
@@ -392,7 +395,9 @@ class MCStreamer {
       return SectionStack.back().first;
     return MCSectionSubPair();
   }
-  MCSection *getCurrentSectionOnly() const { return getCurrentSection().first; }
+  MCSection *getCurrentSectionOnly() const {
+    return CurFrag->getParent();
+  }
 
   /// Return the previous section that the streamer is emitting code to.
   MCSectionSubPair getPreviousSection() const {
@@ -401,18 +406,12 @@ class MCStreamer {
     return MCSectionSubPair();
   }
 
-  /// Returns an index to represent the order a symbol was emitted in.
-  /// (zero if we did not emit that symbol)
-  unsigned getSymbolOrder(const MCSymbol *Sym) const {
-    return SymbolOrdering.lookup(Sym);
+  MCFragment *getCurrentFragment() const {
+    assert(!getCurrentSection().first ||
+           CurFrag->getParent() == getCurrentSection().first);
+    return CurFrag;
   }
 
-  /// Update streamer for a new active section.
-  ///
-  /// This is called by popSection and switchSection, if the current
-  /// section changes.
-  virtual void changeSection(MCSection *, const MCExpr *);
-
   /// Save the current and previous section on the section stack.
   void pushSection() {
     SectionStack.push_back(
@@ -423,59 +422,23 @@ class MCStreamer {
   /// Calls changeSection as needed.
   ///
   /// Returns false if the stack was empty.
-  bool popSection() {
-    if (SectionStack.size() <= 1)
-      return false;
-    auto I = SectionStack.end();
-    --I;
-    MCSectionSubPair OldSection = I->first;
-    --I;
-    MCSectionSubPair NewSection = I->first;
-
-    if (NewSection.first && OldSection != NewSection)
-      changeSection(NewSection.first, NewSection.second);
-    SectionStack.pop_back();
-    return true;
-  }
-
-  bool subSection(const MCExpr *Subsection) {
-    if (SectionStack.empty())
-      return false;
-
-    switchSection(SectionStack.back().first.first, Subsection);
-    return true;
-  }
+  bool popSection();
 
   /// Set the current section where code is being emitted to \p Section.  This
   /// is required to update CurSection.
   ///
   /// This corresponds to assembler directives like .section, .text, etc.
-  virtual void switchSection(MCSection *Section,
-                             const MCExpr *Subsection = nullptr);
-
-  /// Set the current section where code is being emitted to \p Section.
-  /// This is required to update CurSection. This version does not call
-  /// changeSection.
-  void switchSectionNoChange(MCSection *Section,
-                             const MCExpr *Subsection = nullptr) {
-    assert(Section && "Cannot switch to a null section!");
-    MCSectionSubPair curSection = SectionStack.back().first;
-    SectionStack.back().second = curSection;
-    if (MCSectionSubPair(Section, Subsection) != curSection)
-      SectionStack.back().first = MCSectionSubPair(Section, Subsection);
-  }
+  virtual void switchSection(MCSection *Section, uint32_t Subsec = 0);
+  bool switchSection(MCSection *Section, const MCExpr *);
+
+  /// Similar to switchSection, but does not print the section directive.
+  virtual void switchSectionNoPrint(MCSection *Section);
 
   /// Create the default sections and set the initial one.
   virtual void initSections(bool NoExecStack, const MCSubtargetInfo &STI);
 
   MCSymbol *endSection(MCSection *Section);
 
-  /// Sets the symbol's section.
-  ///
-  /// Each emitted symbol will be tracked in the ordering table,
-  /// so we can sort on them later.
-  void assignFragment(MCSymbol *Symbol, MCFragment *Fragment);
-
   /// Returns the mnemonic for \p MI, if the streamer has access to a
   /// instruction printer and returns an empty string otherwise.
   virtual StringRef getMnemonic(MCInst &MI) { return ""; }
diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h
index ff76435d60843b..9891f1d127f1c1 100644
--- a/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -225,7 +225,7 @@ class MCSubtargetInfo {
   }
 
   /// Check whether the CPU string is valid.
-  bool isCPUStringValid(StringRef CPU) const {
+  virtual bool isCPUStringValid(StringRef CPU) const {
     auto Found = llvm::lower_bound(ProcDesc, CPU);
     return Found != ProcDesc.end() && StringRef(Found->Key) == CPU;
   }
@@ -240,6 +240,9 @@ class MCSubtargetInfo {
     return ProcFeatures;
   }
 
+  /// Return the list of processor features currently enabled.
+  std::vector<SubtargetFeatureKV> getEnabledProcessorFeatures() const;
+
   /// HwMode IDs are stored and accessed in a bit set format, enabling
   /// users to efficiently retrieve specific IDs, such as the RegInfo
   /// HwMode ID, from the set as required. Using this approach, various
diff --git a/llvm/include/llvm/MC/MCWasmStreamer.h b/llvm/include/llvm/MC/MCWasmStreamer.h
index 6c9687f3ce442f..77b5625efdfe85 100644
--- a/llvm/include/llvm/MC/MCWasmStreamer.h
+++ b/llvm/include/llvm/MC/MCWasmStreamer.h
@@ -40,7 +40,7 @@ class MCWasmStreamer : public MCObjectStreamer {
   /// \name MCStreamer Interface
   /// @{
 
-  void changeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void changeSection(MCSection *Section, uint32_t Subsection) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCDataFragment &F,
                       uint64_t Offset) override;
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 52563f120fdb4b..893a70f74ee3b0 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -40,6 +40,7 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
   /// \{
 
   void initSections(bool NoExecStack, const MCSubtargetInfo &STI) override;
+  void changeSection(MCSection *Section, uint32_t Subsection = 0) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void emitAssemblerFlag(MCAssemblerFlag Flag) override;
   void emitThumbFunc(MCSymbol *Func) override;
@@ -77,7 +78,6 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
   void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
   void finalizeCGProfileEntry(const MCSymbolRefExpr *&S);
-  void finalizeCGProfile();
 
 private:
   void Error(const Twine &Msg) const;
diff --git a/llvm/include/llvm/MCA/InstrBuilder.h b/llvm/include/llvm/MCA/InstrBuilder.h
index 00c7942e4fa161..00b19a65dc1667 100644
--- a/llvm/include/llvm/MCA/InstrBuilder.h
+++ b/llvm/include/llvm/MCA/InstrBuilder.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_MCA_INSTRBUILDER_H
 #define LLVM_MCA_INSTRBUILDER_H
 
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -71,9 +72,9 @@ class InstrBuilder {
            std::unique_ptr<const InstrDesc>>
       Descriptors;
 
-  // Key is the MCIInst and SchedClassID the describe the value InstrDesc
-  DenseMap<std::pair<const MCInst *, unsigned>,
-           std::unique_ptr<const InstrDesc>>
+  // Key is a hash of the MCInstruction and a SchedClassID that describe the
+  // value InstrDesc
+  DenseMap<std::pair<hash_code, unsigned>, std::unique_ptr<const InstrDesc>>
       VariantDescriptors;
 
   bool FirstCallInst;
@@ -83,6 +84,7 @@ class InstrBuilder {
   using InstRecycleCallback = std::function<Instruction *(const InstrDesc &)>;
   InstRecycleCallback InstRecycleCB;
 
+  Expected<unsigned> getVariantSchedClassID(const MCInst &MCI, unsigned SchedClassID);
   Expected<const InstrDesc &>
   createInstrDescImpl(const MCInst &MCI, const SmallVector<Instrument *> &IVec);
   Expected<const InstrDesc &>
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index 4ab23e4ea81b1a..ec10762a5df3f3 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -384,7 +384,7 @@ struct Elf_Dyn_Impl : Elf_Dyn_Base<ELFT> {
 template <endianness Endianness>
 struct Elf_Rel_Impl<ELFType<Endianness, false>, false> {
   LLVM_ELF_IMPORT_TYPES(Endianness, false)
-  static const bool IsRela = false;
+  static const bool HasAddend = false;
   Elf_Addr r_offset; // Location (file byte offset, or program virtual addr)
   Elf_Word r_info;   // Symbol table index and type of relocation to apply
 
@@ -420,14 +420,14 @@ template <endianness Endianness>
 struct Elf_Rel_Impl<ELFType<Endianness, false>, true>
     : public Elf_Rel_Impl<ELFType<Endianness, false>, false> {
   LLVM_ELF_IMPORT_TYPES(Endianness, false)
-  static const bool IsRela = true;
+  static const bool HasAddend = true;
   Elf_Sword r_addend; // Compute value for relocatable field by adding this
 };
 
 template <endianness Endianness>
 struct Elf_Rel_Impl<ELFType<Endianness, true>, false> {
   LLVM_ELF_IMPORT_TYPES(Endianness, true)
-  static const bool IsRela = false;
+  static const bool HasAddend = false;
   Elf_Addr r_offset; // Location (file byte offset, or program virtual addr)
   Elf_Xword r_info;  // Symbol table index and type of relocation to apply
 
@@ -473,7 +473,7 @@ template <endianness Endianness>
 struct Elf_Rel_Impl<ELFType<Endianness, true>, true>
     : public Elf_Rel_Impl<ELFType<Endianness, true>, false> {
   LLVM_ELF_IMPORT_TYPES(Endianness, true)
-  static const bool IsRela = true;
+  static const bool HasAddend = true;
   Elf_Sxword r_addend; // Compute value for relocatable field by adding this.
 };
 
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 35350df78f8d4e..61d05352ec75ad 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -799,7 +799,7 @@ class MachOObjectFile : public ObjectFile {
       std::string ret;
       raw_string_ostream ss(ret);
       ss << format_hex(platform, 8, true);
-      return ss.str();
+      return ret;
     }
   }
 
@@ -814,7 +814,7 @@ class MachOObjectFile : public ObjectFile {
       std::string ret;
       raw_string_ostream ss(ret);
       ss << format_hex(tools, 8, true);
-      return ss.str();
+      return ret;
     }
   }
 
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index b989a1ee8306c9..b1542ab1392865 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -89,7 +89,11 @@ LOOP_PASS("loop-reduce", LoopStrengthReducePass())
 #ifndef MACHINE_FUNCTION_ANALYSIS
 #define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)
 #endif
+MACHINE_FUNCTION_ANALYSIS("machine-branch-prob",
+                          MachineBranchProbabilityAnalysis())
 MACHINE_FUNCTION_ANALYSIS("machine-dom-tree", MachineDominatorTreeAnalysis())
+MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree",
+                          MachinePostDominatorTreeAnalysis())
 MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 // LiveVariables currently requires pure SSA form.
 // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
@@ -128,8 +132,12 @@ MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
 MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
 MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass())
 MACHINE_FUNCTION_PASS("print", PrintMIRPass())
+MACHINE_FUNCTION_PASS("print<machine-branch-prob>",
+                      MachineBranchProbabilityPrinterPass(dbgs()))
 MACHINE_FUNCTION_PASS("print<machine-dom-tree>",
                       MachineDominatorTreePrinterPass(dbgs()))
+MACHINE_FUNCTION_PASS("print<machine-post-dom-tree>",
+                      MachinePostDominatorTreePrinterPass(dbgs()))
 MACHINE_FUNCTION_PASS("require-all-machine-function-properties",
                       RequireAllMachineFunctionPropertiesPass())
 MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
diff --git a/llvm/include/llvm/Passes/TargetPassRegistry.inc b/llvm/include/llvm/Passes/TargetPassRegistry.inc
index b618331c69988e..521913cb25a4ae 100644
--- a/llvm/include/llvm/Passes/TargetPassRegistry.inc
+++ b/llvm/include/llvm/Passes/TargetPassRegistry.inc
@@ -21,9 +21,12 @@
 #error "must provide <Target>PassRegistry.def"
 #endif
 
-if (PopulateClassToPassNames) {
-  auto *PIC = PB.getPassInstrumentationCallbacks();
-
+auto *PIC = PB.getPassInstrumentationCallbacks();
+if (PIC) {
+  PIC->registerClassToPassNameCallback([this, PIC]() {
+    // MSVC requires this to be captured if it's used inside decltype.
+    // Other compilers consider it an unused lambda capture.
+    (void)this;
 #define ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)                         \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define ADD_CLASS_PASS_TO_PASS_NAME_WITH_PARAMS(NAME, CLASS)                   \
@@ -69,6 +72,7 @@ if (PopulateClassToPassNames) {
 #undef MACHINE_FUNCTION_PASS_WITH_PARAMS
 #undef ADD_CLASS_PASS_TO_PASS_NAME
 #undef ADD_CLASS_PASS_TO_PASS_NAME_WITH_PARAMS
+  });
 }
 
 #define ADD_PASS(NAME, CREATE_PASS)                                            \
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 9bb1d8cbddc5e6..50e6f1d3b9b1f3 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -294,6 +294,8 @@ getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind,
                          uint32_t MaxNumValueData, uint32_t &ActualNumValueData,
                          uint64_t &TotalC, bool GetNoICPValue = false);
 
+// TODO: Unify metadata name 'PGOFuncName' and 'PGOName', by supporting read
+// of this metadata for backward compatibility and generating 'PGOName' only.
 /// Extract the value profile data from \p Inst and returns them if \p Inst is
 /// annotated with value profile data. Returns an empty vector otherwise.
 SmallVector<InstrProfValueData, 4>
@@ -303,6 +305,8 @@ getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind,
 
 inline StringRef getPGOFuncNameMetadataName() { return "PGOFuncName"; }
 
+inline StringRef getPGONameMetadataName() { return "PGOName"; }
+
 /// Return the PGOFuncName meta data associated with a function.
 MDNode *getPGOFuncNameMetadata(const Function &F);
 
@@ -311,8 +315,14 @@ std::string getPGOName(const GlobalVariable &V, bool InLTO = false);
 /// Create the PGOFuncName meta data if PGOFuncName is different from
 /// function's raw name. This should only apply to internal linkage functions
 /// declared by users only.
+/// TODO: Update all callers to 'createPGONameMetadata' and deprecate this
+/// function.
 void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName);
 
+/// Create the PGOName metadata if a global object's PGO name is different from
+/// its mangled name. This should apply to local-linkage global objects only.
+void createPGONameMetadata(GlobalObject &GO, StringRef PGOName);
+
 /// Check if we can use Comdat for profile variables. This will eliminate
 /// the duplicated profile variables for Comdat functions.
 bool needsComdatForCounter(const GlobalObject &GV, const Module &M);
@@ -935,11 +945,8 @@ struct InstrProfRecord {
   }
 
 private:
-  struct ValueProfData {
-    std::vector<InstrProfValueSiteRecord> IndirectCallSites;
-    std::vector<InstrProfValueSiteRecord> MemOPSizes;
-    std::vector<InstrProfValueSiteRecord> VTableTargets;
-  };
+  using ValueProfData = std::array<std::vector<InstrProfValueSiteRecord>,
+                                   IPVK_Last - IPVK_First + 1>;
   std::unique_ptr<ValueProfData> ValueData;
 
   MutableArrayRef<InstrProfValueSiteRecord>
@@ -956,32 +963,18 @@ struct InstrProfRecord {
   getValueSitesForKind(uint32_t ValueKind) const {
     if (!ValueData)
       return std::nullopt;
-    switch (ValueKind) {
-    case IPVK_IndirectCallTarget:
-      return ValueData->IndirectCallSites;
-    case IPVK_MemOPSize:
-      return ValueData->MemOPSizes;
-    case IPVK_VTableTarget:
-      return ValueData->VTableTargets;
-    default:
-      llvm_unreachable("Unknown value kind!");
-    }
+    assert(IPVK_First <= ValueKind && ValueKind <= IPVK_Last &&
+           "Unknown value kind!");
+    return (*ValueData)[ValueKind - IPVK_First];
   }
 
   std::vector<InstrProfValueSiteRecord> &
   getOrCreateValueSitesForKind(uint32_t ValueKind) {
     if (!ValueData)
       ValueData = std::make_unique<ValueProfData>();
-    switch (ValueKind) {
-    case IPVK_IndirectCallTarget:
-      return ValueData->IndirectCallSites;
-    case IPVK_MemOPSize:
-      return ValueData->MemOPSizes;
-    case IPVK_VTableTarget:
-      return ValueData->VTableTargets;
-    default:
-      llvm_unreachable("Unknown value kind!");
-    }
+    assert(IPVK_First <= ValueKind && ValueKind <= IPVK_Last &&
+           "Unknown value kind!");
+    return (*ValueData)[ValueKind - IPVK_First];
   }
 
   // Map indirect call target name hash to name string.
diff --git a/llvm/include/llvm/ProfileData/MIBEntryDef.inc b/llvm/include/llvm/ProfileData/MIBEntryDef.inc
index 794163ae10386c..58c1fc4de4aba5 100644
--- a/llvm/include/llvm/ProfileData/MIBEntryDef.inc
+++ b/llvm/include/llvm/ProfileData/MIBEntryDef.inc
@@ -51,3 +51,5 @@ MIBEntryDef(MaxAccessDensity = 22, MaxAccessDensity, uint32_t)
 MIBEntryDef(TotalLifetimeAccessDensity = 23, TotalLifetimeAccessDensity, uint64_t)
 MIBEntryDef(MinLifetimeAccessDensity = 24, MinLifetimeAccessDensity, uint32_t)
 MIBEntryDef(MaxLifetimeAccessDensity = 25, MaxLifetimeAccessDensity, uint32_t)
+MIBEntryDef(AccessHistogramSize = 26, AccessHistogramSize, uint32_t)
+MIBEntryDef(AccessHistogram = 27, AccessHistogram, uintptr_t)
\ No newline at end of file
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 5cd74be9dc8c41..cee8e5ef4c25d0 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -124,6 +124,13 @@ struct PortableMemInfoBlock {
   OS << "        " << #Name << ": " << Name << "\n";
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
+    if (AccessHistogramSize > 0) {
+      OS << "        " << "AccessHistogramValues" << ":";
+      for (uint32_t I = 0; I < AccessHistogramSize; ++I) {
+        OS << " " << ((uint64_t *)AccessHistogram)[I];
+      }
+      OS << "\n";
+    }
   }
 
   // Return the schema, only for unit tests.
diff --git a/llvm/include/llvm/ProfileData/MemProfData.inc b/llvm/include/llvm/ProfileData/MemProfData.inc
index b82a4baf6dd74e..3f785bd23fce3c 100644
--- a/llvm/include/llvm/ProfileData/MemProfData.inc
+++ b/llvm/include/llvm/ProfileData/MemProfData.inc
@@ -33,7 +33,13 @@
    (uint64_t)'o' << 24 | (uint64_t)'f' << 16 | (uint64_t)'r' << 8 | (uint64_t)129)
 
 // The version number of the raw binary format.
-#define MEMPROF_RAW_VERSION 3ULL
+#define MEMPROF_RAW_VERSION 4ULL
+
+// Currently supported versions.
+#define MEMPROF_RAW_SUPPORTED_VERSIONS                                         \
+  { 3ULL, 4ULL }
+
+#define MEMPROF_V3_MIB_SIZE 132ULL;
 
 #define MEMPROF_BUILDID_MAX_SIZE 32ULL
 
@@ -119,7 +125,8 @@ MemInfoBlock() {
 }
 
 MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
-             uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu)
+             uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu,
+             uintptr_t Histogram, uint32_t HistogramSize)
     : MemInfoBlock() {
   AllocCount = 1U;
   TotalAccessCount = AccessCount;
@@ -149,6 +156,8 @@ MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
   AllocCpuId = AllocCpu;
   DeallocCpuId = DeallocCpu;
   NumMigratedCpu = AllocCpuId != DeallocCpuId;
+  AccessHistogramSize = HistogramSize;
+  AccessHistogram = Histogram;
 }
 
 void Merge(const MemInfoBlock &newMIB) {
@@ -194,6 +203,24 @@ void Merge(const MemInfoBlock &newMIB) {
   NumSameDeallocCpu += DeallocCpuId == newMIB.DeallocCpuId;
   AllocCpuId = newMIB.AllocCpuId;
   DeallocCpuId = newMIB.DeallocCpuId;
+
+  // For merging histograms, we always keep the longer histogram, and add
+  // values of shorter histogram to larger one.
+  uintptr_t ShorterHistogram;
+  uint32_t ShorterHistogramSize;
+  if (newMIB.AccessHistogramSize > AccessHistogramSize) {
+    ShorterHistogram = AccessHistogram;
+    ShorterHistogramSize = AccessHistogramSize;
+    // Swap histogram of current to larger histogram
+    AccessHistogram = newMIB.AccessHistogram;
+    AccessHistogramSize = newMIB.AccessHistogramSize;
+  } else {
+    ShorterHistogram = newMIB.AccessHistogram;
+    ShorterHistogramSize = newMIB.AccessHistogramSize;
+  }
+  for (size_t i = 0; i < ShorterHistogramSize; ++i) {
+    ((uint64_t *)AccessHistogram)[i] += ((uint64_t *)ShorterHistogram)[i];
+  }
 }
 
 #ifdef _MSC_VER
diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index fbba6483abe3e2..da2f14b276ffb0 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -137,7 +137,7 @@ class RawMemProfReader final : public MemProfReader {
 public:
   RawMemProfReader(const RawMemProfReader &) = delete;
   RawMemProfReader &operator=(const RawMemProfReader &) = delete;
-  ~RawMemProfReader() override = default;
+  virtual ~RawMemProfReader() override;
 
   // Prints the contents of the profile in YAML format.
   void printYAML(raw_ostream &OS);
@@ -205,8 +205,14 @@ class RawMemProfReader final : public MemProfReader {
 
   object::SectionedAddress getModuleOffset(uint64_t VirtualAddress);
 
+  llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
+  readMemInfoBlocks(const char *Ptr);
+
   // The profiled binary.
   object::OwningBinary<object::Binary> Binary;
+  // Version of raw memprof binary currently being read. Defaults to most up
+  // to date version.
+  uint64_t MemprofRawVersion = MEMPROF_RAW_VERSION;
   // The preferred load address of the executable segment.
   uint64_t PreferredTextSegmentAddress = 0;
   // The base address of the text segment in the process during profiling.
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index e8b75d6f8cf4b7..ecee7a2cfb539b 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -60,7 +60,6 @@ enum PGOCtxProfileBlockIDs {
 /// example, value profiling would produce a new record with a new record ID,
 /// containing the profiled values (much like the counters)
 class PGOCtxProfileWriter final {
-  SmallVector<char, 1 << 20> Buff;
   BitstreamWriter Writer;
 
   void writeCounters(const ctx_profile::ContextNode &Node);
@@ -68,7 +67,7 @@ class PGOCtxProfileWriter final {
                  const ctx_profile::ContextNode &Node);
 
 public:
-  PGOCtxProfileWriter(raw_fd_stream &Out,
+  PGOCtxProfileWriter(raw_ostream &Out,
                       std::optional<unsigned> VersionOverride = std::nullopt)
       : Writer(Out, 0) {
     Writer.EnterSubblock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID,
diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index 5120f6ab57c030..cb06ac19f0bb7d 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -54,7 +54,7 @@ class ErrorInfoBase {
     std::string Msg;
     raw_string_ostream OS(Msg);
     log(OS);
-    return OS.str();
+    return Msg;
   }
 
   /// Convert this error to a std::error_code.
@@ -761,7 +761,7 @@ inline void cantFail(Error Err, const char *Msg = nullptr) {
     std::string Str;
     raw_string_ostream OS(Str);
     OS << Msg << "\n" << Err;
-    Msg = OS.str().c_str();
+    Msg = Str.c_str();
 #endif
     llvm_unreachable(Msg);
   }
@@ -792,7 +792,7 @@ T cantFail(Expected<T> ValOrErr, const char *Msg = nullptr) {
     raw_string_ostream OS(Str);
     auto E = ValOrErr.takeError();
     OS << Msg << "\n" << E;
-    Msg = OS.str().c_str();
+    Msg = Str.c_str();
 #endif
     llvm_unreachable(Msg);
   }
@@ -823,7 +823,7 @@ T& cantFail(Expected<T&> ValOrErr, const char *Msg = nullptr) {
     raw_string_ostream OS(Str);
     auto E = ValOrErr.takeError();
     OS << Msg << "\n" << E;
-    Msg = OS.str().c_str();
+    Msg = Str.c_str();
 #endif
     llvm_unreachable(Msg);
   }
@@ -1338,7 +1338,7 @@ class FileError final : public ErrorInfo<FileError> {
     std::string Msg;
     raw_string_ostream OS(Msg);
     Err->log(OS);
-    return OS.str();
+    return Msg;
   }
 
   StringRef getFileName() const { return FileName; }
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index 4fce9d8bfb2b68..a8e178d6461e03 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -187,10 +187,8 @@ template <class NodeT>
 void PrintDomTree(const DomTreeNodeBase<NodeT> *N, raw_ostream &O,
                   unsigned Lev) {
   O.indent(2 * Lev) << "[" << Lev << "] " << N;
-  for (typename DomTreeNodeBase<NodeT>::const_iterator I = N->begin(),
-                                                       E = N->end();
-       I != E; ++I)
-    PrintDomTree<NodeT>(*I, O, Lev + 1);
+  for (const auto &I : *N)
+    PrintDomTree<NodeT>(I, O, Lev + 1);
 }
 
 namespace DomTreeBuilder {
diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
index a216cd2a03596f..57cbe993d87394 100644
--- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
@@ -180,15 +180,17 @@ struct SemiNCAInfo {
                   unsigned AttachToNum,
                   const NodeOrderMap *SuccOrder = nullptr) {
     assert(V);
-    SmallVector<NodePtr, 64> WorkList = {V};
+    SmallVector<std::pair<NodePtr, unsigned>, 64> WorkList = {{V, AttachToNum}};
     NodeToInfo[V].Parent = AttachToNum;
 
     while (!WorkList.empty()) {
-      const NodePtr BB = WorkList.pop_back_val();
+      const auto [BB, ParentNum] = WorkList.pop_back_val();
       auto &BBInfo = NodeToInfo[BB];
+      BBInfo.ReverseChildren.push_back(ParentNum);
 
       // Visited nodes always have positive DFS numbers.
       if (BBInfo.DFSNum != 0) continue;
+      BBInfo.Parent = ParentNum;
       BBInfo.DFSNum = BBInfo.Semi = BBInfo.Label = ++LastNum;
       NumToNode.push_back(BB);
 
@@ -201,22 +203,9 @@ struct SemiNCAInfo {
             });
 
       for (const NodePtr Succ : Successors) {
-        const auto SIT = NodeToInfo.find(Succ);
-        // Don't visit nodes more than once but remember to collect
-        // ReverseChildren.
-        if (SIT != NodeToInfo.end() && SIT->second.DFSNum != 0) {
-          if (Succ != BB) SIT->second.ReverseChildren.push_back(LastNum);
-          continue;
-        }
-
         if (!Condition(BB, Succ)) continue;
 
-        // It's fine to add Succ to the map, because we know that it will be
-        // visited later.
-        auto &SuccInfo = NodeToInfo[Succ];
-        WorkList.push_back(Succ);
-        SuccInfo.Parent = LastNum;
-        SuccInfo.ReverseChildren.push_back(LastNum);
+        WorkList.push_back({Succ, LastNum});
       }
     }
 
@@ -595,9 +584,7 @@ struct SemiNCAInfo {
     // Attach the first unreachable block to AttachTo.
     NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
     // Loop over all of the discovered blocks in the function...
-    for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
-      NodePtr W = NumToNode[i];
-
+    for (NodePtr W : llvm::drop_begin(NumToNode)) {
       // Don't replace this with 'count', the insertion side effect is important
       if (DT.DomTreeNodes[W]) continue;  // Haven't calculated this node yet?
 
@@ -614,8 +601,7 @@ struct SemiNCAInfo {
 
   void reattachExistingSubtree(DomTreeT &DT, const TreeNodePtr AttachTo) {
     NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
-    for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
-      const NodePtr N = NumToNode[i];
+    for (const NodePtr N : llvm::drop_begin(NumToNode)) {
       const TreeNodePtr TN = DT.getNode(N);
       assert(TN);
       const TreeNodePtr NewIDom = DT.getNode(NodeToInfo[N].IDom);
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index dfda605365de3c..359b608626dffa 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -228,9 +228,9 @@ class GraphWriter {
           O << "|";
 
       if (RenderUsingHTML)
-        O << EdgeSourceLabels.str();
+        O << edgeSourceLabels;
       else
-        O << "{" << EdgeSourceLabels.str() << "}";
+        O << "{" << edgeSourceLabels << "}";
 
       if (DTraits.renderGraphFromBottomUp())
         if (!RenderUsingHTML)
diff --git a/llvm/include/llvm/Support/JSON.h b/llvm/include/llvm/Support/JSON.h
index 8b437bbabd962f..14a5c7142ed8ce 100644
--- a/llvm/include/llvm/Support/JSON.h
+++ b/llvm/include/llvm/Support/JSON.h
@@ -647,6 +647,8 @@ inline bool Object::erase(StringRef K) {
   return M.erase(ObjectKey(K));
 }
 
+std::vector<const Object::value_type *> sortedElements(const Object &O);
+
 /// A "cursor" marking a position within a Value.
 /// The Value is a tree, and this is the path from the root to the current node.
 /// This is used to associate errors with particular subobjects.
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 5bcefe4b6c3614..f6b1fdb6aba9ef 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -23,6 +23,22 @@
 #include <type_traits>
 
 namespace llvm {
+/// Some template parameter helpers to optimize for bitwidth, for functions that
+/// take multiple arguments.
+
+// We can't verify signedness, since callers rely on implicit coercions to
+// signed/unsigned.
+template <typename T, typename U>
+using enableif_int =
+    std::enable_if_t<std::is_integral_v<T> && std::is_integral_v<U>>;
+
+// Use std::common_type_t to widen only up to the widest argument.
+template <typename T, typename U, typename = enableif_int<T, U>>
+using common_uint =
+    std::common_type_t<std::make_unsigned_t<T>, std::make_unsigned_t<U>>;
+template <typename T, typename U, typename = enableif_int<T, U>>
+using common_sint =
+    std::common_type_t<std::make_signed_t<T>, std::make_signed_t<U>>;
 
 /// Mathematical constants.
 namespace numbers {
@@ -135,22 +151,22 @@ template <typename T> T reverseBits(T Val) {
 // ambiguity.
 
 /// Return the high 32 bits of a 64 bit value.
-constexpr inline uint32_t Hi_32(uint64_t Value) {
+constexpr uint32_t Hi_32(uint64_t Value) {
   return static_cast<uint32_t>(Value >> 32);
 }
 
 /// Return the low 32 bits of a 64 bit value.
-constexpr inline uint32_t Lo_32(uint64_t Value) {
+constexpr uint32_t Lo_32(uint64_t Value) {
   return static_cast<uint32_t>(Value);
 }
 
 /// Make a 64-bit integer from a high / low pair of 32-bit integers.
-constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
+constexpr uint64_t Make_64(uint32_t High, uint32_t Low) {
   return ((uint64_t)High << 32) | (uint64_t)Low;
 }
 
 /// Checks if an integer fits into the given bit width.
-template <unsigned N> constexpr inline bool isInt(int64_t x) {
+template <unsigned N> constexpr bool isInt(int64_t x) {
   if constexpr (N == 0)
     return 0 == x;
   if constexpr (N == 8)
@@ -167,14 +183,14 @@ template <unsigned N> constexpr inline bool isInt(int64_t x) {
 
 /// Checks if a signed integer is an N bit number shifted left by S.
 template <unsigned N, unsigned S>
-constexpr inline bool isShiftedInt(int64_t x) {
+constexpr bool isShiftedInt(int64_t x) {
   static_assert(S < 64, "isShiftedInt<N, S> with S >= 64 is too much.");
   static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
   return isInt<N + S>(x) && (x % (UINT64_C(1) << S) == 0);
 }
 
 /// Checks if an unsigned integer fits into the given bit width.
-template <unsigned N> constexpr inline bool isUInt(uint64_t x) {
+template <unsigned N> constexpr bool isUInt(uint64_t x) {
   if constexpr (N == 0)
     return 0 == x;
   if constexpr (N == 8)
@@ -191,7 +207,7 @@ template <unsigned N> constexpr inline bool isUInt(uint64_t x) {
 
 /// Checks if a unsigned integer is an N bit number shifted left by S.
 template <unsigned N, unsigned S>
-constexpr inline bool isShiftedUInt(uint64_t x) {
+constexpr bool isShiftedUInt(uint64_t x) {
   static_assert(S < 64, "isShiftedUInt<N, S> with S >= 64 is too much.");
   static_assert(N + S <= 64,
                 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
@@ -248,36 +264,36 @@ inline bool isIntN(unsigned N, int64_t x) {
 /// Return true if the argument is a non-empty sequence of ones starting at the
 /// least significant bit with the remainder zero (32 bit version).
 /// Ex. isMask_32(0x0000FFFFU) == true.
-constexpr inline bool isMask_32(uint32_t Value) {
+constexpr bool isMask_32(uint32_t Value) {
   return Value && ((Value + 1) & Value) == 0;
 }
 
 /// Return true if the argument is a non-empty sequence of ones starting at the
 /// least significant bit with the remainder zero (64 bit version).
-constexpr inline bool isMask_64(uint64_t Value) {
+constexpr bool isMask_64(uint64_t Value) {
   return Value && ((Value + 1) & Value) == 0;
 }
 
 /// Return true if the argument contains a non-empty sequence of ones with the
 /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
-constexpr inline bool isShiftedMask_32(uint32_t Value) {
+constexpr bool isShiftedMask_32(uint32_t Value) {
   return Value && isMask_32((Value - 1) | Value);
 }
 
 /// Return true if the argument contains a non-empty sequence of ones with the
 /// remainder zero (64 bit version.)
-constexpr inline bool isShiftedMask_64(uint64_t Value) {
+constexpr bool isShiftedMask_64(uint64_t Value) {
   return Value && isMask_64((Value - 1) | Value);
 }
 
 /// Return true if the argument is a power of two > 0.
 /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
-constexpr inline bool isPowerOf2_32(uint32_t Value) {
+constexpr bool isPowerOf2_32(uint32_t Value) {
   return llvm::has_single_bit(Value);
 }
 
 /// Return true if the argument is a power of two > 0 (64 bit edition.)
-constexpr inline bool isPowerOf2_64(uint64_t Value) {
+constexpr bool isPowerOf2_64(uint64_t Value) {
   return llvm::has_single_bit(Value);
 }
 
@@ -310,13 +326,13 @@ inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
 
 /// Compile time Log2.
 /// Valid only for positive powers of two.
-template <size_t kValue> constexpr inline size_t CTLog2() {
+template <size_t kValue> constexpr size_t CTLog2() {
   static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
                 "Value is not a valid power of 2");
   return 1 + CTLog2<kValue / 2>();
 }
 
-template <> constexpr inline size_t CTLog2<1>() { return 0; }
+template <> constexpr size_t CTLog2<1>() { return 0; }
 
 /// Return the floor log base 2 of the specified value, -1 if the value is zero.
 /// (32 bit edition.)
@@ -346,7 +362,8 @@ inline unsigned Log2_64_Ceil(uint64_t Value) {
 
 /// A and B are either alignments or offsets. Return the minimum alignment that
 /// may be assumed after adding the two together.
-constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
+template <typename U, typename V, typename T = common_uint<U, V>>
+constexpr T MinAlign(U A, V B) {
   // The largest power of 2 that divides both A and B.
   //
   // Replace "-Value" by "1+~Value" in the following commented code to avoid
@@ -355,9 +372,14 @@ constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
   return (A | B) & (1 + ~(A | B));
 }
 
+/// Fallback when arguments aren't integral.
+constexpr uint64_t MinAlign(uint64_t A, uint64_t B) {
+  return (A | B) & (1 + ~(A | B));
+}
+
 /// Returns the next power of two (in 64-bits) that is strictly greater than A.
 /// Returns zero on overflow.
-constexpr inline uint64_t NextPowerOf2(uint64_t A) {
+constexpr uint64_t NextPowerOf2(uint64_t A) {
   A |= (A >> 1);
   A |= (A >> 2);
   A |= (A >> 4);
@@ -375,7 +397,72 @@ inline uint64_t PowerOf2Ceil(uint64_t A) {
   return UINT64_C(1) << Log2_64_Ceil(A);
 }
 
-/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// Returns the integer ceil(Numerator / Denominator). Unsigned version.
+/// Guaranteed to never overflow.
+template <typename U, typename V, typename T = common_uint<U, V>>
+constexpr T divideCeil(U Numerator, V Denominator) {
+  assert(Denominator && "Division by zero");
+  T Bias = (Numerator != 0);
+  return (Numerator - Bias) / Denominator + Bias;
+}
+
+/// Fallback when arguments aren't integral.
+constexpr uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
+  assert(Denominator && "Division by zero");
+  uint64_t Bias = (Numerator != 0);
+  return (Numerator - Bias) / Denominator + Bias;
+}
+
+/// Returns the integer ceil(Numerator / Denominator). Signed version.
+/// Guaranteed to never overflow, unless Numerator is INT64_MIN and Denominator
+/// is -1.
+template <typename U, typename V, typename T = common_sint<U, V>>
+constexpr T divideCeilSigned(U Numerator, V Denominator) {
+  assert(Denominator && "Division by zero");
+  if (!Numerator)
+    return 0;
+  // C's integer division rounds towards 0.
+  T Bias = Denominator >= 0 ? 1 : -1;
+  bool SameSign = (Numerator >= 0) == (Denominator >= 0);
+  return SameSign ? (Numerator - Bias) / Denominator + 1
+                  : Numerator / Denominator;
+}
+
+/// Returns the integer floor(Numerator / Denominator). Signed version.
+/// Guaranteed to never overflow, unless Numerator is INT64_MIN and Denominator
+/// is -1.
+template <typename U, typename V, typename T = common_sint<U, V>>
+constexpr T divideFloorSigned(U Numerator, V Denominator) {
+  assert(Denominator && "Division by zero");
+  if (!Numerator)
+    return 0;
+  // C's integer division rounds towards 0.
+  T Bias = Denominator >= 0 ? -1 : 1;
+  bool SameSign = (Numerator >= 0) == (Denominator >= 0);
+  return SameSign ? Numerator / Denominator
+                  : (Numerator - Bias) / Denominator - 1;
+}
+
+/// Returns the remainder of the Euclidean division of LHS by RHS. Result is
+/// always non-negative.
+template <typename U, typename V, typename T = common_sint<U, V>>
+constexpr T mod(U Numerator, V Denominator) {
+  assert(Denominator >= 1 && "Mod by non-positive number");
+  T Mod = Numerator % Denominator;
+  return Mod < 0 ? Mod + Denominator : Mod;
+}
+
+/// Returns (Numerator / Denominator) rounded by round-half-up. Guaranteed to
+/// never overflow.
+template <typename U, typename V, typename T = common_uint<U, V>>
+constexpr T divideNearest(U Numerator, V Denominator) {
+  assert(Denominator && "Division by zero");
+  T Mod = Numerator % Denominator;
+  return (Numerator / Denominator) +
+         (Mod > (static_cast<T>(Denominator) - 1) / 2);
+}
+
+/// Returns the next integer (mod 2**nbits) that is greater than or equal to
 /// \p Value and is a multiple of \p Align. \p Align must be non-zero.
 ///
 /// Examples:
@@ -386,19 +473,28 @@ inline uint64_t PowerOf2Ceil(uint64_t A) {
 ///   alignTo(321, 255) = 510
 /// \endcode
 ///
-/// May overflow.
-inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
+/// Will overflow only if result is not representable in T.
+template <typename U, typename V, typename T = common_uint<U, V>>
+constexpr T alignTo(U Value, V Align) {
   assert(Align != 0u && "Align can't be 0.");
-  return (Value + Align - 1) / Align * Align;
+  T CeilDiv = divideCeil(Value, Align);
+  return CeilDiv * Align;
 }
 
-inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) {
+/// Fallback when arguments aren't integral.
+constexpr uint64_t alignTo(uint64_t Value, uint64_t Align) {
+  assert(Align != 0u && "Align can't be 0.");
+  uint64_t CeilDiv = divideCeil(Value, Align);
+  return CeilDiv * Align;
+}
+
+constexpr uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) {
   assert(Align != 0 && (Align & (Align - 1)) == 0 &&
          "Align must be a power of 2");
   // Replace unary minus to avoid compilation error on Windows:
   // "unary minus operator applied to unsigned type, result still unsigned"
-  uint64_t negAlign = (~Align) + 1;
-  return (Value + Align - 1) & negAlign;
+  uint64_t NegAlign = (~Align) + 1;
+  return (Value + Align - 1) & NegAlign;
 }
 
 /// If non-zero \p Skew is specified, the return value will be a minimal integer
@@ -413,72 +509,33 @@ inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) {
 ///   alignTo(~0LL, 8, 3) = 3
 ///   alignTo(321, 255, 42) = 552
 /// \endcode
-inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) {
+///
+/// May overflow.
+template <typename U, typename V, typename W,
+          typename T = common_uint<common_uint<U, V>, W>>
+constexpr T alignTo(U Value, V Align, W Skew) {
   assert(Align != 0u && "Align can't be 0.");
   Skew %= Align;
   return alignTo(Value - Skew, Align) + Skew;
 }
 
-/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// Returns the next integer (mod 2**nbits) that is greater than or equal to
 /// \p Value and is a multiple of \c Align. \c Align must be non-zero.
-template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
+///
+/// Will overflow only if result is not representable in T.
+template <auto Align, typename V, typename T = common_uint<decltype(Align), V>>
+constexpr T alignTo(V Value) {
   static_assert(Align != 0u, "Align must be non-zero");
-  return (Value + Align - 1) / Align * Align;
-}
-
-/// Returns the integer ceil(Numerator / Denominator). Unsigned version.
-/// Guaranteed to never overflow.
-inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
-  assert(Denominator && "Division by zero");
-  uint64_t Bias = (Numerator != 0);
-  return (Numerator - Bias) / Denominator + Bias;
-}
-
-/// Returns the integer ceil(Numerator / Denominator). Signed version.
-/// Guaranteed to never overflow.
-inline int64_t divideCeilSigned(int64_t Numerator, int64_t Denominator) {
-  assert(Denominator && "Division by zero");
-  if (!Numerator)
-    return 0;
-  // C's integer division rounds towards 0.
-  int64_t Bias = (Denominator >= 0 ? 1 : -1);
-  bool SameSign = (Numerator >= 0) == (Denominator >= 0);
-  return SameSign ? (Numerator - Bias) / Denominator + 1
-                  : Numerator / Denominator;
+  T CeilDiv = divideCeil(Value, Align);
+  return CeilDiv * Align;
 }
 
-/// Returns the integer floor(Numerator / Denominator). Signed version.
-/// Guaranteed to never overflow.
-inline int64_t divideFloorSigned(int64_t Numerator, int64_t Denominator) {
-  assert(Denominator && "Division by zero");
-  if (!Numerator)
-    return 0;
-  // C's integer division rounds towards 0.
-  int64_t Bias = Denominator >= 0 ? -1 : 1;
-  bool SameSign = (Numerator >= 0) == (Denominator >= 0);
-  return SameSign ? Numerator / Denominator
-                  : (Numerator - Bias) / Denominator - 1;
-}
-
-/// Returns the remainder of the Euclidean division of LHS by RHS. Result is
-/// always non-negative.
-inline int64_t mod(int64_t Numerator, int64_t Denominator) {
-  assert(Denominator >= 1 && "Mod by non-positive number");
-  int64_t Mod = Numerator % Denominator;
-  return Mod < 0 ? Mod + Denominator : Mod;
-}
-
-/// Returns (Numerator / Denominator) rounded by round-half-up. Guaranteed to
-/// never overflow.
-inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
-  assert(Denominator && "Division by zero");
-  uint64_t Mod = Numerator % Denominator;
-  return (Numerator / Denominator) + (Mod > (Denominator - 1) / 2);
-}
-
-/// Returns the largest uint64_t less than or equal to \p Value and is
-/// \p Skew mod \p Align. \p Align must be non-zero
-inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+/// Returns the largest unsigned integer less than or equal to \p Value and is
+/// \p Skew mod \p Align. \p Align must be non-zero. Guaranteed to never
+/// overflow.
+template <typename U, typename V, typename W = uint8_t,
+          typename T = common_uint<common_uint<U, V>, W>>
+constexpr T alignDown(U Value, V Align, W Skew = 0) {
   assert(Align != 0u && "Align can't be 0.");
   Skew %= Align;
   return (Value - Skew) / Align * Align + Skew;
@@ -486,7 +543,7 @@ inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
 
 /// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
 /// Requires B <= 32.
-template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
+template <unsigned B> constexpr int32_t SignExtend32(uint32_t X) {
   static_assert(B <= 32, "Bit width out of range.");
   if constexpr (B == 0)
     return 0;
@@ -504,7 +561,7 @@ inline int32_t SignExtend32(uint32_t X, unsigned B) {
 
 /// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
 /// Requires B <= 64.
-template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
+template <unsigned B> constexpr int64_t SignExtend64(uint64_t x) {
   static_assert(B <= 64, "Bit width out of range.");
   if constexpr (B == 0)
     return 0;
@@ -522,8 +579,8 @@ inline int64_t SignExtend64(uint64_t X, unsigned B) {
 
 /// Subtract two unsigned integers, X and Y, of type T and return the absolute
 /// value of the result.
-template <typename T>
-std::enable_if_t<std::is_unsigned_v<T>, T> AbsoluteDifference(T X, T Y) {
+template <typename U, typename V, typename T = common_uint<U, V>>
+constexpr T AbsoluteDifference(U X, V Y) {
   return X > Y ? (X - Y) : (Y - X);
 }
 
diff --git a/llvm/include/llvm/Support/ScopedPrinter.h b/llvm/include/llvm/Support/ScopedPrinter.h
index 596b73bd27e498..675c0ea4457cf2 100644
--- a/llvm/include/llvm/Support/ScopedPrinter.h
+++ b/llvm/include/llvm/Support/ScopedPrinter.h
@@ -86,7 +86,7 @@ template <class T> std::string to_string(const T &Value) {
   std::string number;
   raw_string_ostream stream(number);
   stream << Value;
-  return stream.str();
+  return number;
 }
 
 template <typename T, typename TEnum>
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index df4b264af72a81..e7f40e87ed24af 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -294,6 +294,9 @@ HANDLE_TARGET_OPCODE(G_FRAME_INDEX)
 /// Generic reference to global value.
 HANDLE_TARGET_OPCODE(G_GLOBAL_VALUE)
 
+/// Generic ptrauth-signed reference to global value.
+HANDLE_TARGET_OPCODE(G_PTRAUTH_GLOBAL_VALUE)
+
 /// Generic instruction to materialize the address of an object in the constant
 /// pool.
 HANDLE_TARGET_OPCODE(G_CONSTANT_POOL)
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index a234e00c760866..1d04783753d5cd 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -1026,7 +1026,7 @@ yamlize(IO &YamlIO, T &Val, bool, EmptyContext &Ctx) {
     std::string Storage;
     raw_string_ostream Buffer(Storage);
     BlockScalarTraits<T>::output(Val, YamlIO.getContext(), Buffer);
-    StringRef Str = Buffer.str();
+    StringRef Str(Storage);
     YamlIO.blockScalarString(Str);
   } else {
     StringRef Str;
@@ -1046,8 +1046,8 @@ yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
     raw_string_ostream ScalarBuffer(ScalarStorage), TagBuffer(TagStorage);
     TaggedScalarTraits<T>::output(Val, io.getContext(), ScalarBuffer,
                                   TagBuffer);
-    io.scalarTag(TagBuffer.str());
-    StringRef ScalarStr = ScalarBuffer.str();
+    io.scalarTag(TagStorage);
+    StringRef ScalarStr(ScalarStorage);
     io.scalarString(ScalarStr,
                     TaggedScalarTraits<T>::mustQuote(Val, ScalarStr));
   } else {
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 4abffe6476c852..e1710ff2d8abfa 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -110,6 +110,12 @@ def G_GLOBAL_VALUE : GenericInstruction {
   let hasSideEffects = false;
 }
 
+def G_PTRAUTH_GLOBAL_VALUE : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins unknown:$addr, i32imm:$key, type1:$addrdisc, i64imm:$disc);
+  let hasSideEffects = 0;
+}
+
 def G_CONSTANT_POOL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$src);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index bd43b958990302..b0789fca630e8d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1636,6 +1636,13 @@ def sub_of_vscale : GICombineRule<
    [{ return Helper.matchSubOfVScale(${root}, ${matchinfo}); }]),
    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
 
+def expand_const_fpowi : GICombineRule<
+   (defs root:$root),
+   (match (G_CONSTANT $int, $imm),
+          (G_FPOWI $dst, $float, $int):$root,
+          [{ return Helper.matchFPowIExpansion(*${root}, ${imm}.getCImm()->getSExtValue()); }]),
+   (apply [{ Helper.applyExpandFPowI(*${root}, ${imm}.getCImm()->getSExtValue()); }])>;
+
 // match_extract_of_element and insert_vector_elt_oob must be the first!
 def vector_ops_combines: GICombineGroup<[
 match_extract_of_element_undef_vector,
@@ -1786,7 +1793,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
 def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
                                      overlapping_and, mulo_by_2, mulo_by_0,
                                      adde_to_addo,
-                                     combine_minmax_nan]>;
+                                     combine_minmax_nan, expand_const_fpowi]>;
 
 def known_bits_simplifications : GICombineGroup<[
   redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index f09c3b2b347c53..b8e56c755fbda8 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -369,8 +369,7 @@ class TargetMachine {
 
   /// Allow the target to modify the pass pipeline.
   // TODO: Populate all pass names by using <Target>PassRegistry.def.
-  virtual void registerPassBuilderCallbacks(PassBuilder &,
-                                            bool PopulateClassToPassNames) {}
+  virtual void registerPassBuilderCallbacks(PassBuilder &) {}
 
   /// Allow the target to register alias analyses with the AAManager for use
   /// with the new pass manager. Only affects the "default" AAManager.
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index a40bca9563cdd4..f47d9f1bde8803 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -22,6 +22,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <array>
+#include <set>
 #include <vector>
 
 namespace llvm {
@@ -116,12 +117,18 @@ using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
 // SubtargetFeature which may represent either an actual extension or some
 // internal LLVM property.
 struct ExtensionInfo {
-  StringRef Name;                 // Human readable name, e.g. "profile".
+  StringRef UserVisibleName;      // Human readable name used in -march, -cpu
+                                  // and target func attribute, e.g. "profile".
   std::optional<StringRef> Alias; // An alias for this extension, if one exists.
   ArchExtKind ID;                 // Corresponding to the ArchExtKind, this
                                   // extensions representation in the bitfield.
-  StringRef Feature;              // -mattr enable string, e.g. "+spe"
-  StringRef NegFeature;           // -mattr disable string, e.g. "-spe"
+  StringRef ArchFeatureName;      // The feature name defined by the
+                                  // Architecture, e.g. FEAT_AdvSIMD.
+  StringRef Description;          // The textual description of the extension.
+  StringRef PosTargetFeature;     // -target-feature/-mattr enable string,
+                                  // e.g. "+spe".
+  StringRef NegTargetFeature;     // -target-feature/-mattr disable string,
+                                  // e.g. "-spe".
 };
 
 #define EMIT_EXTENSIONS
@@ -286,12 +293,12 @@ struct ExtensionSet {
       Features.emplace_back(T(BaseArch->ArchFeature));
 
     for (const auto &E : Extensions) {
-      if (E.Feature.empty() || !Touched.test(E.ID))
+      if (E.PosTargetFeature.empty() || !Touched.test(E.ID))
         continue;
       if (Enabled.test(E.ID))
-        Features.emplace_back(T(E.Feature));
+        Features.emplace_back(T(E.PosTargetFeature));
       else
-        Features.emplace_back(T(E.NegFeature));
+        Features.emplace_back(T(E.NegTargetFeature));
     }
   }
 
@@ -304,8 +311,8 @@ struct Alias {
   StringRef Name;
 };
 
-inline constexpr Alias CpuAliases[] = {{"cobalt-100", "neoverse-n2"},
-                                       {"grace", "neoverse-v2"}};
+#define EMIT_CPU_ALIAS
+#include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
 const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID));
 
@@ -343,7 +350,9 @@ bool isX18ReservedByDefault(const Triple &TT);
 // themselves, they are sequential (0, 1, 2, 3, ...).
 uint64_t getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
 
-void PrintSupportedExtensions(StringMap<StringRef> DescMap);
+void PrintSupportedExtensions();
+
+void printEnabledExtensions(const std::set<StringRef> &EnabledFeatureNames);
 
 } // namespace AArch64
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index f10cede0734e44..0e4ad873e36391 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -252,6 +252,7 @@ X86_FEATURE       (EGPR,            "egpr")
 X86_FEATURE_COMPAT(USERMSR,         "usermsr",                0)
 X86_FEATURE_COMPAT(AVX10_1,         "avx10.1-256",           36)
 X86_FEATURE_COMPAT(AVX10_1_512,     "avx10.1-512",           37)
+X86_FEATURE       (ZU,              "zu")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 6ba04dbc31db36..f70fd02ca573cc 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -123,6 +123,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Alignment.h"
diff --git a/llvm/include/llvm/Transforms/IPO/ExtractGV.h b/llvm/include/llvm/Transforms/IPO/ExtractGV.h
index 45e29ba9a3ecfe..1c2e7d9465fd6e 100644
--- a/llvm/include/llvm/Transforms/IPO/ExtractGV.h
+++ b/llvm/include/llvm/Transforms/IPO/ExtractGV.h
@@ -13,6 +13,7 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class GlobalValue;
 
 class ExtractGVPass : public PassInfoMixin<ExtractGVPass> {
 private:
diff --git a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
index 13dda6d61284c9..4308c0879dbcd1 100644
--- a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
+++ b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
@@ -19,6 +19,7 @@ namespace llvm {
 
 class Module;
 class ProfileSummaryInfo;
+class BasicBlock;
 class BlockFrequencyInfo;
 class TargetTransformInfo;
 class OptimizationRemarkEmitter;
diff --git a/llvm/include/llvm/Transforms/IPO/Internalize.h b/llvm/include/llvm/Transforms/IPO/Internalize.h
index ece5bfe77b7983..c9b2d3ba2d71a3 100644
--- a/llvm/include/llvm/Transforms/IPO/Internalize.h
+++ b/llvm/include/llvm/Transforms/IPO/Internalize.h
@@ -27,6 +27,8 @@
 #include <functional>
 
 namespace llvm {
+class Comdat;
+class GlobalValue;
 class Module;
 
 /// A pass that internalizes all functions and variables other than those that
diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index 0be3146f695a67..415bb65352da30 100644
--- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TRANSFORMS_IPO_WHOLEPROGRAMDEVIRT_H
 #define LLVM_TRANSFORMS_IPO_WHOLEPROGRAMDEVIRT_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/PassManager.h"
 #include <cassert>
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
index 6322df90b18e1d..9fe2716220e83d 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
@@ -43,7 +43,7 @@ class InterestingMemoryOperand {
                            Value *MaybeStride = nullptr)
       : IsWrite(IsWrite), OpType(OpType), Alignment(Alignment),
         MaybeMask(MaybeMask), MaybeEVL(MaybeEVL), MaybeStride(MaybeStride) {
-    const DataLayout &DL = I->getModule()->getDataLayout();
+    const DataLayout &DL = I->getDataLayout();
     TypeStoreSize = DL.getTypeStoreSizeInBits(OpType);
     PtrUse = &I->getOperandUse(OperandNo);
   }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h
new file mode 100644
index 00000000000000..9aafe6cc48f874
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h
@@ -0,0 +1,36 @@
+//===- NumericalStabilitySanitizer.h - NSan Pass ---------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines the numerical stability sanitizer (nsan) pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_NUMERICALSTABIITYSANITIZER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_NUMERICALSTABIITYSANITIZER_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// A function pass for nsan instrumentation.
+///
+/// Instruments functions to duplicate floating point computations in a
+/// higher-precision type.
+/// This pass inserts calls to runtime library functions. If the
+/// functions aren't declared yet, the pass inserts the declarations.
+struct NumericalStabilitySanitizerPass
+    : public PassInfoMixin<NumericalStabilitySanitizerPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_NUMERICALSTABIITYSANITIZER_H
diff --git a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
index 10b6e1c6a21b63..ab2bb2e4cf1e4d 100644
--- a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
+++ b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
@@ -22,9 +22,11 @@
 namespace llvm {
 
 class AssumptionCache;
+class CallInst;
 class DominatorTree;
 class ScalarEvolution;
 class SCEV;
+class Value;
 
 struct AlignmentFromAssumptionsPass
     : public PassInfoMixin<AlignmentFromAssumptionsPass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
index 48f8421c1813c8..a40f0e220044c0 100644
--- a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -42,6 +42,7 @@
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassManager.h"
 #include <algorithm>
 #include <vector>
@@ -59,6 +60,7 @@ class GlobalVariable;
 class Instruction;
 class ProfileSummaryInfo;
 class TargetTransformInfo;
+class TargetTransformInfo;
 
 /// A private "module" namespace for types and utilities used by
 /// ConstantHoisting. These are implementation details and should not be used by
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 6c809bc881d050..8ed03d7f3ddbff 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -37,6 +37,7 @@ class MemSetInst;
 class PostDominatorTree;
 class StoreInst;
 class TargetLibraryInfo;
+class TypeSize;
 class Value;
 
 class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
diff --git a/llvm/include/llvm/Transforms/Scalar/Reassociate.h b/llvm/include/llvm/Transforms/Scalar/Reassociate.h
index 84d72df6fc4d81..8c93afcd587e80 100644
--- a/llvm/include/llvm/Transforms/Scalar/Reassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/Reassociate.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include <deque>
diff --git a/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h b/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h
index d17186682fb8f0..168e0c3bb95215 100644
--- a/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h
+++ b/llvm/include/llvm/Transforms/Scalar/SpeculativeExecution.h
@@ -65,7 +65,9 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class BasicBlock;
 class TargetTransformInfo;
+
 class SpeculativeExecutionPass
     : public PassInfoMixin<SpeculativeExecutionPass> {
 public:
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 1c342b871a4a45..6226062dd713f6 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <functional>
diff --git a/llvm/include/llvm/Transforms/Utils/Debugify.h b/llvm/include/llvm/Transforms/Utils/Debugify.h
index 7981b9ec703644..10c8c5897b9432 100644
--- a/llvm/include/llvm/Transforms/Utils/Debugify.h
+++ b/llvm/include/llvm/Transforms/Utils/Debugify.h
@@ -19,8 +19,9 @@
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index a3e5ac3ac19d45..28ff6c4c7927d8 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -96,7 +96,7 @@ class SSAUpdaterImpl {
 
     // Special case: bail out if BB is unreachable.
     if (BlockList.size() == 0) {
-      ValT V = Traits::GetUndefVal(BB, Updater);
+      ValT V = Traits::GetPoisonVal(BB, Updater);
       (*AvailableVals)[BB] = V;
       return V;
     }
@@ -251,9 +251,9 @@ class SSAUpdaterImpl {
         for (unsigned p = 0; p != Info->NumPreds; ++p) {
           BBInfo *Pred = Info->Preds[p];
 
-          // Treat an unreachable predecessor as a definition with 'undef'.
+          // Treat an unreachable predecessor as a definition with 'poison'.
           if (Pred->BlkNum == 0) {
-            Pred->AvailableVal = Traits::GetUndefVal(Pred->BB, Updater);
+            Pred->AvailableVal = Traits::GetPoisonVal(Pred->BB, Updater);
             (*AvailableVals)[Pred->BB] = Pred->AvailableVal;
             Pred->DefBB = Pred;
             Pred->BlkNum = PseudoEntry->BlkNum;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index d09fc328c452ff..b4fe6351125b1f 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -67,6 +67,7 @@ class BlockFrequencyInfo;
 class DemandedBits;
 class DominatorTree;
 class Function;
+class Instruction;
 class Loop;
 class LoopAccessInfoManager;
 class LoopInfo;
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 4f99d171469e49..3b522dd642db51 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -29,6 +29,7 @@ namespace llvm {
 class AAResults;
 class AssumptionCache;
 class BasicBlock;
+class DataLayout;
 class DemandedBits;
 class DominatorTree;
 class Function;
diff --git a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index f4b4d8888a5472..59801642c92218 100644
--- a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -99,7 +99,7 @@ PreservedAnalyses AAEvaluator::run(Function &F, FunctionAnalysisManager &AM) {
 }
 
 void AAEvaluator::runInternal(Function &F, AAResults &AA) {
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   ++FunctionCount;
 
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index c110943ad0d58f..161a3034e48299 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1919,7 +1919,7 @@ BasicAAResult BasicAA::run(Function &F, FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  return BasicAAResult(F.getParent()->getDataLayout(), F, TLI, AC, DT);
+  return BasicAAResult(F.getDataLayout(), F, TLI, AC, DT);
 }
 
 BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) {
@@ -1947,7 +1947,7 @@ bool BasicAAWrapperPass::runOnFunction(Function &F) {
   auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
   auto &DTWP = getAnalysis<DominatorTreeWrapperPass>();
 
-  Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), F,
+  Result.reset(new BasicAAResult(F.getDataLayout(), F,
                                  TLIWP.getTLI(F), ACT.getAssumptionCache(F),
                                  &DTWP.getDomTree()));
 
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 50dcd5f45233f7..85092f70112cdf 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -620,7 +620,7 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L,
       if (!CmpLHSConst || !llvm::is_contained(successors(BB), B))
         continue;
       // First collapse InstChain
-      const DataLayout &DL = BB->getModule()->getDataLayout();
+      const DataLayout &DL = BB->getDataLayout();
       for (Instruction *I : llvm::reverse(InstChain)) {
         CmpLHSConst = ConstantFoldBinaryOpOperands(
             I->getOpcode(), CmpLHSConst, cast<Constant>(I->getOperand(1)), DL);
@@ -787,12 +787,9 @@ BranchProbabilityInfo::getInitialEstimatedBlockWeight(const BasicBlock *BB) {
                ? static_cast<uint32_t>(BlockExecWeight::NORETURN)
                : static_cast<uint32_t>(BlockExecWeight::UNREACHABLE);
 
-  // Check if the block is 'unwind' handler of  some invoke instruction.
-  for (const auto *Pred : predecessors(BB))
-    if (Pred)
-      if (const auto *II = dyn_cast<InvokeInst>(Pred->getTerminator()))
-        if (II->getUnwindDest() == BB)
-          return static_cast<uint32_t>(BlockExecWeight::UNWIND);
+  // Check if the block is an exception handling block.
+  if (BB->isEHPad())
+    return static_cast<uint32_t>(BlockExecWeight::UNWIND);
 
   // Check if the block contains 'cold' call.
   for (const auto &I : *BB)
diff --git a/llvm/lib/Analysis/CGSCCPassManager.cpp b/llvm/lib/Analysis/CGSCCPassManager.cpp
index 2ed1d98f800688..24b3c6eef10845 100644
--- a/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -160,10 +160,14 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
 
   SmallVector<Function *, 4> DeadFunctions;
 
-  CGSCCUpdateResult UR = {
-      RCWorklist,           CWorklist,     InvalidRefSCCSet,
-      InvalidSCCSet,        nullptr,       PreservedAnalyses::all(),
-      InlinedInternalEdges, DeadFunctions, {}};
+  CGSCCUpdateResult UR = {CWorklist,
+                          InvalidRefSCCSet,
+                          InvalidSCCSet,
+                          nullptr,
+                          PreservedAnalyses::all(),
+                          InlinedInternalEdges,
+                          DeadFunctions,
+                          {}};
 
   // Request PassInstrumentation from analysis manager, will use it to run
   // instrumenting callbacks for the passes later.
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index 20efa2b4ff64c0..ed9cd84bb8e9bf 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -319,15 +319,13 @@ PreservedAnalyses CallGraphSCCsPrinterPass::run(Module &M,
     const std::vector<CallGraphNode *> &nextSCC = *SCCI;
     OS << "\nSCC #" << ++sccNum << ": ";
     bool First = true;
-    for (std::vector<CallGraphNode *>::const_iterator I = nextSCC.begin(),
-                                                      E = nextSCC.end();
-         I != E; ++I) {
+    for (CallGraphNode *CGN : nextSCC) {
       if (First)
         First = false;
       else
         OS << ", ";
-      OS << ((*I)->getFunction() ? (*I)->getFunction()->getName()
-                                 : "external node");
+      OS << (CGN->getFunction() ? CGN->getFunction()->getName()
+                                : "external node");
     }
 
     if (nextSCC.size() == 1 && SCCI.hasCycle())
diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp
index effa25f276dd25..672dae1642cb3e 100644
--- a/llvm/lib/Analysis/CallPrinter.cpp
+++ b/llvm/lib/Analysis/CallPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/HeatUtils.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DOTGraphTraits.h"
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index d8ca4a60cfbd4c..a353842eb809c4 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -388,7 +388,7 @@ UseCaptureKind llvm::DetermineUseCaptureKind(
         // Comparing a dereferenceable_or_null pointer against null cannot
         // lead to pointer escapes, because if it is not null it must be a
         // valid (in-bounds) pointer.
-        const DataLayout &DL = I->getModule()->getDataLayout();
+        const DataLayout &DL = I->getDataLayout();
         if (IsDereferenceableOrNull && IsDereferenceableOrNull(O, DL))
           return UseCaptureKind::NO_CAPTURE;
       }
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 9a5c0f442de42e..962880f68f0767 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -644,7 +644,7 @@ Constant *llvm::ReadByteArrayFromGlobal(const GlobalVariable *GV,
   if (!GV->isConstant() || !GV->hasDefinitiveInitializer())
     return nullptr;
 
-  const DataLayout &DL = GV->getParent()->getDataLayout();
+  const DataLayout &DL = GV->getDataLayout();
   Constant *Init = const_cast<Constant *>(GV->getInitializer());
   TypeSize InitSize = DL.getTypeAllocSize(Init->getType());
   if (InitSize < Offset)
@@ -3485,15 +3485,15 @@ Constant *llvm::ConstantFoldCall(const CallBase *Call, Function *F,
   StringRef Name = F->getName();
   if (auto *FVTy = dyn_cast<FixedVectorType>(Ty))
     return ConstantFoldFixedVectorCall(
-        Name, IID, FVTy, Operands, F->getParent()->getDataLayout(), TLI, Call);
+        Name, IID, FVTy, Operands, F->getDataLayout(), TLI, Call);
 
   if (auto *SVTy = dyn_cast<ScalableVectorType>(Ty))
     return ConstantFoldScalableVectorCall(
-        Name, IID, SVTy, Operands, F->getParent()->getDataLayout(), TLI, Call);
+        Name, IID, SVTy, Operands, F->getDataLayout(), TLI, Call);
 
   if (auto *StTy = dyn_cast<StructType>(Ty))
     return ConstantFoldStructCall(Name, IID, StTy, Operands,
-                                  F->getParent()->getDataLayout(), TLI, Call);
+                                  F->getDataLayout(), TLI, Call);
 
   // TODO: If this is a library function, we already discovered that above,
   //       so we should pass the LibFunc, not the name (and it might be better
diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp
index 1a9c7c21e9cedf..e4c9dcc7544e99 100644
--- a/llvm/lib/Analysis/ConstraintSystem.cpp
+++ b/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -163,15 +163,15 @@ void ConstraintSystem::dump() const {
   SmallVector<std::string> Names = getVarNamesList();
   for (const auto &Row : Constraints) {
     SmallVector<std::string, 16> Parts;
-    for (unsigned I = 0, S = Row.size(); I < S; ++I) {
-      if (Row[I].Id >= NumVariables)
+    for (const Entry &E : Row) {
+      if (E.Id >= NumVariables)
         break;
-      if (Row[I].Id == 0)
+      if (E.Id == 0)
         continue;
       std::string Coefficient;
-      if (Row[I].Coefficient != 1)
-        Coefficient = std::to_string(Row[I].Coefficient) + " * ";
-      Parts.push_back(Coefficient + Names[Row[I].Id - 1]);
+      if (E.Coefficient != 1)
+        Coefficient = std::to_string(E.Coefficient) + " * ";
+      Parts.push_back(Coefficient + Names[E.Id - 1]);
     }
     // assert(!Parts.empty() && "need to have at least some parts");
     int64_t ConstPart = 0;
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index c5017bf52498e4..cda3ebfd77c5c2 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -69,7 +69,7 @@ void DemandedBits::determineLiveOperandBits(
           return;
         KnownBitsComputed = true;
 
-        const DataLayout &DL = UserI->getModule()->getDataLayout();
+        const DataLayout &DL = UserI->getDataLayout();
         Known = KnownBits(BitWidth);
         computeKnownBits(V1, Known, DL, 0, &AC, UserI, &DT);
 
@@ -404,14 +404,14 @@ APInt DemandedBits::getDemandedBits(Instruction *I) {
   if (Found != AliveBits.end())
     return Found->second;
 
-  const DataLayout &DL = I->getModule()->getDataLayout();
+  const DataLayout &DL = I->getDataLayout();
   return APInt::getAllOnes(DL.getTypeSizeInBits(I->getType()->getScalarType()));
 }
 
 APInt DemandedBits::getDemandedBits(Use *U) {
   Type *T = (*U)->getType();
   auto *UserI = cast<Instruction>(U->getUser());
-  const DataLayout &DL = UserI->getModule()->getDataLayout();
+  const DataLayout &DL = UserI->getDataLayout();
   unsigned BitWidth = DL.getTypeSizeInBits(T->getScalarType());
 
   // We only track integer uses, everything else produces a mask with all bits
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index e0e7dd18cd8d48..8e44d548cb56f2 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3607,7 +3607,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   Value *SrcPtr = getLoadStorePointerOperand(Src);
   Value *DstPtr = getLoadStorePointerOperand(Dst);
 
-  switch (underlyingObjectsAlias(AA, F->getParent()->getDataLayout(),
+  switch (underlyingObjectsAlias(AA, F->getDataLayout(),
                                  MemoryLocation::get(Dst),
                                  MemoryLocation::get(Src))) {
   case AliasResult::MayAlias:
@@ -4034,7 +4034,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   Value *SrcPtr = getLoadStorePointerOperand(Src);
   Value *DstPtr = getLoadStorePointerOperand(Dst);
   assert(underlyingObjectsAlias(
-             AA, F->getParent()->getDataLayout(), MemoryLocation::get(Dst),
+             AA, F->getDataLayout(), MemoryLocation::get(Dst),
              MemoryLocation::get(Src)) == AliasResult::MustAlias);
 
   // establish loop nesting levels
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index 3ab0af8d57da16..2feedd2ff40afe 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/Utils/TFUtils.h"
 #include "llvm/Analysis/Utils/TrainingLogger.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 
diff --git a/llvm/lib/Analysis/DomTreeUpdater.cpp b/llvm/lib/Analysis/DomTreeUpdater.cpp
index 0a16fc9a76d29b..6895317c1d03ae 100644
--- a/llvm/lib/Analysis/DomTreeUpdater.cpp
+++ b/llvm/lib/Analysis/DomTreeUpdater.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/GenericDomTreeUpdaterImpl.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
@@ -23,79 +24,12 @@
 
 namespace llvm {
 
-bool DomTreeUpdater::isUpdateValid(
-    const DominatorTree::UpdateType Update) const {
-  const auto *From = Update.getFrom();
-  const auto *To = Update.getTo();
-  const auto Kind = Update.getKind();
+template class GenericDomTreeUpdater<DomTreeUpdater, DominatorTree,
+                                     PostDominatorTree>;
 
-  // Discard updates by inspecting the current state of successors of From.
-  // Since isUpdateValid() must be called *after* the Terminator of From is
-  // altered we can determine if the update is unnecessary for batch updates
-  // or invalid for a single update.
-  const bool HasEdge = llvm::is_contained(successors(From), To);
-
-  // If the IR does not match the update,
-  // 1. In batch updates, this update is unnecessary.
-  // 2. When called by insertEdge*()/deleteEdge*(), this update is invalid.
-  // Edge does not exist in IR.
-  if (Kind == DominatorTree::Insert && !HasEdge)
-    return false;
-
-  // Edge exists in IR.
-  if (Kind == DominatorTree::Delete && HasEdge)
-    return false;
-
-  return true;
-}
-
-bool DomTreeUpdater::isSelfDominance(
-    const DominatorTree::UpdateType Update) const {
-  // Won't affect DomTree and PostDomTree.
-  return Update.getFrom() == Update.getTo();
-}
-
-void DomTreeUpdater::applyDomTreeUpdates() {
-  // No pending DomTreeUpdates.
-  if (Strategy != UpdateStrategy::Lazy || !DT)
-    return;
-
-  // Only apply updates not are applied by DomTree.
-  if (hasPendingDomTreeUpdates()) {
-    const auto I = PendUpdates.begin() + PendDTUpdateIndex;
-    const auto E = PendUpdates.end();
-    assert(I < E && "Iterator range invalid; there should be DomTree updates.");
-    DT->applyUpdates(ArrayRef<DominatorTree::UpdateType>(I, E));
-    PendDTUpdateIndex = PendUpdates.size();
-  }
-}
-
-void DomTreeUpdater::flush() {
-  applyDomTreeUpdates();
-  applyPostDomTreeUpdates();
-  dropOutOfDateUpdates();
-}
-
-void DomTreeUpdater::applyPostDomTreeUpdates() {
-  // No pending PostDomTreeUpdates.
-  if (Strategy != UpdateStrategy::Lazy || !PDT)
-    return;
-
-  // Only apply updates not are applied by PostDomTree.
-  if (hasPendingPostDomTreeUpdates()) {
-    const auto I = PendUpdates.begin() + PendPDTUpdateIndex;
-    const auto E = PendUpdates.end();
-    assert(I < E &&
-           "Iterator range invalid; there should be PostDomTree updates.");
-    PDT->applyUpdates(ArrayRef<DominatorTree::UpdateType>(I, E));
-    PendPDTUpdateIndex = PendUpdates.size();
-  }
-}
-
-void DomTreeUpdater::tryFlushDeletedBB() {
-  if (!hasPendingUpdates())
-    forceFlushDeletedBB();
-}
+template void
+GenericDomTreeUpdater<DomTreeUpdater, DominatorTree,
+                      PostDominatorTree>::recalculate(Function &F);
 
 bool DomTreeUpdater::forceFlushDeletedBB() {
   if (DeletedBBs.empty())
@@ -117,58 +51,6 @@ bool DomTreeUpdater::forceFlushDeletedBB() {
   return true;
 }
 
-void DomTreeUpdater::recalculate(Function &F) {
-
-  if (Strategy == UpdateStrategy::Eager) {
-    if (DT)
-      DT->recalculate(F);
-    if (PDT)
-      PDT->recalculate(F);
-    return;
-  }
-
-  // There is little performance gain if we pend the recalculation under
-  // Lazy UpdateStrategy so we recalculate available trees immediately.
-
-  // Prevent forceFlushDeletedBB() from erasing DomTree or PostDomTree nodes.
-  IsRecalculatingDomTree = IsRecalculatingPostDomTree = true;
-
-  // Because all trees are going to be up-to-date after recalculation,
-  // flush awaiting deleted BasicBlocks.
-  forceFlushDeletedBB();
-  if (DT)
-    DT->recalculate(F);
-  if (PDT)
-    PDT->recalculate(F);
-
-  // Resume forceFlushDeletedBB() to erase DomTree or PostDomTree nodes.
-  IsRecalculatingDomTree = IsRecalculatingPostDomTree = false;
-  PendDTUpdateIndex = PendPDTUpdateIndex = PendUpdates.size();
-  dropOutOfDateUpdates();
-}
-
-bool DomTreeUpdater::hasPendingUpdates() const {
-  return hasPendingDomTreeUpdates() || hasPendingPostDomTreeUpdates();
-}
-
-bool DomTreeUpdater::hasPendingDomTreeUpdates() const {
-  if (!DT)
-    return false;
-  return PendUpdates.size() != PendDTUpdateIndex;
-}
-
-bool DomTreeUpdater::hasPendingPostDomTreeUpdates() const {
-  if (!PDT)
-    return false;
-  return PendUpdates.size() != PendPDTUpdateIndex;
-}
-
-bool DomTreeUpdater::isBBPendingDeletion(llvm::BasicBlock *DelBB) const {
-  if (Strategy == UpdateStrategy::Eager || DeletedBBs.empty())
-    return false;
-  return DeletedBBs.contains(DelBB);
-}
-
 // The DT and PDT require the nodes related to updates
 // are not deleted when update functions are called.
 // So BasicBlock deletions must be pended when the
@@ -201,16 +83,6 @@ void DomTreeUpdater::callbackDeleteBB(
   delete DelBB;
 }
 
-void DomTreeUpdater::eraseDelBBNode(BasicBlock *DelBB) {
-  if (DT && !IsRecalculatingDomTree)
-    if (DT->getNode(DelBB))
-      DT->eraseNode(DelBB);
-
-  if (PDT && !IsRecalculatingPostDomTree)
-    if (PDT->getNode(DelBB))
-      PDT->eraseNode(DelBB);
-}
-
 void DomTreeUpdater::validateDeleteBB(BasicBlock *DelBB) {
   assert(DelBB && "Invalid push_back of nullptr DelBB.");
   assert(pred_empty(DelBB) && "DelBB has one or more predecessors.");
@@ -227,206 +99,13 @@ void DomTreeUpdater::validateDeleteBB(BasicBlock *DelBB) {
   new UnreachableInst(DelBB->getContext(), DelBB);
 }
 
-void DomTreeUpdater::applyUpdates(ArrayRef<DominatorTree::UpdateType> Updates) {
-  if (!DT && !PDT)
-    return;
-
-  if (Strategy == UpdateStrategy::Lazy) {
-    PendUpdates.reserve(PendUpdates.size() + Updates.size());
-    for (const auto &U : Updates)
-      if (!isSelfDominance(U))
-        PendUpdates.push_back(U);
-
-    return;
-  }
-
-  if (DT)
-    DT->applyUpdates(Updates);
-  if (PDT)
-    PDT->applyUpdates(Updates);
-}
-
-void DomTreeUpdater::applyUpdatesPermissive(
-    ArrayRef<DominatorTree::UpdateType> Updates) {
-  if (!DT && !PDT)
-    return;
-
-  SmallSet<std::pair<BasicBlock *, BasicBlock *>, 8> Seen;
-  SmallVector<DominatorTree::UpdateType, 8> DeduplicatedUpdates;
-  for (const auto &U : Updates) {
-    auto Edge = std::make_pair(U.getFrom(), U.getTo());
-    // Because it is illegal to submit updates that have already been applied
-    // and updates to an edge need to be strictly ordered,
-    // it is safe to infer the existence of an edge from the first update
-    // to this edge.
-    // If the first update to an edge is "Delete", it means that the edge
-    // existed before. If the first update to an edge is "Insert", it means
-    // that the edge didn't exist before.
-    //
-    // For example, if the user submits {{Delete, A, B}, {Insert, A, B}},
-    // because
-    // 1. it is illegal to submit updates that have already been applied,
-    // i.e., user cannot delete an nonexistent edge,
-    // 2. updates to an edge need to be strictly ordered,
-    // So, initially edge A -> B existed.
-    // We can then safely ignore future updates to this edge and directly
-    // inspect the current CFG:
-    // a. If the edge still exists, because the user cannot insert an existent
-    // edge, so both {Delete, A, B}, {Insert, A, B} actually happened and
-    // resulted in a no-op. DTU won't submit any update in this case.
-    // b. If the edge doesn't exist, we can then infer that {Delete, A, B}
-    // actually happened but {Insert, A, B} was an invalid update which never
-    // happened. DTU will submit {Delete, A, B} in this case.
-    if (!isSelfDominance(U) && Seen.count(Edge) == 0) {
-      Seen.insert(Edge);
-      // If the update doesn't appear in the CFG, it means that
-      // either the change isn't made or relevant operations
-      // result in a no-op.
-      if (isUpdateValid(U)) {
-        if (isLazy())
-          PendUpdates.push_back(U);
-        else
-          DeduplicatedUpdates.push_back(U);
-      }
-    }
-  }
-
-  if (Strategy == UpdateStrategy::Lazy)
-    return;
-
-  if (DT)
-    DT->applyUpdates(DeduplicatedUpdates);
-  if (PDT)
-    PDT->applyUpdates(DeduplicatedUpdates);
-}
-
-DominatorTree &DomTreeUpdater::getDomTree() {
-  assert(DT && "Invalid acquisition of a null DomTree");
-  applyDomTreeUpdates();
-  dropOutOfDateUpdates();
-  return *DT;
-}
-
-PostDominatorTree &DomTreeUpdater::getPostDomTree() {
-  assert(PDT && "Invalid acquisition of a null PostDomTree");
-  applyPostDomTreeUpdates();
-  dropOutOfDateUpdates();
-  return *PDT;
-}
-
-void DomTreeUpdater::dropOutOfDateUpdates() {
-  if (Strategy == DomTreeUpdater::UpdateStrategy::Eager)
-    return;
-
-  tryFlushDeletedBB();
-
-  // Drop all updates applied by both trees.
-  if (!DT)
-    PendDTUpdateIndex = PendUpdates.size();
-  if (!PDT)
-    PendPDTUpdateIndex = PendUpdates.size();
-
-  const size_t dropIndex = std::min(PendDTUpdateIndex, PendPDTUpdateIndex);
-  const auto B = PendUpdates.begin();
-  const auto E = PendUpdates.begin() + dropIndex;
-  assert(B <= E && "Iterator out of range.");
-  PendUpdates.erase(B, E);
-  // Calculate current index.
-  PendDTUpdateIndex -= dropIndex;
-  PendPDTUpdateIndex -= dropIndex;
-}
-
+LLVM_DUMP_METHOD
+void DomTreeUpdater::dump() const {
+  Base::dump();
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void DomTreeUpdater::dump() const {
-  raw_ostream &OS = llvm::dbgs();
-
-  OS << "Available Trees: ";
-  if (DT || PDT) {
-    if (DT)
-      OS << "DomTree ";
-    if (PDT)
-      OS << "PostDomTree ";
-    OS << "\n";
-  } else
-    OS << "None\n";
-
-  OS << "UpdateStrategy: ";
-  if (Strategy == UpdateStrategy::Eager) {
-    OS << "Eager\n";
-    return;
-  } else
-    OS << "Lazy\n";
-  int Index = 0;
-
-  auto printUpdates =
-      [&](ArrayRef<DominatorTree::UpdateType>::const_iterator begin,
-          ArrayRef<DominatorTree::UpdateType>::const_iterator end) {
-        if (begin == end)
-          OS << "  None\n";
-        Index = 0;
-        for (auto It = begin, ItEnd = end; It != ItEnd; ++It) {
-          auto U = *It;
-          OS << "  " << Index << " : ";
-          ++Index;
-          if (U.getKind() == DominatorTree::Insert)
-            OS << "Insert, ";
-          else
-            OS << "Delete, ";
-          BasicBlock *From = U.getFrom();
-          if (From) {
-            auto S = From->getName();
-            if (!From->hasName())
-              S = "(no name)";
-            OS << S << "(" << From << "), ";
-          } else {
-            OS << "(badref), ";
-          }
-          BasicBlock *To = U.getTo();
-          if (To) {
-            auto S = To->getName();
-            if (!To->hasName())
-              S = "(no_name)";
-            OS << S << "(" << To << ")\n";
-          } else {
-            OS << "(badref)\n";
-          }
-        }
-      };
-
-  if (DT) {
-    const auto I = PendUpdates.begin() + PendDTUpdateIndex;
-    assert(PendUpdates.begin() <= I && I <= PendUpdates.end() &&
-           "Iterator out of range.");
-    OS << "Applied but not cleared DomTreeUpdates:\n";
-    printUpdates(PendUpdates.begin(), I);
-    OS << "Pending DomTreeUpdates:\n";
-    printUpdates(I, PendUpdates.end());
-  }
-
-  if (PDT) {
-    const auto I = PendUpdates.begin() + PendPDTUpdateIndex;
-    assert(PendUpdates.begin() <= I && I <= PendUpdates.end() &&
-           "Iterator out of range.");
-    OS << "Applied but not cleared PostDomTreeUpdates:\n";
-    printUpdates(PendUpdates.begin(), I);
-    OS << "Pending PostDomTreeUpdates:\n";
-    printUpdates(I, PendUpdates.end());
-  }
-
-  OS << "Pending DeletedBBs:\n";
-  Index = 0;
-  for (const auto *BB : DeletedBBs) {
-    OS << "  " << Index << " : ";
-    ++Index;
-    if (BB->hasName())
-      OS << BB->getName() << "(";
-    else
-      OS << "(no_name)(";
-    OS << BB << ")\n";
-  }
-
+  raw_ostream &OS = dbgs();
   OS << "Pending Callbacks:\n";
-  Index = 0;
+  int Index = 0;
   for (const auto &BB : Callbacks) {
     OS << "  " << Index << " : ";
     ++Index;
@@ -436,6 +115,7 @@ LLVM_DUMP_METHOD void DomTreeUpdater::dump() const {
       OS << "(no_name)(";
     OS << BB << ")\n";
   }
-}
 #endif
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 055f121e743411..699ddf271e9e83 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -95,7 +95,7 @@ static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
                                                      AssumptionCache *AC,
                                                      DominatorTree *DT) {
   bool IsSigned = false;
-  const DataLayout &DL = Exit->getModule()->getDataLayout();
+  const DataLayout &DL = Exit->getDataLayout();
   uint64_t MaxBitWidth = DL.getTypeSizeInBits(Exit->getType());
 
   if (DB) {
diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp
index 5c7883fb3b37c9..fc9f4343178b8b 100644
--- a/llvm/lib/Analysis/IVUsers.cpp
+++ b/llvm/lib/Analysis/IVUsers.cpp
@@ -134,7 +134,7 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
 /// add its users to the IVUsesByStride set and return true.  Otherwise, return
 /// false.
 bool IVUsers::AddUsersIfInteresting(Instruction *I) {
-  const DataLayout &DL = I->getModule()->getDataLayout();
+  const DataLayout &DL = I->getDataLayout();
 
   // Add this IV user to the Processed set before returning false to ensure that
   // all IV users are members of the set. See IVUsers::isIVUserOrOperand.
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index a71ab23a309027..f43666f0037b6f 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -87,7 +87,7 @@ uint32_t ICallPromotionAnalysis::getProfitablePromotionCandidates(
   return I;
 }
 
-ArrayRef<InstrProfValueData>
+MutableArrayRef<InstrProfValueData>
 ICallPromotionAnalysis::getPromotionCandidatesForInstruction(
     const Instruction *I, uint64_t &TotalCount, uint32_t &NumCandidates) {
   uint32_t NumVals;
@@ -95,9 +95,9 @@ ICallPromotionAnalysis::getPromotionCandidatesForInstruction(
                                       MaxNumPromotions, NumVals, TotalCount);
   if (!Res) {
     NumCandidates = 0;
-    return ArrayRef<InstrProfValueData>();
+    return MutableArrayRef<InstrProfValueData>();
   }
   ValueDataArray = std::move(Res);
   NumCandidates = getProfitablePromotionCandidates(I, NumVals, TotalCount);
-  return ArrayRef<InstrProfValueData>(ValueDataArray.get(), NumVals);
+  return MutableArrayRef<InstrProfValueData>(ValueDataArray.get(), NumVals);
 }
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 71dfcbc9b7bb7e..c6907cb128bb47 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index f5b17dca49735b..94aa8ecf6f558c 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -498,7 +498,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
                ProfileSummaryInfo *PSI = nullptr,
                OptimizationRemarkEmitter *ORE = nullptr)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
-        PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
+        PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
         CandidateCall(Call) {}
 
   InlineResult analyze();
@@ -2999,7 +2999,7 @@ std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
   // alloca, the inlined code would need to be adjusted to handle that the
   // argument is in the alloca address space (so it is a little bit complicated
   // to solve).
-  unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
+  unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
     if (Call.isByValArgument(I)) {
       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 132dd1cc7396e3..cdcc40c596b77f 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5248,11 +5248,16 @@ static Value *simplifyPHINode(PHINode *PN, ArrayRef<Value *> IncomingValues,
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
   Value *CommonValue = nullptr;
+  bool HasPoisonInput = false;
   bool HasUndefInput = false;
   for (Value *Incoming : IncomingValues) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PN)
       continue;
+    if (isa<PoisonValue>(Incoming)) {
+      HasPoisonInput = true;
+      continue;
+    }
     if (Q.isUndefValue(Incoming)) {
       // Remember that we saw an undef value, but otherwise ignore them.
       HasUndefInput = true;
@@ -5263,12 +5268,13 @@ static Value *simplifyPHINode(PHINode *PN, ArrayRef<Value *> IncomingValues,
     CommonValue = Incoming;
   }
 
-  // If CommonValue is null then all of the incoming values were either undef or
-  // equal to the phi node itself.
+  // If CommonValue is null then all of the incoming values were either undef,
+  // poison or equal to the phi node itself.
   if (!CommonValue)
-    return UndefValue::get(PN->getType());
+    return HasUndefInput ? UndefValue::get(PN->getType())
+                         : PoisonValue::get(PN->getType());
 
-  if (HasUndefInput) {
+  if (HasPoisonInput || HasUndefInput) {
     // If we have a PHI node like phi(X, undef, X), where X is defined by some
     // instruction, we cannot return X as the result of the PHI node unless it
     // dominates the PHI block.
@@ -7155,7 +7161,7 @@ Value *llvm::simplifyInstruction(Instruction *I, const SimplifyQuery &SQ) {
   /// If called on unreachable code, the instruction may simplify to itself.
   /// Make life easier for users by detecting that case here, and returning a
   /// safe value instead.
-  return Result == I ? UndefValue::get(I->getType()) : Result;
+  return Result == I ? PoisonValue::get(I->getType()) : Result;
 }
 
 /// Implementation of recursive simplification through an instruction's
@@ -7177,7 +7183,7 @@ static bool replaceAndRecursivelySimplifyImpl(
     SmallSetVector<Instruction *, 8> *UnsimplifiedUsers = nullptr) {
   bool Simplified = false;
   SmallSetVector<Instruction *, 8> Worklist;
-  const DataLayout &DL = I->getModule()->getDataLayout();
+  const DataLayout &DL = I->getDataLayout();
 
   // If we have an explicit value to collapse to, do that round of the
   // simplification loop by hand initially.
@@ -7242,7 +7248,7 @@ const SimplifyQuery getBestSimplifyQuery(Pass &P, Function &F) {
   auto *TLI = TLIWP ? &TLIWP->getTLI(F) : nullptr;
   auto *ACWP = P.getAnalysisIfAvailable<AssumptionCacheTracker>();
   auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr;
-  return {F.getParent()->getDataLayout(), TLI, DT, AC};
+  return {F.getDataLayout(), TLI, DT, AC};
 }
 
 const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &AR,
@@ -7256,7 +7262,7 @@ const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM,
   auto *DT = AM.template getCachedResult<DominatorTreeAnalysis>(F);
   auto *TLI = AM.template getCachedResult<TargetLibraryAnalysis>(F);
   auto *AC = AM.template getCachedResult<AssumptionAnalysis>(F);
-  return {F.getParent()->getDataLayout(), TLI, DT, AC};
+  return {F.getDataLayout(), TLI, DT, AC};
 }
 template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &,
                                                   Function &);
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index f10b2bc06746aa..d8b03eaa3928fb 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
@@ -1059,7 +1060,7 @@ LazyValueInfoImpl::solveBlockValueExtractValue(ExtractValueInst *EVI,
   // based on replaced with.overflow intrinsics.
   if (Value *V = simplifyExtractValueInst(
           EVI->getAggregateOperand(), EVI->getIndices(),
-          EVI->getModule()->getDataLayout()))
+          EVI->getDataLayout()))
     return getBlockValue(V, BB, EVI);
 
   LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
@@ -1387,7 +1388,7 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
         // over the operands unnecessarily which can be expensive for
         // instructions with many operands.
         if (isa<IntegerType>(Usr->getType()) && isOperationFoldable(Usr)) {
-          const DataLayout &DL = BBTo->getModule()->getDataLayout();
+          const DataLayout &DL = BBTo->getDataLayout();
           if (usesOperand(Usr, Condition)) {
             // If Val has Condition as an operand and Val can be folded into a
             // constant with either Condition == true or Condition == false,
@@ -1451,7 +1452,7 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
       ConstantRange EdgeVal(CaseValue);
       if (ValUsesConditionAndMayBeFoldable) {
         User *Usr = cast<User>(Val);
-        const DataLayout &DL = BBTo->getModule()->getDataLayout();
+        const DataLayout &DL = BBTo->getDataLayout();
         ValueLatticeElement EdgeLatticeVal =
             constantFoldUser(Usr, Condition, CaseValue, DL);
         if (EdgeLatticeVal.isOverdefined())
@@ -1689,7 +1690,7 @@ LazyValueInfo LazyValueAnalysis::run(Function &F,
                                      FunctionAnalysisManager &FAM) {
   auto &AC = FAM.getResult<AssumptionAnalysis>(F);
 
-  return LazyValueInfo(&AC, &F.getParent()->getDataLayout());
+  return LazyValueInfo(&AC, &F.getDataLayout());
 }
 
 /// Returns true if we can statically tell that this value will never be a
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 1ab856ac8830a9..496308a0c247a4 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -563,22 +563,22 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 }
 
 void Lint::visitSDiv(BinaryOperator &I) {
-  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+  Check(!isZero(I.getOperand(1), I.getDataLayout(), DT, AC),
         "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitUDiv(BinaryOperator &I) {
-  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+  Check(!isZero(I.getOperand(1), I.getDataLayout(), DT, AC),
         "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitSRem(BinaryOperator &I) {
-  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+  Check(!isZero(I.getOperand(1), I.getDataLayout(), DT, AC),
         "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitURem(BinaryOperator &I) {
-  Check(!isZero(I.getOperand(1), I.getModule()->getDataLayout(), DT, AC),
+  Check(!isZero(I.getOperand(1), I.getDataLayout(), DT, AC),
         "Undefined behavior: Division by zero", &I);
 }
 
@@ -647,7 +647,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
                            SmallPtrSetImpl<Value *> &Visited) const {
   // Detect self-referential values.
   if (!Visited.insert(V).second)
-    return UndefValue::get(V->getType());
+    return PoisonValue::get(V->getType());
 
   // TODO: Look through sext or zext cast, when the result is known to
   // be interpreted as signed or unsigned, respectively.
@@ -709,7 +709,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
 
 PreservedAnalyses LintPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto *Mod = F.getParent();
-  auto *DL = &F.getParent()->getDataLayout();
+  auto *DL = &F.getDataLayout();
   auto *AA = &AM.getResult<AAManager>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 2b8197066e8e92..1d54a66705a2a1 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -263,7 +263,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
                                              ScalarEvolution &SE,
                                              DominatorTree &DT,
                                              AssumptionCache *AC) {
-  auto &DL = LI->getModule()->getDataLayout();
+  auto &DL = LI->getDataLayout();
   Value *Ptr = LI->getPointerOperand();
 
   APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
@@ -349,7 +349,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
 ///
 /// This uses the pointee type to determine how many bytes need to be safe to
 /// load from the pointer.
-bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
+bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size,
                                        const DataLayout &DL,
                                        Instruction *ScanFrom,
                                        AssumptionCache *AC,
@@ -588,7 +588,7 @@ Value *llvm::findAvailablePtrLoadStore(
   if (MaxInstsToScan == 0)
     MaxInstsToScan = ~0U;
 
-  const DataLayout &DL = ScanBB->getModule()->getDataLayout();
+  const DataLayout &DL = ScanBB->getDataLayout();
   const Value *StrippedPtr = Loc.Ptr->stripPointerCasts();
 
   while (ScanFrom != ScanBB->begin()) {
@@ -668,7 +668,7 @@ Value *llvm::findAvailablePtrLoadStore(
 Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
                                       bool *IsLoadCSE,
                                       unsigned MaxInstsToScan) {
-  const DataLayout &DL = Load->getModule()->getDataLayout();
+  const DataLayout &DL = Load->getDataLayout();
   Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
   BasicBlock *ScanBB = Load->getParent();
   Type *AccessTy = Load->getType();
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index fd8919fff6ff96..38bf6d8160aa99 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -239,7 +239,7 @@ getStartAndEndForAccess(const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
   assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
 
   // Add the size of the pointed element to ScEnd.
-  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+  auto &DL = Lp->getHeader()->getDataLayout();
   Type *IdxTy = DL.getIndexType(PtrExpr->getType());
   const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
   ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
@@ -309,7 +309,7 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
     return false;
 
   const DataLayout &DL =
-      SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
+      SinkAR->getLoop()->getHeader()->getDataLayout();
   unsigned AllocSize =
       std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
 
@@ -612,12 +612,12 @@ void RuntimePointerChecking::printChecks(
     OS.indent(Depth) << "Check " << N++ << ":\n";
 
     OS.indent(Depth + 2) << "Comparing group (" << Check1 << "):\n";
-    for (unsigned K = 0; K < First.size(); ++K)
-      OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
+    for (unsigned K : First)
+      OS.indent(Depth + 2) << *Pointers[K].PointerValue << "\n";
 
     OS.indent(Depth + 2) << "Against group (" << Check2 << "):\n";
-    for (unsigned K = 0; K < Second.size(); ++K)
-      OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
+    for (unsigned K : Second)
+      OS.indent(Depth + 2) << *Pointers[K].PointerValue << "\n";
   }
 }
 
@@ -627,15 +627,12 @@ void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
   printChecks(OS, Checks, Depth);
 
   OS.indent(Depth) << "Grouped accesses:\n";
-  for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
-    const auto &CG = CheckingGroups[I];
-
+  for (const auto &CG : CheckingGroups) {
     OS.indent(Depth + 2) << "Group " << &CG << ":\n";
     OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
                          << ")\n";
-    for (unsigned J = 0; J < CG.Members.size(); ++J) {
-      OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
-                           << "\n";
+    for (unsigned Member : CG.Members) {
+      OS.indent(Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n";
     }
   }
 }
@@ -1497,7 +1494,7 @@ std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
     return std::nullopt;
   }
 
-  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+  auto &DL = Lp->getHeader()->getDataLayout();
   TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
   int64_t Size = AllocSize.getFixedValue();
   const APInt &APStepVal = C->getAPInt();
@@ -1910,7 +1907,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
     const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
     const DenseMap<Value *, SmallVector<const Value *, 16>>
         &UnderlyingObjects) {
-  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+  auto &DL = InnermostLoop->getHeader()->getDataLayout();
   auto &SE = *PSE.getSE();
   auto [APtr, AIsWrite] = A;
   auto [BPtr, BIsWrite] = B;
@@ -2030,7 +2027,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   }
 
   ScalarEvolution &SE = *PSE.getSE();
-  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+  auto &DL = InnermostLoop->getHeader()->getDataLayout();
   uint64_t MaxStride = std::max(StrideA, StrideB);
 
   // If the distance between the acecsses is larger than their maximum absolute
@@ -2355,9 +2352,9 @@ void MemoryDepChecker::Dependence::print(
 
 bool LoopAccessInfo::canAnalyzeLoop() {
   // We need to have a loop header.
-  LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
-                    << TheLoop->getHeader()->getParent()->getName() << ": "
-                    << TheLoop->getHeader()->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
+                    << TheLoop->getHeader()->getParent()->getName() << "' from "
+                    << TheLoop->getLocStr() << "\n");
 
   // We can only analyze innermost loops.
   if (!TheLoop->isInnermost()) {
@@ -2386,6 +2383,8 @@ bool LoopAccessInfo::canAnalyzeLoop() {
     return false;
   }
 
+  LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
+                    << TheLoop->getHeader()->getName() << "\n");
   return true;
 }
 
@@ -2806,7 +2805,7 @@ bool LoopAccessInfo::isInvariant(Value *V) const {
 /// stores. This ignores trailing indices that have no effect on the final
 /// pointer.
 static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
-  const DataLayout &DL = Gep->getModule()->getDataLayout();
+  const DataLayout &DL = Gep->getDataLayout();
   unsigned LastOperand = Gep->getNumOperands() - 1;
   TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
 
@@ -2848,21 +2847,6 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   return GEP->getOperand(InductionOperand);
 }
 
-/// If a value has only one user that is a CastInst, return it.
-static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
-  Value *UniqueCast = nullptr;
-  for (User *U : Ptr->users()) {
-    CastInst *CI = dyn_cast<CastInst>(U);
-    if (CI && CI->getType() == Ty) {
-      if (!UniqueCast)
-        UniqueCast = CI;
-      else
-        return nullptr;
-    }
-  }
-  return UniqueCast;
-}
-
 /// Get the stride of a pointer access in a loop. Looks for symbolic
 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
 static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
@@ -2925,21 +2909,14 @@ static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *L
     return nullptr;
 
   // Look for the loop invariant symbolic value.
-  const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
-  if (!U) {
-    const auto *C = dyn_cast<SCEVIntegralCastExpr>(V);
-    if (!C)
-      return nullptr;
-    U = dyn_cast<SCEVUnknown>(C->getOperand());
-    if (!U)
-      return nullptr;
+  if (isa<SCEVUnknown>(V))
+    return V;
 
-    // Match legacy behavior - this is not needed for correctness
-    if (!getUniqueCastUse(U->getValue(), Lp, V->getType()))
-      return nullptr;
-  }
+  if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(V))
+    if (isa<SCEVUnknown>(C->getOperand()))
+      return V;
 
-  return V;
+  return nullptr;
 }
 
 void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
@@ -2984,7 +2961,7 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // Match the types so we can compare the stride and the MaxBTC.
   // The Stride can be positive/negative, so we sign extend Stride;
   // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
-  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
   uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
   uint64_t BETypeSizeBits = DL.getTypeSizeInBits(MaxBTC->getType());
   const SCEV *CastedStride = StrideExpr;
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index c34c4974382ea6..6bb5f001e9bd1d 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PrintPasses.h"
 #include "llvm/InitializePasses.h"
@@ -663,6 +664,17 @@ Loop::LocRange Loop::getLocRange() const {
   return LocRange();
 }
 
+std::string Loop::getLocStr() const {
+  std::string Result;
+  raw_string_ostream OS(Result);
+  if (const DebugLoc LoopDbgLoc = getStartLoc())
+    LoopDbgLoc.print(OS);
+  else
+    // Just print the module name.
+    OS << getHeader()->getParent()->getParent()->getModuleIdentifier();
+  return Result;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Loop::dump() const { print(dbgs()); }
 
diff --git a/llvm/lib/Analysis/LoopPass.cpp b/llvm/lib/Analysis/LoopPass.cpp
index 2159c45a8e5c55..3dc9e75b6168f3 100644
--- a/llvm/lib/Analysis/LoopPass.cpp
+++ b/llvm/lib/Analysis/LoopPass.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/PrintPasses.h"
diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
index 84f1eff9a73203..7b6ca4d711fcda 100644
--- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -84,7 +84,7 @@ bool UnrolledInstAnalyzer::visitBinaryOperator(BinaryOperator &I) {
       RHS = SimpleRHS;
 
   Value *SimpleV = nullptr;
-  const DataLayout &DL = I.getModule()->getDataLayout();
+  const DataLayout &DL = I.getDataLayout();
   if (auto FI = dyn_cast<FPMathOperator>(&I))
     SimpleV =
         simplifyBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
@@ -157,7 +157,7 @@ bool UnrolledInstAnalyzer::visitCastInst(CastInst &I) {
   // analysis, which operates on integers (and, e.g., might convert i8* null to
   // i32 0).
   if (CastInst::castIsValid(I.getOpcode(), Op, I.getType())) {
-    const DataLayout &DL = I.getModule()->getDataLayout();
+    const DataLayout &DL = I.getDataLayout();
     if (Value *V = simplifyCastInst(I.getOpcode(), Op, I.getType(), DL)) {
       SimplifiedValues[&I] = V;
       return true;
@@ -194,7 +194,7 @@ bool UnrolledInstAnalyzer::visitCmpInst(CmpInst &I) {
     }
   }
 
-  const DataLayout &DL = I.getModule()->getDataLayout();
+  const DataLayout &DL = I.getDataLayout();
   if (Value *V = simplifyCmpInst(I.getPredicate(), LHS, RHS, DL)) {
     SimplifiedValues[&I] = V;
     return true;
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 21946572339b9a..f4a782b4f168b0 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -56,6 +57,9 @@ static cl::opt<SkipMLPolicyCriteria> SkipPolicy(
                clEnumValN(SkipMLPolicyCriteria::IfCallerIsNotCold,
                           "if-caller-not-cold", "if the caller is not cold")));
 
+static cl::opt<std::string> ModelSelector("ml-inliner-model-selector",
+                                          cl::Hidden, cl::init(""));
+
 #if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL)
 // codegen-ed file
 #include "InlinerSizeModel.h" // NOLINT
@@ -73,7 +77,8 @@ llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM,
   std::unique_ptr<MLModelRunner> AOTRunner;
   if (InteractiveChannelBaseName.empty())
     AOTRunner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>(
-        M.getContext(), FeatureMap, DecisionName);
+        M.getContext(), FeatureMap, DecisionName,
+        EmbeddedModelRunnerOptions().setModelSelector(ModelSelector));
   else {
     auto Features = FeatureMap;
     if (InteractiveIncludeDefault)
@@ -187,8 +192,8 @@ unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const {
   return CG.lookup(F) ? FunctionLevels.at(CG.lookup(F)) : 0;
 }
 
-void MLInlineAdvisor::onPassEntry(LazyCallGraph::SCC *LastSCC) {
-  if (!LastSCC || ForceStop)
+void MLInlineAdvisor::onPassEntry(LazyCallGraph::SCC *CurSCC) {
+  if (!CurSCC || ForceStop)
     return;
   FPICache.clear();
   // Function passes executed between InlinerPass runs may have changed the
@@ -235,15 +240,15 @@ void MLInlineAdvisor::onPassEntry(LazyCallGraph::SCC *LastSCC) {
   // (Re)use NodesInLastSCC to remember the nodes in the SCC right now,
   // in case the SCC is split before onPassExit and some nodes are split out
   assert(NodesInLastSCC.empty());
-  for (const auto &N : *LastSCC)
+  for (const auto &N : *CurSCC)
     NodesInLastSCC.insert(&N);
 }
 
-void MLInlineAdvisor::onPassExit(LazyCallGraph::SCC *LastSCC) {
+void MLInlineAdvisor::onPassExit(LazyCallGraph::SCC *CurSCC) {
   // No need to keep this around - function passes will invalidate it.
   if (!KeepFPICache)
     FPICache.clear();
-  if (!LastSCC || ForceStop)
+  if (!CurSCC || ForceStop)
     return;
   // Keep track of the nodes and edges we last saw. Then, in onPassEntry,
   // we update the node count and edge count from the subset of these nodes that
@@ -259,7 +264,7 @@ void MLInlineAdvisor::onPassExit(LazyCallGraph::SCC *LastSCC) {
   }
 
   // Check on nodes that may have got added to SCC
-  for (const auto &N : *LastSCC) {
+  for (const auto &N : *CurSCC) {
     assert(!N.isDead());
     auto I = NodesInLastSCC.insert(&N);
     if (I.second)
@@ -423,6 +428,10 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   *ModelRunner->getTensor<int64_t>(FeatureIndex::callee_users) =
       CalleeBefore.Uses;
   *ModelRunner->getTensor<int64_t>(FeatureIndex::cost_estimate) = CostEstimate;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::is_callee_avail_external) =
+      Callee.hasAvailableExternallyLinkage();
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::is_caller_avail_external) =
+      Caller.hasAvailableExternallyLinkage();
 
   // Add the cost features
   for (size_t I = 0;
diff --git a/llvm/lib/Analysis/MemDerefPrinter.cpp b/llvm/lib/Analysis/MemDerefPrinter.cpp
index 2632bc50d6e695..e858d941435441 100644
--- a/llvm/lib/Analysis/MemDerefPrinter.cpp
+++ b/llvm/lib/Analysis/MemDerefPrinter.cpp
@@ -26,7 +26,7 @@ PreservedAnalyses MemDerefPrinterPass::run(Function &F,
   SmallVector<Value *, 4> Deref;
   SmallPtrSet<Value *, 4> DerefAndAligned;
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   for (auto &I : instructions(F)) {
     if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
       Value *PO = LI->getPointerOperand();
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 8ca15434833d9d..1edc51e9ce5da3 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -386,7 +386,7 @@ llvm::getAllocSize(const CallBase *CB, const TargetLibraryInfo *TLI,
 
   // Get the index type for this address space, results and intermediate
   // computations are performed at that width.
-  auto &DL = CB->getModule()->getDataLayout();
+  auto &DL = CB->getDataLayout();
   const unsigned IntTyBits = DL.getIndexTypeSizeInBits(CB->getType());
 
   // Handle strdup-like functions separately.
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 951e00e3414222..9f7baa983f1229 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -399,7 +399,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     BatchAAResults &BatchAA) {
   bool isInvariantLoad = false;
   Align MemLocAlign =
-      MemLoc.Ptr->getPointerAlignment(BB->getModule()->getDataLayout());
+      MemLoc.Ptr->getPointerAlignment(BB->getDataLayout());
 
   unsigned DefaultLimit = getDefaultBlockScanLimit();
   if (!Limit)
@@ -910,7 +910,7 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
                                        const_cast<Value *>(Loc.Ptr)));
     return;
   }
-  const DataLayout &DL = FromBB->getModule()->getDataLayout();
+  const DataLayout &DL = FromBB->getDataLayout();
   PHITransAddr Address(const_cast<Value *>(Loc.Ptr), DL, &AC);
 
   // This is the set of blocks we've inspected, and the pointer we consider in
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 0404b32be848ce..e0cd320e946a4a 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -34,7 +34,7 @@ void LocationSize::print(raw_ostream &OS) const {
 }
 
 MemoryLocation MemoryLocation::get(const LoadInst *LI) {
-  const auto &DL = LI->getModule()->getDataLayout();
+  const auto &DL = LI->getDataLayout();
 
   return MemoryLocation(
       LI->getPointerOperand(),
@@ -43,7 +43,7 @@ MemoryLocation MemoryLocation::get(const LoadInst *LI) {
 }
 
 MemoryLocation MemoryLocation::get(const StoreInst *SI) {
-  const auto &DL = SI->getModule()->getDataLayout();
+  const auto &DL = SI->getDataLayout();
 
   return MemoryLocation(SI->getPointerOperand(),
                         LocationSize::precise(DL.getTypeStoreSize(
@@ -57,7 +57,7 @@ MemoryLocation MemoryLocation::get(const VAArgInst *VI) {
 }
 
 MemoryLocation MemoryLocation::get(const AtomicCmpXchgInst *CXI) {
-  const auto &DL = CXI->getModule()->getDataLayout();
+  const auto &DL = CXI->getDataLayout();
 
   return MemoryLocation(CXI->getPointerOperand(),
                         LocationSize::precise(DL.getTypeStoreSize(
@@ -66,7 +66,7 @@ MemoryLocation MemoryLocation::get(const AtomicCmpXchgInst *CXI) {
 }
 
 MemoryLocation MemoryLocation::get(const AtomicRMWInst *RMWI) {
-  const auto &DL = RMWI->getModule()->getDataLayout();
+  const auto &DL = RMWI->getDataLayout();
 
   return MemoryLocation(RMWI->getPointerOperand(),
                         LocationSize::precise(DL.getTypeStoreSize(
@@ -165,7 +165,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
 
   // We may be able to produce an exact size for known intrinsics.
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Call)) {
-    const DataLayout &DL = II->getModule()->getDataLayout();
+    const DataLayout &DL = II->getDataLayout();
 
     switch (II->getIntrinsicID()) {
     default:
diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
index d4b31f2b001878..904d30d0544654 100644
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -139,7 +139,7 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock,
   auto *RHS = Cond->getOperand(1);
   if (!LHS || LHS->getParent() != CurLoop->getHeader())
     return false;
-  auto DL = ExitBlock->getModule()->getDataLayout();
+  auto DL = ExitBlock->getDataLayout();
   auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader());
   auto *SimpleValOrNull = simplifyCmpInst(Cond->getPredicate(),
                                           IVStart, RHS,
diff --git a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
index ca87bc3c78bed0..458ccc108a0830 100644
--- a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
@@ -142,5 +142,5 @@ ModRefInfo ObjCARCAAResult::getModRefInfo(const CallBase *Call,
 AnalysisKey ObjCARCAA::Key;
 
 ObjCARCAAResult ObjCARCAA::run(Function &F, FunctionAnalysisManager &AM) {
-  return ObjCARCAAResult(F.getParent()->getDataLayout());
+  return ObjCARCAAResult(F.getDataLayout());
 }
diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp
index 5ec1964c4cb601..e42113db427811 100644
--- a/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/llvm/lib/Analysis/PHITransAddr.cpp
@@ -216,8 +216,8 @@ Value *PHITransAddr::translateSubExpr(Value *V, BasicBlock *CurBB,
     if (Value *V = simplifyGEPInst(GEP->getSourceElementType(), GEPOps[0],
                                    ArrayRef<Value *>(GEPOps).slice(1),
                                    GEP->getNoWrapFlags(), {DL, TLI, DT, AC})) {
-      for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
-        RemoveInstInputs(GEPOps[i], InstInputs);
+      for (Value *Op : GEPOps)
+        RemoveInstInputs(Op, InstInputs);
 
       return addAsInput(V);
     }
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b2cb672e305c3e..e998fe9452ad73 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2615,16 +2615,16 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     bool Ok = true;
     // Check all the operands to see if they can be represented in the
     // source type of the truncate.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-      if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Ops[i])) {
+    for (const SCEV *Op : Ops) {
+      if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Op)) {
         if (T->getOperand()->getType() != SrcType) {
           Ok = false;
           break;
         }
         LargeOps.push_back(T->getOperand());
-      } else if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
+      } else if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Op)) {
         LargeOps.push_back(getAnyExtendExpr(C, SrcType));
-      } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Ops[i])) {
+      } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Op)) {
         SmallVector<const SCEV *, 8> LargeMulOps;
         for (unsigned j = 0, f = M->getNumOperands(); j != f && Ok; ++j) {
           if (const SCEVTruncateExpr *T =
@@ -3668,13 +3668,13 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   if (Operands.size() == 1) return Operands[0];
 #ifndef NDEBUG
   Type *ETy = getEffectiveSCEVType(Operands[0]->getType());
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    assert(getEffectiveSCEVType(Operands[i]->getType()) == ETy &&
+  for (const SCEV *Op : llvm::drop_begin(Operands)) {
+    assert(getEffectiveSCEVType(Op->getType()) == ETy &&
            "SCEVAddRecExpr operand types don't match!");
-    assert(!Operands[i]->getType()->isPointerTy() && "Step must be integer");
+    assert(!Op->getType()->isPointerTy() && "Step must be integer");
   }
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
-    assert(isAvailableAtLoopEntry(Operands[i], L) &&
+  for (const SCEV *Op : Operands)
+    assert(isAvailableAtLoopEntry(Op, L) &&
            "SCEVAddRecExpr operand is not available at loop entry!");
 #endif
 
@@ -3958,8 +3958,8 @@ const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind,
   // already have one, otherwise create a new one.
   FoldingSetNodeID ID;
   ID.AddInteger(Kind);
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    ID.AddPointer(Ops[i]);
+  for (const SCEV *Op : Ops)
+    ID.AddPointer(Op);
   void *IP = nullptr;
   const SCEV *ExistingSCEV = UniqueSCEVs.FindNodeOrInsertPos(ID, IP);
   if (ExistingSCEV)
@@ -4345,8 +4345,8 @@ ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind,
   // already have one, otherwise create a new one.
   FoldingSetNodeID ID;
   ID.AddInteger(Kind);
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    ID.AddPointer(Ops[i]);
+  for (const SCEV *Op : Ops)
+    ID.AddPointer(Op);
   void *IP = nullptr;
   const SCEV *ExistingSCEV = UniqueSCEVs.FindNodeOrInsertPos(ID, IP);
   if (ExistingSCEV)
@@ -8785,9 +8785,7 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L,
   // Compute the ExitLimit for each loop exit. Use this to populate ExitCounts
   // and compute maxBECount.
   // Do a union of all the predicates here.
-  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
-    BasicBlock *ExitBB = ExitingBlocks[i];
-
+  for (BasicBlock *ExitBB : ExitingBlocks) {
     // We canonicalize untaken exits to br (constant), ignore them so that
     // proving an exit untaken doesn't negatively impact our ability to reason
     // about the loop as whole.
@@ -13002,179 +13000,227 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
       return RHS;
   }
 
-  // When the RHS is not invariant, we do not know the end bound of the loop and
-  // cannot calculate the ExactBECount needed by ExitLimit. However, we can
-  // calculate the MaxBECount, given the start, stride and max value for the end
-  // bound of the loop (RHS), and the fact that IV does not overflow (which is
-  // checked above).
+  const SCEV *End = nullptr, *BECount = nullptr,
+             *BECountIfBackedgeTaken = nullptr;
   if (!isLoopInvariant(RHS, L)) {
-    const SCEV *MaxBECount = computeMaxBECountForLT(
-        Start, Stride, RHS, getTypeSizeInBits(LHS->getType()), IsSigned);
-    return ExitLimit(getCouldNotCompute() /* ExactNotTaken */, MaxBECount,
-                     MaxBECount, false /*MaxOrZero*/, Predicates);
-  }
-
-  // We use the expression (max(End,Start)-Start)/Stride to describe the
-  // backedge count, as if the backedge is taken at least once max(End,Start)
-  // is End and so the result is as above, and if not max(End,Start) is Start
-  // so we get a backedge count of zero.
-  const SCEV *BECount = nullptr;
-  auto *OrigStartMinusStride = getMinusSCEV(OrigStart, Stride);
-  assert(isAvailableAtLoopEntry(OrigStartMinusStride, L) && "Must be!");
-  assert(isAvailableAtLoopEntry(OrigStart, L) && "Must be!");
-  assert(isAvailableAtLoopEntry(OrigRHS, L) && "Must be!");
-  // Can we prove (max(RHS,Start) > Start - Stride?
-  if (isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigStart) &&
-      isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigRHS)) {
-    // In this case, we can use a refined formula for computing backedge taken
-    // count.  The general formula remains:
-    //   "End-Start /uceiling Stride" where "End = max(RHS,Start)"
-    // We want to use the alternate formula:
-    //   "((End - 1) - (Start - Stride)) /u Stride"
-    // Let's do a quick case analysis to show these are equivalent under
-    // our precondition that max(RHS,Start) > Start - Stride.
-    // * For RHS <= Start, the backedge-taken count must be zero.
-    //   "((End - 1) - (Start - Stride)) /u Stride" reduces to
-    //   "((Start - 1) - (Start - Stride)) /u Stride" which simplies to
-    //   "Stride - 1 /u Stride" which is indeed zero for all non-zero values
-    //     of Stride.  For 0 stride, we've use umin(1,Stride) above, reducing
-    //     this to the stride of 1 case.
-    // * For RHS >= Start, the backedge count must be "RHS-Start /uceil Stride".
-    //   "((End - 1) - (Start - Stride)) /u Stride" reduces to
-    //   "((RHS - 1) - (Start - Stride)) /u Stride" reassociates to
-    //   "((RHS - (Start - Stride) - 1) /u Stride".
-    //   Our preconditions trivially imply no overflow in that form.
-    const SCEV *MinusOne = getMinusOne(Stride->getType());
-    const SCEV *Numerator =
-        getMinusSCEV(getAddExpr(RHS, MinusOne), getMinusSCEV(Start, Stride));
-    BECount = getUDivExpr(Numerator, Stride);
-  }
-
-  const SCEV *BECountIfBackedgeTaken = nullptr;
-  if (!BECount) {
-    auto canProveRHSGreaterThanEqualStart = [&]() {
-      auto CondGE = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
-      const SCEV *GuardedRHS = applyLoopGuards(OrigRHS, L);
-      const SCEV *GuardedStart = applyLoopGuards(OrigStart, L);
-
-      if (isLoopEntryGuardedByCond(L, CondGE, OrigRHS, OrigStart) ||
-          isKnownPredicate(CondGE, GuardedRHS, GuardedStart))
-        return true;
-
-      // (RHS > Start - 1) implies RHS >= Start.
-      // * "RHS >= Start" is trivially equivalent to "RHS > Start - 1" if
-      //   "Start - 1" doesn't overflow.
-      // * For signed comparison, if Start - 1 does overflow, it's equal
-      //   to INT_MAX, and "RHS >s INT_MAX" is trivially false.
-      // * For unsigned comparison, if Start - 1 does overflow, it's equal
-      //   to UINT_MAX, and "RHS >u UINT_MAX" is trivially false.
+    const auto *RHSAddRec = dyn_cast<SCEVAddRecExpr>(RHS);
+    if (PositiveStride && RHSAddRec != nullptr && RHSAddRec->getLoop() == L &&
+        RHSAddRec->getNoWrapFlags()) {
+      // The structure of loop we are trying to calculate backedge count of:
       //
-      // FIXME: Should isLoopEntryGuardedByCond do this for us?
-      auto CondGT = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
-      auto *StartMinusOne = getAddExpr(OrigStart,
-                                       getMinusOne(OrigStart->getType()));
-      return isLoopEntryGuardedByCond(L, CondGT, OrigRHS, StartMinusOne);
-    };
-
-    // If we know that RHS >= Start in the context of loop, then we know that
-    // max(RHS, Start) = RHS at this point.
-    const SCEV *End;
-    if (canProveRHSGreaterThanEqualStart()) {
-      End = RHS;
-    } else {
-      // If RHS < Start, the backedge will be taken zero times.  So in
-      // general, we can write the backedge-taken count as:
+      //  left = left_start
+      //  right = right_start
       //
-      //     RHS >= Start ? ceil(RHS - Start) / Stride : 0
+      //  while(left < right){
+      //    ... do something here ...
+      //    left += s1; // stride of left is s1 (s1 > 0)
+      //    right += s2; // stride of right is s2 (s2 < 0)
+      //  }
       //
-      // We convert it to the following to make it more convenient for SCEV:
-      //
-      //     ceil(max(RHS, Start) - Start) / Stride
-      End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
 
-      // See what would happen if we assume the backedge is taken. This is
-      // used to compute MaxBECount.
-      BECountIfBackedgeTaken = getUDivCeilSCEV(getMinusSCEV(RHS, Start), Stride);
-    }
+      const SCEV *RHSStart = RHSAddRec->getStart();
+      const SCEV *RHSStride = RHSAddRec->getStepRecurrence(*this);
 
-    // At this point, we know:
-    //
-    // 1. If IsSigned, Start <=s End; otherwise, Start <=u End
-    // 2. The index variable doesn't overflow.
-    //
-    // Therefore, we know N exists such that
-    // (Start + Stride * N) >= End, and computing "(Start + Stride * N)"
-    // doesn't overflow.
-    //
-    // Using this information, try to prove whether the addition in
-    // "(Start - End) + (Stride - 1)" has unsigned overflow.
-    const SCEV *One = getOne(Stride->getType());
-    bool MayAddOverflow = [&] {
-      if (auto *StrideC = dyn_cast<SCEVConstant>(Stride)) {
-        if (StrideC->getAPInt().isPowerOf2()) {
-          // Suppose Stride is a power of two, and Start/End are unsigned
-          // integers.  Let UMAX be the largest representable unsigned
-          // integer.
-          //
-          // By the preconditions of this function, we know
-          // "(Start + Stride * N) >= End", and this doesn't overflow.
-          // As a formula:
-          //
-          //   End <= (Start + Stride * N) <= UMAX
-          //
-          // Subtracting Start from all the terms:
-          //
-          //   End - Start <= Stride * N <= UMAX - Start
-          //
-          // Since Start is unsigned, UMAX - Start <= UMAX.  Therefore:
-          //
-          //   End - Start <= Stride * N <= UMAX
-          //
-          // Stride * N is a multiple of Stride. Therefore,
-          //
-          //   End - Start <= Stride * N <= UMAX - (UMAX mod Stride)
-          //
-          // Since Stride is a power of two, UMAX + 1 is divisible by Stride.
-          // Therefore, UMAX mod Stride == Stride - 1.  So we can write:
-          //
-          //   End - Start <= Stride * N <= UMAX - Stride - 1
-          //
-          // Dropping the middle term:
-          //
-          //   End - Start <= UMAX - Stride - 1
-          //
-          // Adding Stride - 1 to both sides:
-          //
-          //   (End - Start) + (Stride - 1) <= UMAX
-          //
-          // In other words, the addition doesn't have unsigned overflow.
-          //
-          // A similar proof works if we treat Start/End as signed values.
-          // Just rewrite steps before "End - Start <= Stride * N <= UMAX" to
-          // use signed max instead of unsigned max. Note that we're trying
-          // to prove a lack of unsigned overflow in either case.
-          return false;
+      // If Stride - RHSStride is positive and does not overflow, we can write
+      // backedge count as ->
+      //    ceil((End - Start) /u (Stride - RHSStride))
+      //    Where, End = max(RHSStart, Start)
+
+      // Check if RHSStride < 0 and Stride - RHSStride will not overflow.
+      if (isKnownNegative(RHSStride) &&
+          willNotOverflow(Instruction::Sub, /*Signed=*/true, Stride,
+                          RHSStride)) {
+
+        const SCEV *Denominator = getMinusSCEV(Stride, RHSStride);
+        if (isKnownPositive(Denominator)) {
+          End = IsSigned ? getSMaxExpr(RHSStart, Start)
+                         : getUMaxExpr(RHSStart, Start);
+
+          // We can do this because End >= Start, as End = max(RHSStart, Start)
+          const SCEV *Delta = getMinusSCEV(End, Start);
+
+          BECount = getUDivCeilSCEV(Delta, Denominator);
+          BECountIfBackedgeTaken =
+              getUDivCeilSCEV(getMinusSCEV(RHSStart, Start), Denominator);
         }
       }
-      if (Start == Stride || Start == getMinusSCEV(Stride, One)) {
-        // If Start is equal to Stride, (End - Start) + (Stride - 1) == End - 1.
-        // If !IsSigned, 0 <u Stride == Start <=u End; so 0 <u End - 1 <u End.
-        // If IsSigned, 0 <s Stride == Start <=s End; so 0 <s End - 1 <s End.
+    }
+    if (BECount == nullptr) {
+      // If we cannot calculate ExactBECount, we can calculate the MaxBECount,
+      // given the start, stride and max value for the end bound of the
+      // loop (RHS), and the fact that IV does not overflow (which is
+      // checked above).
+      const SCEV *MaxBECount = computeMaxBECountForLT(
+          Start, Stride, RHS, getTypeSizeInBits(LHS->getType()), IsSigned);
+      return ExitLimit(getCouldNotCompute() /* ExactNotTaken */, MaxBECount,
+                       MaxBECount, false /*MaxOrZero*/, Predicates);
+    }
+  } else {
+    // We use the expression (max(End,Start)-Start)/Stride to describe the
+    // backedge count, as if the backedge is taken at least once
+    // max(End,Start) is End and so the result is as above, and if not
+    // max(End,Start) is Start so we get a backedge count of zero.
+    auto *OrigStartMinusStride = getMinusSCEV(OrigStart, Stride);
+    assert(isAvailableAtLoopEntry(OrigStartMinusStride, L) && "Must be!");
+    assert(isAvailableAtLoopEntry(OrigStart, L) && "Must be!");
+    assert(isAvailableAtLoopEntry(OrigRHS, L) && "Must be!");
+    // Can we prove (max(RHS,Start) > Start - Stride?
+    if (isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigStart) &&
+        isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigRHS)) {
+      // In this case, we can use a refined formula for computing backedge
+      // taken count.  The general formula remains:
+      //   "End-Start /uceiling Stride" where "End = max(RHS,Start)"
+      // We want to use the alternate formula:
+      //   "((End - 1) - (Start - Stride)) /u Stride"
+      // Let's do a quick case analysis to show these are equivalent under
+      // our precondition that max(RHS,Start) > Start - Stride.
+      // * For RHS <= Start, the backedge-taken count must be zero.
+      //   "((End - 1) - (Start - Stride)) /u Stride" reduces to
+      //   "((Start - 1) - (Start - Stride)) /u Stride" which simplies to
+      //   "Stride - 1 /u Stride" which is indeed zero for all non-zero values
+      //     of Stride.  For 0 stride, we've use umin(1,Stride) above,
+      //     reducing this to the stride of 1 case.
+      // * For RHS >= Start, the backedge count must be "RHS-Start /uceil
+      // Stride".
+      //   "((End - 1) - (Start - Stride)) /u Stride" reduces to
+      //   "((RHS - 1) - (Start - Stride)) /u Stride" reassociates to
+      //   "((RHS - (Start - Stride) - 1) /u Stride".
+      //   Our preconditions trivially imply no overflow in that form.
+      const SCEV *MinusOne = getMinusOne(Stride->getType());
+      const SCEV *Numerator =
+          getMinusSCEV(getAddExpr(RHS, MinusOne), getMinusSCEV(Start, Stride));
+      BECount = getUDivExpr(Numerator, Stride);
+    }
+
+    if (!BECount) {
+      auto canProveRHSGreaterThanEqualStart = [&]() {
+        auto CondGE = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
+        const SCEV *GuardedRHS = applyLoopGuards(OrigRHS, L);
+        const SCEV *GuardedStart = applyLoopGuards(OrigStart, L);
+
+        if (isLoopEntryGuardedByCond(L, CondGE, OrigRHS, OrigStart) ||
+            isKnownPredicate(CondGE, GuardedRHS, GuardedStart))
+          return true;
+
+        // (RHS > Start - 1) implies RHS >= Start.
+        // * "RHS >= Start" is trivially equivalent to "RHS > Start - 1" if
+        //   "Start - 1" doesn't overflow.
+        // * For signed comparison, if Start - 1 does overflow, it's equal
+        //   to INT_MAX, and "RHS >s INT_MAX" is trivially false.
+        // * For unsigned comparison, if Start - 1 does overflow, it's equal
+        //   to UINT_MAX, and "RHS >u UINT_MAX" is trivially false.
         //
-        // If Start is equal to Stride - 1, (End - Start) + Stride - 1 == End.
-        return false;
+        // FIXME: Should isLoopEntryGuardedByCond do this for us?
+        auto CondGT = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+        auto *StartMinusOne =
+            getAddExpr(OrigStart, getMinusOne(OrigStart->getType()));
+        return isLoopEntryGuardedByCond(L, CondGT, OrigRHS, StartMinusOne);
+      };
+
+      // If we know that RHS >= Start in the context of loop, then we know
+      // that max(RHS, Start) = RHS at this point.
+      if (canProveRHSGreaterThanEqualStart()) {
+        End = RHS;
+      } else {
+        // If RHS < Start, the backedge will be taken zero times.  So in
+        // general, we can write the backedge-taken count as:
+        //
+        //     RHS >= Start ? ceil(RHS - Start) / Stride : 0
+        //
+        // We convert it to the following to make it more convenient for SCEV:
+        //
+        //     ceil(max(RHS, Start) - Start) / Stride
+        End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
+
+        // See what would happen if we assume the backedge is taken. This is
+        // used to compute MaxBECount.
+        BECountIfBackedgeTaken =
+            getUDivCeilSCEV(getMinusSCEV(RHS, Start), Stride);
       }
-      return true;
-    }();
 
-    const SCEV *Delta = getMinusSCEV(End, Start);
-    if (!MayAddOverflow) {
-      // floor((D + (S - 1)) / S)
-      // We prefer this formulation if it's legal because it's fewer operations.
-      BECount =
-          getUDivExpr(getAddExpr(Delta, getMinusSCEV(Stride, One)), Stride);
-    } else {
-      BECount = getUDivCeilSCEV(Delta, Stride);
+      // At this point, we know:
+      //
+      // 1. If IsSigned, Start <=s End; otherwise, Start <=u End
+      // 2. The index variable doesn't overflow.
+      //
+      // Therefore, we know N exists such that
+      // (Start + Stride * N) >= End, and computing "(Start + Stride * N)"
+      // doesn't overflow.
+      //
+      // Using this information, try to prove whether the addition in
+      // "(Start - End) + (Stride - 1)" has unsigned overflow.
+      const SCEV *One = getOne(Stride->getType());
+      bool MayAddOverflow = [&] {
+        if (auto *StrideC = dyn_cast<SCEVConstant>(Stride)) {
+          if (StrideC->getAPInt().isPowerOf2()) {
+            // Suppose Stride is a power of two, and Start/End are unsigned
+            // integers.  Let UMAX be the largest representable unsigned
+            // integer.
+            //
+            // By the preconditions of this function, we know
+            // "(Start + Stride * N) >= End", and this doesn't overflow.
+            // As a formula:
+            //
+            //   End <= (Start + Stride * N) <= UMAX
+            //
+            // Subtracting Start from all the terms:
+            //
+            //   End - Start <= Stride * N <= UMAX - Start
+            //
+            // Since Start is unsigned, UMAX - Start <= UMAX.  Therefore:
+            //
+            //   End - Start <= Stride * N <= UMAX
+            //
+            // Stride * N is a multiple of Stride. Therefore,
+            //
+            //   End - Start <= Stride * N <= UMAX - (UMAX mod Stride)
+            //
+            // Since Stride is a power of two, UMAX + 1 is divisible by
+            // Stride. Therefore, UMAX mod Stride == Stride - 1.  So we can
+            // write:
+            //
+            //   End - Start <= Stride * N <= UMAX - Stride - 1
+            //
+            // Dropping the middle term:
+            //
+            //   End - Start <= UMAX - Stride - 1
+            //
+            // Adding Stride - 1 to both sides:
+            //
+            //   (End - Start) + (Stride - 1) <= UMAX
+            //
+            // In other words, the addition doesn't have unsigned overflow.
+            //
+            // A similar proof works if we treat Start/End as signed values.
+            // Just rewrite steps before "End - Start <= Stride * N <= UMAX"
+            // to use signed max instead of unsigned max. Note that we're
+            // trying to prove a lack of unsigned overflow in either case.
+            return false;
+          }
+        }
+        if (Start == Stride || Start == getMinusSCEV(Stride, One)) {
+          // If Start is equal to Stride, (End - Start) + (Stride - 1) == End
+          // - 1. If !IsSigned, 0 <u Stride == Start <=u End; so 0 <u End - 1
+          // <u End. If IsSigned, 0 <s Stride == Start <=s End; so 0 <s End -
+          // 1 <s End.
+          //
+          // If Start is equal to Stride - 1, (End - Start) + Stride - 1 ==
+          // End.
+          return false;
+        }
+        return true;
+      }();
+
+      const SCEV *Delta = getMinusSCEV(End, Start);
+      if (!MayAddOverflow) {
+        // floor((D + (S - 1)) / S)
+        // We prefer this formulation if it's legal because it's fewer
+        // operations.
+        BECount =
+            getUDivExpr(getAddExpr(Delta, getMinusSCEV(Stride, One)), Stride);
+      } else {
+        BECount = getUDivCeilSCEV(Delta, Stride);
+      }
     }
   }
 
@@ -13463,7 +13509,7 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
                                  AssumptionCache &AC, DominatorTree &DT,
                                  LoopInfo &LI)
-    : F(F), TLI(TLI), AC(AC), DT(DT), LI(LI),
+    : F(F), DL(F.getDataLayout()), TLI(TLI), AC(AC), DT(DT), LI(LI),
       CouldNotCompute(new SCEVCouldNotCompute()), ValuesAtScopes(64),
       LoopDispositions(64), BlockDispositions(64) {
   // To use guards for proving predicates, we need to scan every instruction in
@@ -13482,8 +13528,8 @@ ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
 }
 
 ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
-    : F(Arg.F), HasGuards(Arg.HasGuards), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT),
-      LI(Arg.LI), CouldNotCompute(std::move(Arg.CouldNotCompute)),
+    : F(Arg.F), DL(Arg.DL), HasGuards(Arg.HasGuards), TLI(Arg.TLI), AC(Arg.AC),
+      DT(Arg.DT), LI(Arg.LI), CouldNotCompute(std::move(Arg.CouldNotCompute)),
       ValueExprMap(std::move(Arg.ValueExprMap)),
       PendingLoopPredicates(std::move(Arg.PendingLoopPredicates)),
       PendingPhiRanges(std::move(Arg.PendingPhiRanges)),
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 3e1b5dea6f6c25..21f54c7cbc849f 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -87,7 +87,7 @@ void StackLifetime::collectMarkers() {
   DenseMap<const BasicBlock *, SmallDenseMap<const IntrinsicInst *, Marker>>
       BBMarkerSet;
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   // Compute the set of start/end markers per basic block.
   for (const BasicBlock *BB : depth_first(&F)) {
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index c5334959115d96..27360d0e84cb2b 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -152,7 +152,7 @@ raw_ostream &operator<<(raw_ostream &OS, const UseInfo<CalleeTy> &U) {
 /// Calculate the allocation size of a given alloca. Returns empty range
 // in case of confution.
 ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
-  const DataLayout &DL = AI.getModule()->getDataLayout();
+  const DataLayout &DL = AI.getDataLayout();
   TypeSize TS = DL.getTypeAllocSize(AI.getAllocatedType());
   unsigned PointerSize = DL.getPointerTypeSizeInBits(AI.getType());
   // Fallback to empty range for alloca size.
@@ -268,7 +268,7 @@ class StackSafetyLocalAnalysis {
 
 public:
   StackSafetyLocalAnalysis(Function &F, ScalarEvolution &SE)
-      : F(F), DL(F.getParent()->getDataLayout()), SE(SE),
+      : F(F), DL(F.getDataLayout()), SE(SE),
         PointerSize(DL.getPointerSizeInBits()),
         UnknownRange(PointerSize, true) {}
 
@@ -852,7 +852,7 @@ GVToSSI createGlobalStackSafetyInfo(
     }
 
   uint32_t PointerSize =
-      Copy.begin()->first->getParent()->getDataLayout().getPointerSizeInBits();
+      Copy.begin()->first->getDataLayout().getPointerSizeInBits();
   StackSafetyDataFlowAnalysis<GlobalValue> SSDFA(PointerSize, std::move(Copy));
 
   for (const auto &F : SSDFA.run()) {
diff --git a/llvm/lib/Analysis/StructuralHash.cpp b/llvm/lib/Analysis/StructuralHash.cpp
index 244ed5d55f3f81..24985f0a9d3147 100644
--- a/llvm/lib/Analysis/StructuralHash.cpp
+++ b/llvm/lib/Analysis/StructuralHash.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/StructuralHash.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/StructuralHash.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 6b760fbde5bbaa..bf53f562ec4195 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f0..c175d1737e54be 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -98,8 +98,8 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
 
   Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
   ParamTys.reserve(Arguments.size());
-  for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
-    ParamTys.push_back(Arguments[Idx]->getType());
+  for (const Value *Argument : Arguments)
+    ParamTys.push_back(Argument->getType());
 }
 
 IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
@@ -722,6 +722,10 @@ unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
   return TTIImpl->getNumberOfRegisters(ClassID);
 }
 
+bool TargetTransformInfo::hasConditionalLoadStoreForType(Type *Ty) const {
+  return TTIImpl->hasConditionalLoadStoreForType(Ty);
+}
+
 unsigned TargetTransformInfo::getRegisterClassForType(bool Vector,
                                                       Type *Ty) const {
   return TTIImpl->getRegisterClassForType(Vector, Ty);
@@ -1350,7 +1354,7 @@ TargetIRAnalysis::Result TargetIRAnalysis::run(const Function &F,
 AnalysisKey TargetIRAnalysis::Key;
 
 TargetIRAnalysis::Result TargetIRAnalysis::getDefaultTTI(const Function &F) {
-  return Result(F.getParent()->getDataLayout());
+  return Result(F.getDataLayout());
 }
 
 // Register the basic pass.
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index a9d5743bdaac01..e1cb63a9ab8f91 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0df061923f625a..7deb7bd4e31239 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1740,10 +1740,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
             cast<ConstantInt>(II->getArgOperand(HasAVL))->getZExtValue());
         RISCVII::VLMUL VLMUL = static_cast<RISCVII::VLMUL>(
             cast<ConstantInt>(II->getArgOperand(1 + HasAVL))->getZExtValue());
-        // The Range is [Lower, Upper), so we need to subtract 1 here to get the
-        // real upper value.
         uint64_t MaxVLEN =
-            (Range.getUpper().getZExtValue() - 1) * RISCV::RVVBitsPerBlock;
+            Range.getUnsignedMax().getZExtValue() * RISCV::RVVBitsPerBlock;
         uint64_t MaxVL = MaxVLEN / RISCVVType::getSEWLMULRatio(SEW, VLMUL);
 
         // Result of vsetvli must be not larger than AVL.
@@ -3996,6 +3994,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
   case LibFunc_cosf:
   case LibFunc_cosl:
     return Intrinsic::cos;
+  case LibFunc_tan:
+  case LibFunc_tanf:
+  case LibFunc_tanl:
+    return Intrinsic::tan;
   case LibFunc_exp:
   case LibFunc_expf:
   case LibFunc_expl:
@@ -4685,7 +4687,7 @@ static KnownFPClass computeKnownFPClassFromContext(const Value *V,
 
     assert(I->getFunction() == Q.CxtI->getParent()->getParent() &&
            "Got assumption for the wrong function!");
-    assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
+    assert(I->getIntrinsicID() == Intrinsic::assume &&
            "must be an assume intrinsic");
 
     if (!isValidAssumeForContext(I, Q.CxtI, Q.DT))
@@ -6113,7 +6115,7 @@ bool llvm::getConstantDataArrayInfo(const Value *V,
     // Fail if V is not based on constant global object.
     return false;
 
-  const DataLayout &DL = GV->getParent()->getDataLayout();
+  const DataLayout &DL = GV->getDataLayout();
   APInt Off(DL.getIndexTypeSizeInBits(V->getType()), 0);
 
   if (GV != V->stripAndAccumulateConstantOffsets(DL, Off,
@@ -6686,7 +6688,7 @@ bool llvm::isSafeToSpeculativelyExecuteWithOpcode(
       return false;
     if (mustSuppressSpeculation(*LI))
       return false;
-    const DataLayout &DL = LI->getModule()->getDataLayout();
+    const DataLayout &DL = LI->getDataLayout();
     return isDereferenceableAndAlignedPointer(LI->getPointerOperand(),
                                               LI->getType(), LI->getAlign(), DL,
                                               CtxI, AC, DT, TLI);
@@ -8426,7 +8428,7 @@ static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
   if (!C)
     return nullptr;
 
-  const DataLayout &DL = CmpI->getModule()->getDataLayout();
+  const DataLayout &DL = CmpI->getDataLayout();
   Constant *CastedTo = nullptr;
   switch (*CastOp) {
   case Instruction::ZExt:
@@ -9393,6 +9395,10 @@ static ConstantRange getRangeForIntrinsic(const IntrinsicInst &II) {
     if (!II.getParent() || !II.getFunction())
       break;
     return getVScaleRange(II.getFunction(), Width);
+  case Intrinsic::scmp:
+  case Intrinsic::ucmp:
+    return ConstantRange::getNonEmpty(APInt::getAllOnes(Width),
+                                      APInt(Width, 2));
   default:
     break;
   }
@@ -9531,7 +9537,7 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
       CallInst *I = cast<CallInst>(AssumeVH);
       assert(I->getParent()->getParent() == CtxI->getParent()->getParent() &&
              "Got assumption for the wrong function!");
-      assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
+      assert(I->getIntrinsicID() == Intrinsic::assume &&
              "must be an assume intrinsic");
 
       if (!isValidAssumeForContext(I, CtxI, DT))
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 30728ed5875090..fd1c3378e2495e 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -165,11 +165,11 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
 Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
   assert(V->getType()->isVectorTy() && "Not looking at a vector?");
   VectorType *VTy = cast<VectorType>(V->getType());
-  // For fixed-length vector, return undef for out of range access.
+  // For fixed-length vector, return poison for out of range access.
   if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
     unsigned Width = FVTy->getNumElements();
     if (EltNo >= Width)
-      return UndefValue::get(FVTy->getElementType());
+      return PoisonValue::get(FVTy->getElementType());
   }
 
   if (Constant *C = dyn_cast<Constant>(V))
@@ -202,7 +202,7 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
         cast<FixedVectorType>(SVI->getOperand(0)->getType())->getNumElements();
     int InEl = SVI->getMaskValue(EltNo);
     if (InEl < 0)
-      return UndefValue::get(VTy->getElementType());
+      return PoisonValue::get(VTy->getElementType());
     if (InEl < (int)LHSWidth)
       return findScalarElement(SVI->getOperand(0), InEl);
     return findScalarElement(SVI->getOperand(1), InEl - LHSWidth);
@@ -418,6 +418,31 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
   return true;
 }
 
+bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &ScaledMask) {
+  unsigned NumSrcElts = Mask.size();
+  assert(NumSrcElts > 0 && NumDstElts > 0 && "Unexpected scaling factor");
+
+  // Fast-path: if no scaling, then it is just a copy.
+  if (NumSrcElts == NumDstElts) {
+    ScaledMask.assign(Mask.begin(), Mask.end());
+    return true;
+  }
+
+  // Ensure we can find a whole scale factor.
+  assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
+         "Unexpected scaling factor");
+
+  if (NumSrcElts > NumDstElts) {
+    int Scale = NumSrcElts / NumDstElts;
+    return widenShuffleMaskElts(Scale, Mask, ScaledMask);
+  }
+
+  int Scale = NumDstElts / NumSrcElts;
+  narrowShuffleMaskElts(Scale, Mask, ScaledMask);
+  return true;
+}
+
 void llvm::getShuffleMaskWithWidestElts(ArrayRef<int> Mask,
                                         SmallVectorImpl<int> &ScaledMask) {
   std::array<SmallVector<int, 16>, 2> TmpMasks;
@@ -1070,7 +1095,7 @@ bool InterleavedAccessInfo::isStrided(int Stride) {
 void InterleavedAccessInfo::collectConstStrideAccesses(
     MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
     const DenseMap<Value*, const SCEV*> &Strides) {
-  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  auto &DL = TheLoop->getHeader()->getDataLayout();
 
   // Since it's desired that the load/store instructions be maintained in
   // "program order" for the interleaved access analysis, we have to visit the
@@ -1456,20 +1481,19 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
   if (!requiresScalarEpilogue())
     return;
 
-  bool ReleasedGroup = false;
   // Release groups requiring scalar epilogues. Note that this also removes them
   // from InterleaveGroups.
-  for (auto *Group : make_early_inc_range(InterleaveGroups)) {
+  bool ReleasedGroup = InterleaveGroups.remove_if([&](auto *Group) {
     if (!Group->requiresScalarEpilogue())
-      continue;
+      return false;
     LLVM_DEBUG(
         dbgs()
         << "LV: Invalidate candidate interleaved group due to gaps that "
            "require a scalar epilogue (not allowed under optsize) and cannot "
            "be masked (not enabled). \n");
-    releaseGroup(Group);
-    ReleasedGroup = true;
-  }
+    releaseGroupWithoutRemovingFromSet(Group);
+    return true;
+  });
   assert(ReleasedGroup && "At least one group must be invalidated, as a "
                           "scalar epilogue was required");
   (void)ReleasedGroup;
diff --git a/llvm/lib/Analysis/models/gen-inline-oz-test-model.py b/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
index 3846e7a3cee767..c6fcaef7c6ae6b 100644
--- a/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
+++ b/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
@@ -71,6 +71,8 @@ def get_input_signature():
             "nested_inlines",
             "nested_inline_cost_estimate",
             "threshold",
+            "is_callee_avail_external",
+            "is_caller_avail_external",
         ]
     ]
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b92e613eaa4186..059d42d9faa7fe 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -3546,7 +3546,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
     if (isa<BasicBlock>(P.second.first))
       continue;
     P.second.first->replaceAllUsesWith(
-        UndefValue::get(P.second.first->getType()));
+        PoisonValue::get(P.second.first->getType()));
     P.second.first->deleteValue();
   }
 
@@ -3554,7 +3554,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
     if (isa<BasicBlock>(P.second.first))
       continue;
     P.second.first->replaceAllUsesWith(
-        UndefValue::get(P.second.first->getType()));
+        PoisonValue::get(P.second.first->getType()));
     P.second.first->deleteValue();
   }
 }
@@ -8282,7 +8282,7 @@ int LLParser::parseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
     return error(NewLoc, "cmpxchg operand must be a first class value");
 
   const Align DefaultAlignment(
-      PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
+      PFS.getFunction().getDataLayout().getTypeStoreSize(
           Cmp->getType()));
 
   AtomicCmpXchgInst *CXI =
@@ -8388,13 +8388,13 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   }
 
   unsigned Size =
-      PFS.getFunction().getParent()->getDataLayout().getTypeStoreSizeInBits(
+      PFS.getFunction().getDataLayout().getTypeStoreSizeInBits(
           Val->getType());
   if (Size < 8 || (Size & (Size - 1)))
     return error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
                          " integer");
   const Align DefaultAlignment(
-      PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
+      PFS.getFunction().getDataLayout().getTypeStoreSize(
           Val->getType()));
   AtomicRMWInst *RMWI =
       new AtomicRMWInst(Operation, Ptr, Val,
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 6c6c5d56e134f3..05c96971233710 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3228,7 +3228,7 @@ Error BitcodeReader::parseConstants() {
         V = ConstantFP::get(
             CurTy, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record)));
       else
-        V = UndefValue::get(CurTy);
+        V = PoisonValue::get(CurTy);
       break;
     }
 
@@ -3251,7 +3251,7 @@ Error BitcodeReader::parseConstants() {
         V = BitcodeConstant::create(
             Alloc, CurTy, BitcodeConstant::ConstantVectorOpcode, Elts);
       } else {
-        V = UndefValue::get(CurTy);
+        V = PoisonValue::get(CurTy);
       }
       break;
     }
@@ -3332,7 +3332,7 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid unary op constexpr record");
       int Opc = getDecodedUnaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
-        V = UndefValue::get(CurTy);  // Unknown unop.
+        V = PoisonValue::get(CurTy);  // Unknown unop.
       } else {
         V = BitcodeConstant::create(Alloc, CurTy, Opc, (unsigned)Record[1]);
       }
@@ -3343,7 +3343,7 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid binary op constexpr record");
       int Opc = getDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
-        V = UndefValue::get(CurTy);  // Unknown binop.
+        V = PoisonValue::get(CurTy);  // Unknown binop.
       } else {
         uint8_t Flags = 0;
         if (Record.size() >= 4) {
@@ -3373,7 +3373,7 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid cast constexpr record");
       int Opc = getDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
-        V = UndefValue::get(CurTy);  // Unknown cast.
+        V = PoisonValue::get(CurTy);  // Unknown cast.
       } else {
         unsigned OpTyID = Record[1];
         Type *OpTy = getTypeByID(OpTyID);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index ba16c0851e1fdc..7a228fb6c08b9b 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -5309,6 +5309,8 @@ static const char *getSectionNameForBitcode(const Triple &T) {
     llvm_unreachable("GOFF is not yet implemented");
     break;
   case Triple::SPIRV:
+    if (T.getVendor() == Triple::AMD)
+      return ".llvmbc";
     llvm_unreachable("SPIRV is not yet implemented");
     break;
   case Triple::XCOFF:
@@ -5334,6 +5336,8 @@ static const char *getSectionNameForCommandline(const Triple &T) {
     llvm_unreachable("GOFF is not yet implemented");
     break;
   case Triple::SPIRV:
+    if (T.getVendor() == Triple::AMD)
+      return ".llvmcmd";
     llvm_unreachable("SPIRV is not yet implemented");
     break;
   case Triple::XCOFF:
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index e693cdbd0ccc1c..7fc18639e5852f 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -730,7 +730,7 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
     // index is compatible with the value we return.
     if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath,
                               AllowDifferingSizes, TLI,
-                              F->getParent()->getDataLayout()))
+                              F->getDataLayout()))
       return false;
 
     CallEmpty  = !nextRealType(CallSubTypes, CallPath);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 82b5ccdc70ea1d..5d7c97adcacade 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 766fb3633b2814..6d6ceed053fd0e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -444,7 +444,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
-  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -478,11 +478,13 @@ bool AsmPrinter::doInitialization(Module &M) {
   // use the directive, where it would need the same conditionalization
   // anyway.
   const Triple &Target = TM.getTargetTriple();
-  Triple TVT(M.getDarwinTargetVariantTriple());
-  OutStreamer->emitVersionForTarget(
-      Target, M.getSDKVersion(),
-      M.getDarwinTargetVariantTriple().empty() ? nullptr : &TVT,
-      M.getDarwinTargetVariantSDKVersion());
+  if (Target.isOSBinFormatMachO() && Target.isOSDarwin()) {
+    Triple TVT(M.getDarwinTargetVariantTriple());
+    OutStreamer->emitVersionForTarget(
+        Target, M.getSDKVersion(),
+        M.getDarwinTargetVariantTriple().empty() ? nullptr : &TVT,
+        M.getDarwinTargetVariantSDKVersion());
+  }
 
   // Allow the target to emit any magic that it wants at the start of the file.
   emitStartOfAsmFile(M);
@@ -783,7 +785,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout &DL = GV->getParent()->getDataLayout();
+  const DataLayout &DL = GV->getDataLayout();
   uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
@@ -871,7 +873,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
       emitAlignment(Alignment, GV);
       OutStreamer->emitLabel(MangSym);
 
-      emitGlobalConstant(GV->getParent()->getDataLayout(),
+      emitGlobalConstant(GV->getDataLayout(),
                          GV->getInitializer());
     }
 
@@ -911,7 +913,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   if (LocalAlias != EmittedInitSym)
     OutStreamer->emitLabel(LocalAlias);
 
-  emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+  emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
 
   if (MAI->hasDotTypeDotSizeDirective())
     // .size foo, 42
@@ -935,7 +937,7 @@ void AsmPrinter::emitFunctionPrefix(ArrayRef<const Constant *> Prefix) {
   const Function &F = MF->getFunction();
   if (!MAI->hasSubsectionsViaSymbols()) {
     for (auto &C : Prefix)
-      emitGlobalConstant(F.getParent()->getDataLayout(), C);
+      emitGlobalConstant(F.getDataLayout(), C);
     return;
   }
   // Preserving prefix-like data on platforms which use subsections-via-symbols
@@ -945,7 +947,7 @@ void AsmPrinter::emitFunctionPrefix(ArrayRef<const Constant *> Prefix) {
   OutStreamer->emitLabel(OutContext.createLinkerPrivateTempSymbol());
 
   for (auto &C : Prefix) {
-    emitGlobalConstant(F.getParent()->getDataLayout(), C);
+    emitGlobalConstant(F.getDataLayout(), C);
   }
 
   // Emit an .alt_entry directive for the actual function symbol.
@@ -1080,7 +1082,7 @@ void AsmPrinter::emitFunctionHeader() {
 
   // Emit the prologue data.
   if (F.hasPrologueData())
-    emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrologueData());
+    emitGlobalConstant(F.getDataLayout(), F.getPrologueData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
@@ -1478,8 +1480,9 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
             ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI()
             : nullptr;
     const MachineBranchProbabilityInfo *MBPI =
-        Features.BrProb ? &getAnalysis<MachineBranchProbabilityInfo>()
-                        : nullptr;
+        Features.BrProb
+            ? &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI()
+            : nullptr;
 
     if (Features.BBFreq || Features.BrProb) {
       for (const MachineBasicBlock &MBB : MF) {
@@ -1527,7 +1530,7 @@ void AsmPrinter::emitKCFITrapEntry(const MachineFunction &MF,
 void AsmPrinter::emitKCFITypeId(const MachineFunction &MF) {
   const Function &F = MF.getFunction();
   if (const MDNode *MD = F.getMetadata(LLVMContext::MD_kcfi_type))
-    emitGlobalConstant(F.getParent()->getDataLayout(),
+    emitGlobalConstant(F.getDataLayout(),
                        mdconst::extract<ConstantInt>(MD->getOperand(0)));
 }
 
@@ -1677,7 +1680,7 @@ void AsmPrinter::emitPCSections(const MachineFunction &MF) {
         for (const MDOperand &AuxMDO : AuxMDs->operands()) {
           assert(isa<ConstantAsMetadata>(AuxMDO) && "expecting a constant");
           const Constant *C = cast<ConstantAsMetadata>(AuxMDO)->getValue();
-          const DataLayout &DL = F.getParent()->getDataLayout();
+          const DataLayout &DL = F.getDataLayout();
           const uint64_t Size = DL.getTypeStoreSize(C->getType());
 
           if (auto *CI = dyn_cast<ConstantInt>(C);
@@ -2463,15 +2466,11 @@ bool AsmPrinter::doFinalization(Module &M) {
     emitGlobalIFunc(M, IFunc);
 
   // Finalize debug and EH information.
-  // Defer MCAssembler based constant folding due to a performance issue. The
-  // label differences will be evaluated at write time.
-  OutStreamer->setUseAssemblerInfoForParsing(false);
   for (const HandlerInfo &HI : Handlers) {
     NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
                        HI.TimerGroupDescription, TimePassesIsEnabled);
     HI.Handler->endModule();
   }
-  OutStreamer->setUseAssemblerInfoForParsing(true);
 
   // This deletes all the ephemeral handlers that AsmPrinter added, while
   // keeping all the user-added handlers alive until the AsmPrinter is
@@ -2803,10 +2802,8 @@ void AsmPrinter::emitJumpTableInfo() {
 
     // Defer MCAssembler based constant folding due to a performance issue. The
     // label differences will be evaluated at write time.
-    OutStreamer->setUseAssemblerInfoForParsing(false);
     for (const MachineBasicBlock *MBB : JTBBs)
       emitJumpTableEntry(MJTI, MBB, JTI);
-    OutStreamer->setUseAssemblerInfoForParsing(true);
   }
   if (!JTInDiffSection)
     OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
@@ -2931,14 +2928,14 @@ bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
   assert(GV->hasInitializer() && "Not a special LLVM global!");
 
   if (GV->getName() == "llvm.global_ctors") {
-    emitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+    emitXXStructorList(GV->getDataLayout(), GV->getInitializer(),
                        /* isCtor */ true);
 
     return true;
   }
 
   if (GV->getName() == "llvm.global_dtors") {
-    emitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+    emitXXStructorList(GV->getDataLayout(), GV->getInitializer(),
                        /* isCtor */ false);
 
     return true;
@@ -3152,7 +3149,7 @@ void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV,
                                unsigned MaxBytesToEmit) const {
   if (GV)
-    Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
+    Alignment = getGVAlignment(GV, GV->getDataLayout(), Alignment);
 
   if (Alignment == Align(1))
     return; // 1-byte aligned: no need to emit alignment.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index 619155cafe9277..4bbf66206bfbac 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -53,8 +53,8 @@ void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
   ID.AddInteger(unsigned(Children));
 
   // For each attribute description.
-  for (unsigned i = 0, N = Data.size(); i < N; ++i)
-    Data[i].Profile(ID);
+  for (const DIEAbbrevData &D : Data)
+    D.Profile(ID);
 }
 
 /// Emit - Print the abbreviation using the specified asm printer.
@@ -67,9 +67,7 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
   AP->emitULEB128((unsigned)Children, dwarf::ChildrenString(Children).data());
 
   // For each attribute description.
-  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
-    const DIEAbbrevData &AttrData = Data[i];
-
+  for (const DIEAbbrevData &AttrData : Data) {
     // Emit attribute type.
     AP->emitULEB128(AttrData.getAttribute(),
                     dwarf::AttributeString(AttrData.getAttribute()).data());
@@ -109,14 +107,12 @@ void DIEAbbrev::print(raw_ostream &O) const {
     << dwarf::ChildrenString(Children)
     << '\n';
 
-  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
-    O << "  "
-      << dwarf::AttributeString(Data[i].getAttribute())
-      << "  "
-      << dwarf::FormEncodingString(Data[i].getForm());
+  for (const DIEAbbrevData &D : Data) {
+    O << "  " << dwarf::AttributeString(D.getAttribute()) << "  "
+      << dwarf::FormEncodingString(D.getForm());
 
-    if (Data[i].getForm() == dwarf::DW_FORM_implicit_const)
-      O << " " << Data[i].getValue();
+    if (D.getForm() == dwarf::DW_FORM_implicit_const)
+      O << " " << D.getValue();
 
     O << '\n';
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 24cd1b15a57369..25d4bbddd9851b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 10c844ddb14a14..49f3fc1a1fa597 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index dd7d9e5deac2e2..2addf938c8b638 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3028,20 +3028,6 @@ void DwarfDebug::emitDebugARanges() {
       continue;
     }
 
-    // Sort the symbols by offset within the section.
-    llvm::stable_sort(List, [&](const SymbolCU &A, const SymbolCU &B) {
-      unsigned IA = A.Sym ? Asm->OutStreamer->getSymbolOrder(A.Sym) : 0;
-      unsigned IB = B.Sym ? Asm->OutStreamer->getSymbolOrder(B.Sym) : 0;
-
-      // Symbols with no order assigned should be placed at the end.
-      // (e.g. section end labels)
-      if (IA == 0)
-        return false;
-      if (IB == 0)
-        return true;
-      return IA < IB;
-    });
-
     // Insert a final terminator.
     List.push_back(SymbolCU(nullptr, Asm->OutStreamer->endSection(Section)));
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
index cd18703b359e1f..1a1e6f0117e2b8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCStreamer.h"
 
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 8afd75069589e1..cbec482d4cdc52 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PrintPasses.h"
 #include "llvm/InitializePasses.h"
@@ -891,9 +892,9 @@ class MemLocFragmentFill {
     DenseMap<BasicBlock *, unsigned int> BBToOrder;
     { // Init OrderToBB and BBToOrder.
       unsigned int RPONumber = 0;
-      for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
-        OrderToBB[RPONumber] = *RI;
-        BBToOrder[*RI] = RPONumber;
+      for (BasicBlock *BB : RPOT) {
+        OrderToBB[RPONumber] = BB;
+        BBToOrder[BB] = RPONumber;
         Worklist.push(RPONumber);
         ++RPONumber;
       }
@@ -2184,7 +2185,7 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
       if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
         ProcessDbgRecord(DII, InstDeclares);
       } else if (auto Info = getUntaggedStoreAssignmentInfo(
-                     I, Fn.getParent()->getDataLayout())) {
+                     I, Fn.getDataLayout())) {
         // Find markers linked to this alloca.
         auto HandleDbgAssignForStore = [&](auto *Assign) {
           std::optional<DIExpression::FragmentInfo> FragInfo;
@@ -2192,7 +2193,7 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
           // Skip this assignment if the affected bits are outside of the
           // variable fragment.
           if (!at::calculateFragmentIntersect(
-                  I.getModule()->getDataLayout(), Info->Base,
+                  I.getDataLayout(), Info->Base,
                   Info->OffsetInBits, Info->SizeInBits, Assign, FragInfo) ||
               (FragInfo && FragInfo->SizeInBits == 0))
             return;
@@ -2312,9 +2313,9 @@ bool AssignmentTrackingLowering::run(FunctionVarLocsBuilder *FnVarLocsBuilder) {
   DenseMap<BasicBlock *, unsigned int> BBToOrder;
   { // Init OrderToBB and BBToOrder.
     unsigned int RPONumber = 0;
-    for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
-      OrderToBB[RPONumber] = *RI;
-      BBToOrder[*RI] = RPONumber;
+    for (BasicBlock *BB : RPOT) {
+      OrderToBB[RPONumber] = BB;
+      BBToOrder[BB] = RPONumber;
       Worklist.push(RPONumber);
       ++RPONumber;
     }
@@ -2800,7 +2801,7 @@ DebugAssignmentTrackingAnalysis::run(Function &F,
   if (!isAssignmentTrackingEnabled(*F.getParent()))
     return FunctionVarLocs();
 
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
 
   FunctionVarLocsBuilder Builder;
   analyzeFunction(F, DL, &Builder);
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 7728cc50fc9f9d..cea09bcb453863 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -173,22 +173,22 @@ INITIALIZE_PASS_END(AtomicExpandLegacy, DEBUG_TYPE,
 
 // Helper functions to retrieve the size of atomic instructions.
 static unsigned getAtomicOpSize(LoadInst *LI) {
-  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const DataLayout &DL = LI->getDataLayout();
   return DL.getTypeStoreSize(LI->getType());
 }
 
 static unsigned getAtomicOpSize(StoreInst *SI) {
-  const DataLayout &DL = SI->getModule()->getDataLayout();
+  const DataLayout &DL = SI->getDataLayout();
   return DL.getTypeStoreSize(SI->getValueOperand()->getType());
 }
 
 static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
-  const DataLayout &DL = RMWI->getModule()->getDataLayout();
+  const DataLayout &DL = RMWI->getDataLayout();
   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
 }
 
 static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
-  const DataLayout &DL = CASI->getModule()->getDataLayout();
+  const DataLayout &DL = CASI->getDataLayout();
   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
 }
 
@@ -208,7 +208,7 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
   if (!Subtarget->enableAtomicExpand())
     return false;
   TLI = Subtarget->getTargetLowering();
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
 
   SmallVector<Instruction *, 1> AtomicInsts;
 
@@ -1205,7 +1205,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
   Function *F = BB->getParent();
 
   assert(AddrAlign >=
-             F->getParent()->getDataLayout().getTypeStoreSize(ResultTy) &&
+             F->getDataLayout().getTypeStoreSize(ResultTy) &&
          "Expected at least natural alignment at this point.");
 
   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
@@ -1645,7 +1645,7 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 // Note: This function is exposed externally by AtomicExpandUtils.h
 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
                                     CreateCmpXchgInstFun CreateCmpXchg) {
-  ReplacementIRBuilder Builder(AI, AI->getModule()->getDataLayout());
+  ReplacementIRBuilder Builder(AI, AI->getDataLayout());
   Builder.setIsFPConstrained(
       AI->getFunction()->hasFnAttribute(Attribute::StrictFP));
 
diff --git a/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
index 57cefae2066a9c..80a4eb86cf9ef0 100644
--- a/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -30,5 +30,5 @@ llvm::PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
                                 cl::Hidden);
 
 BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+    : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
       TLI(ST->getTargetLowering()) {}
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index c6c48cfc320c9e..1b6a6ee2bbc728 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -99,7 +99,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBlockFrequencyInfo>();
-      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
       AU.addRequired<ProfileSummaryInfoWrapperPass>();
       AU.addRequired<TargetPassConfig>();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -131,9 +131,10 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
                          PassConfig->getEnableTailMerge();
   MBFIWrapper MBBFreqInfo(
       getAnalysis<MachineBlockFrequencyInfo>());
-  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
-                      getAnalysis<MachineBranchProbabilityInfo>(),
-                      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
+  BranchFolder Folder(
+      EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
+      getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI(),
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
   return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
                                  MF.getSubtarget().getRegisterInfo());
 }
diff --git a/llvm/lib/CodeGen/CFGuardLongjmp.cpp b/llvm/lib/CodeGen/CFGuardLongjmp.cpp
index c3bf9385511104..b5d88a7432b171 100644
--- a/llvm/lib/CodeGen/CFGuardLongjmp.cpp
+++ b/llvm/lib/CodeGen/CFGuardLongjmp.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 41fc555c93e931..f1607f85c5b319 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -116,6 +116,7 @@ add_llvm_component_library(LLVMCodeGen
   MachineCheckDebugify.cpp
   MachineCycleAnalysis.cpp
   MachineDebugify.cpp
+  MachineDomTreeUpdater.cpp
   MachineDominanceFrontier.cpp
   MachineDominators.cpp
   MachineFrameInfo.cpp
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 900c33b580f15d..5a7d8a5c1f9eb6 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -509,7 +509,7 @@ bool CodeGenPrepareLegacyPass::runOnFunction(Function &F) {
     return false;
   auto TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
   CodeGenPrepare CGP(TM);
-  CGP.DL = &F.getParent()->getDataLayout();
+  CGP.DL = &F.getDataLayout();
   CGP.SubtargetInfo = TM->getSubtargetImpl(F);
   CGP.TLI = CGP.SubtargetInfo->getTargetLowering();
   CGP.TRI = CGP.SubtargetInfo->getRegisterInfo();
@@ -557,7 +557,7 @@ PreservedAnalyses CodeGenPreparePass::run(Function &F,
 }
 
 bool CodeGenPrepare::run(Function &F, FunctionAnalysisManager &AM) {
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
   SubtargetInfo = TM->getSubtargetImpl(F);
   TLI = SubtargetInfo->getTargetLowering();
   TRI = SubtargetInfo->getRegisterInfo();
@@ -3496,7 +3496,7 @@ class AddressingModeMatcher {
       std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
       bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
-        DL(MI->getModule()->getDataLayout()), LI(LI), getDTFn(getDTFn),
+        DL(MI->getDataLayout()), LI(LI), getDTFn(getDTFn),
         AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
         InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),
         LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {
@@ -5185,7 +5185,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
                                     const TargetRegisterInfo &TRI) {
   const Function *F = CI->getFunction();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
+      TLI.ParseConstraints(F->getDataLayout(), &TRI, *CI);
 
   for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {
     // Compute the constraint code and ConstraintType to use.
@@ -8032,7 +8032,7 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
   if (HBC && HBC->getParent() != SI.getParent())
     HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
 
-  bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
+  bool IsLE = SI.getDataLayout().isLittleEndian();
   auto CreateSplitStore = [&](Value *V, bool Upper) {
     V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
     Value *Addr = SI.getPointerOperand();
@@ -8331,8 +8331,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
       return true;
 
-    if ((isa<UIToFPInst>(I) || isa<SIToFPInst>(I) || isa<FPToUIInst>(I) ||
-         isa<TruncInst>(I)) &&
+    if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
         TLI->optimizeExtendOrTruncateConversion(
             I, LI->getLoopFor(I->getParent()), *TTI))
       return true;
diff --git a/llvm/lib/CodeGen/EHContGuardCatchret.cpp b/llvm/lib/CodeGen/EHContGuardCatchret.cpp
index b26aa792bb9379..f7c6580a73da56 100644
--- a/llvm/lib/CodeGen/EHContGuardCatchret.cpp
+++ b/llvm/lib/CodeGen/EHContGuardCatchret.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 30480e598acef8..5f3e85077cb563 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -617,8 +617,7 @@ void SSAIfConv::replacePHIInstrs() {
   DebugLoc HeadDL = FirstTerm->getDebugLoc();
 
   // Convert all PHIs to select instructions inserted before FirstTerm.
-  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
-    PHIInfo &PI = PHIs[i];
+  for (PHIInfo &PI : PHIs) {
     LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
     Register DstReg = PI.PHI->getOperand(0).getReg();
     if (hasSameValue(*MRI, TII, PI.TReg, PI.FReg)) {
@@ -645,8 +644,7 @@ void SSAIfConv::rewritePHIOperands() {
   DebugLoc HeadDL = FirstTerm->getDebugLoc();
 
   // Convert all PHIs to select instructions inserted before FirstTerm.
-  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
-    PHIInfo &PI = PHIs[i];
+  for (PHIInfo &PI : PHIs) {
     unsigned DstReg = 0;
 
     LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
@@ -789,14 +787,14 @@ char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
 
 INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
                       "Early If Converter", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
                     "Early If Converter", false, false)
 
 void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
@@ -962,8 +960,7 @@ bool EarlyIfConverter::shouldConvertIf() {
   CriticalPathInfo TBlock{};
   CriticalPathInfo FBlock{};
   bool ShouldConvert = true;
-  for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
-    SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
+  for (SSAIfConv::PHIInfo &PI : IfConv.PHIs) {
     unsigned Slack = TailTrace.getInstrSlack(*PI.PHI);
     unsigned MaxDepth = Slack + TailTrace.getInstrCycles(*PI.PHI).Depth;
     LLVM_DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
@@ -1145,12 +1142,12 @@ char &llvm::EarlyIfPredicatorID = EarlyIfPredicator::ID;
 INITIALIZE_PASS_BEGIN(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_END(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator", false,
                     false)
 
 void EarlyIfPredicator::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
@@ -1225,7 +1222,7 @@ bool EarlyIfPredicator::runOnMachineFunction(MachineFunction &MF) {
   SchedModel.init(&STI);
   DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfo>();
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
 
   bool Changed = false;
   IfConv.runOnMachineFunction(MF);
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index bb84813569f4d5..2758f7be4d505f 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -970,7 +970,7 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
   if (DT)
     DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
 
-  const DataLayout& DL = F.getParent()->getDataLayout();
+  const DataLayout& DL = F.getDataLayout();
   bool MadeChanges = false;
   for (auto BBIt = F.begin(); BBIt != F.end();) {
     if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index dc35f33a3a059e..97c6ee4773f22d 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -571,7 +571,7 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
                                                       VPIntrinsic &VPI) {
   assert(VPI.canIgnoreVectorLengthParam());
 
-  const auto &DL = F.getParent()->getDataLayout();
+  const auto &DL = F.getDataLayout();
 
   Value *MaskParam = VPI.getMaskParam();
   Value *PtrParam = VPI.getMemoryPointerParam();
diff --git a/llvm/lib/CodeGen/GCMetadata.cpp b/llvm/lib/CodeGen/GCMetadata.cpp
index cad7d1f1137bba..e1af457c9b9d76 100644
--- a/llvm/lib/CodeGen/GCMetadata.cpp
+++ b/llvm/lib/CodeGen/GCMetadata.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 2ee035790eff10..ee94c0bfbf9d0f 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -49,6 +49,8 @@ addFlagsUsingAttrFn(ISD::ArgFlagsTy &Flags,
     Flags.setNest();
   if (AttrFn(Attribute::ByVal))
     Flags.setByVal();
+  if (AttrFn(Attribute::ByRef))
+    Flags.setByRef();
   if (AttrFn(Attribute::Preallocated))
     Flags.setPreallocated();
   if (AttrFn(Attribute::InAlloca))
@@ -221,17 +223,26 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
   }
 
   Align MemAlign = DL.getABITypeAlign(Arg.Ty);
-  if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
+  if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated() ||
+      Flags.isByRef()) {
     assert(OpIdx >= AttributeList::FirstArgIndex);
     unsigned ParamIdx = OpIdx - AttributeList::FirstArgIndex;
 
     Type *ElementTy = FuncInfo.getParamByValType(ParamIdx);
+    if (!ElementTy)
+      ElementTy = FuncInfo.getParamByRefType(ParamIdx);
     if (!ElementTy)
       ElementTy = FuncInfo.getParamInAllocaType(ParamIdx);
     if (!ElementTy)
       ElementTy = FuncInfo.getParamPreallocatedType(ParamIdx);
+
     assert(ElementTy && "Must have byval, inalloca or preallocated type");
-    Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
+
+    uint64_t MemSize = DL.getTypeAllocSize(ElementTy);
+    if (Flags.isByRef())
+      Flags.setByRefSize(MemSize);
+    else
+      Flags.setByValSize(MemSize);
 
     // For ByVal, alignment should be passed from FE.  BE will guess if
     // this info is not there but there are cases it cannot get right.
@@ -722,7 +733,7 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
   MachineFunction &MF = MIRBuilder.getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   const unsigned NumArgs = Args.size();
 
@@ -875,10 +886,10 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
         if (VA.getLocInfo() == CCValAssign::Indirect)
           Handler.assignValueToAddress(ArgReg, StackAddr, PointerTy, MPO, VA);
         else
-          Handler.assignValueToAddress(Args[i], Part, StackAddr, MemTy, MPO, VA);
+          Handler.assignValueToAddress(Args[i], Part, StackAddr, MemTy, MPO,
+                                       VA);
       } else if (VA.isMemLoc() && Flags.isByVal()) {
-        assert(Args[i].Regs.size() == 1 &&
-               "didn't expect split byval pointer");
+        assert(Args[i].Regs.size() == 1 && "didn't expect split byval pointer");
 
         if (Handler.isIncomingArgumentHandler()) {
           // We just need to copy the frame index value to the pointer.
@@ -1294,7 +1305,8 @@ Register CallLowering::ValueHandler::extendRegister(Register ValReg,
   }
 
   switch (VA.getLocInfo()) {
-  default: break;
+  default:
+    break;
   case CCValAssign::Full:
   case CCValAssign::BCvt:
     // FIXME: bitconverting between vector types may or may not be a
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 31030accd43f71..d57dd6fca01403 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -3082,9 +3082,9 @@ void CombinerHelper::applyCombineInsertVecElts(
     UndefReg = Builder.buildUndef(DstTy.getScalarType()).getReg(0);
     return UndefReg;
   };
-  for (unsigned I = 0; I < MatchInfo.size(); ++I) {
-    if (!MatchInfo[I])
-      MatchInfo[I] = GetUndef();
+  for (Register &Reg : MatchInfo) {
+    if (!Reg)
+      Reg = GetUndef();
   }
   Builder.buildBuildVector(MI.getOperand(0).getReg(), MatchInfo);
   MI.eraseFromParent();
@@ -7349,6 +7349,54 @@ void CombinerHelper::applyBuildFnMO(const MachineOperand &MO,
   Root->eraseFromParent();
 }
 
+bool CombinerHelper::matchFPowIExpansion(MachineInstr &MI, int64_t Exponent) {
+  bool OptForSize = MI.getMF()->getFunction().hasOptSize();
+  return getTargetLowering().isBeneficialToExpandPowI(Exponent, OptForSize);
+}
+
+void CombinerHelper::applyExpandFPowI(MachineInstr &MI, int64_t Exponent) {
+  auto [Dst, Base] = MI.getFirst2Regs();
+  LLT Ty = MRI.getType(Dst);
+  int64_t ExpVal = Exponent;
+
+  if (ExpVal == 0) {
+    Builder.buildFConstant(Dst, 1.0);
+    MI.removeFromParent();
+    return;
+  }
+
+  if (ExpVal < 0)
+    ExpVal = -ExpVal;
+
+  // We use the simple binary decomposition method from SelectionDAG ExpandPowI
+  // to generate the multiply sequence. There are more optimal ways to do this
+  // (for example, powi(x,15) generates one more multiply than it should), but
+  // this has the benefit of being both really simple and much better than a
+  // libcall.
+  std::optional<SrcOp> Res;
+  SrcOp CurSquare = Base;
+  while (ExpVal > 0) {
+    if (ExpVal & 1) {
+      if (!Res)
+        Res = CurSquare;
+      else
+        Res = Builder.buildFMul(Ty, *Res, CurSquare);
+    }
+
+    CurSquare = Builder.buildFMul(Ty, CurSquare, CurSquare);
+    ExpVal >>= 1;
+  }
+
+  // If the original exponent was negative, invert the result, producing
+  // 1/(x*x*x).
+  if (Exponent < 0)
+    Res = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0), *Res,
+                            MI.getFlags());
+
+  Builder.buildCopy(Dst, *Res);
+  MI.eraseFromParent();
+}
+
 bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
                                       BuildFnTy &MatchInfo) {
   GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 2988fa45b2e336..4d22b4c2ed2a81 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -32,7 +32,7 @@ INITIALIZE_PASS(GISelKnownBitsAnalysis, DEBUG_TYPE,
 
 GISelKnownBits::GISelKnownBits(MachineFunction &MF, unsigned MaxDepth)
     : MF(MF), MRI(MF.getRegInfo()), TL(*MF.getSubtarget().getTargetLowering()),
-      DL(MF.getFunction().getParent()->getDataLayout()), MaxDepth(MaxDepth) {}
+      DL(MF.getFunction().getDataLayout()), MaxDepth(MaxDepth) {}
 
 Align GISelKnownBits::computeKnownAlignment(Register R, unsigned Depth) {
   const MachineInstr *MI = MRI.getVRegDef(R);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c06b35a98e434e..14e6b79c583fe3 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2541,24 +2541,20 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   }
   case Intrinsic::set_fpenv: {
     Value *FPEnv = CI.getOperand(0);
-    MIRBuilder.buildInstr(TargetOpcode::G_SET_FPENV, {},
-                          {getOrCreateVReg(*FPEnv)});
+    MIRBuilder.buildSetFPEnv(getOrCreateVReg(*FPEnv));
     return true;
   }
-  case Intrinsic::reset_fpenv: {
-    MIRBuilder.buildInstr(TargetOpcode::G_RESET_FPENV, {}, {});
+  case Intrinsic::reset_fpenv:
+    MIRBuilder.buildResetFPEnv();
     return true;
-  }
   case Intrinsic::set_fpmode: {
     Value *FPState = CI.getOperand(0);
-    MIRBuilder.buildInstr(TargetOpcode::G_SET_FPMODE, {},
-                          { getOrCreateVReg(*FPState) });
+    MIRBuilder.buildSetFPMode(getOrCreateVReg(*FPState));
     return true;
   }
-  case Intrinsic::reset_fpmode: {
-    MIRBuilder.buildInstr(TargetOpcode::G_RESET_FPMODE, {}, {});
+  case Intrinsic::reset_fpmode:
+    MIRBuilder.buildResetFPMode();
     return true;
-  }
   case Intrinsic::vscale: {
     MIRBuilder.buildVScale(getOrCreateVReg(CI), 1);
     return true;
@@ -3498,7 +3494,11 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
     EntryBuilder->buildConstant(Reg, 0);
   else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder->buildGlobalValue(Reg, GV);
-  else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
+  else if (auto CPA = dyn_cast<ConstantPtrAuth>(&C)) {
+    Register Addr = getOrCreateVReg(*CPA->getPointer());
+    Register AddrDisc = getOrCreateVReg(*CPA->getAddrDiscriminator());
+    EntryBuilder->buildConstantPtrAuth(Reg, CPA, Addr, AddrDisc);
+  } else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
     if (!isa<FixedVectorType>(CAZ->getType()))
       return false;
     // Return the scalar if it is a <1 x Ty> vector.
@@ -3841,7 +3841,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   CurBuilder->setMF(*MF);
   EntryBuilder->setMF(*MF);
   MRI = &MF->getRegInfo();
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
   ORE = std::make_unique<OptimizationRemarkEmitter>(&F);
   const TargetMachine &TM = MF->getTarget();
   TM.resetTargetOptions(F);
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 5acf35b37882e2..81f25b21a0409a 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -223,7 +223,7 @@ bool InlineAsmLowering::lowerInlineAsm(
 
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 430fcae7316891..975f19b8596b96 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7140,8 +7140,6 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
   return UnableToLegalize;
 }
 
-// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
-// multiplication tree.
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
   LLT Ty = MRI.getType(Dst);
@@ -7212,6 +7210,10 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
   // constants are a nan and -0.0, but the final result should preserve
   // everything.
   unsigned Flags = MI.getFlags();
+
+  // We masked the sign bit and the not-sign bit, so these are disjoint.
+  Flags |= MachineInstr::Disjoint;
+
   MIRBuilder.buildOr(Dst, And0, And1, Flags);
 
   MI.eraseFromParent();
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index 526ea0868af565..0d0c093648ebaa 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -318,7 +318,7 @@ bool LoadStoreOpt::mergeStores(SmallVectorImpl<GStore *> &StoresToMerge) {
     assert(MRI->getType(StoreMI->getValueReg()) == OrigTy);
 #endif
 
-  const auto &DL = MF->getFunction().getParent()->getDataLayout();
+  const auto &DL = MF->getFunction().getDataLayout();
   bool AnyMerged = false;
   do {
     unsigned NumPow2 = llvm::bit_floor(StoresToMerge.size());
@@ -955,7 +955,7 @@ void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) {
   // Need to reserve at least MaxStoreSizeToForm + 1 bits.
   BitVector LegalSizes(MaxStoreSizeToForm * 2);
   const auto &LI = *MF->getSubtarget().getLegalizerInfo();
-  const auto &DL = MF->getFunction().getParent()->getDataLayout();
+  const auto &DL = MF->getFunction().getDataLayout();
   Type *IRPtrTy = PointerType::get(MF->getFunction().getContext(), AddrSpace);
   LLT PtrTy = getLLTForType(*IRPtrTy, DL);
   // We assume that we're not going to be generating any stores wider than
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 37aa4e0aed7e82..06a6c1f93ef1f6 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -397,6 +397,19 @@ MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
   return buildFConstant(Res, *CFP);
 }
 
+MachineInstrBuilder
+MachineIRBuilder::buildConstantPtrAuth(const DstOp &Res,
+                                       const ConstantPtrAuth *CPA,
+                                       Register Addr, Register AddrDisc) {
+  auto MIB = buildInstr(TargetOpcode::G_PTRAUTH_GLOBAL_VALUE);
+  Res.addDefToMIB(*getMRI(), MIB);
+  MIB.addUse(Addr);
+  MIB.addImm(CPA->getKey()->getZExtValue());
+  MIB.addUse(AddrDisc);
+  MIB.addImm(CPA->getDiscriminator()->getZExtValue());
+  return MIB;
+}
+
 MachineInstrBuilder MachineIRBuilder::buildBrCond(const SrcOp &Tst,
                                                   MachineBasicBlock &Dest) {
   assert(Tst.getLLTTy(*getMRI()).isScalar() && "invalid operand type");
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 383cb61aed4107..e067bd8961a233 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -63,7 +63,7 @@ INITIALIZE_PASS_BEGIN(RegBankSelect, DEBUG_TYPE,
                       "Assign register bank of generic virtual registers",
                       false, false);
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,
                     "Assign register bank of generic virtual registers", false,
@@ -86,7 +86,7 @@ void RegBankSelect::init(MachineFunction &MF) {
   TPC = &getAnalysis<TargetPassConfig>();
   if (OptMode != Mode::Fast) {
     MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
-    MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+    MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   } else {
     MBFI = nullptr;
     MBPI = nullptr;
@@ -100,7 +100,7 @@ void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
     // We could preserve the information from these two analysis but
     // the APIs do not allow to do so yet.
     AU.addRequired<MachineBlockFrequencyInfo>();
-    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   }
   AU.addRequired<TargetPassConfig>();
   getSelectionDAGFallbackAnalysisUsage(AU);
@@ -955,8 +955,10 @@ uint64_t RegBankSelect::EdgeInsertPoint::frequency(const Pass &P) const {
   if (WasMaterialized)
     return MBFI->getBlockFreq(DstOrSplit).getFrequency();
 
+  auto *MBPIWrapper =
+      P.getAnalysisIfAvailable<MachineBranchProbabilityInfoWrapperPass>();
   const MachineBranchProbabilityInfo *MBPI =
-      P.getAnalysisIfAvailable<MachineBranchProbabilityInfo>();
+      MBPIWrapper ? &MBPIWrapper->getMBPI() : nullptr;
   if (!MBPI)
     return 1;
   // The basic block will be on the edge.
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index cc5aad14e1b56f..9205eabcf5684e 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -240,7 +240,7 @@ bool HardwareLoopsLegacy::runOnFunction(Function &F) {
   auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
@@ -275,7 +275,7 @@ PreservedAnalyses HardwareLoopsPass::run(Function &F,
   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
 
   HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts);
   bool Changed = Impl.run(F);
@@ -291,7 +291,7 @@ PreservedAnalyses HardwareLoopsPass::run(Function &F,
 }
 
 bool HardwareLoopsImpl::run(Function &F) {
-  LLVMContext &Ctx = F.getParent()->getContext();
+  LLVMContext &Ctx = F.getContext();
   for (Loop *L : LI)
     if (L->isOutermost())
       TryConvertLoop(L, Ctx);
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index e8e276a8558d8a..02cb95c5d76647 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -210,7 +210,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBlockFrequencyInfo>();
-      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
       AU.addRequired<ProfileSummaryInfoWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -432,7 +432,7 @@ char IfConverter::ID = 0;
 char &llvm::IfConverterID = IfConverter::ID;
 
 INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 
@@ -445,7 +445,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
   MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   ProfileSummaryInfo *PSI =
       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   MRI = &MF.getRegInfo();
diff --git a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
index 13f595bef8eeb8..05a7387b1232cc 100644
--- a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
+++ b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -100,7 +100,7 @@ FunctionPass *llvm::createIndirectBrExpandPass() {
 }
 
 bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
 
   SmallVector<IndirectBrInst *, 1> IndirectBrs;
 
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 98a0150a84e315..dc03f590ded322 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -70,8 +70,6 @@ STATISTIC(NumFolded,          "Number of folded stack accesses");
 STATISTIC(NumFoldedLoads,     "Number of folded loads");
 STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
 
-static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
-                                     cl::desc("Disable inline spill hoisting"));
 static cl::opt<bool>
 RestrictStatepointRemat("restrict-statepoint-remat",
                        cl::init(false), cl::Hidden,
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index fc4be84bca109b..10208bb91799d8 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -1256,7 +1256,7 @@ bool InterleavedLoadCombineImpl::run() {
   bool changed = false;
   unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
 
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
 
   // Start with the highest factor to avoid combining and recombining.
   for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
diff --git a/llvm/lib/CodeGen/KCFI.cpp b/llvm/lib/CodeGen/KCFI.cpp
index bffa02ca8afd03..91c6ac26182791 100644
--- a/llvm/lib/CodeGen/KCFI.cpp
+++ b/llvm/lib/CodeGen/KCFI.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
index 721b75900c8ef0..83b16fc883e8b6 100644
--- a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 
 INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
                       "Lazy Machine Block Frequency Analysis", true, true)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
                     "Lazy Machine Block Frequency Analysis", true, true)
@@ -43,7 +43,7 @@ void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS,
 
 void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -62,7 +62,7 @@ LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
     return *MBFI;
   }
 
-  auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
+  auto &MBPI = getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
   auto *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
diff --git a/llvm/lib/CodeGen/LexicalScopes.cpp b/llvm/lib/CodeGen/LexicalScopes.cpp
index 47c19c3d8ec450..6dbd2ca00f316e 100644
--- a/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -340,8 +340,8 @@ LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const {
 
   if (!Children.empty())
     err << std::string(Indent + 2, ' ') << "Children ...\n";
-  for (unsigned i = 0, e = Children.size(); i != e; ++i)
-    if (Children[i] != this)
-      Children[i]->dump(Indent + 2);
+  for (const LexicalScope *Child : Children)
+    if (Child != this)
+      Child->dump(Indent + 2);
 }
 #endif
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index 481d9e341da377..555cbb7a507f45 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -3907,7 +3907,7 @@ class LDVSSAUpdater {
   DenseMap<BlockValueNum, LDVSSAPhi *> PHIs;
   /// Map of which blocks generate Undef values -- blocks that are not
   /// dominated by any Def.
-  DenseMap<MachineBasicBlock *, BlockValueNum> UndefMap;
+  DenseMap<MachineBasicBlock *, BlockValueNum> PoisonMap;
   /// Map of machine blocks to our own records of them.
   DenseMap<MachineBasicBlock *, LDVSSABlock *> BlockMap;
   /// Machine location where any PHI must occur.
@@ -3923,7 +3923,7 @@ class LDVSSAUpdater {
       delete Block.second;
 
     PHIs.clear();
-    UndefMap.clear();
+    PoisonMap.clear();
     BlockMap.clear();
   }
 
@@ -4017,15 +4017,15 @@ template <> class SSAUpdaterTraits<LDVSSAUpdater> {
       Preds->push_back(BB->Updater.getSSALDVBlock(Pred));
   }
 
-  /// GetUndefVal - Normally creates an IMPLICIT_DEF instruction with a new
+  /// GetPoisonVal - Normally creates an IMPLICIT_DEF instruction with a new
   /// register. For LiveDebugValues, represents a block identified as not having
   /// any DBG_PHI predecessors.
-  static BlockValueNum GetUndefVal(LDVSSABlock *BB, LDVSSAUpdater *Updater) {
+  static BlockValueNum GetPoisonVal(LDVSSABlock *BB, LDVSSAUpdater *Updater) {
     // Create a value number for this block -- it needs to be unique and in the
-    // "undef" collection, so that we know it's not real. Use a number
+    // "poison" collection, so that we know it's not real. Use a number
     // representing a PHI into this block.
     BlockValueNum Num = ValueIDNum(BB->BB.getNumber(), 0, Updater->Loc).asU64();
-    Updater->UndefMap[&BB->BB] = Num;
+    Updater->PoisonMap[&BB->BB] = Num;
     return Num;
   }
 
@@ -4188,7 +4188,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
     // Are all these things actually defined?
     for (auto &PHIIt : PHI->IncomingValues) {
       // Any undef input means DBG_PHIs didn't dominate the use point.
-      if (Updater.UndefMap.contains(&PHIIt.first->BB))
+      if (Updater.PoisonMap.contains(&PHIIt.first->BB))
         return std::nullopt;
 
       ValueIDNum ValueToCheck;
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 16d8e916ce6682..3224bedcb58d26 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -764,9 +764,9 @@ void LDVImpl::print(raw_ostream &OS) {
 #endif
 
 void UserValue::mapVirtRegs(LDVImpl *LDV) {
-  for (unsigned i = 0, e = locations.size(); i != e; ++i)
-    if (locations[i].isReg() && locations[i].getReg().isVirtual())
-      LDV->mapVirtReg(locations[i].getReg(), this);
+  for (const MachineOperand &MO : locations)
+    if (MO.isReg() && MO.getReg().isVirtual())
+      LDV->mapVirtReg(MO.getReg(), this);
 }
 
 UserValue *
@@ -1254,9 +1254,9 @@ void LDVImpl::computeIntervals() {
   LexicalScopes LS;
   LS.initialize(*MF);
 
-  for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
-    userValues[i]->computeIntervals(MF->getRegInfo(), *TRI, *LIS, LS);
-    userValues[i]->mapVirtRegs(this);
+  for (const auto &UV : userValues) {
+    UV->computeIntervals(MF->getRegInfo(), *TRI, *LIS, LS);
+    UV->mapVirtRegs(this);
   }
 }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 114f0e8a57108c..0809f88fde56b1 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -242,6 +242,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("intrinsic", MIToken::kw_intrinsic)
       .Case("target-index", MIToken::kw_target_index)
       .Case("half", MIToken::kw_half)
+      .Case("bfloat", MIToken::kw_bfloat)
       .Case("float", MIToken::kw_float)
       .Case("double", MIToken::kw_double)
       .Case("x86_fp80", MIToken::kw_x86_fp80)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index 49a19896804eeb..22547483a8a86b 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -100,6 +100,7 @@ struct MIToken {
     kw_intrinsic,
     kw_target_index,
     kw_half,
+    kw_bfloat,
     kw_float,
     kw_double,
     kw_x86_fp80,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 95924f0566283b..1a12eefc8675bb 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -2916,6 +2916,7 @@ bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
   case MIToken::IntegerLiteral:
     return parseImmediateOperand(Dest);
   case MIToken::kw_half:
+  case MIToken::kw_bfloat:
   case MIToken::kw_float:
   case MIToken::kw_double:
   case MIToken::kw_x86_fp80:
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index f3a961f883517f..5698f6d6fea005 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
diff --git a/llvm/lib/CodeGen/MLRegAllocPriorityAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocPriorityAdvisor.cpp
index 422781593a9c6b..8c1bb41f1430ce 100644
--- a/llvm/lib/CodeGen/MLRegAllocPriorityAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocPriorityAdvisor.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Analysis/ModelUnderTrainingRunner.h"
 #include "llvm/Analysis/NoInferenceModelRunner.h"
 #include "llvm/Analysis/Utils/TrainingLogger.h"
+#include "llvm/IR/Module.h"
 #endif
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index abf43e39ee9a61..533ab7cccaeb71 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -315,6 +315,12 @@ bool MachineBasicBlock::isLegalToHoistInto() const {
   return true;
 }
 
+bool MachineBasicBlock::hasName() const {
+  if (const BasicBlock *LBB = getBasicBlock())
+    return LBB->hasName();
+  return false;
+}
+
 StringRef MachineBasicBlock::getName() const {
   if (const BasicBlock *LBB = getBasicBlock())
     return LBB->getName();
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 7ebecc6beb17d0..2a68f18d9bc764 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -163,7 +163,7 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
 
 INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, DEBUG_TYPE,
                       "Machine Block Frequency Analysis", true, true)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineBlockFrequencyInfo, DEBUG_TYPE,
                     "Machine Block Frequency Analysis", true, true)
@@ -185,7 +185,7 @@ MachineBlockFrequencyInfo::MachineBlockFrequencyInfo(
 MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() = default;
 
 void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -209,7 +209,7 @@ void MachineBlockFrequencyInfo::calculate(
 
 bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
   MachineBranchProbabilityInfo &MBPI =
-      getAnalysis<MachineBranchProbabilityInfo>();
+      getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
   calculate(F, MBPI, MLI);
   return false;
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 1cb71f39efbe18..a229475df8feeb 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -608,7 +608,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
     AU.addRequired<MachineBlockFrequencyInfo>();
     if (TailDupPlacement)
       AU.addRequired<MachinePostDominatorTreeWrapperPass>();
@@ -627,7 +627,7 @@ char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
 
 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
                       "Branch Probability Basic Block Placement", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
@@ -3425,7 +3425,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   F = &MF;
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   MBFI = std::make_unique<MBFIWrapper>(
       getAnalysis<MachineBlockFrequencyInfo>());
   MLI = &getAnalysis<MachineLoopInfo>();
@@ -3726,7 +3726,7 @@ class MachineBlockPlacementStats : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
     AU.addRequired<MachineBlockFrequencyInfo>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -3741,7 +3741,7 @@ char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
 
 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
                       "Basic Block Placement Stats", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
                     "Basic Block Placement Stats", false, false)
@@ -3754,7 +3754,7 @@ bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
   if (!isFunctionInPrintList(F.getName()))
     return false;
 
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
 
   for (MachineBasicBlock &MBB : F) {
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index a84377d708558c..56ffffff622405 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -18,9 +18,11 @@
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfo, "machine-branch-prob",
+INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfoWrapperPass,
+                      "machine-branch-prob",
                       "Machine Branch Probability Analysis", false, true)
-INITIALIZE_PASS_END(MachineBranchProbabilityInfo, "machine-branch-prob",
+INITIALIZE_PASS_END(MachineBranchProbabilityInfoWrapperPass,
+                    "machine-branch-prob",
                     "Machine Branch Probability Analysis", false, true)
 
 namespace llvm {
@@ -37,15 +39,45 @@ cl::opt<unsigned> ProfileLikelyProb(
     cl::init(51), cl::Hidden);
 } // namespace llvm
 
-char MachineBranchProbabilityInfo::ID = 0;
+MachineBranchProbabilityAnalysis::Result
+MachineBranchProbabilityAnalysis::run(MachineFunction &,
+                                      MachineFunctionAnalysisManager &) {
+  return MachineBranchProbabilityInfo();
+}
+
+PreservedAnalyses
+MachineBranchProbabilityPrinterPass::run(MachineFunction &MF,
+                                         MachineFunctionAnalysisManager &MFAM) {
+  OS << "Printing analysis 'Machine Branch Probability Analysis' for machine "
+        "function '"
+     << MF.getName() << "':\n";
+  auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineBasicBlock *Succ : MBB.successors())
+      MBPI.printEdgeProbability(OS << "  ", &MBB, Succ);
+  }
+  return PreservedAnalyses::all();
+}
+
+char MachineBranchProbabilityInfoWrapperPass::ID = 0;
 
-MachineBranchProbabilityInfo::MachineBranchProbabilityInfo()
+MachineBranchProbabilityInfoWrapperPass::
+    MachineBranchProbabilityInfoWrapperPass()
     : ImmutablePass(ID) {
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
-  initializeMachineBranchProbabilityInfoPass(Registry);
+  initializeMachineBranchProbabilityInfoWrapperPassPass(Registry);
 }
 
-void MachineBranchProbabilityInfo::anchor() {}
+void MachineBranchProbabilityInfoWrapperPass::anchor() {}
+
+AnalysisKey MachineBranchProbabilityAnalysis::Key;
+
+bool MachineBranchProbabilityInfo::invalidate(
+    MachineFunction &, const PreservedAnalyses &PA,
+    MachineFunctionAnalysisManager::Invalidator &) {
+  auto PAC = PA.getChecker<MachineBranchProbabilityAnalysis>();
+  return !PAC.preservedWhenStateless();
+}
 
 BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
     const MachineBasicBlock *Src,
diff --git a/llvm/lib/CodeGen/MachineCheckDebugify.cpp b/llvm/lib/CodeGen/MachineCheckDebugify.cpp
index 874f726d294716..9b703d5401cb97 100644
--- a/llvm/lib/CodeGen/MachineCheckDebugify.cpp
+++ b/llvm/lib/CodeGen/MachineCheckDebugify.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 
diff --git a/llvm/lib/CodeGen/MachineDomTreeUpdater.cpp b/llvm/lib/CodeGen/MachineDomTreeUpdater.cpp
new file mode 100644
index 00000000000000..afffafb245e672
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineDomTreeUpdater.cpp
@@ -0,0 +1,66 @@
+//===- MachineDomTreeUpdater.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MachineDomTreeUpdater class, which provides a
+// uniform way to update dominator tree related data structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDomTreeUpdater.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/GenericDomTreeUpdaterImpl.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/Support/GenericDomTree.h"
+#include <algorithm>
+#include <functional>
+#include <utility>
+
+namespace llvm {
+
+template class GenericDomTreeUpdater<
+    MachineDomTreeUpdater, MachineDominatorTree, MachinePostDominatorTree>;
+
+template void
+GenericDomTreeUpdater<MachineDomTreeUpdater, MachineDominatorTree,
+                      MachinePostDominatorTree>::recalculate(MachineFunction
+                                                                 &MF);
+
+bool MachineDomTreeUpdater::forceFlushDeletedBB() {
+  if (DeletedBBs.empty())
+    return false;
+
+  for (auto *BB : DeletedBBs) {
+    eraseDelBBNode(BB);
+    BB->eraseFromParent();
+  }
+  DeletedBBs.clear();
+  return true;
+}
+
+// The DT and PDT require the nodes related to updates
+// are not deleted when update functions are called.
+// So MachineBasicBlock deletions must be pended when the
+// UpdateStrategy is Lazy. When the UpdateStrategy is
+// Eager, the MachineBasicBlock will be deleted immediately.
+void MachineDomTreeUpdater::deleteBB(MachineBasicBlock *DelBB) {
+  validateDeleteBB(DelBB);
+  if (Strategy == UpdateStrategy::Lazy) {
+    DeletedBBs.insert(DelBB);
+    return;
+  }
+
+  eraseDelBBNode(DelBB);
+  DelBB->eraseFromParent();
+}
+
+void MachineDomTreeUpdater::validateDeleteBB(MachineBasicBlock *DelBB) {
+  assert(DelBB && "Invalid push_back of nullptr DelBB.");
+  assert(DelBB->pred_empty() && "DelBB has one or more predecessors.");
+}
+
+} // namespace llvm
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 4182e753541258..7f6a75208d253f 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -307,7 +307,7 @@ void MachineFunction::clear() {
 }
 
 const DataLayout &MachineFunction::getDataLayout() const {
-  return F.getParent()->getDataLayout();
+  return F.getDataLayout();
 }
 
 /// Get the JumpTableInfo for this function.
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index 519a77c44acf84..24eb360723dad1 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp
index d57a912f418b72..62ac3e32d24d9d 100644
--- a/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PrintPasses.h"
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 198af9339c1598..f0de2cad20337f 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -98,7 +98,7 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
 MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &TID,
                            DebugLoc DL, bool NoImp)
     : MCID(&TID), NumOperands(0), Flags(0), AsmPrinterFlags(0),
-      DbgLoc(std::move(DL)), DebugInstrNum(0) {
+      DbgLoc(std::move(DL)), DebugInstrNum(0), Opcode(TID.Opcode) {
   assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   // Reserve space for the expected number of operands.
@@ -117,7 +117,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &TID,
 /// uniqueness.
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
     : MCID(&MI.getDesc()), NumOperands(0), Flags(0), AsmPrinterFlags(0),
-      Info(MI.Info), DbgLoc(MI.getDebugLoc()), DebugInstrNum(0) {
+      Info(MI.Info), DbgLoc(MI.getDebugLoc()), DebugInstrNum(0),
+      Opcode(MI.getOpcode()) {
   assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   CapOperands = OperandCapacity::get(MI.getNumOperands());
@@ -143,6 +144,7 @@ void MachineInstr::setDesc(const MCInstrDesc &TID) {
   if (getParent())
     getMF()->handleChangeDesc(*this, TID);
   MCID = &TID;
+  Opcode = TID.Opcode;
 }
 
 void MachineInstr::moveBefore(MachineInstr *MovePos) {
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index dafa8e2527f6d9..92189f6360683a 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -177,26 +177,25 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       }
     }
 
-    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
-      MachineOperand &MO = *Defs[i];
-      Register Reg = MO.getReg();
+    for (MachineOperand *MO : Defs) {
+      Register Reg = MO->getReg();
       if (!Reg)
         continue;
 
       if (LocalDefSet.insert(Reg).second) {
         LocalDefs.push_back(Reg);
-        if (MO.isDead()) {
+        if (MO->isDead()) {
           DeadDefSet.insert(Reg);
         }
       } else {
         // Re-defined inside the bundle, it's no longer killed.
         KilledDefSet.erase(Reg);
-        if (!MO.isDead())
+        if (!MO->isDead())
           // Previously defined but dead.
           DeadDefSet.erase(Reg);
       }
 
-      if (!MO.isDead() && Reg.isPhysical()) {
+      if (!MO->isDead() && Reg.isPhysical()) {
         for (MCPhysReg SubReg : TRI->subregs(Reg)) {
           if (LocalDefSet.insert(SubReg).second)
             LocalDefs.push_back(SubReg);
diff --git a/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp b/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
index 9c3b31935f6d60..f114f1ecc0baec 100644
--- a/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCSymbol.h"
 
 using namespace llvm;
@@ -41,3 +42,25 @@ MachineModuleInfoImpl::SymbolListTy MachineModuleInfoImpl::getSortedStubs(
   Map.clear();
   return List;
 }
+
+template <typename MachineModuleInfoTarget>
+static typename MachineModuleInfoTarget::AuthStubListTy getAuthGVStubListHelper(
+    DenseMap<MCSymbol *, typename MachineModuleInfoTarget::AuthStubInfo>
+        &AuthPtrStubs) {
+  typename MachineModuleInfoTarget::AuthStubListTy List(AuthPtrStubs.begin(),
+                                                        AuthPtrStubs.end());
+
+  if (!List.empty())
+    llvm::sort(List.begin(), List.end(),
+               [](const typename MachineModuleInfoTarget::AuthStubPairTy &LHS,
+                  const typename MachineModuleInfoTarget::AuthStubPairTy &RHS) {
+                 return LHS.first->getName() < RHS.first->getName();
+               });
+
+  AuthPtrStubs.clear();
+  return List;
+}
+
+MachineModuleInfoELF::AuthStubListTy MachineModuleInfoELF::getAuthGVStubList() {
+  return getAuthGVStubListHelper<MachineModuleInfoELF>(AuthPtrStubs);
+}
diff --git a/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp b/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
index aa63411df96512..965539ddaca8db 100644
--- a/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
+++ b/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
@@ -9,6 +9,7 @@
 #include "llvm/CodeGen/MachineModuleSlotTracker.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 6836f6cbc54097..76e595214b4899 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -69,6 +69,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/CodeGen/MachinePostDominators.cpp b/llvm/lib/CodeGen/MachinePostDominators.cpp
index 8ba7c3180785cb..51637130addc46 100644
--- a/llvm/lib/CodeGen/MachinePostDominators.cpp
+++ b/llvm/lib/CodeGen/MachinePostDominators.cpp
@@ -39,6 +39,23 @@ template bool Verify<MBBPostDomTree>(const MBBPostDomTree &DT,
 extern bool VerifyMachineDomInfo;
 } // namespace llvm
 
+AnalysisKey MachinePostDominatorTreeAnalysis::Key;
+
+MachinePostDominatorTreeAnalysis::Result
+MachinePostDominatorTreeAnalysis::run(MachineFunction &MF,
+                                      MachineFunctionAnalysisManager &) {
+  return MachinePostDominatorTree(MF);
+}
+
+PreservedAnalyses
+MachinePostDominatorTreePrinterPass::run(MachineFunction &MF,
+                                         MachineFunctionAnalysisManager &MFAM) {
+  OS << "MachinePostDominatorTree for machine function: " << MF.getName()
+     << '\n';
+  MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF).print(OS);
+  return PreservedAnalyses::all();
+}
+
 char MachinePostDominatorTreeWrapperPass::ID = 0;
 
 //declare initializeMachinePostDominatorTreePass
@@ -64,6 +81,17 @@ void MachinePostDominatorTreeWrapperPass::getAnalysisUsage(
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+bool MachinePostDominatorTree::invalidate(
+    MachineFunction &, const PreservedAnalyses &PA,
+    MachineFunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on machine functions, or the
+  // machine function's CFG have been preserved.
+  auto PAC = PA.getChecker<MachinePostDominatorTreeAnalysis>();
+  return !PAC.preserved() &&
+         !PAC.preservedSet<AllAnalysesOn<MachineFunction>>() &&
+         !PAC.preservedSet<CFGAnalyses>();
+}
+
 MachineBasicBlock *MachinePostDominatorTree::findNearestCommonDominator(
     ArrayRef<MachineBasicBlock *> Blocks) const {
   assert(!Blocks.empty());
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 8d6ea9c488da5c..ffe1df4ec1ce8a 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -209,7 +209,7 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
 
-  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI);
   return InsertedPHI.getReg(0);
 }
 
@@ -236,6 +236,22 @@ void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
     NewVR = GetValueInMiddleOfBlock(UseMI->getParent());
   }
 
+  // Insert a COPY if needed to satisfy register class constraints for the using
+  // MO. Or, if possible, just constrain the class for NewVR to avoid the need
+  // for a COPY.
+  if (NewVR) {
+    const TargetRegisterClass *UseRC =
+        dyn_cast_or_null<const TargetRegisterClass *>(RegAttrs.RCOrRB);
+    if (UseRC && !MRI->constrainRegClass(NewVR, UseRC)) {
+      MachineBasicBlock *UseBB = UseMI->getParent();
+      MachineInstr *InsertedCopy =
+          InsertNewDef(TargetOpcode::COPY, UseBB, UseBB->getFirstNonPHI(),
+                       RegAttrs, MRI, TII)
+              .addReg(NewVR);
+      NewVR = InsertedCopy->getOperand(0).getReg();
+      LLVM_DEBUG(dbgs() << "  Inserted COPY: " << *InsertedCopy);
+    }
+  }
   U.setReg(NewVR);
 }
 
@@ -290,11 +306,11 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
     append_range(*Preds, BB->predecessors());
   }
 
-  /// GetUndefVal - Create an IMPLICIT_DEF instruction with a new register.
+  /// GetPoisonVal - Create an IMPLICIT_DEF instruction with a new register.
   /// Add it into the specified block and return the register.
-  static Register GetUndefVal(MachineBasicBlock *BB,
+  static Register GetPoisonVal(MachineBasicBlock *BB,
                               MachineSSAUpdater *Updater) {
-    // Insert an implicit_def to represent an undef value.
+    // Insert an implicit_def to represent a poison value.
     MachineInstr *NewDef =
         InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstNonPHI(),
                      Updater->RegAttrs, Updater->MRI, Updater->TII);
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 76947a419c347d..4dabaabe3659f2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -187,7 +187,7 @@ namespace {
       AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachinePostDominatorTreeWrapperPass>();
       AU.addRequired<MachineCycleInfoWrapperPass>();
-      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
       AU.addPreserved<MachineCycleInfoWrapperPass>();
       AU.addPreserved<MachineLoopInfo>();
       if (UseBlockFreqInfo)
@@ -273,7 +273,7 @@ char &llvm::MachineSinkingID = MachineSinking::ID;
 
 INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE,
                       "Machine code sinking", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
@@ -712,7 +712,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
   CI = &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
   MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   RegClassInfo.runOnMachineFunction(MF);
   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 3e6f36fe936fff..0f44777634e6dd 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -46,7 +46,7 @@ char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
 
 INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE,
                       "Machine Trace Metrics", false, true)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
                     "Machine Trace Metrics", false, true)
@@ -57,7 +57,7 @@ MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
 
 void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -939,15 +939,15 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
   }
 
   // Now we know the height of MI. Update any regunits read.
-  for (size_t I = 0, E = ReadOps.size(); I != E; ++I) {
-    MCRegister Reg = MI.getOperand(ReadOps[I]).getReg().asMCReg();
+  for (unsigned Op : ReadOps) {
+    MCRegister Reg = MI.getOperand(Op).getReg().asMCReg();
     for (MCRegUnit Unit : TRI->regunits(Reg)) {
       LiveRegUnit &LRU = RegUnits[Unit];
       // Set the height to the highest reader of the unit.
       if (LRU.Cycle <= Height && LRU.MI != &MI) {
         LRU.Cycle = Height;
         LRU.MI = &MI;
-        LRU.Op = ReadOps[I];
+        LRU.Op = Op;
       }
     }
   }
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 9ea238c61ed91e..0c8a0f2b24a1ee 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2066,6 +2066,12 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       report("Dst operand 0 must be a pointer", MI);
     break;
   }
+  case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE: {
+    const MachineOperand &AddrOp = MI->getOperand(1);
+    if (!AddrOp.isReg() || !MRI->getType(AddrOp.getReg()).isPointer())
+      report("addr operand must be a pointer", &AddrOp, 1);
+    break;
+  }
   default:
     break;
   }
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index e6fe7a070f2a50..2e078be29082d1 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -615,8 +615,7 @@ optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
         PHIBBs.insert(UI.getParent());
 
     const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-    for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
-      MachineOperand *UseMO = Uses[i];
+    for (MachineOperand *UseMO : Uses) {
       MachineInstr *UseMI = UseMO->getParent();
       MachineBasicBlock *UseMBB = UseMI->getParent();
       if (PHIBBs.count(UseMBB))
diff --git a/llvm/lib/CodeGen/PseudoSourceValue.cpp b/llvm/lib/CodeGen/PseudoSourceValue.cpp
index 242889d3178495..d102d22e87afb0 100644
--- a/llvm/lib/CodeGen/PseudoSourceValue.cpp
+++ b/llvm/lib/CodeGen/PseudoSourceValue.cpp
@@ -13,6 +13,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/PseudoSourceValueManager.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index 71288469b8f0fc..fb18f5a8a884a8 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -115,11 +116,8 @@ void RegAllocBase::allocatePhysRegs() {
       // selectOrSplit failed to find a register!
       // Probably caused by an inline asm.
       MachineInstr *MI = nullptr;
-      for (MachineRegisterInfo::reg_instr_iterator
-               I = MRI->reg_instr_begin(VirtReg->reg()),
-               E = MRI->reg_instr_end();
-           I != E;) {
-        MI = &*(I++);
+      for (MachineInstr &MIR : MRI->reg_instructions(VirtReg->reg())) {
+        MI = &MIR;
         if (MI->isInlineAsm())
           break;
       }
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 5d84e1e39e27ce..f465c63cf1040b 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -226,19 +226,17 @@ bool RABasic::spillInterferences(const LiveInterval &VirtReg,
   assert(!Intfs.empty() && "expected interference");
 
   // Spill each interfering vreg allocated to PhysReg or an alias.
-  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
-    const LiveInterval &Spill = *Intfs[i];
-
+  for (const LiveInterval *Spill : Intfs) {
     // Skip duplicates.
-    if (!VRM->hasPhys(Spill.reg()))
+    if (!VRM->hasPhys(Spill->reg()))
       continue;
 
     // Deallocate the interfering vreg by removing it from the union.
     // A LiveInterval instance may not be in a union during modification!
-    Matrix->unassign(Spill);
+    Matrix->unassign(*Spill);
 
     // Spill the extracted interval.
-    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+    LiveRangeEdit LRE(Spill, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
     spiller().spill(LRE);
   }
   return true;
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
index 47ad9c168b9236..a1dccc4d59723b 100644
--- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 3397cb4f3661fe..9f4f23807d82ff 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1855,8 +1855,8 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
     // Replace SrcReg with DstReg in all UseMI operands.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-      MachineOperand &MO = UseMI->getOperand(Ops[i]);
+    for (unsigned Op : Ops) {
+      MachineOperand &MO = UseMI->getOperand(Op);
 
       // Adjust <undef> flags in case of sub-register joins. We don't want to
       // turn a full def into a read-modify-write sub-register def and vice
@@ -4136,9 +4136,9 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
 
 void RegisterCoalescer::coalesceLocals() {
   copyCoalesceWorkList(LocalWorkList);
-  for (unsigned j = 0, je = LocalWorkList.size(); j != je; ++j) {
-    if (LocalWorkList[j])
-      WorkList.push_back(LocalWorkList[j]);
+  for (MachineInstr *MI : LocalWorkList) {
+    if (MI)
+      WorkList.push_back(MI);
   }
   LocalWorkList.clear();
 }
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 0a26247a4d1659..e41d1bfb0e530d 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -886,7 +886,7 @@ class SafeStackLegacyPass : public FunctionPass {
     if (!TL)
       report_fatal_error("TargetLowering instance is required");
 
-    auto *DL = &F.getParent()->getDataLayout();
+    auto *DL = &F.getDataLayout();
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
@@ -946,7 +946,7 @@ PreservedAnalyses SafeStackPass::run(Function &F,
   if (!TL)
     report_fatal_error("TargetLowering instance is required");
 
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
 
   // preserve DominatorTree
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
diff --git a/llvm/lib/CodeGen/SafeStackLayout.h b/llvm/lib/CodeGen/SafeStackLayout.h
index 6126c7a6785472..8614c72f305009 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.h
+++ b/llvm/lib/CodeGen/SafeStackLayout.h
@@ -12,6 +12,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/StackLifetime.h"
+#include "llvm/Support/Alignment.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index de22d230b1c32d..8f5b05b662b33a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -186,7 +186,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
           Register SP = TLI->getStackPointerRegisterToSaveRestore();
           const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           std::vector<TargetLowering::AsmOperandInfo> Ops =
-              TLI->ParseConstraints(Fn->getParent()->getDataLayout(), TRI,
+              TLI->ParseConstraints(Fn->getDataLayout(), TRI,
                                     *Call);
           for (TargetLowering::AsmOperandInfo &Op : Ops) {
             if (Op.Type == InlineAsm::isClobber) {
@@ -570,7 +570,7 @@ FunctionLoweringInfo::getValueFromVirtualReg(Register Vreg) {
     SmallVector<EVT, 4> ValueVTs;
     for (auto &P : ValueMap) {
       ValueVTs.clear();
-      ComputeValueVTs(*TLI, Fn->getParent()->getDataLayout(),
+      ComputeValueVTs(*TLI, Fn->getDataLayout(),
                       P.first->getType(), ValueVTs);
       unsigned Reg = P.second;
       for (EVT VT : ValueVTs) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index dfc24f01eb1124..d036a0285e5719 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1681,8 +1681,13 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
     SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit);
   }
 
+  SDNodeFlags Flags;
+  Flags.setDisjoint(true);
+
   // Store the part with the modified sign and convert back to float.
-  SDValue CopiedSign = DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit);
+  SDValue CopiedSign =
+      DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit, Flags);
+
   return modifySignAsInt(MagAsInt, DL, CopiedSign);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 8a93433c5e0459..cb6d3fe4db8a43 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -188,8 +188,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
 
 #ifndef NDEBUG
   // Checked that NewNodes are only used by other NewNodes.
-  for (unsigned i = 0, e = NewNodes.size(); i != e; ++i) {
-    SDNode *N = NewNodes[i];
+  for (SDNode *N : NewNodes) {
     for (SDNode *U : N->uses())
       assert(U->getNodeId() == NewNode && "NewNode used by non-NewNode!");
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 532c6306fb3d14..bbf08e862da12f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5231,7 +5231,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_ExpOp(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  SDValue ExpOp = RHS.getValueType().isVector() ? GetWidenedVector(RHS) : RHS;
+  EVT ExpVT = RHS.getValueType();
+  SDValue ExpOp = RHS;
+  if (ExpVT.isVector()) {
+    EVT WideExpVT =
+        WidenVT.changeVectorElementType(ExpVT.getVectorElementType());
+    ExpOp = ModifyToType(RHS, WideExpVT);
+  }
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ExpOp);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index e3acb58327a8c1..5522c25be39495 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -748,8 +748,7 @@ void ScheduleDAGLinearize::Schedule() {
       ++DAGSize;
   }
 
-  for (unsigned i = 0, e = Glues.size(); i != e; ++i) {
-    SDNode *Glue = Glues[i];
+  for (SDNode *Glue : Glues) {
     SDNode *GUser = GluedMap[Glue];
     unsigned Degree = Glue->getNodeId();
     unsigned UDegree = GUser->getNodeId();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 296b06187ec0fd..465919d03d8cae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1802,6 +1802,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
 
+    if (const ConstantPtrAuth *CPA = dyn_cast<ConstantPtrAuth>(C)) {
+      return DAG.getNode(ISD::PtrAuthGlobalAddress, getCurSDLoc(), VT,
+                         getValue(CPA->getPointer()), getValue(CPA->getKey()),
+                         getValue(CPA->getAddrDiscriminator()),
+                         getValue(CPA->getDiscriminator()));
+    }
+
     if (isa<ConstantPointerNull>(C)) {
       unsigned AS = V->getType()->getPointerAddressSpace();
       return DAG.getConstant(0, getCurSDLoc(),
@@ -4783,9 +4790,18 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MMOFlags,
       LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata());
+
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  const auto &TTI =
+      TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
   SDValue StoreNode =
-      DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
-                         ISD::UNINDEXED, false /* Truncating */, IsCompressing);
+      !IsCompressing && TTI.hasConditionalLoadStoreForType(
+                            I.getArgOperand(0)->getType()->getScalarType())
+          ? TLI.visitMaskedStore(DAG, sdl, getMemoryRoot(), MMO, Ptr, Src0,
+                                 Mask)
+          : DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask,
+                               VT, MMO, ISD::UNINDEXED, /*Truncating=*/false,
+                               IsCompressing);
   DAG.setRoot(StoreNode);
   setValue(&I, StoreNode);
 }
@@ -4958,12 +4974,23 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
       MachinePointerInfo(PtrOperand), MMOFlags,
       LocationSize::beforeOrAfterPointer(), Alignment, AAInfo, Ranges);
 
-  SDValue Load =
-      DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
-                        ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding);
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  const auto &TTI =
+      TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
+  // The Load/Res may point to different values and both of them are output
+  // variables.
+  SDValue Load;
+  SDValue Res;
+  if (!IsExpanding && TTI.hasConditionalLoadStoreForType(
+                          Src0Operand->getType()->getScalarType()))
+    Res = TLI.visitMaskedLoad(DAG, sdl, InChain, MMO, Load, Ptr, Src0, Mask);
+  else
+    Res = Load =
+        DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
+                          ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding);
   if (AddToChain)
     PendingLoads.push_back(Load.getValue(1));
-  setValue(&I, Load);
+  setValue(&I, Res);
 }
 
 void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a7555d6d31f26d..c1d2c095b103c2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -75,6 +75,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     }
     return "<<Unknown Node #" + utostr(getOpcode()) + ">>";
 
+    // clang-format off
 #ifndef NDEBUG
   case ISD::DELETED_NODE:               return "<<Deleted Node!>>";
 #endif
@@ -126,6 +127,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ConstantFP:                 return "ConstantFP";
   case ISD::GlobalAddress:              return "GlobalAddress";
   case ISD::GlobalTLSAddress:           return "GlobalTLSAddress";
+  case ISD::PtrAuthGlobalAddress:       return "PtrAuthGlobalAddress";
   case ISD::FrameIndex:                 return "FrameIndex";
   case ISD::JumpTable:                  return "JumpTable";
   case ISD::JUMP_TABLE_DEBUG_INFO:
@@ -168,8 +170,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
       return "OpaqueTargetConstant";
     return "TargetConstant";
 
-    // clang-format off
-
   case ISD::TargetConstantFP:           return "TargetConstantFP";
   case ISD::TargetGlobalAddress:        return "TargetGlobalAddress";
   case ISD::TargetGlobalTLSAddress:     return "TargetGlobalTLSAddress";
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 31ba833abe29da..8ce6dfc1fdd267 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -78,6 +78,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PrintPasses.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
@@ -310,15 +311,6 @@ namespace llvm {
 
 } // end namespace llvm
 
-// EmitInstrWithCustomInserter - This method should be implemented by targets
-// that mark instructions with the 'usesCustomInserter' flag.  These
-// instructions are special in various ways, which require special support to
-// insert.  The specified MachineInstr is created but not inserted into any
-// basic blocks, and this method is called to expand it into a sequence of
-// instructions, potentially also creating new basic blocks and control flow.
-// When new basic blocks are inserted and the edges from MBB to its successors
-// are modified, the method should insert pairs of <OldSucc, NewSucc> into the
-// DenseMap.
 MachineBasicBlock *
 TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                             MachineBasicBlock *MBB) const {
@@ -755,16 +747,16 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       // that COPY instructions also need DBG_VALUE, if it is the only
       // user of LDI->second.
       MachineInstr *CopyUseMI = nullptr;
-      for (MachineRegisterInfo::use_instr_iterator
-           UI = RegInfo->use_instr_begin(LDI->second),
-           E = RegInfo->use_instr_end(); UI != E; ) {
-        MachineInstr *UseMI = &*(UI++);
-        if (UseMI->isDebugValue()) continue;
-        if (UseMI->isCopy() && !CopyUseMI && UseMI->getParent() == EntryMBB) {
-          CopyUseMI = UseMI; continue;
+      for (MachineInstr &UseMI : RegInfo->use_instructions(LDI->second)) {
+        if (UseMI.isDebugValue())
+          continue;
+        if (UseMI.isCopy() && !CopyUseMI && UseMI.getParent() == EntryMBB) {
+          CopyUseMI = &UseMI;
+          continue;
         }
         // Otherwise this is another use or second copy use.
-        CopyUseMI = nullptr; break;
+        CopyUseMI = nullptr;
+        break;
       }
       if (CopyUseMI &&
           TRI.getRegSizeInBits(LDI->second, MRI) ==
@@ -1455,13 +1447,12 @@ void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
   llvm::WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo();
   if (!EHInfo)
     return;
-  for (auto MBBI = MF->begin(), E = MF->end(); MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = &*MBBI;
-    const BasicBlock *BB = MBB->getBasicBlock();
+  for (MachineBasicBlock &MBB : *MF) {
+    const BasicBlock *BB = MBB.getBasicBlock();
     int State = EHInfo->BlockToStateMap[BB];
     if (BB->getFirstMayFaultInst()) {
       // Report IP range only for blocks with Faulty inst
-      auto MBBb = MBB->getFirstNonPHI();
+      auto MBBb = MBB.getFirstNonPHI();
       MachineInstr *MIb = &*MBBb;
       if (MIb->isTerminator())
         continue;
@@ -1470,16 +1461,16 @@ void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
       MCSymbol *BeginLabel = MMI.getContext().createTempSymbol();
       MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
       EHInfo->addIPToStateRange(State, BeginLabel, EndLabel);
-      BuildMI(*MBB, MBBb, SDB->getCurDebugLoc(),
+      BuildMI(MBB, MBBb, SDB->getCurDebugLoc(),
               TII->get(TargetOpcode::EH_LABEL))
           .addSym(BeginLabel);
-      auto MBBe = MBB->instr_end();
+      auto MBBe = MBB.instr_end();
       MachineInstr *MIe = &*(--MBBe);
       // insert before (possible multiple) terminators
       while (MIe->isTerminator())
         MIe = &*(--MBBe);
       ++MBBe;
-      BuildMI(*MBB, MBBe, SDB->getCurDebugLoc(),
+      BuildMI(MBB, MBBe, SDB->getCurDebugLoc(),
               TII->get(TargetOpcode::EH_LABEL))
           .addSym(EndLabel);
     }
@@ -2125,8 +2116,8 @@ SelectionDAGISel::FinishBasicBlock() {
     // from the original BB before switch expansion.  Note that PHI nodes can
     // occur multiple times in PHINodesToUpdate.  We have to be very careful to
     // handle them the right number of times.
-    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
-      FuncInfo->MBB = Succs[i];
+    for (MachineBasicBlock *Succ : Succs) {
+      FuncInfo->MBB = Succ;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // FuncInfo->MBB may have been removed from the CFG if a branch was
       // constant folded.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ad957aaa8f141d..fdd9f1f461759c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9243,7 +9243,10 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
   unsigned Opc = N->getOpcode();
   bool IsFloor = Opc == ISD::AVGFLOORS || Opc == ISD::AVGFLOORU;
   bool IsSigned = Opc == ISD::AVGCEILS || Opc == ISD::AVGFLOORS;
+  unsigned SumOpc = IsFloor ? ISD::ADD : ISD::SUB;
+  unsigned SignOpc = IsFloor ? ISD::AND : ISD::OR;
   unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
+  unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   assert((Opc == ISD::AVGFLOORS || Opc == ISD::AVGCEILS ||
           Opc == ISD::AVGFLOORU || Opc == ISD::AVGCEILU) &&
          "Unknown AVG node");
@@ -9262,12 +9265,28 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
                        DAG.getShiftAmountConstant(1, VT, dl));
   }
 
+  // For scalars, see if we can efficiently extend/truncate to use add+shift.
+  if (VT.isScalarInteger()) {
+    unsigned BW = VT.getScalarSizeInBits();
+    EVT ExtVT = VT.getIntegerVT(*DAG.getContext(), 2 * BW);
+    if (isTypeLegal(ExtVT) && isTruncateFree(ExtVT, VT)) {
+      LHS = DAG.getNode(ExtOpc, dl, ExtVT, LHS);
+      RHS = DAG.getNode(ExtOpc, dl, ExtVT, RHS);
+      SDValue Avg = DAG.getNode(ISD::ADD, dl, ExtVT, LHS, RHS);
+      if (!IsFloor)
+        Avg = DAG.getNode(ISD::ADD, dl, ExtVT, Avg,
+                          DAG.getConstant(1, dl, ExtVT));
+      // Just use SRL as we will be truncating away the extended sign bits.
+      Avg = DAG.getNode(ISD::SRL, dl, ExtVT, Avg,
+                        DAG.getShiftAmountConstant(1, ExtVT, dl));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Avg);
+    }
+  }
+
   // avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1))
   // avgceilu(lhs, rhs) -> sub(or(lhs,rhs),lshr(xor(lhs,rhs),1))
   // avgfloors(lhs, rhs) -> add(and(lhs,rhs),ashr(xor(lhs,rhs),1))
   // avgflooru(lhs, rhs) -> add(and(lhs,rhs),lshr(xor(lhs,rhs),1))
-  unsigned SumOpc = IsFloor ? ISD::ADD : ISD::SUB;
-  unsigned SignOpc = IsFloor ? ISD::AND : ISD::OR;
   LHS = DAG.getFreeze(LHS);
   RHS = DAG.getFreeze(RHS);
   SDValue Sign = DAG.getNode(SignOpc, dl, VT, LHS, RHS);
diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 20c827ce08d845..054f7d7215962e 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -199,7 +199,7 @@ SjLjEHPrepareImpl::setupFunctionContext(Function &F,
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   const Align Alignment = DL.getPrefTypeAlign(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), nullptr,
                            Alignment, "fn_context", EntryBB->begin());
diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp
index 8b80c6ccb43893..a038b09c765c06 100644
--- a/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -26,7 +26,7 @@ SlotIndexes::SlotIndexes() : MachineFunctionPass(ID) {
 
 SlotIndexes::~SlotIndexes() {
   // The indexList's nodes are all allocated in the BumpPtrAllocator.
-  indexList.clearAndLeakNodesUnsafely();
+  indexList.clear();
 }
 
 INITIALIZE_PASS(SlotIndexes, DEBUG_TYPE,
@@ -75,7 +75,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   MBBRanges.resize(mf->getNumBlockIDs());
   idx2MBBMap.reserve(mf->size());
 
-  indexList.push_back(createEntry(nullptr, index));
+  indexList.push_back(*createEntry(nullptr, index));
 
   // Iterate over the function.
   for (MachineBasicBlock &MBB : *mf) {
@@ -87,7 +87,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
         continue;
 
       // Insert a store index for the instr.
-      indexList.push_back(createEntry(&MI, index += SlotIndex::InstrDist));
+      indexList.push_back(*createEntry(&MI, index += SlotIndex::InstrDist));
 
       // Save this base index in the maps.
       mi2iMap.insert(std::make_pair(
@@ -95,7 +95,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
     }
 
     // We insert one blank instructions between basic blocks.
-    indexList.push_back(createEntry(nullptr, index += SlotIndex::InstrDist));
+    indexList.push_back(*createEntry(nullptr, index += SlotIndex::InstrDist));
 
     MBBRanges[MBB.getNumber()].first = blockStartIndex;
     MBBRanges[MBB.getNumber()].second = SlotIndex(&indexList.back(),
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 36414e742c327a..10c69cdc0d87f5 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -964,14 +964,14 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
       MFI->setObjectSSPLayout(SI.second, FromKind);
 
     // The new alloca might not be valid in a llvm.dbg.declare for this
-    // variable, so undef out the use to make the verifier happy.
+    // variable, so poison out the use to make the verifier happy.
     AllocaInst *FromAI = const_cast<AllocaInst *>(From);
     if (FromAI->isUsedByMetadata())
-      ValueAsMetadata::handleRAUW(FromAI, UndefValue::get(FromAI->getType()));
+      ValueAsMetadata::handleRAUW(FromAI, PoisonValue::get(FromAI->getType()));
     for (auto &Use : FromAI->uses()) {
       if (BitCastInst *BCI = dyn_cast<BitCastInst>(Use.get()))
         if (BCI->isUsedByMetadata())
-          ValueAsMetadata::handleRAUW(BCI, UndefValue::get(BCI->getType()));
+          ValueAsMetadata::handleRAUW(BCI, PoisonValue::get(BCI->getType()));
     }
 
     // Note that this will not replace uses in MMOs (which we'll update below),
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index eb7a113b575f75..a24e3479a12296 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -400,8 +400,8 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
 
     const PseudoSourceValue *NewSV = MF.getPSVManager().getFixedStack(NewFI);
     SmallVectorImpl<MachineMemOperand *> &RefMMOs = SSRefs[SS];
-    for (unsigned i = 0, e = RefMMOs.size(); i != e; ++i)
-      RefMMOs[i]->setValue(NewSV);
+    for (MachineMemOperand *MMO : RefMMOs)
+      MMO->setValue(NewSV);
   }
 
   // Rewrite all MO_FrameIndex operands.  Look for dead stores.
diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp
index bf3d2088e196c4..25f20d9c899bb0 100644
--- a/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/llvm/lib/CodeGen/TailDuplication.cpp
@@ -40,7 +40,7 @@ class TailDuplicateBase : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
     AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -84,7 +84,7 @@ bool TailDuplicateBase::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  auto MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
                &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index f5dd21cb927012..6b860466a8875c 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -201,8 +201,7 @@ bool TailDuplicator::tailDuplicateAndUpdate(
 
   // Update SSA form.
   if (!SSAUpdateVRs.empty()) {
-    for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
-      unsigned VReg = SSAUpdateVRs[i];
+    for (unsigned VReg : SSAUpdateVRs) {
       SSAUpdate.Initialize(VReg);
 
       // If the original definition is still around, add it as an available
@@ -253,8 +252,7 @@ bool TailDuplicator::tailDuplicateAndUpdate(
 
   // Eliminate some of the copies inserted by tail duplication to maintain
   // SSA form.
-  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
-    MachineInstr *Copy = Copies[i];
+  for (MachineInstr *Copy : Copies) {
     if (!Copy->isCopy())
       continue;
     Register Dst = Copy->getOperand(0).getReg();
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index b2c1750131a2f7..976691b7de4484 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
@@ -641,7 +642,7 @@ getELFSectionNameForGlobal(const GlobalObject *GO, SectionKind Kind,
     // We also need alignment here.
     // FIXME: this is getting the alignment of the character, not the
     // alignment of the global!
-    Align Alignment = GO->getParent()->getDataLayout().getPreferredAlign(
+    Align Alignment = GO->getDataLayout().getPreferredAlign(
         cast<GlobalVariable>(GO));
 
     Name += ".str";
@@ -1359,7 +1360,7 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      GO->getParent()->getDataLayout().getPreferredAlign(
+      GO->getDataLayout().getPreferredAlign(
           cast<GlobalVariable>(GO)) < Align(32))
     return CStringSection;
 
@@ -1367,7 +1368,7 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GO->hasExternalLinkage() &&
-      GO->getParent()->getDataLayout().getPreferredAlign(
+      GO->getDataLayout().getPreferredAlign(
           cast<GlobalVariable>(GO)) < Align(32))
     return UStringSection;
 
@@ -1555,7 +1556,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
 
 static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
                                const MCSection &Section) {
-  if (!AsmInfo.isSectionAtomizableBySymbols(Section))
+  if (!MCAsmInfoDarwin::isSectionAtomizableBySymbols(Section))
     return true;
 
   // FIXME: we should be able to use private labels for sections that can't be
@@ -2725,8 +2726,7 @@ MCSection *TargetLoweringObjectFileGOFF::getExplicitSectionGlobal(
 MCSection *TargetLoweringObjectFileGOFF::getSectionForLSDA(
     const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const {
   std::string Name = ".gcc_exception_table." + F.getName().str();
-  return getContext().getGOFFSection(Name, SectionKind::getData(), nullptr,
-                                     nullptr);
+  return getContext().getGOFFSection(Name, SectionKind::getData(), nullptr, 0);
 }
 
 MCSection *TargetLoweringObjectFileGOFF::SelectSectionForGlobal(
@@ -2734,7 +2734,7 @@ MCSection *TargetLoweringObjectFileGOFF::SelectSectionForGlobal(
   auto *Symbol = TM.getSymbol(GO);
   if (Kind.isBSS())
     return getContext().getGOFFSection(Symbol->getName(), SectionKind::getBSS(),
-                                       nullptr, nullptr);
+                                       nullptr, 0);
 
   return getContext().getObjectFileInfo()->getTextSection();
 }
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 89aea3a2916118..0940759ddc421f 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -924,12 +924,12 @@ bool TypePromotionImpl::run(Function &F, const TargetMachine *TM,
   SafeToPromote.clear();
   SafeWrap.clear();
   bool MadeChange = false;
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F);
   TLI = SubtargetInfo->getTargetLowering();
   RegisterBitWidth =
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
-  Ctx = &F.getParent()->getContext();
+  Ctx = &F.getContext();
 
   // Return the preferred integer width of the instruction, or zero if we
   // shouldn't try.
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
index fc1cbfefb0dbe3..0cddf59d0ca2ac 100644
--- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -130,12 +130,12 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
   // Now see if there are no other dependencies to instructions already
   // in the packet.
   if (IsTop) {
-    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
-      if (hasDependence(Packet[i], SU))
+    for (const SUnit *U : Packet)
+      if (hasDependence(U, SU))
         return false;
   } else {
-    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
-      if (hasDependence(SU, Packet[i]))
+    for (const SUnit *U : Packet)
+      if (hasDependence(SU, U))
         return false;
   }
   return true;
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index df1c02c3dc67c2..b0f736a49c20ed 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -207,21 +207,6 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     assert(isExtended() && "Type is not extended!");
     return LLVMTy;
   case MVT::isVoid:  return Type::getVoidTy(Context);
-  case MVT::i1:      return Type::getInt1Ty(Context);
-  case MVT::i2:      return Type::getIntNTy(Context, 2);
-  case MVT::i4:      return Type::getIntNTy(Context, 4);
-  case MVT::i8:      return Type::getInt8Ty(Context);
-  case MVT::i16:     return Type::getInt16Ty(Context);
-  case MVT::i32:     return Type::getInt32Ty(Context);
-  case MVT::i64:     return Type::getInt64Ty(Context);
-  case MVT::i128:    return IntegerType::get(Context, 128);
-  case MVT::f16:     return Type::getHalfTy(Context);
-  case MVT::bf16:    return Type::getBFloatTy(Context);
-  case MVT::f32:     return Type::getFloatTy(Context);
-  case MVT::f64:     return Type::getDoubleTy(Context);
-  case MVT::f80:     return Type::getX86_FP80Ty(Context);
-  case MVT::f128:    return Type::getFP128Ty(Context);
-  case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
   case MVT::aarch64svcount:
     return TargetExtType::get(Context, "aarch64.svcount");
@@ -229,353 +214,10 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::i64x8:   return IntegerType::get(Context, 512);
   case MVT::externref: return Type::getWasm_ExternrefTy(Context);
   case MVT::funcref: return Type::getWasm_FuncrefTy(Context);
-  case MVT::v1i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 1);
-  case MVT::v2i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 2);
-  case MVT::v3i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 3);
-  case MVT::v4i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 4);
-  case MVT::v8i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 8);
-  case MVT::v16i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 16);
-  case MVT::v32i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 32);
-  case MVT::v64i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 64);
-  case MVT::v128i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 128);
-  case MVT::v256i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 256);
-  case MVT::v512i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 512);
-  case MVT::v1024i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 1024);
-  case MVT::v2048i1:
-    return FixedVectorType::get(Type::getInt1Ty(Context), 2048);
-  case MVT::v128i2:
-    return FixedVectorType::get(Type::getIntNTy(Context, 2), 128);
-  case MVT::v256i2:
-    return FixedVectorType::get(Type::getIntNTy(Context, 2), 256);
-  case MVT::v64i4:
-    return FixedVectorType::get(Type::getIntNTy(Context, 4), 64);
-  case MVT::v128i4:
-    return FixedVectorType::get(Type::getIntNTy(Context, 4), 128);
-  case MVT::v1i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 1);
-  case MVT::v2i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 2);
-  case MVT::v3i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 3);
-  case MVT::v4i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 4);
-  case MVT::v8i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 8);
-  case MVT::v16i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 16);
-  case MVT::v32i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 32);
-  case MVT::v64i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 64);
-  case MVT::v128i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 128);
-  case MVT::v256i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 256);
-  case MVT::v512i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 512);
-  case MVT::v1024i8:
-    return FixedVectorType::get(Type::getInt8Ty(Context), 1024);
-  case MVT::v1i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 1);
-  case MVT::v2i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 2);
-  case MVT::v3i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 3);
-  case MVT::v4i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 4);
-  case MVT::v8i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 8);
-  case MVT::v16i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 16);
-  case MVT::v32i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 32);
-  case MVT::v64i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 64);
-  case MVT::v128i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 128);
-  case MVT::v256i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 256);
-  case MVT::v512i16:
-    return FixedVectorType::get(Type::getInt16Ty(Context), 512);
-  case MVT::v1i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 1);
-  case MVT::v2i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 2);
-  case MVT::v3i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 3);
-  case MVT::v4i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 4);
-  case MVT::v5i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 5);
-  case MVT::v6i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 6);
-  case MVT::v7i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 7);
-  case MVT::v8i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 8);
-  case MVT::v9i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 9);
-  case MVT::v10i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 10);
-  case MVT::v11i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 11);
-  case MVT::v12i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 12);
-  case MVT::v16i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 16);
-  case MVT::v32i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 32);
-  case MVT::v64i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 64);
-  case MVT::v128i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 128);
-  case MVT::v256i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 256);
-  case MVT::v512i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 512);
-  case MVT::v1024i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 1024);
-  case MVT::v2048i32:
-    return FixedVectorType::get(Type::getInt32Ty(Context), 2048);
-  case MVT::v1i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 1);
-  case MVT::v2i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 2);
-  case MVT::v3i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 3);
-  case MVT::v4i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 4);
-  case MVT::v8i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 8);
-  case MVT::v16i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 16);
-  case MVT::v32i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 32);
-  case MVT::v64i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 64);
-  case MVT::v128i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 128);
-  case MVT::v256i64:
-    return FixedVectorType::get(Type::getInt64Ty(Context), 256);
-  case MVT::v1i128:
-    return FixedVectorType::get(Type::getInt128Ty(Context), 1);
-  case MVT::v1f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 1);
-  case MVT::v2f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 2);
-  case MVT::v3f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 3);
-  case MVT::v4f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 4);
-  case MVT::v8f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 8);
-  case MVT::v16f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 16);
-  case MVT::v32f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 32);
-  case MVT::v64f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 64);
-  case MVT::v128f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 128);
-  case MVT::v256f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 256);
-  case MVT::v512f16:
-    return FixedVectorType::get(Type::getHalfTy(Context), 512);
-  case MVT::v2bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 2);
-  case MVT::v3bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 3);
-  case MVT::v4bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 4);
-  case MVT::v8bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 8);
-  case MVT::v16bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 16);
-  case MVT::v32bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 32);
-  case MVT::v64bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 64);
-  case MVT::v128bf16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 128);
-  case MVT::v1f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 1);
-  case MVT::v2f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 2);
-  case MVT::v3f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 3);
-  case MVT::v4f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 4);
-  case MVT::v5f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 5);
-  case MVT::v6f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 6);
-  case MVT::v7f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 7);
-  case MVT::v8f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 8);
-  case MVT::v9f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 9);
-  case MVT::v10f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 10);
-  case MVT::v11f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 11);
-  case MVT::v12f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 12);
-  case MVT::v16f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 16);
-  case MVT::v32f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 32);
-  case MVT::v64f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 64);
-  case MVT::v128f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 128);
-  case MVT::v256f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 256);
-  case MVT::v512f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 512);
-  case MVT::v1024f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 1024);
-  case MVT::v2048f32:
-    return FixedVectorType::get(Type::getFloatTy(Context), 2048);
-  case MVT::v1f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 1);
-  case MVT::v2f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 2);
-  case MVT::v3f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 3);
-  case MVT::v4f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 4);
-  case MVT::v8f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 8);
-  case MVT::v16f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 16);
-  case MVT::v32f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 32);
-  case MVT::v64f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 64);
-  case MVT::v128f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 128);
-  case MVT::v256f64:
-    return FixedVectorType::get(Type::getDoubleTy(Context), 256);
-  case MVT::nxv1i1:
-    return ScalableVectorType::get(Type::getInt1Ty(Context), 1);
-  case MVT::nxv2i1:
-    return ScalableVectorType::get(Type::getInt1Ty(Context), 2);
-  case MVT::nxv4i1:
-    return ScalableVectorType::get(Type::getInt1Ty(Context), 4);
-  case MVT::nxv8i1:
-    return ScalableVectorType::get(Type::getInt1Ty(Context), 8);
-  case MVT::nxv16i1:
-    return ScalableVectorType::get(Type::getInt1Ty(Context), 16);
-  case MVT::nxv32i1:
-    return ScalableVectorType::get(Type::getInt1Ty(Context), 32);
-  case MVT::nxv64i1:
-    return ScalableVectorType::get(Type::getInt1Ty(Context), 64);
-  case MVT::nxv1i8:
-    return ScalableVectorType::get(Type::getInt8Ty(Context), 1);
-  case MVT::nxv2i8:
-    return ScalableVectorType::get(Type::getInt8Ty(Context), 2);
-  case MVT::nxv4i8:
-    return ScalableVectorType::get(Type::getInt8Ty(Context), 4);
-  case MVT::nxv8i8:
-    return ScalableVectorType::get(Type::getInt8Ty(Context), 8);
-  case MVT::nxv16i8:
-    return ScalableVectorType::get(Type::getInt8Ty(Context), 16);
-  case MVT::nxv32i8:
-    return ScalableVectorType::get(Type::getInt8Ty(Context), 32);
-  case MVT::nxv64i8:
-    return ScalableVectorType::get(Type::getInt8Ty(Context), 64);
-  case MVT::nxv1i16:
-    return ScalableVectorType::get(Type::getInt16Ty(Context), 1);
-  case MVT::nxv2i16:
-    return ScalableVectorType::get(Type::getInt16Ty(Context), 2);
-  case MVT::nxv4i16:
-    return ScalableVectorType::get(Type::getInt16Ty(Context), 4);
-  case MVT::nxv8i16:
-    return ScalableVectorType::get(Type::getInt16Ty(Context), 8);
-  case MVT::nxv16i16:
-    return ScalableVectorType::get(Type::getInt16Ty(Context), 16);
-  case MVT::nxv32i16:
-    return ScalableVectorType::get(Type::getInt16Ty(Context), 32);
-  case MVT::nxv1i32:
-    return ScalableVectorType::get(Type::getInt32Ty(Context), 1);
-  case MVT::nxv2i32:
-    return ScalableVectorType::get(Type::getInt32Ty(Context), 2);
-  case MVT::nxv4i32:
-    return ScalableVectorType::get(Type::getInt32Ty(Context), 4);
-  case MVT::nxv8i32:
-    return ScalableVectorType::get(Type::getInt32Ty(Context), 8);
-  case MVT::nxv16i32:
-    return ScalableVectorType::get(Type::getInt32Ty(Context), 16);
-  case MVT::nxv32i32:
-    return ScalableVectorType::get(Type::getInt32Ty(Context), 32);
-  case MVT::nxv1i64:
-    return ScalableVectorType::get(Type::getInt64Ty(Context), 1);
-  case MVT::nxv2i64:
-    return ScalableVectorType::get(Type::getInt64Ty(Context), 2);
-  case MVT::nxv4i64:
-    return ScalableVectorType::get(Type::getInt64Ty(Context), 4);
-  case MVT::nxv8i64:
-    return ScalableVectorType::get(Type::getInt64Ty(Context), 8);
-  case MVT::nxv16i64:
-    return ScalableVectorType::get(Type::getInt64Ty(Context), 16);
-  case MVT::nxv32i64:
-    return ScalableVectorType::get(Type::getInt64Ty(Context), 32);
-  case MVT::nxv1f16:
-    return ScalableVectorType::get(Type::getHalfTy(Context), 1);
-  case MVT::nxv2f16:
-    return ScalableVectorType::get(Type::getHalfTy(Context), 2);
-  case MVT::nxv4f16:
-    return ScalableVectorType::get(Type::getHalfTy(Context), 4);
-  case MVT::nxv8f16:
-    return ScalableVectorType::get(Type::getHalfTy(Context), 8);
-  case MVT::nxv16f16:
-    return ScalableVectorType::get(Type::getHalfTy(Context), 16);
-  case MVT::nxv32f16:
-    return ScalableVectorType::get(Type::getHalfTy(Context), 32);
-  case MVT::nxv1bf16:
-    return ScalableVectorType::get(Type::getBFloatTy(Context), 1);
-  case MVT::nxv2bf16:
-    return ScalableVectorType::get(Type::getBFloatTy(Context), 2);
-  case MVT::nxv4bf16:
-    return ScalableVectorType::get(Type::getBFloatTy(Context), 4);
-  case MVT::nxv8bf16:
-    return ScalableVectorType::get(Type::getBFloatTy(Context), 8);
-  case MVT::nxv16bf16:
-    return ScalableVectorType::get(Type::getBFloatTy(Context), 16);
-  case MVT::nxv32bf16:
-    return ScalableVectorType::get(Type::getBFloatTy(Context), 32);
-  case MVT::nxv1f32:
-    return ScalableVectorType::get(Type::getFloatTy(Context), 1);
-  case MVT::nxv2f32:
-    return ScalableVectorType::get(Type::getFloatTy(Context), 2);
-  case MVT::nxv4f32:
-    return ScalableVectorType::get(Type::getFloatTy(Context), 4);
-  case MVT::nxv8f32:
-    return ScalableVectorType::get(Type::getFloatTy(Context), 8);
-  case MVT::nxv16f32:
-    return ScalableVectorType::get(Type::getFloatTy(Context), 16);
-  case MVT::nxv1f64:
-    return ScalableVectorType::get(Type::getDoubleTy(Context), 1);
-  case MVT::nxv2f64:
-    return ScalableVectorType::get(Type::getDoubleTy(Context), 2);
-  case MVT::nxv4f64:
-    return ScalableVectorType::get(Type::getDoubleTy(Context), 4);
-  case MVT::nxv8f64:
-    return ScalableVectorType::get(Type::getDoubleTy(Context), 8);
   case MVT::Metadata: return Type::getMetadataTy(Context);
+#define GET_VT_EVT(Ty, EVT) case MVT::Ty: return EVT;
+#include "llvm/CodeGen/GenVT.inc"
+#undef GET_VT_EVT
   }
   // clang-format on
 }
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 16c1dcb1e11753..7514d49fb18a98 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -84,6 +84,7 @@
 #include "llvm/IR/EHPersonalities.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp
index 93a18f5bc9e8d4..c58c67b70fe3c2 100644
--- a/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/EHPersonalities.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -138,7 +139,7 @@ bool WinEHPrepareImpl::runOnFunction(Function &Fn) {
   if (!isScopedEHPersonality(Personality))
     return false;
 
-  DL = &Fn.getParent()->getDataLayout();
+  DL = &Fn.getDataLayout();
   return prepareExplicitEH(Fn);
 }
 
diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 7a86351a1f4e38..0777480499e55b 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -192,22 +192,31 @@ bool WindowScheduler::initialize() {
     return false;
   }
   // Check each MI in MBB.
-  SmallVector<Register, 8> PhiDefs;
+  SmallSet<Register, 8> PrevDefs;
+  SmallSet<Register, 8> PrevUses;
+  auto IsLoopCarried = [&](MachineInstr &Phi) {
+    // Two cases are checked here: (1)The virtual register defined by the
+    // preceding phi is used by the succeeding phi;(2)The preceding phi uses the
+    // virtual register defined by the succeeding phi.
+    if (PrevUses.count(Phi.getOperand(0).getReg()))
+      return true;
+    PrevDefs.insert(Phi.getOperand(0).getReg());
+    for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+      if (PrevDefs.count(Phi.getOperand(I).getReg()))
+        return true;
+      PrevUses.insert(Phi.getOperand(I).getReg());
+    }
+    return false;
+  };
   auto PLI = TII->analyzeLoopForPipelining(MBB);
   for (auto &MI : *MBB) {
     if (MI.isMetaInstruction() || MI.isTerminator())
       continue;
     if (MI.isPHI()) {
-      for (auto Def : PhiDefs)
-        if (MI.readsRegister(Def, TRI)) {
-          LLVM_DEBUG(
-              dbgs()
-              << "Consecutive phis are not allowed in window scheduling!\n");
-          return false;
-        }
-      for (auto Def : MI.defs())
-        if (Def.isReg())
-          PhiDefs.push_back(Def.getReg());
+      if (IsLoopCarried(MI)) {
+        LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
+        return false;
+      }
       ++SchedPhiNum;
       ++BestOffset;
     } else
diff --git a/llvm/lib/DWP/DWP.cpp b/llvm/lib/DWP/DWP.cpp
index 77bd22d1f071c6..fecd184ca68a86 100644
--- a/llvm/lib/DWP/DWP.cpp
+++ b/llvm/lib/DWP/DWP.cpp
@@ -415,6 +415,17 @@ Expected<InfoSectionUnitHeader> parseInfoSectionUnitHeader(StringRef Info) {
   return Header;
 }
 
+static void writeNewOffsetsTo(MCStreamer &Out, DataExtractor &Data,
+                              DenseMap<uint64_t, uint32_t> &OffsetRemapping,
+                              uint64_t &Offset, uint64_t &Size) {
+
+  while (Offset < Size) {
+    auto OldOffset = Data.getU32(&Offset);
+    auto NewOffset = OffsetRemapping[OldOffset];
+    Out.emitIntValue(NewOffset, 4);
+  }
+}
+
 void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings,
                             MCSection *StrOffsetSection,
                             StringRef CurStrSection,
@@ -439,17 +450,30 @@ void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings,
 
   Out.switchSection(StrOffsetSection);
 
-  uint64_t HeaderSize = debugStrOffsetsHeaderSize(Data, Version);
   uint64_t Offset = 0;
   uint64_t Size = CurStrOffsetSection.size();
-  // FIXME: This can be caused by bad input and should be handled as such.
-  assert(HeaderSize <= Size && "StrOffsetSection size is less than its header");
-  // Copy the header to the output.
-  Out.emitBytes(Data.getBytes(&Offset, HeaderSize));
-  while (Offset < Size) {
-    auto OldOffset = Data.getU32(&Offset);
-    auto NewOffset = OffsetRemapping[OldOffset];
-    Out.emitIntValue(NewOffset, 4);
+  if (Version > 4) {
+    while (Offset < Size) {
+      uint64_t HeaderSize = debugStrOffsetsHeaderSize(Data, Version);
+      assert(HeaderSize <= Size - Offset &&
+             "StrOffsetSection size is less than its header");
+
+      uint64_t ContributionEnd = 0;
+      uint64_t ContributionSize = 0;
+      uint64_t HeaderLengthOffset = Offset;
+      if (HeaderSize == 8) {
+        ContributionSize = Data.getU32(&HeaderLengthOffset);
+      } else if (HeaderSize == 16) {
+        HeaderLengthOffset += 4; // skip the dwarf64 marker
+        ContributionSize = Data.getU64(&HeaderLengthOffset);
+      }
+      ContributionEnd = ContributionSize + HeaderLengthOffset;
+      Out.emitBytes(Data.getBytes(&Offset, HeaderSize));
+      writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, ContributionEnd);
+    }
+
+  } else {
+    writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, Size);
   }
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index ac19ac79329718..7fba00d0e457b6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -630,19 +630,26 @@ std::optional<uint64_t> DWARFDebugNames::Entry::getDIEUnitOffset() const {
   return std::nullopt;
 }
 
-std::optional<uint64_t> DWARFDebugNames::Entry::getCUIndex() const {
+std::optional<uint64_t> DWARFDebugNames::Entry::getRelatedCUIndex() const {
+  // Return the DW_IDX_compile_unit attribute value if it is specified.
   if (std::optional<DWARFFormValue> Off = lookup(dwarf::DW_IDX_compile_unit))
     return Off->getAsUnsignedConstant();
   // In a per-CU index, the entries without a DW_IDX_compile_unit attribute
-  // implicitly refer to the single CU, but only if we don't have a
-  // DW_IDX_type_unit.
-  if (lookup(dwarf::DW_IDX_type_unit).has_value())
-    return std::nullopt;
+  // implicitly refer to the single CU. 
   if (NameIdx->getCUCount() == 1)
     return 0;
   return std::nullopt;
 }
 
+std::optional<uint64_t> DWARFDebugNames::Entry::getCUIndex() const {
+  // Return the DW_IDX_compile_unit attribute value but only if we don't have a
+  // DW_IDX_type_unit attribute. Use Entry::getRelatedCUIndex() to get the
+  // associated CU index if this behaviour is not desired.
+  if (lookup(dwarf::DW_IDX_type_unit).has_value())
+    return std::nullopt;
+  return getRelatedCUIndex();
+}
+
 std::optional<uint64_t> DWARFDebugNames::Entry::getCUOffset() const {
   std::optional<uint64_t> Index = getCUIndex();
   if (!Index || *Index >= NameIdx->getCUCount())
@@ -650,6 +657,13 @@ std::optional<uint64_t> DWARFDebugNames::Entry::getCUOffset() const {
   return NameIdx->getCUOffset(*Index);
 }
 
+std::optional<uint64_t> DWARFDebugNames::Entry::getRelatedCUOffset() const {
+  std::optional<uint64_t> Index = getRelatedCUIndex();
+  if (!Index || *Index >= NameIdx->getCUCount())
+    return std::nullopt;
+  return NameIdx->getCUOffset(*Index);
+}
+
 std::optional<uint64_t> DWARFDebugNames::Entry::getLocalTUOffset() const {
   std::optional<uint64_t> Index = getLocalTUIndex();
   if (!Index || *Index >= NameIdx->getLocalTUCount())
@@ -657,6 +671,19 @@ std::optional<uint64_t> DWARFDebugNames::Entry::getLocalTUOffset() const {
   return NameIdx->getLocalTUOffset(*Index);
 }
 
+std::optional<uint64_t>
+DWARFDebugNames::Entry::getForeignTUTypeSignature() const {
+  std::optional<uint64_t> Index = getLocalTUIndex();
+  const uint32_t NumLocalTUs = NameIdx->getLocalTUCount();
+  if (!Index || *Index < NumLocalTUs)
+    return std::nullopt; // Invalid TU index or TU index is for a local TU
+  // The foreign TU index is the TU index minus the number of local TUs.
+  const uint64_t ForeignTUIndex = *Index - NumLocalTUs;
+  if (ForeignTUIndex >= NameIdx->getForeignTUCount())
+    return std::nullopt; // Invalid foreign TU index.
+  return NameIdx->getForeignTUSignature(ForeignTUIndex);
+}
+
 std::optional<uint64_t> DWARFDebugNames::Entry::getLocalTUIndex() const {
   if (std::optional<DWARFFormValue> Off = lookup(dwarf::DW_IDX_type_unit))
     return Off->getAsUnsignedConstant();
diff --git a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
index f89f09aa3399ab..1a31aa206dfcc6 100644
--- a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
@@ -618,8 +618,7 @@ SymbolCache::getSourceFileById(SymIndexId FileId) const {
   if (FileId == 0)
     return nullptr;
 
-  return std::unique_ptr<NativeSourceFile>(
-      new NativeSourceFile(*SourceFiles[FileId].get()));
+  return std::make_unique<NativeSourceFile>(*SourceFiles[FileId].get());
 }
 
 SymIndexId
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index cd7ff40d63a492..e18de9afaaa661 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -53,6 +53,18 @@ static bool consumeFront(std::string_view &S, std::string_view C) {
   return true;
 }
 
+static bool consumeFront(std::string_view &S, std::string_view PrefixA,
+                         std::string_view PrefixB, bool A) {
+  const std::string_view &Prefix = A ? PrefixA : PrefixB;
+  return consumeFront(S, Prefix);
+}
+
+static bool startsWith(std::string_view S, std::string_view PrefixA,
+                       std::string_view PrefixB, bool A) {
+  const std::string_view &Prefix = A ? PrefixA : PrefixB;
+  return llvm::itanium_demangle::starts_with(S, Prefix);
+}
+
 static bool isMemberPointer(std::string_view MangledName, bool &Error) {
   Error = false;
   const char F = MangledName.front();
@@ -2256,6 +2268,18 @@ Demangler::demangleTemplateParameterList(std::string_view &MangledName) {
 
     NodeList &TP = **Current;
 
+    // <auto-nttp> ::= $ M <type> <nttp>
+    const bool IsAutoNTTP = consumeFront(MangledName, "$M");
+    if (IsAutoNTTP) {
+      // The deduced type of the auto NTTP parameter isn't printed so
+      // we want to ignore the AST created from demangling the type.
+      //
+      // TODO: Avoid the extra allocations to the bump allocator in this case.
+      (void)demangleType(MangledName, QualifierMangleMode::Drop);
+      if (Error)
+        return nullptr;
+    }
+
     TemplateParameterReferenceNode *TPRN = nullptr;
     if (consumeFront(MangledName, "$$Y")) {
       // Template alias
@@ -2266,15 +2290,17 @@ Demangler::demangleTemplateParameterList(std::string_view &MangledName) {
     } else if (consumeFront(MangledName, "$$C")) {
       // Type has qualifiers.
       TP.N = demangleType(MangledName, QualifierMangleMode::Mangle);
-    } else if (llvm::itanium_demangle::starts_with(MangledName, "$1") ||
-               llvm::itanium_demangle::starts_with(MangledName, "$H") ||
-               llvm::itanium_demangle::starts_with(MangledName, "$I") ||
-               llvm::itanium_demangle::starts_with(MangledName, "$J")) {
+    } else if (startsWith(MangledName, "$1", "1", !IsAutoNTTP) ||
+               startsWith(MangledName, "$H", "H", !IsAutoNTTP) ||
+               startsWith(MangledName, "$I", "I", !IsAutoNTTP) ||
+               startsWith(MangledName, "$J", "J", !IsAutoNTTP)) {
       // Pointer to member
       TP.N = TPRN = Arena.alloc<TemplateParameterReferenceNode>();
       TPRN->IsMemberPointer = true;
 
-      MangledName.remove_prefix(1);
+      if (!IsAutoNTTP)
+        MangledName.remove_prefix(1); // Remove leading '$'
+
       // 1 - single inheritance       <name>
       // H - multiple inheritance     <name> <number>
       // I - virtual inheritance      <name> <number> <number>
@@ -2342,7 +2368,7 @@ Demangler::demangleTemplateParameterList(std::string_view &MangledName) {
       }
       TPRN->IsMemberPointer = true;
 
-    } else if (consumeFront(MangledName, "$0")) {
+    } else if (consumeFront(MangledName, "$0", "0", !IsAutoNTTP)) {
       // Integral non-type template parameter
       bool IsNegative = false;
       uint64_t Value = 0;
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index 31c290ad505303..28e4f96df828cd 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -192,9 +192,9 @@ std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
   SmallString<128> FullName;
 
   const DataLayout &DL =
-    GV->getParent()->getDataLayout().isDefault()
+    GV->getDataLayout().isDefault()
       ? getDataLayout()
-      : GV->getParent()->getDataLayout();
+      : GV->getDataLayout();
 
   Mangler::getNameWithPrefix(FullName, GV->getName(), DL);
   return std::string(FullName);
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
index 908b88fef1b314..866de2cb227c35 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -200,6 +200,9 @@ class ELFLinkGraphBuilder_aarch32
 
 protected:
   TargetFlagsType makeTargetFlags(const typename ELFT::Sym &Sym) override {
+    // Only emit target flag for callable symbols
+    if (Sym.getType() != ELF::STT_FUNC)
+      return TargetFlagsType{};
     if (Sym.getValue() & 0x01)
       return aarch32::ThumbSymbol;
     return TargetFlagsType{};
@@ -209,7 +212,9 @@ class ELFLinkGraphBuilder_aarch32
                                      TargetFlagsType Flags) override {
     assert((makeTargetFlags(Sym) & Flags) == Flags);
     static constexpr uint64_t ThumbBit = 0x01;
-    return Sym.getValue() & ~ThumbBit;
+    if (Sym.getType() == ELF::STT_FUNC)
+      return Sym.getValue() & ~ThumbBit;
+    return Sym.getValue();
   }
 
 public:
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index f03dd434b704b2..3e6de62c8b7dd9 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1266,7 +1266,7 @@ JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
 
 std::pair<JITDylib::AsynchronousSymbolQuerySet,
           std::shared_ptr<SymbolDependenceMap>>
-JITDylib::removeTracker(ResourceTracker &RT) {
+JITDylib::IL_removeTracker(ResourceTracker &RT) {
   // Note: Should be called under the session lock.
   assert(State != Closed && "JD is defunct");
 
@@ -1305,9 +1305,7 @@ JITDylib::removeTracker(ResourceTracker &RT) {
       SymbolsToFail.push_back(Sym);
   }
 
-  AsynchronousSymbolQuerySet QueriesToFail;
-  auto Result = ES.runSessionLocked(
-      [&]() { return ES.IL_failSymbols(*this, std::move(SymbolsToFail)); });
+  auto Result = ES.IL_failSymbols(*this, std::move(SymbolsToFail));
 
   // Removed symbols should be taken out of the table altogether.
   for (auto &Sym : SymbolsToRemove) {
@@ -2198,7 +2196,8 @@ Error ExecutionSession::removeResourceTracker(ResourceTracker &RT) {
   runSessionLocked([&] {
     CurrentResourceManagers = ResourceManagers;
     RT.makeDefunct();
-    std::tie(QueriesToFail, FailedSymbols) = RT.getJITDylib().removeTracker(RT);
+    std::tie(QueriesToFail, FailedSymbols) =
+        RT.getJITDylib().IL_removeTracker(RT);
   });
 
   Error Err = Error::success();
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 670c8cf996fd61..8a5986c1b88b18 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -117,7 +117,7 @@ void CtorDtorRunner::add(iterator_range<CtorDtorIterator> CtorDtors) {
 
   MangleAndInterner Mangle(
       JD.getExecutionSession(),
-      (*CtorDtors.begin()).Func->getParent()->getDataLayout());
+      (*CtorDtors.begin()).Func->getDataLayout());
 
   for (auto CtorDtor : CtorDtors) {
     assert(CtorDtor.Func && CtorDtor.Func->hasName() &&
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 119a1aa66adfce..2fadb248b414f1 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index 17a96dee1000b6..540728ea0fd999 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
+#include "llvm/ADT/StringMap.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index aa2bf52cdfd27f..c053ef51411ce8 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -801,8 +801,9 @@ Error LLJITBuilderState::prepareForConstruction() {
       break;
     }
     if (UseJITLink) {
+      if (!JTMB->getCodeModel())
+        JTMB->setCodeModel(CodeModel::Small);
       JTMB->setRelocationModel(Reloc::PIC_);
-      JTMB->setCodeModel(CodeModel::Small);
       CreateObjectLinkingLayer =
           [](ExecutionSession &ES,
              const Triple &) -> Expected<std::unique_ptr<ObjectLayer>> {
diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
index 3368d3276cb373..d97336c914ea64 100644
--- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
@@ -200,9 +200,8 @@ BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L,
   if (!ObjInterface)
     return ObjInterface.takeError();
 
-  return std::unique_ptr<BasicObjectLayerMaterializationUnit>(
-      new BasicObjectLayerMaterializationUnit(L, std::move(O),
-                                              std::move(*ObjInterface)));
+  return std::make_unique<BasicObjectLayerMaterializationUnit>(
+      L, std::move(O), std::move(*ObjInterface));
 }
 
 BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit(
diff --git a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
index 9c243c9bf1d298..9abf7c11d54676 100644
--- a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
@@ -35,7 +35,7 @@ void IRSymbolMapper::add(ExecutionSession &ES, const ManglingOptions &MO,
   if (GVs.empty())
     return;
 
-  MangleAndInterner Mangle(ES, GVs[0]->getParent()->getDataLayout());
+  MangleAndInterner Mangle(ES, GVs[0]->getDataLayout());
   for (auto *G : GVs) {
     assert(G && "GVs cannot contain null elements");
     if (!G->hasName() || G->isDeclaration() || G->hasLocalLinkage() ||
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 57ade947678657..a66c40ddb68775 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -626,9 +626,11 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
           }
         }
       }
+    }
 
-      // If this node has both dependants and dependencies then add it to the
-      // worklist to propagate the dependencies to the dependants.
+    // Add blocks with both dependants and dependencies to the worklist to
+    // propagate dependencies to dependants.
+    for (auto &[B, BI] : BlockInfos) {
       if (!BI.Dependants.empty() && !BI.Dependencies.empty())
         WorkList.push_back(B);
     }
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index dbf7154229d380..ed06d4e2658e56 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -795,6 +795,12 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
 
   if (!OffloadInfoManager.empty())
     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
+
+  if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
+    std::vector<WeakTrackingVH> LLVMCompilerUsed = {
+        M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
+    emitUsed("llvm.compiler.used", LLVMCompilerUsed);
+  }
 }
 
 OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -927,16 +933,11 @@ Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK,
+OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
                                bool ForceSimpleCall, bool CheckCancelFlag) {
   if (!updateToLocation(Loc))
     return Loc.IP;
-  return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
-}
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
-                                 bool ForceSimpleCall, bool CheckCancelFlag) {
   // Build call __kmpc_cancel_barrier(loc, thread_id) or
   //            __kmpc_barrier(loc, thread_id);
 
@@ -2117,15 +2118,1396 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
                               /*IsCancellable*/ true);
 }
 
-/// Create a function with a unique name and a "void (i8*, i8*)" signature in
-/// the given module and return it.
-Function *getFreshReductionFunc(Module &M) {
+static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
+  BasicBlock::iterator IT(I);
+  IT++;
+  return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
+}
+
+void OpenMPIRBuilder::emitUsed(StringRef Name,
+                               std::vector<WeakTrackingVH> &List) {
+  if (List.empty())
+    return;
+
+  // Convert List to what ConstantArray needs.
+  SmallVector<Constant *, 8> UsedArray;
+  UsedArray.resize(List.size());
+  for (unsigned I = 0, E = List.size(); I != E; ++I)
+    UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+        cast<Constant>(&*List[I]), Builder.getPtrTy());
+
+  if (UsedArray.empty())
+    return;
+  ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
+
+  auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                ConstantArray::get(ATy, UsedArray), Name);
+
+  GV->setSection("llvm.metadata");
+}
+
+Value *OpenMPIRBuilder::getGPUThreadID() {
+  return Builder.CreateCall(
+      getOrCreateRuntimeFunction(M,
+                                 OMPRTL___kmpc_get_hardware_thread_id_in_block),
+      {});
+}
+
+Value *OpenMPIRBuilder::getGPUWarpSize() {
+  return Builder.CreateCall(
+      getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
+}
+
+Value *OpenMPIRBuilder::getNVPTXWarpID() {
+  unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+  return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
+}
+
+Value *OpenMPIRBuilder::getNVPTXLaneID() {
+  unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
+  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
+  return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
+                           "nvptx_lane_id");
+}
+
+Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
+                                        Type *ToType) {
+  Type *FromType = From->getType();
+  uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
+  uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
+  assert(FromSize > 0 && "From size must be greater than zero");
+  assert(ToSize > 0 && "To size must be greater than zero");
+  if (FromType == ToType)
+    return From;
+  if (FromSize == ToSize)
+    return Builder.CreateBitCast(From, ToType);
+  if (ToType->isIntegerTy() && FromType->isIntegerTy())
+    return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
+  InsertPointTy SaveIP = Builder.saveIP();
+  Builder.restoreIP(AllocaIP);
+  Value *CastItem = Builder.CreateAlloca(ToType);
+  Builder.restoreIP(SaveIP);
+
+  Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      CastItem, FromType->getPointerTo());
+  Builder.CreateStore(From, ValCastItem);
+  return Builder.CreateLoad(ToType, CastItem);
+}
+
+Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
+                                                     Value *Element,
+                                                     Type *ElementType,
+                                                     Value *Offset) {
+  uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
+  assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
+
+  // Cast all types to 32- or 64-bit values before calling shuffle routines.
+  Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
+  Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
+  Value *WarpSize =
+      Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
+  Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
+      Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
+                : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
+  Value *WarpSizeCast =
+      Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
+  Value *ShuffleCall =
+      Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
+  return castValueToType(AllocaIP, ShuffleCall, CastTy);
+}
+
+void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
+                                      Value *DstAddr, Type *ElemType,
+                                      Value *Offset, Type *ReductionArrayTy) {
+  uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
+  // Create the loop over the big sized data.
+  // ptr = (void*)Elem;
+  // ptrEnd = (void*) Elem + 1;
+  // Step = 8;
+  // while (ptr + Step < ptrEnd)
+  //   shuffle((int64_t)*ptr);
+  // Step = 4;
+  // while (ptr + Step < ptrEnd)
+  //   shuffle((int32_t)*ptr);
+  // ...
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  Value *ElemPtr = DstAddr;
+  Value *Ptr = SrcAddr;
+  for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
+    if (Size < IntSize)
+      continue;
+    Type *IntType = Builder.getIntNTy(IntSize * 8);
+    Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
+    Value *SrcAddrGEP =
+        Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
+    ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
+
+    Function *CurFunc = Builder.GetInsertBlock()->getParent();
+    if ((Size / IntSize) > 1) {
+      Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          SrcAddrGEP, Builder.getPtrTy());
+      BasicBlock *PreCondBB =
+          BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
+      BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
+      BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
+      BasicBlock *CurrentBB = Builder.GetInsertBlock();
+      emitBlock(PreCondBB, CurFunc);
+      PHINode *PhiSrc =
+          Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
+      PhiSrc->addIncoming(Ptr, CurrentBB);
+      PHINode *PhiDest =
+          Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
+      PhiDest->addIncoming(ElemPtr, CurrentBB);
+      Ptr = PhiSrc;
+      ElemPtr = PhiDest;
+      Value *PtrDiff = Builder.CreatePtrDiff(
+          Builder.getInt8Ty(), PtrEnd,
+          Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
+      Builder.CreateCondBr(
+          Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
+          ExitBB);
+      emitBlock(ThenBB, CurFunc);
+      Value *Res = createRuntimeShuffleFunction(
+          AllocaIP,
+          Builder.CreateAlignedLoad(
+              IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
+          IntType, Offset);
+      Builder.CreateAlignedStore(Res, ElemPtr,
+                                 M.getDataLayout().getPrefTypeAlign(ElemType));
+      Value *LocalPtr =
+          Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+      Value *LocalElemPtr =
+          Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+      PhiSrc->addIncoming(LocalPtr, ThenBB);
+      PhiDest->addIncoming(LocalElemPtr, ThenBB);
+      emitBranch(PreCondBB);
+      emitBlock(ExitBB, CurFunc);
+    } else {
+      Value *Res = createRuntimeShuffleFunction(
+          AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
+      if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
+                                         Res->getType()->getScalarSizeInBits())
+        Res = Builder.CreateTrunc(Res, ElemType);
+      Builder.CreateStore(Res, ElemPtr);
+      Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+      ElemPtr =
+          Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+    }
+    Size = Size % IntSize;
+  }
+}
+
+void OpenMPIRBuilder::emitReductionListCopy(
+    InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+    ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+    CopyOptionsTy CopyOptions) {
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
+
+  // Iterates, element-by-element, through the source Reduce list and
+  // make a copy.
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *SrcElementAddr = nullptr;
+    Value *DestElementAddr = nullptr;
+    Value *DestElementPtrAddr = nullptr;
+    // Should we shuffle in an element from a remote lane?
+    bool ShuffleInElement = false;
+    // Set to true to update the pointer in the dest Reduce list to a
+    // newly created element.
+    bool UpdateDestListPtr = false;
+
+    // Step 1.1: Get the address for the src element in the Reduce list.
+    Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
+        ReductionArrayTy, SrcBase,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
+
+    // Step 1.2: Create a temporary to store the element in the destination
+    // Reduce list.
+    DestElementPtrAddr = Builder.CreateInBoundsGEP(
+        ReductionArrayTy, DestBase,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    switch (Action) {
+    case CopyAction::RemoteLaneToThread: {
+      InsertPointTy CurIP = Builder.saveIP();
+      Builder.restoreIP(AllocaIP);
+      AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
+                                                    ".omp.reduction.element");
+      DestAlloca->setAlignment(
+          M.getDataLayout().getPrefTypeAlign(RI.ElementType));
+      DestElementAddr = DestAlloca;
+      DestElementAddr =
+          Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
+                                      DestElementAddr->getName() + ".ascast");
+      Builder.restoreIP(CurIP);
+      ShuffleInElement = true;
+      UpdateDestListPtr = true;
+      break;
+    }
+    case CopyAction::ThreadCopy: {
+      DestElementAddr =
+          Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
+      break;
+    }
+    }
+
+    // Now that all active lanes have read the element in the
+    // Reduce list, shuffle over the value from the remote lane.
+    if (ShuffleInElement) {
+      shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
+                      RemoteLaneOffset, ReductionArrayTy);
+    } else {
+      switch (RI.EvaluationKind) {
+      case EvalKind::Scalar: {
+        Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
+        // Store the source element value to the dest element address.
+        Builder.CreateStore(Elem, DestElementAddr);
+        break;
+      }
+      case EvalKind::Complex: {
+        Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, SrcElementAddr, 0, 0, ".realp");
+        Value *SrcReal = Builder.CreateLoad(
+            RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+        Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
+        Value *SrcImg = Builder.CreateLoad(
+            RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+        Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, DestElementAddr, 0, 0, ".realp");
+        Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, DestElementAddr, 0, 1, ".imagp");
+        Builder.CreateStore(SrcReal, DestRealPtr);
+        Builder.CreateStore(SrcImg, DestImgPtr);
+        break;
+      }
+      case EvalKind::Aggregate: {
+        Value *SizeVal = Builder.getInt64(
+            M.getDataLayout().getTypeStoreSize(RI.ElementType));
+        Builder.CreateMemCpy(
+            DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+            SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+            SizeVal, false);
+        break;
+      }
+      };
+    }
+
+    // Step 3.1: Modify reference in dest Reduce list as needed.
+    // Modifying the reference in Reduce list to point to the newly
+    // created element.  The element is live in the current function
+    // scope and that of functions it invokes (i.e., reduce_function).
+    // RemoteReduceData[i] = (void*)&RemoteElem
+    if (UpdateDestListPtr) {
+      Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          DestElementAddr, Builder.getPtrTy(),
+          DestElementAddr->getName() + ".ascast");
+      Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
+    }
+  }
+}
+
+Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
+    const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
+    AttributeList FuncAttrs) {
+  InsertPointTy SavedIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
+      /* IsVarArg */ false);
+  Function *WcFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_inter_warp_copy_func", &M);
+  WcFunc->setAttributes(FuncAttrs);
+  WcFunc->addParamAttr(0, Attribute::NoUndef);
+  WcFunc->addParamAttr(1, Attribute::NoUndef);
+  BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // ReduceList: thread local Reduce list.
+  // At the stage of the computation when this function is called, partially
+  // aggregated values reside in the first lane of every active warp.
+  Argument *ReduceListArg = WcFunc->getArg(0);
+  // NumWarps: number of warps active in the parallel region.  This could
+  // be smaller than 32 (max warps in a CTA) for partial block reduction.
+  Argument *NumWarpsArg = WcFunc->getArg(1);
+
+  // This array is used as a medium to transfer, one reduce element at a time,
+  // the data from the first lane of every warp to lanes in the first warp
+  // in order to perform the final step of a reduction in a parallel region
+  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
+  // for reduced latency, as well as to have a distinct copy for concurrently
+  // executing target regions.  The array is declared with common linkage so
+  // as to be shared across compilation units.
+  StringRef TransferMediumName =
+      "__openmp_nvptx_data_transfer_temporary_storage";
+  GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
+  unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
+  ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
+  if (!TransferMedium) {
+    TransferMedium = new GlobalVariable(
+        M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
+        UndefValue::get(ArrayTy), TransferMediumName,
+        /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
+        /*AddressSpace=*/3);
+  }
+
+  // Get the CUDA thread id of the current OpenMP thread on the GPU.
+  Value *GPUThreadID = getGPUThreadID();
+  // nvptx_lane_id = nvptx_id % warpsize
+  Value *LaneID = getNVPTXLaneID();
+  // nvptx_warp_id = nvptx_id / warpsize
+  Value *WarpID = getNVPTXWarpID();
+
+  InsertPointTy AllocaIP =
+      InsertPointTy(Builder.GetInsertBlock(),
+                    Builder.GetInsertBlock()->getFirstInsertionPt());
+  Type *Arg0Type = ReduceListArg->getType();
+  Type *Arg1Type = NumWarpsArg->getType();
+  Builder.restoreIP(AllocaIP);
+  AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
+      Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
+  AllocaInst *NumWarpsAlloca =
+      Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
+  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
+  Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      NumWarpsAlloca, Arg1Type->getPointerTo(),
+      NumWarpsAlloca->getName() + ".ascast");
+  Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+  Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
+  AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
+  InsertPointTy CodeGenIP =
+      getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
+  Builder.restoreIP(CodeGenIP);
+
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
+
+  for (auto En : enumerate(ReductionInfos)) {
+    //
+    // Warp master copies reduce element to transfer medium in __shared__
+    // memory.
+    //
+    const ReductionInfo &RI = En.value();
+    unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
+    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
+      Type *CType = Builder.getIntNTy(TySize * 8);
+
+      unsigned NumIters = RealTySize / TySize;
+      if (NumIters == 0)
+        continue;
+      Value *Cnt = nullptr;
+      Value *CntAddr = nullptr;
+      BasicBlock *PrecondBB = nullptr;
+      BasicBlock *ExitBB = nullptr;
+      if (NumIters > 1) {
+        CodeGenIP = Builder.saveIP();
+        Builder.restoreIP(AllocaIP);
+        CntAddr =
+            Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
+
+        CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
+                                              CntAddr->getName() + ".ascast");
+        Builder.restoreIP(CodeGenIP);
+        Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
+                            CntAddr,
+                            /*Volatile=*/false);
+        PrecondBB = BasicBlock::Create(Ctx, "precond");
+        ExitBB = BasicBlock::Create(Ctx, "exit");
+        BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
+        emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
+        Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
+                                 /*Volatile=*/false);
+        Value *Cmp = Builder.CreateICmpULT(
+            Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
+        Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
+        emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
+      }
+
+      // kmpc_barrier.
+      createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                    omp::Directive::OMPD_unknown,
+                    /* ForceSimpleCall */ false,
+                    /* CheckCancelFlag */ true);
+      BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+      BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+      BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+      // if (lane_id  == 0)
+      Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
+      Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+      emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+
+      // Reduce element = LocalReduceList[i]
+      auto *RedListArrayTy =
+          ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+      Type *IndexTy = Builder.getIndexTy(
+          M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+      Value *ElemPtrPtr =
+          Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+                                    {ConstantInt::get(IndexTy, 0),
+                                     ConstantInt::get(IndexTy, En.index())});
+      // elemptr = ((CopyType*)(elemptrptr)) + I
+      Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+      if (NumIters > 1)
+        ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
+
+      // Get pointer to location in transfer medium.
+      // MediumPtr = &medium[warp_id]
+      Value *MediumPtr = Builder.CreateInBoundsGEP(
+          ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
+      // elem = *elemptr
+      //*MediumPtr = elem
+      Value *Elem = Builder.CreateLoad(CType, ElemPtr);
+      // Store the source element value to the dest element address.
+      Builder.CreateStore(Elem, MediumPtr,
+                          /*IsVolatile*/ true);
+      Builder.CreateBr(MergeBB);
+
+      // else
+      emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+      Builder.CreateBr(MergeBB);
+
+      // endif
+      emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+      createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                    omp::Directive::OMPD_unknown,
+                    /* ForceSimpleCall */ false,
+                    /* CheckCancelFlag */ true);
+
+      // Warp 0 copies reduce element from transfer medium
+      BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
+      BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
+      BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+      Value *NumWarpsVal =
+          Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
+      // Up to 32 threads in warp 0 are active.
+      Value *IsActiveThread =
+          Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
+      Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+      emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
+
+      // SecMediumPtr = &medium[tid]
+      // SrcMediumVal = *SrcMediumPtr
+      Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
+          ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
+      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+      Value *TargetElemPtrPtr =
+          Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+                                    {ConstantInt::get(IndexTy, 0),
+                                     ConstantInt::get(IndexTy, En.index())});
+      Value *TargetElemPtrVal =
+          Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
+      Value *TargetElemPtr = TargetElemPtrVal;
+      if (NumIters > 1)
+        TargetElemPtr =
+            Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
+
+      // *TargetElemPtr = SrcMediumVal;
+      Value *SrcMediumValue =
+          Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
+      Builder.CreateStore(SrcMediumValue, TargetElemPtr);
+      Builder.CreateBr(W0MergeBB);
+
+      emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
+      Builder.CreateBr(W0MergeBB);
+
+      emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
+
+      if (NumIters > 1) {
+        Cnt = Builder.CreateNSWAdd(
+            Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
+        Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
+
+        auto *CurFn = Builder.GetInsertBlock()->getParent();
+        emitBranch(PrecondBB);
+        emitBlock(ExitBB, CurFn);
+      }
+      RealTySize %= TySize;
+    }
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(SavedIP);
+
+  return WcFunc;
+}
+
+Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    AttributeList FuncAttrs) {
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy =
+      FunctionType::get(Builder.getVoidTy(),
+                        {Builder.getPtrTy(), Builder.getInt16Ty(),
+                         Builder.getInt16Ty(), Builder.getInt16Ty()},
+                        /* IsVarArg */ false);
+  Function *SarFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_shuffle_and_reduce_func", &M);
+  SarFunc->setAttributes(FuncAttrs);
+  SarFunc->addParamAttr(0, Attribute::NoUndef);
+  SarFunc->addParamAttr(1, Attribute::NoUndef);
+  SarFunc->addParamAttr(2, Attribute::NoUndef);
+  SarFunc->addParamAttr(3, Attribute::NoUndef);
+  SarFunc->addParamAttr(1, Attribute::SExt);
+  SarFunc->addParamAttr(2, Attribute::SExt);
+  SarFunc->addParamAttr(3, Attribute::SExt);
+  BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // Thread local Reduce list used to host the values of data to be reduced.
+  Argument *ReduceListArg = SarFunc->getArg(0);
+  // Current lane id; could be logical.
+  Argument *LaneIDArg = SarFunc->getArg(1);
+  // Offset of the remote source lane relative to the current lane.
+  Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
+  // Algorithm version.  This is expected to be known at compile time.
+  Argument *AlgoVerArg = SarFunc->getArg(3);
+
+  Type *ReduceListArgType = ReduceListArg->getType();
+  Type *LaneIDArgType = LaneIDArg->getType();
+  Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
+  Value *ReduceListAlloca = Builder.CreateAlloca(
+      ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
+  Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+                                             LaneIDArg->getName() + ".addr");
+  Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
+      LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
+  Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+                                              AlgoVerArg->getName() + ".addr");
+  ArrayType *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // Create a local thread-private variable to host the Reduce list
+  // from a remote lane.
+  Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
+      RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
+
+  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAlloca, ReduceListArgType,
+      ReduceListAlloca->getName() + ".ascast");
+  Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
+  Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteLaneOffsetAlloca, LaneIDArgPtrType,
+      RemoteLaneOffsetAlloca->getName() + ".ascast");
+  Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
+  Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteReductionListAlloca, Builder.getPtrTy(),
+      RemoteReductionListAlloca->getName() + ".ascast");
+
+  Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+  Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
+  Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
+  Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
+
+  Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
+  Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
+  Value *RemoteLaneOffset =
+      Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
+  Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
+
+  InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
+
+  // This loop iterates through the list of reduce elements and copies,
+  // element by element, from a remote lane in the warp to RemoteReduceList,
+  // hosted on the thread's stack.
+  emitReductionListCopy(
+      AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
+      ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
+
+  // The actions to be performed on the Remote Reduce list is dependent
+  // on the algorithm version.
+  //
+  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
+  //  LaneId % 2 == 0 && Offset > 0):
+  //    do the reduction value aggregation
+  //
+  //  The thread local variable Reduce list is mutated in place to host the
+  //  reduced data, which is the aggregated value produced from local and
+  //  remote lanes.
+  //
+  //  Note that AlgoVer is expected to be a constant integer known at compile
+  //  time.
+  //  When AlgoVer==0, the first conjunction evaluates to true, making
+  //    the entire predicate true during compile time.
+  //  When AlgoVer==1, the second conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  //  When AlgoVer==2, the third conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
+  Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
+  Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
+  Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
+  Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
+  Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
+  Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
+  Value *RemoteOffsetComp =
+      Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
+  Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
+  Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
+  Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
+
+  BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+  BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+  BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+  Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
+  emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+  Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceList, Builder.getPtrTy());
+  Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteListAddrCast, Builder.getPtrTy());
+  Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
+      ->addFnAttr(Attribute::NoUnwind);
+  Builder.CreateBr(MergeBB);
+
+  emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+  Builder.CreateBr(MergeBB);
+
+  emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+
+  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
+  // Reduce list.
+  Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
+  Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
+
+  BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
+  BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
+  BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
+  Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
+
+  emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
+  emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
+                        ReductionInfos, RemoteListAddrCast, ReduceList);
+  Builder.CreateBr(CpyMergeBB);
+
+  emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
+  Builder.CreateBr(CpyMergeBB);
+
+  emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
+
+  Builder.CreateRetVoid();
+
+  return SarFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+    AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGCFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_list_to_global_copy_func", &M);
+  LtGCFunc->setAttributes(FuncAttrs);
+  LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGCFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGCFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *LocalReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Value *BufferArgVal =
+      Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    auto *RedListArrayTy =
+        ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+    // Reduce element = LocalReduceList[i]
+    Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // elemptr = ((CopyType*)(elemptrptr)) + I
+    Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+
+    // Global = Buffer.VD[Idx];
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
+    Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+
+    switch (RI.EvaluationKind) {
+    case EvalKind::Scalar: {
+      Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
+      Builder.CreateStore(TargetElement, GlobVal);
+      break;
+    }
+    case EvalKind::Complex: {
+      Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 0, ".realp");
+      Value *SrcReal = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+      Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 1, ".imagp");
+      Value *SrcImg = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+      Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobVal, 0, 0, ".realp");
+      Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobVal, 0, 1, ".imagp");
+      Builder.CreateStore(SrcReal, DestRealPtr);
+      Builder.CreateStore(SrcImg, DestImgPtr);
+      break;
+    }
+    case EvalKind::Aggregate: {
+      Value *SizeVal =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+      Builder.CreateMemCpy(
+          GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
+          M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
+      break;
+    }
+    }
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGCFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    Type *ReductionsBufferTy, AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGRFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_list_to_global_reduce_func", &M);
+  LtGRFunc->setAttributes(FuncAttrs);
+  LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGRFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGRFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  auto *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+  Value *LocalReduceList =
+      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, Builder.getPtrTy(),
+      LocalReduceList->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceListAddrCast,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+    // Global = Buffer.VD[Idx];
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+  }
+
+  // Call reduce_function(GlobalReduceList, ReduceList)
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
+      ->addFnAttr(Attribute::NoUnwind);
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGRFunc;
+}
+
+Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+    AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGCFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_global_to_list_copy_func", &M);
+  LtGCFunc->setAttributes(FuncAttrs);
+  LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGCFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGCFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *LocalReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    auto *RedListArrayTy =
+        ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+    // Reduce element = LocalReduceList[i]
+    Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // elemptr = ((CopyType*)(elemptrptr)) + I
+    Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+    // Global = Buffer.VD[Idx];
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+
+    switch (RI.EvaluationKind) {
+    case EvalKind::Scalar: {
+      Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
+      Builder.CreateStore(TargetElement, ElemPtr);
+      break;
+    }
+    case EvalKind::Complex: {
+      Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobValPtr, 0, 0, ".realp");
+      Value *SrcReal = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+      Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobValPtr, 0, 1, ".imagp");
+      Value *SrcImg = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+      Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 0, ".realp");
+      Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 1, ".imagp");
+      Builder.CreateStore(SrcReal, DestRealPtr);
+      Builder.CreateStore(SrcImg, DestImgPtr);
+      break;
+    }
+    case EvalKind::Aggregate: {
+      Value *SizeVal =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+      Builder.CreateMemCpy(
+          ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+          GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+          SizeVal, false);
+      break;
+    }
+    }
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGCFunc;
+}
+
+Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    Type *ReductionsBufferTy, AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  auto *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGRFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_global_to_list_reduce_func", &M);
+  LtGRFunc->setAttributes(FuncAttrs);
+  LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGRFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGRFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  ArrayType *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+  Value *LocalReduceList =
+      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, Builder.getPtrTy(),
+      LocalReduceList->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, ReductionList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // Global = Buffer.VD[Idx];
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+  }
+
+  // Call reduce_function(ReduceList, GlobalReduceList)
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
+      ->addFnAttr(Attribute::NoUnwind);
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGRFunc;
+}
+
+std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
+  std::string Suffix =
+      createPlatformSpecificName({"omp", "reduction", "reduction_func"});
+  return (Name + Suffix).str();
+}
+
+Function *OpenMPIRBuilder::createReductionFunction(
+    StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+    ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
+  auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
+                                   {Builder.getPtrTy(), Builder.getPtrTy()},
+                                   /* IsVarArg */ false);
+  std::string Name = getReductionFuncName(ReducerName);
+  Function *ReductionFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M);
+  ReductionFunc->setAttributes(FuncAttrs);
+  ReductionFunc->addParamAttr(0, Attribute::NoUndef);
+  ReductionFunc->addParamAttr(1, Attribute::NoUndef);
+  BasicBlock *EntryBB =
+      BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // Need to alloca memory here and deal with the pointers before getting
+  // LHS/RHS pointers out
+  Value *LHSArrayPtr = nullptr;
+  Value *RHSArrayPtr = nullptr;
+  Argument *Arg0 = ReductionFunc->getArg(0);
+  Argument *Arg1 = ReductionFunc->getArg(1);
+  Type *Arg0Type = Arg0->getType();
+  Type *Arg1Type = Arg1->getType();
+
+  Value *LHSAlloca =
+      Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+  Value *RHSAlloca =
+      Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+  Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
+  Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
+  Builder.CreateStore(Arg0, LHSAddrCast);
+  Builder.CreateStore(Arg1, RHSAddrCast);
+  LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+  RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
+
+  Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  SmallVector<Value *> LHSPtrs, RHSPtrs;
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, RHSArrayPtr,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+    Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        RHSI8Ptr, RI.PrivateVariable->getType(),
+        RHSI8Ptr->getName() + ".ascast");
+
+    Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, LHSArrayPtr,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+    Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
+
+    if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
+      LHSPtrs.emplace_back(LHSPtr);
+      RHSPtrs.emplace_back(RHSPtr);
+    } else {
+      Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+      Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+      Value *Reduced;
+      RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
+      if (!Builder.GetInsertBlock())
+        return ReductionFunc;
+      Builder.CreateStore(Reduced, LHSPtr);
+    }
+  }
+
+  if (ReductionGenCBKind == ReductionGenCBKind::Clang)
+    for (auto En : enumerate(ReductionInfos)) {
+      unsigned Index = En.index();
+      const ReductionInfo &RI = En.value();
+      Value *LHSFixupPtr, *RHSFixupPtr;
+      Builder.restoreIP(RI.ReductionGenClang(
+          Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
+
+      // Fix the CallBack code genereated to use the correct Values for the LHS
+      // and RHS
+      LHSFixupPtr->replaceUsesWithIf(
+          LHSPtrs[Index], [ReductionFunc](const Use &U) {
+            return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+                   ReductionFunc;
+          });
+      RHSFixupPtr->replaceUsesWithIf(
+          RHSPtrs[Index], [ReductionFunc](const Use &U) {
+            return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+                   ReductionFunc;
+          });
+    }
+
+  Builder.CreateRetVoid();
+  return ReductionFunc;
+}
+
+static void
+checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+                    bool IsGPU) {
+  for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
+    (void)RI;
+    assert(RI.Variable && "expected non-null variable");
+    assert(RI.PrivateVariable && "expected non-null private variable");
+    assert((RI.ReductionGen || RI.ReductionGenClang) &&
+           "expected non-null reduction generator callback");
+    if (!IsGPU) {
+      assert(
+          RI.Variable->getType() == RI.PrivateVariable->getType() &&
+          "expected variables and their private equivalents to have the same "
+          "type");
+    }
+    assert(RI.Variable->getType()->isPointerTy() &&
+           "expected variables to be pointers");
+  }
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
+    const LocationDescription &Loc, InsertPointTy AllocaIP,
+    InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+    bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
+    ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
+    unsigned ReductionBufNum, Value *SrcLocInfo) {
+  if (!updateToLocation(Loc))
+    return InsertPointTy();
+  Builder.restoreIP(CodeGenIP);
+  checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
+  LLVMContext &Ctx = M.getContext();
+
+  // Source location for the ident struct
+  if (!SrcLocInfo) {
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+    SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  }
+
+  if (ReductionInfos.size() == 0)
+    return Builder.saveIP();
+
+  Function *CurFunc = Builder.GetInsertBlock()->getParent();
+  AttributeList FuncAttrs;
+  AttrBuilder AttrBldr(Ctx);
+  for (auto Attr : CurFunc->getAttributes().getFnAttrs())
+    AttrBldr.addAttribute(Attr);
+  AttrBldr.removeAttribute(Attribute::OptimizeNone);
+  FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
+
+  Function *ReductionFunc = nullptr;
+  CodeGenIP = Builder.saveIP();
+  ReductionFunc =
+      createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
+                              ReductionInfos, ReductionGenCBKind, FuncAttrs);
+  Builder.restoreIP(CodeGenIP);
+
+  // Set the grid value in the config needed for lowering later on
+  if (GridValue.has_value())
+    Config.setGridValue(GridValue.value());
+  else
+    Config.setGridValue(getGridValue(T, ReductionFunc));
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+  Value *RTLoc =
+      getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
+
+  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
+  // RedList, shuffle_reduce_func, interwarp_copy_func);
+  // or
+  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+  Value *Res;
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+  auto Size = ReductionInfos.size();
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *RedArrayTy = ArrayType::get(PtrTy, Size);
+  CodeGenIP = Builder.saveIP();
+  Builder.restoreIP(AllocaIP);
+  Value *ReductionListAlloca =
+      Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
+  Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
+  Builder.restoreIP(CodeGenIP);
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *ElemPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, ReductionList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *CastElem =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+    Builder.CreateStore(CastElem, ElemPtr);
+  }
+  CodeGenIP = Builder.saveIP();
+  Function *SarFunc =
+      emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
+  Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
+  Builder.restoreIP(CodeGenIP);
+
+  Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
+
+  unsigned MaxDataSize = 0;
+  SmallVector<Type *> ReductionTypeArgs;
+  for (auto En : enumerate(ReductionInfos)) {
+    auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
+    if (Size > MaxDataSize)
+      MaxDataSize = Size;
+    ReductionTypeArgs.emplace_back(En.value().ElementType);
+  }
+  Value *ReductionDataSize =
+      Builder.getInt64(MaxDataSize * ReductionInfos.size());
+  if (!IsTeamsReduction) {
+    Value *SarFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+    Value *WcFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+    Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+    Res = Builder.CreateCall(Pv2Ptr, Args);
+  } else {
+    CodeGenIP = Builder.saveIP();
+    StructType *ReductionsBufferTy = StructType::create(
+        Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
+    Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
+    Function *LtGCFunc = emitListToGlobalCopyFunction(
+        ReductionInfos, ReductionsBufferTy, FuncAttrs);
+    Function *LtGRFunc = emitListToGlobalReduceFunction(
+        ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+    Function *GtLCFunc = emitGlobalToListCopyFunction(
+        ReductionInfos, ReductionsBufferTy, FuncAttrs);
+    Function *GtLRFunc = emitGlobalToListReduceFunction(
+        ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+    Builder.restoreIP(CodeGenIP);
+
+    Value *KernelTeamsReductionPtr = Builder.CreateCall(
+        RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
+
+    Value *Args3[] = {RTLoc,
+                      KernelTeamsReductionPtr,
+                      Builder.getInt32(ReductionBufNum),
+                      ReductionDataSize,
+                      RL,
+                      SarFunc,
+                      WcFunc,
+                      LtGCFunc,
+                      LtGRFunc,
+                      GtLCFunc,
+                      GtLRFunc};
+
+    Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
+    Res = Builder.CreateCall(TeamsReduceFn, Args3);
+  }
+
+  // 5. Build if (res == 1)
+  BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
+  BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
+  Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
+  Builder.CreateCondBr(Cond, ThenBB, ExitBB);
+
+  // 6. Build then branch: where we have reduced values in the master
+  //    thread in each team.
+  //    __kmpc_end_reduce{_nowait}(<gtid>);
+  //    break;
+  emitBlock(ThenBB, CurFunc);
+
+  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *LHS = RI.Variable;
+    Value *RHS =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+
+    if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
+      Value *LHSPtr, *RHSPtr;
+      Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
+                                             &LHSPtr, &RHSPtr, CurFunc));
+
+      // Fix the CallBack code genereated to use the correct Values for the LHS
+      // and RHS
+      LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
+        return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+               ReductionFunc;
+      });
+      RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
+        return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+               ReductionFunc;
+      });
+    } else {
+      assert(false && "Unhandled ReductionGenCBKind");
+    }
+  }
+  emitBlock(ExitBB, CurFunc);
+
+  Config.setEmitLLVMUsed();
+
+  return Builder.saveIP();
+}
+
+static Function *getFreshReductionFunc(Module &M) {
   Type *VoidTy = Type::getVoidTy(M.getContext());
   Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
   auto *FuncTy =
       FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
   return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
-                          M.getDataLayout().getDefaultGlobalsAddressSpace(),
                           ".omp.reduction.func", &M);
 }
 
@@ -2178,10 +3560,9 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc,
   Module *Module = Func->getParent();
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  bool CanGenerateAtomic =
-      llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
-        return RI.AtomicReductionGen;
-      });
+  bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
+    return RI.AtomicReductionGen;
+  });
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
                                   CanGenerateAtomic
                                       ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
@@ -3248,16 +4629,8 @@ static void removeUnusedBlocksFromParent(ArrayRef<BasicBlock *> BBs) {
     return false;
   };
 
-  while (true) {
-    bool Changed = false;
-    for (BasicBlock *BB : make_early_inc_range(BBsToErase)) {
-      if (HasRemainingUses(BB)) {
-        BBsToErase.erase(BB);
-        Changed = true;
-      }
-    }
-    if (!Changed)
-      break;
+  while (BBsToErase.remove_if(HasRemainingUses)) {
+    // Try again if anything was removed.
   }
 
   SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
@@ -3752,7 +5125,7 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
     for (auto &AlignedItem : AlignedVars) {
       Value *AlignedPtr = AlignedItem.first;
       Value *Alignment = AlignedItem.second;
-      Builder.CreateAlignmentAssumption(F->getParent()->getDataLayout(),
+      Builder.CreateAlignmentAssumption(F->getDataLayout(),
                                         AlignedPtr, Alignment);
     }
     Builder.restoreIP(IP);
@@ -4604,7 +5977,7 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
 
   Function *Fn = getOrCreateRuntimeFunctionPtr(
       omp::RuntimeFunction::OMPRTL___kmpc_target_init);
-  const DataLayout &DL = Fn->getParent()->getDataLayout();
+  const DataLayout &DL = Fn->getDataLayout();
 
   Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
   Constant *DynamicEnvironmentInitializer =
@@ -4640,7 +6013,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
                              Ident,
                              DynamicEnvironment,
                          });
-  Twine KernelEnvironmentName = KernelName + "_kernel_environment";
+  std::string KernelEnvironmentName =
+      (KernelName + "_kernel_environment").str();
   GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
       M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
       KernelEnvironmentInitializer, KernelEnvironmentName,
@@ -5164,15 +6538,7 @@ static Function *createOutlinedFunction(
           ? make_range(Func->arg_begin() + 1, Func->arg_end())
           : Func->args();
 
-  // Rewrite uses of input valus to parameters.
-  for (auto InArg : zip(Inputs, ArgRange)) {
-    Value *Input = std::get<0>(InArg);
-    Argument &Arg = std::get<1>(InArg);
-    Value *InputCopy = nullptr;
-
-    Builder.restoreIP(
-        ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
-
+  auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
     // Things like GEP's can come in the form of Constants. Constants and
     // ConstantExpr's do not have access to the knowledge of what they're
     // contained in, so we must dig a little to find an instruction so we
@@ -5198,8 +6564,49 @@ static Function *createOutlinedFunction(
       if (auto *Instr = dyn_cast<Instruction>(User))
         if (Instr->getFunction() == Func)
           Instr->replaceUsesOfWith(Input, InputCopy);
+  };
+
+  SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
+
+  // Rewrite uses of input valus to parameters.
+  for (auto InArg : zip(Inputs, ArgRange)) {
+    Value *Input = std::get<0>(InArg);
+    Argument &Arg = std::get<1>(InArg);
+    Value *InputCopy = nullptr;
+
+    Builder.restoreIP(
+        ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
+
+    // In certain cases a Global may be set up for replacement, however, this
+    // Global may be used in multiple arguments to the kernel, just segmented
+    // apart, for example, if we have a global array, that is sectioned into
+    // multiple mappings (technically not legal in OpenMP, but there is a case
+    // in Fortran for Common Blocks where this is neccesary), we will end up
+    // with GEP's into this array inside the kernel, that refer to the Global
+    // but are technically seperate arguments to the kernel for all intents and
+    // purposes. If we have mapped a segment that requires a GEP into the 0-th
+    // index, it will fold into an referal to the Global, if we then encounter
+    // this folded GEP during replacement all of the references to the
+    // Global in the kernel will be replaced with the argument we have generated
+    // that corresponds to it, including any other GEP's that refer to the
+    // Global that may be other arguments. This will invalidate all of the other
+    // preceding mapped arguments that refer to the same global that may be
+    // seperate segments. To prevent this, we defer global processing until all
+    // other processing has been performed.
+    if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
+        llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
+        llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
+      DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
+      continue;
+    }
+
+    ReplaceValue(Input, InputCopy, Func);
   }
 
+  // Replace all of our deferred Input values, currently just Globals.
+  for (auto Deferred : DeferredReplacement)
+    ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
+
   // Restore insert point.
   Builder.restoreIP(OldInsertPoint);
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index fc6bea04372433..5beefaa1ec7018 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,6 +1033,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         break; // No other 'amdgcn.atomic.*'
       }
 
+      if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
+          Name.starts_with("ds.fmax")) {
+        // Replaced with atomicrmw fadd/fmin/fmax, so there's no new
+        // declaration.
+        NewFn = nullptr;
+        return true;
+      }
+
       if (Name.starts_with("ldexp.")) {
         // Target specific intrinsic became redundant
         NewFn = Intrinsic::getDeclaration(
@@ -2331,40 +2339,82 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   llvm_unreachable("Unknown function for ARM CallBase upgrade.");
 }
 
+// These are expected to have the arguments:
+// atomic.intrin (ptr, rmw_value, ordering, scope, isVolatile)
+//
+// Except for int_amdgcn_ds_fadd_v2bf16 which only has (ptr, rmw_value).
+//
 static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
                                          Function *F, IRBuilder<> &Builder) {
-  const bool IsInc = Name.starts_with("atomic.inc.");
-  if (IsInc || Name.starts_with("atomic.dec.")) {
-    if (CI->getNumOperands() != 6) // Malformed bitcode.
-      return nullptr;
+  AtomicRMWInst::BinOp RMWOp =
+      StringSwitch<AtomicRMWInst::BinOp>(Name)
+          .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+          .StartsWith("ds.fmin", AtomicRMWInst::FMin)
+          .StartsWith("ds.fmax", AtomicRMWInst::FMax)
+          .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
+          .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
+
+  unsigned NumOperands = CI->getNumOperands();
+  if (NumOperands < 3) // Malformed bitcode.
+    return nullptr;
 
-    AtomicRMWInst::BinOp RMWOp =
-        IsInc ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap;
+  Value *Ptr = CI->getArgOperand(0);
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) // Malformed.
+    return nullptr;
+
+  Value *Val = CI->getArgOperand(1);
+  if (Val->getType() != CI->getType()) // Malformed.
+    return nullptr;
 
-    Value *Ptr = CI->getArgOperand(0);
-    Value *Val = CI->getArgOperand(1);
-    ConstantInt *OrderArg = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  ConstantInt *OrderArg = nullptr;
+  bool IsVolatile = false;
+
+  // These should have 5 arguments (plus the callee). A separate version of the
+  // ds_fadd intrinsic was defined for bf16 which was missing arguments.
+  if (NumOperands > 3)
+    OrderArg = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+
+  // Ignore scope argument at 3
+
+  if (NumOperands > 5) {
     ConstantInt *VolatileArg = dyn_cast<ConstantInt>(CI->getArgOperand(4));
+    IsVolatile = !VolatileArg || !VolatileArg->isZero();
+  }
 
-    AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent;
-    if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue()))
-      Order = static_cast<AtomicOrdering>(OrderArg->getZExtValue());
-    if (Order == AtomicOrdering::NotAtomic ||
-        Order == AtomicOrdering::Unordered)
-      Order = AtomicOrdering::SequentiallyConsistent;
-
-    // The scope argument never really worked correctly. Use agent as the most
-    // conservative option which should still always produce the instruction.
-    SyncScope::ID SSID = F->getContext().getOrInsertSyncScopeID("agent");
-    AtomicRMWInst *RMW =
-        Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
-
-    if (!VolatileArg || !VolatileArg->isZero())
-      RMW->setVolatile(true);
-    return RMW;
+  AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent;
+  if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue()))
+    Order = static_cast<AtomicOrdering>(OrderArg->getZExtValue());
+  if (Order == AtomicOrdering::NotAtomic || Order == AtomicOrdering::Unordered)
+    Order = AtomicOrdering::SequentiallyConsistent;
+
+  LLVMContext &Ctx = F->getContext();
+
+  // Handle the v2bf16 intrinsic which used <2 x i16> instead of <2 x bfloat>
+  Type *RetTy = CI->getType();
+  if (VectorType *VT = dyn_cast<VectorType>(RetTy)) {
+    if (VT->getElementType()->isIntegerTy(16)) {
+      VectorType *AsBF16 =
+          VectorType::get(Type::getBFloatTy(Ctx), VT->getElementCount());
+      Val = Builder.CreateBitCast(Val, AsBF16);
+    }
   }
 
-  llvm_unreachable("Unknown function for AMDGPU intrinsic upgrade.");
+  // The scope argument never really worked correctly. Use agent as the most
+  // conservative option which should still always produce the instruction.
+  SyncScope::ID SSID = Ctx.getOrInsertSyncScopeID("agent");
+  AtomicRMWInst *RMW =
+      Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
+
+  if (PtrTy->getAddressSpace() != 3) {
+    RMW->setMetadata("amdgpu.no.fine.grained.memory",
+                     MDNode::get(F->getContext(), {}));
+  }
+
+  if (IsVolatile)
+    RMW->setVolatile(true);
+
+  return Builder.CreateBitCast(RMW, RetTy);
 }
 
 /// Helper to unwrap intrinsic call MetadataAsValue operands.
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index de32771ab05dcd..bf19934da047c4 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -291,6 +291,10 @@ const Module *BasicBlock::getModule() const {
   return getParent()->getParent();
 }
 
+const DataLayout &BasicBlock::getDataLayout() const {
+  return getModule()->getDataLayout();
+}
+
 const CallInst *BasicBlock::getTerminatingMustTailCall() const {
   if (InstList.empty())
     return nullptr;
diff --git a/llvm/lib/IR/ConstantRangeList.cpp b/llvm/lib/IR/ConstantRangeList.cpp
index 2cc483d4e4962f..0856f79bb9191a 100644
--- a/llvm/lib/IR/ConstantRangeList.cpp
+++ b/llvm/lib/IR/ConstantRangeList.cpp
@@ -81,6 +81,154 @@ void ConstantRangeList::insert(const ConstantRange &NewRange) {
   }
 }
 
+void ConstantRangeList::subtract(const ConstantRange &SubRange) {
+  if (SubRange.isEmptySet() || empty())
+    return;
+  assert(!SubRange.isFullSet() && "Do not support full set");
+  assert(SubRange.getLower().slt(SubRange.getUpper()));
+  assert(getBitWidth() == SubRange.getBitWidth());
+  // Handle common cases.
+  if (Ranges.back().getUpper().sle(SubRange.getLower()))
+    return;
+  if (SubRange.getUpper().sle(Ranges.front().getLower()))
+    return;
+
+  SmallVector<ConstantRange, 2> Result;
+  auto AppendRangeIfNonEmpty = [&Result](APInt Start, APInt End) {
+    if (Start.slt(End))
+      Result.push_back(ConstantRange(Start, End));
+  };
+  for (auto &Range : Ranges) {
+    if (SubRange.getUpper().sle(Range.getLower()) ||
+        Range.getUpper().sle(SubRange.getLower())) {
+      // "Range" and "SubRange" do not overlap.
+      //       L---U        : Range
+      // L---U              : SubRange (Case1)
+      //             L---U  : SubRange (Case2)
+      Result.push_back(Range);
+    } else if (Range.getLower().sle(SubRange.getLower()) &&
+               SubRange.getUpper().sle(Range.getUpper())) {
+      // "Range" contains "SubRange".
+      //       L---U        : Range
+      //        L-U         : SubRange
+      // Note that ConstantRange::contains(ConstantRange) checks unsigned,
+      // but we need signed checking here.
+      AppendRangeIfNonEmpty(Range.getLower(), SubRange.getLower());
+      AppendRangeIfNonEmpty(SubRange.getUpper(), Range.getUpper());
+    } else if (SubRange.getLower().sle(Range.getLower()) &&
+               Range.getUpper().sle(SubRange.getUpper())) {
+      // "SubRange" contains "Range".
+      //        L-U        : Range
+      //       L---U       : SubRange
+      continue;
+    } else if (Range.getLower().sge(SubRange.getLower()) &&
+               Range.getLower().sle(SubRange.getUpper())) {
+      // "Range" and "SubRange" overlap at the left.
+      //       L---U        : Range
+      //     L---U          : SubRange
+      AppendRangeIfNonEmpty(SubRange.getUpper(), Range.getUpper());
+    } else {
+      // "Range" and "SubRange" overlap at the right.
+      //       L---U        : Range
+      //         L---U      : SubRange
+      assert(SubRange.getLower().sge(Range.getLower()) &&
+             SubRange.getLower().sle(Range.getUpper()));
+      AppendRangeIfNonEmpty(Range.getLower(), SubRange.getLower());
+    }
+  }
+
+  Ranges = Result;
+}
+
+ConstantRangeList
+ConstantRangeList::unionWith(const ConstantRangeList &CRL) const {
+  assert(getBitWidth() == CRL.getBitWidth() &&
+         "ConstantRangeList bitwidths don't agree!");
+  // Handle common cases.
+  if (empty())
+    return CRL;
+  if (CRL.empty())
+    return *this;
+
+  ConstantRangeList Result;
+  size_t i = 0, j = 0;
+  // "PreviousRange" tracks the lowest unioned range that is being processed.
+  // Its lower is fixed and the upper may be updated over iterations.
+  ConstantRange PreviousRange(getBitWidth(), false);
+  if (Ranges[i].getLower().slt(CRL.Ranges[j].getLower())) {
+    PreviousRange = Ranges[i++];
+  } else {
+    PreviousRange = CRL.Ranges[j++];
+  }
+
+  // Try to union "PreviousRange" and "CR". If they are disjoint, push
+  // "PreviousRange" to the result and assign it to "CR", a new union range.
+  // Otherwise, update the upper of "PreviousRange" to cover "CR". Note that,
+  // the lower of "PreviousRange" is always less or equal the lower of "CR".
+  auto UnionAndUpdateRange = [&PreviousRange,
+                              &Result](const ConstantRange &CR) {
+    if (PreviousRange.getUpper().slt(CR.getLower())) {
+      Result.Ranges.push_back(PreviousRange);
+      PreviousRange = CR;
+    } else {
+      PreviousRange = ConstantRange(
+          PreviousRange.getLower(),
+          APIntOps::smax(PreviousRange.getUpper(), CR.getUpper()));
+    }
+  };
+  while (i < size() || j < CRL.size()) {
+    if (j == CRL.size() ||
+        (i < size() && Ranges[i].getLower().slt(CRL.Ranges[j].getLower()))) {
+      // Merge PreviousRange with this.
+      UnionAndUpdateRange(Ranges[i++]);
+    } else {
+      // Merge PreviousRange with CRL.
+      UnionAndUpdateRange(CRL.Ranges[j++]);
+    }
+  }
+  Result.Ranges.push_back(PreviousRange);
+  return Result;
+}
+
+ConstantRangeList
+ConstantRangeList::intersectWith(const ConstantRangeList &CRL) const {
+  assert(getBitWidth() == CRL.getBitWidth() &&
+         "ConstantRangeList bitwidths don't agree!");
+
+  // Handle common cases.
+  if (empty())
+    return *this;
+  if (CRL.empty())
+    return CRL;
+
+  ConstantRangeList Result;
+  size_t i = 0, j = 0;
+  while (i < size() && j < CRL.size()) {
+    auto &Range = this->Ranges[i];
+    auto &OtherRange = CRL.Ranges[j];
+
+    // The intersection of two Ranges is (max(lowers), min(uppers)), and it's
+    // possible that max(lowers) > min(uppers) if they don't have intersection.
+    // Add the intersection to result only if it's non-empty.
+    // To keep simple, we don't call ConstantRange::intersectWith() as it
+    // considers the complex upper wrapped case and may result two ranges,
+    // like (2, 8) && (6, 4) = {(2, 4), (6, 8)}.
+    APInt Start = APIntOps::smax(Range.getLower(), OtherRange.getLower());
+    APInt End = APIntOps::smin(Range.getUpper(), OtherRange.getUpper());
+    if (Start.slt(End))
+      Result.Ranges.push_back(ConstantRange(Start, End));
+
+    // Move to the next Range in one list determined by the uppers.
+    // For example: A = {(0, 2), (4, 8)}; B = {(-2, 5), (6, 10)}
+    // We need to intersect three pairs: A0 && B0; A1 && B0; A1 && B1.
+    if (Range.getUpper().slt(OtherRange.getUpper()))
+      i++;
+    else
+      j++;
+  }
+  return Result;
+}
+
 void ConstantRangeList::print(raw_ostream &OS) const {
   interleaveComma(Ranges, OS, [&](ConstantRange CR) {
     OS << "(" << CR.getLower() << ", " << CR.getUpper() << ")";
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 3b6b01fb78b0af..9ba78731060439 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -954,6 +954,32 @@ LLVMTypeRef LLVMTargetExtTypeInContext(LLVMContextRef C, const char *Name,
       TargetExtType::get(*unwrap(C), Name, TypeParamArray, IntParamArray));
 }
 
+const char *LLVMGetTargetExtTypeName(LLVMTypeRef TargetExtTy) {
+  TargetExtType *Type = unwrap<TargetExtType>(TargetExtTy);
+  return Type->getName().data();
+}
+
+unsigned LLVMGetTargetExtTypeNumTypeParams(LLVMTypeRef TargetExtTy) {
+  TargetExtType *Type = unwrap<TargetExtType>(TargetExtTy);
+  return Type->getNumTypeParameters();
+}
+
+LLVMTypeRef LLVMGetTargetExtTypeTypeParam(LLVMTypeRef TargetExtTy,
+                                          unsigned Idx) {
+  TargetExtType *Type = unwrap<TargetExtType>(TargetExtTy);
+  return wrap(Type->getTypeParameter(Idx));
+}
+
+unsigned LLVMGetTargetExtTypeNumIntParams(LLVMTypeRef TargetExtTy) {
+  TargetExtType *Type = unwrap<TargetExtType>(TargetExtTy);
+  return Type->getNumIntParameters();
+}
+
+unsigned LLVMGetTargetExtTypeIntParam(LLVMTypeRef TargetExtTy, unsigned Idx) {
+  TargetExtType *Type = unwrap<TargetExtType>(TargetExtTy);
+  return Type->getIntParameter(Idx);
+}
+
 /*===-- Operations on values ----------------------------------------------===*/
 
 /*--.. Operations on all values ............................................--*/
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e57c3fe8c825d6..7f1489ebbd740c 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2271,7 +2271,7 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
     return /*Changed*/ false;
 
   bool Changed = false;
-  auto *DL = &F.getParent()->getDataLayout();
+  auto *DL = &F.getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
   // storage" is limited to Allocas). We'll use this to find dbg.declares to
   // delete after running `trackAssignments`.
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 9360e6d7d274c8..5fb348a8bbcd48 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -359,6 +359,10 @@ LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
+const DataLayout &Function::getDataLayout() const {
+  return getParent()->getDataLayout();
+}
+
 unsigned Function::getInstructionCount() const {
   unsigned NumInstrs = 0;
   for (const BasicBlock &BB : BasicBlocks)
@@ -397,6 +401,12 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
   }
   if (M->getModuleFlag("function_return_thunk_extern"))
     B.addAttribute(Attribute::FnRetThunkExtern);
+  StringRef DefaultCPU = F->getContext().getDefaultTargetCPU();
+  if (!DefaultCPU.empty())
+    B.addAttribute("target-cpu", DefaultCPU);
+  StringRef DefaultFeatures = F->getContext().getDefaultTargetFeatures();
+  if (!DefaultFeatures.empty())
+    B.addAttribute("target-features", DefaultFeatures);
   F->addFnAttrs(B);
   return F;
 }
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 3cc29ea8a340d4..c51852987614c5 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -121,6 +121,10 @@ bool GlobalValue::canBenefitFromLocalAlias() const {
          !isa<GlobalIFunc>(this) && !isDeduplicateComdat(getComdat());
 }
 
+const DataLayout &GlobalValue::getDataLayout() const {
+  return getParent()->getDataLayout();
+}
+
 void GlobalObject::setAlignment(MaybeAlign Align) {
   assert((!Align || *Align <= MaximumAlignment) &&
          "Alignment is greater than MaximumAlignment!");
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index b32799355d6922..e5cde875ab1d80 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Statepoint.h"
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 9dd16fee6b901d..6f0f3f244c050c 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
@@ -70,6 +71,10 @@ const Function *Instruction::getFunction() const {
   return getParent()->getParent();
 }
 
+const DataLayout &Instruction::getDataLayout() const {
+  return getModule()->getDataLayout();
+}
+
 void Instruction::removeFromParent() {
   // Perform any debug-info maintenence required.
   handleMarkerRemoval();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 2e1fd2a96ffff5..494306815d90d7 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -205,7 +205,7 @@ Value *PHINode::hasConstantValue() const {
       ConstantValue = getIncomingValue(i);
     }
   if (ConstantValue == this)
-    return UndefValue::get(getType());
+    return PoisonValue::get(getType());
   return ConstantValue;
 }
 
@@ -1190,7 +1190,7 @@ static Align computeAllocaDefaultAlign(Type *Ty, InsertPosition Pos) {
   BasicBlock *BB = Pos.getBasicBlock();
   assert(BB->getParent() &&
          "BB must be in a Function when alignment not provided!");
-  const DataLayout &DL = BB->getModule()->getDataLayout();
+  const DataLayout &DL = BB->getDataLayout();
   return DL.getPrefTypeAlign(Ty);
 }
 
@@ -1248,7 +1248,7 @@ static Align computeLoadStoreDefaultAlign(Type *Ty, InsertPosition Pos) {
   BasicBlock *BB = Pos.getBasicBlock();
   assert(BB->getParent() &&
          "BB must be in a Function when alignment not provided!");
-  const DataLayout &DL = BB->getModule()->getDataLayout();
+  const DataLayout &DL = BB->getDataLayout();
   return DL.getABITypeAlign(Ty);
 }
 
@@ -1822,7 +1822,7 @@ Constant *ShuffleVectorInst::convertShuffleMaskForBitcode(ArrayRef<int> Mask,
     Type *VecTy = VectorType::get(Int32Ty, Mask.size(), true);
     if (Mask[0] == 0)
       return Constant::getNullValue(VecTy);
-    return UndefValue::get(VecTy);
+    return PoisonValue::get(VecTy);
   }
   SmallVector<Constant *, 16> MaskConst;
   for (int Elem : Mask) {
@@ -4002,11 +4002,7 @@ void SwitchInstProfUpdateWrapper::init() {
   if (!ProfileData)
     return;
 
-  // FIXME: This check belongs in ProfDataUtils. Its almost equivalent to
-  // getValidBranchWeightMDNode(), but the need to use llvm_unreachable
-  // makes them slightly different.
-  if (ProfileData->getNumOperands() !=
-      SI.getNumSuccessors() + getBranchWeightOffset(ProfileData)) {
+  if (getNumBranchWeights(*ProfileData) != SI.getNumSuccessors()) {
     llvm_unreachable("number of prof branch_weights metadata operands does "
                      "not correspond to number of succesors");
   }
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index 8120cccace40b5..194c7e7581cfbb 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -390,3 +390,19 @@ void LLVMContext::setOpaquePointers(bool Enable) const {
 bool LLVMContext::supportsTypedPointers() const {
   return false;
 }
+
+StringRef LLVMContext::getDefaultTargetCPU() {
+  return pImpl->DefaultTargetCPU;
+}
+
+void LLVMContext::setDefaultTargetCPU(StringRef CPU) {
+  pImpl->DefaultTargetCPU = CPU;
+}
+
+StringRef LLVMContext::getDefaultTargetFeatures() {
+  return pImpl->DefaultTargetFeatures;
+}
+
+void LLVMContext::setDefaultTargetFeatures(StringRef Features) {
+  pImpl->DefaultTargetFeatures = Features;
+}
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 5f8df87149f04a..937a87d686175f 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1723,6 +1723,9 @@ class LLVMContextImpl {
   }
 
   void deleteTrailingDbgRecords(BasicBlock *B) { TrailingDbgRecords.erase(B); }
+
+  std::string DefaultTargetCPU;
+  std::string DefaultTargetFeatures;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
index 019fe844e286c8..e6c3ea9d568831 100644
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -128,7 +128,7 @@ void Mangler::getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV,
       PrefixTy = Private;
   }
 
-  const DataLayout &DL = GV->getParent()->getDataLayout();
+  const DataLayout &DL = GV->getDataLayout();
   if (!GV->hasName()) {
     // Get the ID for the global, assigning a new one if we haven't got one
     // already.
@@ -227,7 +227,7 @@ void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
       raw_string_ostream FlagOS(Flag);
       Mangler.getNameWithPrefix(FlagOS, GV, false);
       FlagOS.flush();
-      if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
+      if (Flag[0] == GV->getDataLayout().getGlobalPrefix())
         OS << Flag.substr(1);
       else
         OS << Flag;
@@ -266,7 +266,7 @@ void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
     raw_string_ostream FlagOS(Flag);
     Mangler.getNameWithPrefix(FlagOS, GV, false);
     FlagOS.flush();
-    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
+    if (Flag[0] == GV->getDataLayout().getGlobalPrefix())
       OS << Flag.substr(1);
     else
       OS << Flag;
diff --git a/llvm/lib/IR/PassInstrumentation.cpp b/llvm/lib/IR/PassInstrumentation.cpp
index 86c82a131fd2c9..0c4e7698d9fa87 100644
--- a/llvm/lib/IR/PassInstrumentation.cpp
+++ b/llvm/lib/IR/PassInstrumentation.cpp
@@ -24,6 +24,11 @@ void PassInstrumentationCallbacks::addClassToPassName(StringRef ClassName,
 
 StringRef
 PassInstrumentationCallbacks::getPassNameForClassName(StringRef ClassName) {
+  if (!ClassToPassNameCallbacks.empty()) {
+    for (auto &Fn : ClassToPassNameCallbacks)
+      Fn();
+    ClassToPassNameCallbacks.clear();
+  }
   return ClassToPassName[ClassName];
 }
 
diff --git a/llvm/lib/IR/PassManager.cpp b/llvm/lib/IR/PassManager.cpp
index cbddf3dfb056cf..bf8f7906d3368d 100644
--- a/llvm/lib/IR/PassManager.cpp
+++ b/llvm/lib/IR/PassManager.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManagerImpl.h"
 #include <optional>
 
@@ -144,6 +145,18 @@ PreservedAnalyses ModuleToFunctionPassAdaptor::run(Module &M,
   return PA;
 }
 
+template <>
+void llvm::printIRUnitNameForStackTrace<Module>(raw_ostream &OS,
+                                                const Module &IR) {
+  OS << "module \"" << IR.getName() << "\"";
+}
+
+template <>
+void llvm::printIRUnitNameForStackTrace<Function>(raw_ostream &OS,
+                                                  const Function &IR) {
+  OS << "function \"" << IR.getName() << "\"";
+}
+
 AnalysisSetKey CFGAnalyses::SetKey;
 
 AnalysisSetKey PreservedAnalyses::AllAnalysesKey;
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 2f8e61c41b1da4..992ce34e000343 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -142,6 +142,10 @@ unsigned getBranchWeightOffset(const MDNode *ProfileData) {
   return hasBranchWeightOrigin(ProfileData) ? 2 : 1;
 }
 
+unsigned getNumBranchWeights(const MDNode &ProfileData) {
+  return ProfileData.getNumOperands() - getBranchWeightOffset(&ProfileData);
+}
+
 MDNode *getBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   if (!isBranchWeightMD(ProfileData))
@@ -151,9 +155,7 @@ MDNode *getBranchWeightMDNode(const Instruction &I) {
 
 MDNode *getValidBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = getBranchWeightMDNode(I);
-  auto Offset = getBranchWeightOffset(ProfileData);
-  if (ProfileData &&
-      ProfileData->getNumOperands() == Offset + I.getNumSuccessors())
+  if (ProfileData && getNumBranchWeights(*ProfileData) == I.getNumSuccessors())
     return ProfileData;
   return nullptr;
 }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 0abd5f76d449cd..c98f61d5551408 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4818,10 +4818,9 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
 
   // Check consistency of !prof branch_weights metadata.
   if (ProfName == "branch_weights") {
-    unsigned int Offset = getBranchWeightOffset(MD);
+    unsigned NumBranchWeights = getNumBranchWeights(*MD);
     if (isa<InvokeInst>(&I)) {
-      Check(MD->getNumOperands() == (1 + Offset) ||
-                MD->getNumOperands() == (2 + Offset),
+      Check(NumBranchWeights == 1 || NumBranchWeights == 2,
             "Wrong number of InvokeInst branch_weights operands", MD);
     } else {
       unsigned ExpectedNumOperands = 0;
@@ -4841,10 +4840,11 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
         CheckFailed("!prof branch_weights are not allowed for this instruction",
                     MD);
 
-      Check(MD->getNumOperands() == Offset + ExpectedNumOperands,
-            "Wrong number of operands", MD);
+      Check(NumBranchWeights == ExpectedNumOperands, "Wrong number of operands",
+            MD);
     }
-    for (unsigned i = Offset; i < MD->getNumOperands(); ++i) {
+    for (unsigned i = getBranchWeightOffset(MD); i < MD->getNumOperands();
+         ++i) {
       auto &MDO = MD->getOperand(i);
       Check(MDO, "second operand should not be null", MD);
       Check(mdconst::dyn_extract<ConstantInt>(MDO),
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 84a69d9ff1a1f5..d5d642f0d25e6c 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -191,12 +191,8 @@ static void RegisterPassPlugins(ArrayRef<std::string> PassPlugins,
   // Load requested pass plugins and let them register pass builder callbacks
   for (auto &PluginFN : PassPlugins) {
     auto PassPlugin = PassPlugin::Load(PluginFN);
-    if (!PassPlugin) {
-      errs() << "Failed to load passes from '" << PluginFN
-             << "'. Request ignored.\n";
-      continue;
-    }
-
+    if (!PassPlugin)
+      report_fatal_error(PassPlugin.takeError(), /*gen_crash_diag=*/false);
     PassPlugin->registerPassBuilderCallbacks(PB);
   }
 }
diff --git a/llvm/lib/LTO/UpdateCompilerUsed.cpp b/llvm/lib/LTO/UpdateCompilerUsed.cpp
index 8dff5418dedbad..2882ea9d14b233 100644
--- a/llvm/lib/LTO/UpdateCompilerUsed.cpp
+++ b/llvm/lib/LTO/UpdateCompilerUsed.cpp
@@ -15,8 +15,9 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Mangler.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Linker/LinkModules.cpp b/llvm/lib/Linker/LinkModules.cpp
index 4fe1f1a0f51833..485ac106d4ebb1 100644
--- a/llvm/lib/Linker/LinkModules.cpp
+++ b/llvm/lib/Linker/LinkModules.cpp
@@ -291,7 +291,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
       return false;
     }
 
-    const DataLayout &DL = Dest.getParent()->getDataLayout();
+    const DataLayout &DL = Dest.getDataLayout();
     uint64_t DestSize = DL.getTypeAllocSize(Dest.getValueType());
     uint64_t SrcSize = DL.getTypeAllocSize(Src.getValueType());
     LinkFromSrc = SrcSize > DestSize;
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index b8ef2654ed6e3b..e6dbc3e37e67c8 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -108,7 +108,7 @@ struct ELFWriter {
     DwoOnly,
   } Mode;
 
-  static uint64_t SymbolValue(const MCSymbol &Sym, const MCAsmLayout &Layout);
+  static uint64_t symbolValue(const MCSymbol &Sym, const MCAssembler &Asm);
   static bool isInSymtab(const MCAsmLayout &Layout, const MCSymbolELF &Symbol,
                          bool Used, bool Renamed);
 
@@ -193,8 +193,6 @@ struct ELFWriter {
   MCSectionELF *createRelocationSection(MCContext &Ctx,
                                         const MCSectionELF &Sec);
 
-  void createMemtagRelocs(MCAssembler &Asm);
-
   void writeSectionHeader(const MCAsmLayout &Layout,
                           const SectionIndexMapTy &SectionIndexMap,
                           const SectionOffsetsTy &SectionOffsets);
@@ -456,16 +454,15 @@ void ELFWriter::writeHeader(const MCAssembler &Asm) {
   W.write<uint16_t>(StringTableIndex);
 }
 
-uint64_t ELFWriter::SymbolValue(const MCSymbol &Sym,
-                                const MCAsmLayout &Layout) {
+uint64_t ELFWriter::symbolValue(const MCSymbol &Sym, const MCAssembler &Asm) {
   if (Sym.isCommon())
     return Sym.getCommonAlignment()->value();
 
   uint64_t Res;
-  if (!Layout.getSymbolOffset(Sym, Res))
+  if (!Asm.getSymbolOffset(Sym, Res))
     return 0;
 
-  if (Layout.getAssembler().isThumbFunc(&Sym))
+  if (Asm.isThumbFunc(&Sym))
     Res |= 1;
 
   return Res;
@@ -544,7 +541,7 @@ void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
   uint8_t Visibility = Symbol.getVisibility();
   uint8_t Other = Symbol.getOther() | Visibility;
 
-  uint64_t Value = SymbolValue(*MSD.Symbol, Layout);
+  uint64_t Value = symbolValue(*MSD.Symbol, Layout.getAssembler());
   uint64_t Size = 0;
 
   const MCExpr *ESize = MSD.Symbol->getSize();
@@ -616,23 +613,6 @@ bool ELFWriter::isInSymtab(const MCAsmLayout &Layout, const MCSymbolELF &Symbol,
   return true;
 }
 
-void ELFWriter::createMemtagRelocs(MCAssembler &Asm) {
-  MCSectionELF *MemtagRelocs = nullptr;
-  for (const MCSymbol &Sym : Asm.symbols()) {
-    const auto &SymE = cast<MCSymbolELF>(Sym);
-    if (!SymE.isMemtag())
-      continue;
-    if (MemtagRelocs == nullptr) {
-      MemtagRelocs = OWriter.TargetObjectWriter->getMemtagRelocsSection(Asm.getContext());
-      if (MemtagRelocs == nullptr)
-        report_fatal_error("Tagged globals are not available on this architecture.");
-      Asm.registerSection(*MemtagRelocs);
-    }
-    ELFRelocationEntry Rec(0, &SymE, ELF::R_AARCH64_NONE, 0, nullptr, 0);
-    OWriter.Relocations[MemtagRelocs].push_back(Rec);
-  }
-}
-
 void ELFWriter::computeSymbolTable(
     MCAssembler &Asm, const MCAsmLayout &Layout,
     const SectionIndexMapTy &SectionIndexMap, const RevGroupMapTy &RevGroupMap,
@@ -1094,8 +1074,6 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) {
       Ctx.getELFSection(".strtab", ELF::SHT_STRTAB, 0);
   StringTableIndex = addToSectionTable(StrtabSection);
 
-  createMemtagRelocs(Asm);
-
   RevGroupMapTy RevGroupMap;
   SectionIndexMapTy SectionIndexMap;
 
@@ -1542,8 +1520,7 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
         SymA.getType() == ELF::STT_GNU_IFUNC)
       return false;
   }
-  return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
-                                                                InSet, IsPCRel);
+  return &SymA.getSection() == FB.getParent();
 }
 
 std::unique_ptr<MCObjectWriter>
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index 290be40371663f..2cb0bf044ee968 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -76,10 +76,6 @@ void MCAsmInfo::addInitialFrameState(const MCCFIInstruction &Inst) {
   InitialFrameState.push_back(Inst);
 }
 
-bool MCAsmInfo::isSectionAtomizableBySymbols(const MCSection &Section) const {
-  return false;
-}
-
 const MCExpr *
 MCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                        unsigned Encoding,
diff --git a/llvm/lib/MC/MCAsmInfoDarwin.cpp b/llvm/lib/MC/MCAsmInfoDarwin.cpp
index 2137b81c799f31..a10be51ee61ac0 100644
--- a/llvm/lib/MC/MCAsmInfoDarwin.cpp
+++ b/llvm/lib/MC/MCAsmInfoDarwin.cpp
@@ -19,7 +19,7 @@
 using namespace llvm;
 
 bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
-    const MCSection &Section) const {
+    const MCSection &Section) {
   const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
 
   // Sections holding 1 byte strings are atomized based on the data they
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index f257d0d9e83f74..0050923f3cd9da 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -144,7 +144,7 @@ class MCAsmStreamer final : public MCStreamer {
   /// @name MCStreamer Interface
   /// @{
 
-  void changeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void changeSection(MCSection *Section, uint32_t Subsection) override;
 
   void emitELFSymverDirective(const MCSymbol *OriginalSym, StringRef Name,
                               bool KeepOriginalSym) override;
@@ -510,15 +510,14 @@ void MCAsmStreamer::emitExplicitComments() {
   ExplicitCommentToEmit.clear();
 }
 
-void MCAsmStreamer::changeSection(MCSection *Section,
-                                  const MCExpr *Subsection) {
-  assert(Section && "Cannot switch to a null section!");
+void MCAsmStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   if (MCTargetStreamer *TS = getTargetStreamer()) {
-    TS->changeSection(getCurrentSectionOnly(), Section, Subsection, OS);
+    TS->changeSection(getCurrentSection().first, Section, Subsection, OS);
   } else {
     Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
                                   Subsection);
   }
+  MCStreamer::changeSection(Section, Subsection);
 }
 
 void MCAsmStreamer::emitELFSymverDirective(const MCSymbol *OriginalSym,
@@ -1081,7 +1080,7 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, Align ByteAlignment,
                                  SMLoc Loc) {
   if (Symbol)
-    assignFragment(Symbol, &Section->getDummyFragment());
+    Symbol->setFragment(&Section->getDummyFragment());
 
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
@@ -1107,9 +1106,8 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
 // e.g. _a.
 void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, Align ByteAlignment) {
-  assignFragment(Symbol, &Section->getDummyFragment());
+  Symbol->setFragment(&Section->getDummyFragment());
 
-  assert(Symbol && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
 
   assert(Section->getVariant() == MCSection::SV_MachO &&
@@ -2209,7 +2207,7 @@ void MCAsmStreamer::emitWinEHHandlerData(SMLoc Loc) {
 
   MCSection *TextSec = &CurFrame->Function->getSection();
   MCSection *XData = getAssociatedXDataSection(TextSec);
-  switchSectionNoChange(XData);
+  switchSectionNoPrint(XData);
 
   OS << "\t.seh_handlerdata";
   EmitEOL();
@@ -2626,7 +2624,7 @@ void MCAsmStreamer::doFinalizationAtSectionEnd(MCSection *Section) {
   if (MAI->usesDwarfFileAndLocDirectives())
     return;
 
-  switchSectionNoChange(Section);
+  switchSectionNoPrint(Section);
 
   MCSymbol *Sym = getCurrentSectionOnly()->getEndSymbol(getContext());
 
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index fe0419b8120891..c675ba07626644 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -85,9 +85,7 @@ MCAssembler::MCAssembler(MCContext &Context,
                          std::unique_ptr<MCCodeEmitter> Emitter,
                          std::unique_ptr<MCObjectWriter> Writer)
     : Context(Context), Backend(std::move(Backend)),
-      Emitter(std::move(Emitter)), Writer(std::move(Writer)),
-      BundleAlignSize(0), RelaxAll(false), SubsectionsViaSymbols(false),
-      IncrementalLinkerCompatible(false), ELFHeaderEFlags(0) {
+      Emitter(std::move(Emitter)), Writer(std::move(Writer)) {
   VersionInfo.Major = 0; // Major version == 0 for "none specified"
   DarwinTargetVariantVersionInfo.Major = 0;
 }
@@ -95,6 +93,9 @@ MCAssembler::MCAssembler(MCContext &Context,
 MCAssembler::~MCAssembler() = default;
 
 void MCAssembler::reset() {
+  RelaxAll = false;
+  SubsectionsViaSymbols = false;
+  IncrementalLinkerCompatible = false;
   Sections.clear();
   Symbols.clear();
   IndirectSymbols.clear();
@@ -103,9 +104,6 @@ void MCAssembler::reset() {
   FileNames.clear();
   ThumbFuncs.clear();
   BundleAlignSize = 0;
-  RelaxAll = false;
-  SubsectionsViaSymbols = false;
-  IncrementalLinkerCompatible = false;
   ELFHeaderEFlags = 0;
   LOHContainer.reset();
   VersionInfo.Major = 0;
@@ -126,6 +124,7 @@ void MCAssembler::reset() {
 bool MCAssembler::registerSection(MCSection &Section) {
   if (Section.isRegistered())
     return false;
+  assert(Section.curFragList()->Head && "allocInitialFragment not called");
   Sections.push_back(&Section);
   Section.setIsRegistered(true);
   return true;
@@ -173,29 +172,9 @@ bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   return false;
 }
 
-const MCSymbol *MCAssembler::getAtom(const MCSymbol &S) const {
-  // Linker visible symbols define atoms.
-  if (isSymbolLinkerVisible(S))
-    return &S;
-
-  // Absolute and undefined symbols have no defining atom.
-  if (!S.isInSection())
-    return nullptr;
-
-  // Non-linker visible symbols in sections which can't be atomized have no
-  // defining atom.
-  if (!getContext().getAsmInfo()->isSectionAtomizableBySymbols(
-          *S.getFragment()->getParent()))
-    return nullptr;
-
-  // Otherwise, return the atom for the containing fragment.
-  return S.getFragment()->getAtom();
-}
-
-bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
-                                const MCFragment *DF, MCValue &Target,
-                                const MCSubtargetInfo *STI, uint64_t &Value,
-                                bool &WasForced) const {
+bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
+                                MCValue &Target, const MCSubtargetInfo *STI,
+                                uint64_t &Value, bool &WasForced) const {
   ++stats::evaluateFixup;
 
   // FIXME: This code has some duplication with recordRelocation. We should
@@ -208,7 +187,7 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
   MCContext &Ctx = getContext();
   Value = 0;
   WasForced = false;
-  if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup)) {
+  if (!Expr->evaluateAsRelocatable(Target, Layout, &Fixup)) {
     Ctx.reportError(Fixup.getLoc(), "expected relocatable expression");
     return true;
   }
@@ -225,7 +204,7 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
                   MCFixupKindInfo::FKF_IsTarget;
 
   if (IsTarget)
-    return getBackend().evaluateTargetFixup(*this, Layout, Fixup, DF, Target,
+    return getBackend().evaluateTargetFixup(*this, *Layout, Fixup, DF, Target,
                                             STI, Value, WasForced);
 
   unsigned FixupFlags = getBackendPtr()->getFixupKindInfo(Fixup.getKind()).Flags;
@@ -258,12 +237,12 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
   if (const MCSymbolRefExpr *A = Target.getSymA()) {
     const MCSymbol &Sym = A->getSymbol();
     if (Sym.isDefined())
-      Value += Layout.getSymbolOffset(Sym);
+      Value += getSymbolOffset(Sym);
   }
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
     const MCSymbol &Sym = B->getSymbol();
     if (Sym.isDefined())
-      Value -= Layout.getSymbolOffset(Sym);
+      Value -= getSymbolOffset(Sym);
   }
 
   bool ShouldAlignPC = getBackend().getFixupKindInfo(Fixup.getKind()).Flags &
@@ -272,7 +251,7 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
     "FKF_IsAlignedDownTo32Bits is only allowed on PC-relative fixups!");
 
   if (IsPCRel) {
-    uint64_t Offset = Layout.getFragmentOffset(DF) + Fixup.getOffset();
+    uint64_t Offset = getFragmentOffset(*DF) + Fixup.getOffset();
 
     // A number of ARM fixups in Thumb mode require that the effective PC
     // address be determined as the 32-bit aligned version of the actual offset.
@@ -291,14 +270,13 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
   // recordRelocation handle non-VK_None cases like A@plt-B+C.
   if (!IsResolved && Target.getSymA() && Target.getSymB() &&
       Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None &&
-      getBackend().handleAddSubRelocations(Layout, *DF, Fixup, Target, Value))
+      getBackend().handleAddSubRelocations(*Layout, *DF, Fixup, Target, Value))
     return true;
 
   return IsResolved;
 }
 
-uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
-                                          const MCFragment &F) const {
+uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
   assert(getBackendPtr() && "Requires assembler backend");
   switch (F.getKind()) {
   case MCFragment::FT_Data:
@@ -310,7 +288,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   case MCFragment::FT_Fill: {
     auto &FF = cast<MCFillFragment>(F);
     int64_t NumValues = 0;
-    if (!FF.getNumValues().evaluateKnownAbsolute(NumValues, Layout)) {
+    if (!FF.getNumValues().evaluateKnownAbsolute(NumValues, *Layout)) {
       getContext().reportError(FF.getLoc(),
                                "expected assembly-time absolute expression");
       return 0;
@@ -337,7 +315,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
 
   case MCFragment::FT_Align: {
     const MCAlignFragment &AF = cast<MCAlignFragment>(F);
-    unsigned Offset = Layout.getFragmentOffset(&AF);
+    unsigned Offset = getFragmentOffset(AF);
     unsigned Size = offsetToAlignment(Offset, AF.getAlignment());
 
     // Insert extra Nops for code alignment if the target define
@@ -360,17 +338,17 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     MCValue Value;
-    if (!OF.getOffset().evaluateAsValue(Value, Layout)) {
+    if (!OF.getOffset().evaluateAsValue(Value, *this)) {
       getContext().reportError(OF.getLoc(),
                                "expected assembly-time absolute expression");
         return 0;
     }
 
-    uint64_t FragmentOffset = Layout.getFragmentOffset(&OF);
+    uint64_t FragmentOffset = getFragmentOffset(OF);
     int64_t TargetLocation = Value.getConstant();
     if (const MCSymbolRefExpr *A = Value.getSymA()) {
       uint64_t Val;
-      if (!Layout.getSymbolOffset(A->getSymbol(), Val)) {
+      if (!getSymbolOffset(A->getSymbol(), Val)) {
         getContext().reportError(OF.getLoc(), "expected absolute expression");
         return 0;
       }
@@ -403,6 +381,16 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   llvm_unreachable("invalid fragment kind");
 }
 
+MCAsmLayout::MCAsmLayout(MCAssembler &Asm) : Assembler(Asm) {
+  // Compute the section layout order. Virtual sections must go last.
+  for (MCSection &Sec : Asm)
+    if (!Sec.isVirtualSection())
+      SectionOrder.push_back(&Sec);
+  for (MCSection &Sec : Asm)
+    if (Sec.isVirtualSection())
+      SectionOrder.push_back(&Sec);
+}
+
 // Compute the amount of padding required before the fragment \p F to
 // obey bundling restrictions, where \p FOffset is the fragment's offset in
 // its section and \p FSize is the fragment's size.
@@ -443,7 +431,7 @@ static uint64_t computeBundlePadding(unsigned BundleSize,
     return 0;
 }
 
-void MCAsmLayout::layoutBundle(MCFragment *Prev, MCFragment *F) {
+void MCAssembler::layoutBundle(MCFragment *Prev, MCFragment *F) const {
   // If bundling is enabled and this fragment has instructions in it, it has to
   // obey the bundling restrictions. With padding, we'll have:
   //
@@ -468,13 +456,13 @@ void MCAsmLayout::layoutBundle(MCFragment *Prev, MCFragment *F) {
   assert(isa<MCEncodedFragment>(F) &&
          "Only MCEncodedFragment implementations have instructions");
   MCEncodedFragment *EF = cast<MCEncodedFragment>(F);
-  uint64_t FSize = Assembler.computeFragmentSize(*this, *EF);
+  uint64_t FSize = computeFragmentSize(*EF);
 
-  if (FSize > Assembler.getBundleAlignSize())
+  if (FSize > getBundleAlignSize())
     report_fatal_error("Fragment can't be larger than a bundle size");
 
-  uint64_t RequiredBundlePadding = computeBundlePadding(
-      Assembler.getBundleAlignSize(), EF, EF->Offset, FSize);
+  uint64_t RequiredBundlePadding =
+      computeBundlePadding(getBundleAlignSize(), EF, EF->Offset, FSize);
   if (RequiredBundlePadding > UINT8_MAX)
     report_fatal_error("Padding cannot exceed 255 bytes");
   EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
@@ -484,13 +472,7 @@ void MCAsmLayout::layoutBundle(MCFragment *Prev, MCFragment *F) {
       DF->Offset = EF->Offset;
 }
 
-uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
-  ensureValid(F);
-  return F->Offset;
-}
-
-void MCAsmLayout::ensureValid(const MCFragment *Frag) const {
-  MCSection &Sec = *Frag->getParent();
+void MCAssembler::ensureValid(MCSection &Sec) const {
   if (Sec.hasLayout())
     return;
   Sec.setHasLayout(true);
@@ -498,15 +480,147 @@ void MCAsmLayout::ensureValid(const MCFragment *Frag) const {
   uint64_t Offset = 0;
   for (MCFragment &F : Sec) {
     F.Offset = Offset;
-    if (Assembler.isBundlingEnabled() && F.hasInstructions()) {
-      const_cast<MCAsmLayout *>(this)->layoutBundle(Prev, &F);
+    if (isBundlingEnabled() && F.hasInstructions()) {
+      layoutBundle(Prev, &F);
       Offset = F.Offset;
     }
-    Offset += getAssembler().computeFragmentSize(*this, F);
+    Offset += computeFragmentSize(F);
     Prev = &F;
   }
 }
 
+uint64_t MCAssembler::getFragmentOffset(const MCFragment &F) const {
+  ensureValid(*F.getParent());
+  return F.Offset;
+}
+
+uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
+  return Assembler.getFragmentOffset(*F);
+}
+
+// Simple getSymbolOffset helper for the non-variable case.
+static bool getLabelOffset(const MCAssembler &Asm, const MCSymbol &S,
+                           bool ReportError, uint64_t &Val) {
+  if (!S.getFragment()) {
+    if (ReportError)
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         S.getName() + "'");
+    return false;
+  }
+  Val = Asm.getFragmentOffset(*S.getFragment()) + S.getOffset();
+  return true;
+}
+
+static bool getSymbolOffsetImpl(const MCAssembler &Asm, const MCSymbol &S,
+                                bool ReportError, uint64_t &Val) {
+  if (!S.isVariable())
+    return getLabelOffset(Asm, S, ReportError, Val);
+
+  // If SD is a variable, evaluate it.
+  MCValue Target;
+  if (!S.getVariableValue()->evaluateAsValue(Target, Asm))
+    report_fatal_error("unable to evaluate offset for variable '" +
+                       S.getName() + "'");
+
+  uint64_t Offset = Target.getConstant();
+
+  const MCSymbolRefExpr *A = Target.getSymA();
+  if (A) {
+    uint64_t ValA;
+    // FIXME: On most platforms, `Target`'s component symbols are labels from
+    // having been simplified during evaluation, but on Mach-O they can be
+    // variables due to PR19203. This, and the line below for `B` can be
+    // restored to call `getLabelOffset` when PR19203 is fixed.
+    if (!getSymbolOffsetImpl(Asm, A->getSymbol(), ReportError, ValA))
+      return false;
+    Offset += ValA;
+  }
+
+  const MCSymbolRefExpr *B = Target.getSymB();
+  if (B) {
+    uint64_t ValB;
+    if (!getSymbolOffsetImpl(Asm, B->getSymbol(), ReportError, ValB))
+      return false;
+    Offset -= ValB;
+  }
+
+  Val = Offset;
+  return true;
+}
+
+bool MCAssembler::getSymbolOffset(const MCSymbol &S, uint64_t &Val) const {
+  return getSymbolOffsetImpl(*this, S, false, Val);
+}
+
+uint64_t MCAssembler::getSymbolOffset(const MCSymbol &S) const {
+  uint64_t Val;
+  getSymbolOffsetImpl(*this, S, true, Val);
+  return Val;
+}
+
+uint64_t MCAsmLayout::getSymbolOffset(const MCSymbol &S) const {
+  return Assembler.getSymbolOffset(S);
+}
+
+const MCSymbol *MCAssembler::getBaseSymbol(const MCSymbol &Symbol) const {
+  assert(Layout);
+  if (!Symbol.isVariable())
+    return &Symbol;
+
+  const MCExpr *Expr = Symbol.getVariableValue();
+  MCValue Value;
+  if (!Expr->evaluateAsValue(Value, *this)) {
+    getContext().reportError(Expr->getLoc(),
+                             "expression could not be evaluated");
+    return nullptr;
+  }
+
+  const MCSymbolRefExpr *RefB = Value.getSymB();
+  if (RefB) {
+    getContext().reportError(
+        Expr->getLoc(),
+        Twine("symbol '") + RefB->getSymbol().getName() +
+            "' could not be evaluated in a subtraction expression");
+    return nullptr;
+  }
+
+  const MCSymbolRefExpr *A = Value.getSymA();
+  if (!A)
+    return nullptr;
+
+  const MCSymbol &ASym = A->getSymbol();
+  if (ASym.isCommon()) {
+    getContext().reportError(Expr->getLoc(),
+                             "Common symbol '" + ASym.getName() +
+                                 "' cannot be used in assignment expr");
+    return nullptr;
+  }
+
+  return &ASym;
+}
+const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
+  return Assembler.getBaseSymbol(Symbol);
+}
+
+uint64_t MCAssembler::getSectionAddressSize(const MCSection &Sec) const {
+  // The size is the last fragment's end offset.
+  const MCFragment &F = *Sec.curFragList()->Tail;
+  return getFragmentOffset(F) + computeFragmentSize(F);
+}
+uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const {
+  return Assembler.getSectionAddressSize(*Sec);
+}
+
+uint64_t MCAssembler::getSectionFileSize(const MCSection &Sec) const {
+  // Virtual sections have no file size.
+  if (Sec.isVirtualSection())
+    return 0;
+  return getSectionAddressSize(Sec);
+}
+uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const {
+  return Assembler.getSectionFileSize(*Sec);
+}
+
 bool MCAssembler::registerSymbol(const MCSymbol &Symbol) {
   bool Changed = !Symbol.isRegistered();
   if (Changed) {
@@ -555,7 +669,7 @@ void MCAssembler::writeFragmentPadding(raw_ostream &OS,
 static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
                           const MCAsmLayout &Layout, const MCFragment &F) {
   // FIXME: Embed in fragments instead?
-  uint64_t FragmentSize = Asm.computeFragmentSize(Layout, F);
+  uint64_t FragmentSize = Asm.computeFragmentSize(F);
 
   llvm::endianness Endian = Asm.getBackend().Endian;
 
@@ -831,19 +945,19 @@ void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec,
 }
 
 std::tuple<MCValue, uint64_t, bool>
-MCAssembler::handleFixup(const MCAsmLayout &Layout, MCFragment &F,
-                         const MCFixup &Fixup, const MCSubtargetInfo *STI) {
+MCAssembler::handleFixup(MCFragment &F, const MCFixup &Fixup,
+                         const MCSubtargetInfo *STI) {
   // Evaluate the fixup.
   MCValue Target;
   uint64_t FixedValue;
   bool WasForced;
   bool IsResolved =
-      evaluateFixup(Layout, Fixup, &F, Target, STI, FixedValue, WasForced);
+      evaluateFixup(Fixup, &F, Target, STI, FixedValue, WasForced);
   if (!IsResolved) {
     // The fixup was unresolved, we need a relocation. Inform the object
     // writer of the relocation, and give it an opportunity to adjust the
     // fixup value if need be.
-    getWriter().recordRelocation(*this, Layout, &F, Fixup, Target, FixedValue);
+    getWriter().recordRelocation(*this, *Layout, &F, Fixup, Target, FixedValue);
   }
   return std::make_tuple(Target, FixedValue, IsResolved);
 }
@@ -856,17 +970,8 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
 
   // Create dummy fragments and assign section ordinals.
   unsigned SectionIndex = 0;
-  for (MCSection &Sec : *this) {
-    // Create dummy fragments to eliminate any empty sections, this simplifies
-    // layout.
-    if (Sec.empty()) {
-      auto *F = getContext().allocFragment<MCDataFragment>();
-      F->setParent(&Sec);
-      Sec.addFragment(*F);
-    }
-
+  for (MCSection &Sec : *this)
     Sec.setOrdinal(SectionIndex++);
-  }
 
   // Assign layout order indices to sections and fragments.
   for (unsigned i = 0, e = Layout.getSectionOrder().size(); i != e; ++i) {
@@ -874,26 +979,27 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
     Sec->setLayoutOrder(i);
 
     // Chain together fragments from all subsections.
-    MCDummyFragment Dummy;
-    Dummy.setParent(Sec);
-    MCFragment *Tail = &Dummy;
-    for (auto &[_, List] : Sec->Subsections) {
-      if (!List.Head)
-        continue;
-      Tail->Next = List.Head;
-      Tail = List.Tail;
-    }
-    Sec->Subsections.clear();
-    Sec->Subsections.push_back({0u, {Dummy.getNext(), Tail}});
-    Sec->CurFragList = &Sec->Subsections[0].second;
+    if (Sec->Subsections.size() > 1) {
+      MCDummyFragment Dummy;
+      MCFragment *Tail = &Dummy;
+      for (auto &[_, List] : Sec->Subsections) {
+        assert(List.Head);
+        Tail->Next = List.Head;
+        Tail = List.Tail;
+      }
+      Sec->Subsections.clear();
+      Sec->Subsections.push_back({0u, {Dummy.getNext(), Tail}});
+      Sec->CurFragList = &Sec->Subsections[0].second;
 
-    unsigned FragmentIndex = 0;
-    for (MCFragment &Frag : *Sec)
-      Frag.setLayoutOrder(FragmentIndex++);
+      unsigned FragmentIndex = 0;
+      for (MCFragment &Frag : *Sec)
+        Frag.setLayoutOrder(FragmentIndex++);
+    }
   }
 
   // Layout until everything fits.
-  while (layoutOnce(Layout)) {
+  this->Layout = &Layout;
+  while (layoutOnce()) {
     if (getContext().hadError())
       return;
     // Size of fragments in one section can depend on the size of fragments in
@@ -989,7 +1095,7 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         bool IsResolved;
         MCValue Target;
         std::tie(Target, FixedValue, IsResolved) =
-            handleFixup(Layout, Frag, Fixup, STI);
+            handleFixup(Frag, Fixup, STI);
         getBackend().applyFixup(*this, Fixup, Target, Contents, FixedValue,
                                 IsResolved, STI);
       }
@@ -1004,27 +1110,27 @@ void MCAssembler::Finish() {
 
   // Write the object file.
   stats::ObjectBytes += getWriter().writeObject(*this, Layout);
+
+  this->Layout = nullptr;
 }
 
 bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                       const MCRelaxableFragment *DF,
-                                       const MCAsmLayout &Layout) const {
+                                       const MCRelaxableFragment *DF) const {
   assert(getBackendPtr() && "Expected assembler backend");
   MCValue Target;
   uint64_t Value;
   bool WasForced;
-  bool Resolved = evaluateFixup(Layout, Fixup, DF, Target,
-                                DF->getSubtargetInfo(), Value, WasForced);
+  bool Resolved = evaluateFixup(Fixup, DF, Target, DF->getSubtargetInfo(),
+                                Value, WasForced);
   if (Target.getSymA() &&
       Target.getSymA()->getKind() == MCSymbolRefExpr::VK_X86_ABS8 &&
       Fixup.getKind() == FK_Data_1)
     return false;
   return getBackend().fixupNeedsRelaxationAdvanced(Fixup, Resolved, Value, DF,
-                                                   Layout, WasForced);
+                                                   *Layout, WasForced);
 }
 
-bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F,
-                                          const MCAsmLayout &Layout) const {
+bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F) const {
   assert(getBackendPtr() && "Expected assembler backend");
   // If this inst doesn't ever need relaxation, ignore it. This occurs when we
   // are intentionally pushing out inst fragments, or because we relaxed a
@@ -1033,17 +1139,16 @@ bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F,
     return false;
 
   for (const MCFixup &Fixup : F->getFixups())
-    if (fixupNeedsRelaxation(Fixup, F, Layout))
+    if (fixupNeedsRelaxation(Fixup, F))
       return true;
 
   return false;
 }
 
-bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
-                                   MCRelaxableFragment &F) {
+bool MCAssembler::relaxInstruction(MCRelaxableFragment &F) {
   assert(getEmitterPtr() &&
          "Expected CodeEmitter defined for relaxInstruction");
-  if (!fragmentNeedsRelaxation(&F, Layout))
+  if (!fragmentNeedsRelaxation(&F))
     return false;
 
   ++stats::RelaxedInstructions;
@@ -1065,7 +1170,7 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
   return true;
 }
 
-bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
+bool MCAssembler::relaxLEB(MCLEBFragment &LF) {
   const unsigned OldSize = static_cast<unsigned>(LF.getContents().size());
   unsigned PadTo = OldSize;
   int64_t Value;
@@ -1075,11 +1180,12 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   // requires that .uleb128 A-B is foldable where A and B reside in different
   // fragments. This is used by __gcc_except_table.
   bool Abs = getSubsectionsViaSymbols()
-                 ? LF.getValue().evaluateKnownAbsolute(Value, Layout)
-                 : LF.getValue().evaluateAsAbsolute(Value, Layout);
+                 ? LF.getValue().evaluateKnownAbsolute(Value, *Layout)
+                 : LF.getValue().evaluateAsAbsolute(Value, *Layout);
   if (!Abs) {
     bool Relaxed, UseZeroPad;
-    std::tie(Relaxed, UseZeroPad) = getBackend().relaxLEB128(LF, Layout, Value);
+    std::tie(Relaxed, UseZeroPad) =
+        getBackend().relaxLEB128(LF, *Layout, Value);
     if (!Relaxed) {
       getContext().reportError(LF.getValue().getLoc(),
                                Twine(LF.isSigned() ? ".s" : ".u") +
@@ -1141,17 +1247,16 @@ static bool needPadding(uint64_t StartAddr, uint64_t Size,
          isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
 }
 
-bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
-                                     MCBoundaryAlignFragment &BF) {
+bool MCAssembler::relaxBoundaryAlign(MCBoundaryAlignFragment &BF) {
   // BoundaryAlignFragment that doesn't need to align any fragment should not be
   // relaxed.
   if (!BF.getLastFragment())
     return false;
 
-  uint64_t AlignedOffset = Layout.getFragmentOffset(&BF);
+  uint64_t AlignedOffset = getFragmentOffset(BF);
   uint64_t AlignedSize = 0;
   for (const MCFragment *F = BF.getNext();; F = F->getNext()) {
-    AlignedSize += computeFragmentSize(Layout, *F);
+    AlignedSize += computeFragmentSize(*F);
     if (F == BF.getLastFragment())
       break;
   }
@@ -1166,17 +1271,16 @@ bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
   return true;
 }
 
-bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
-                                     MCDwarfLineAddrFragment &DF) {
+bool MCAssembler::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF) {
 
   bool WasRelaxed;
-  if (getBackend().relaxDwarfLineAddr(DF, Layout, WasRelaxed))
+  if (getBackend().relaxDwarfLineAddr(DF, *Layout, WasRelaxed))
     return WasRelaxed;
 
-  MCContext &Context = Layout.getAssembler().getContext();
+  MCContext &Context = getContext();
   uint64_t OldSize = DF.getContents().size();
   int64_t AddrDelta;
-  bool Abs = DF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, Layout);
+  bool Abs = DF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, *Layout);
   assert(Abs && "We created a line delta with an invalid expression");
   (void)Abs;
   int64_t LineDelta;
@@ -1190,15 +1294,14 @@ bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
   return OldSize != Data.size();
 }
 
-bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
-                                              MCDwarfCallFrameFragment &DF) {
+bool MCAssembler::relaxDwarfCallFrameFragment(MCDwarfCallFrameFragment &DF) {
   bool WasRelaxed;
-  if (getBackend().relaxDwarfCFA(DF, Layout, WasRelaxed))
+  if (getBackend().relaxDwarfCFA(DF, *Layout, WasRelaxed))
     return WasRelaxed;
 
-  MCContext &Context = Layout.getAssembler().getContext();
+  MCContext &Context = getContext();
   int64_t Value;
-  bool Abs = DF.getAddrDelta().evaluateAsAbsolute(Value, Layout);
+  bool Abs = DF.getAddrDelta().evaluateAsAbsolute(Value, *Layout);
   if (!Abs) {
     getContext().reportError(DF.getAddrDelta().getLoc(),
                              "invalid CFI advance_loc expression");
@@ -1215,25 +1318,22 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
   return OldSize != Data.size();
 }
 
-bool MCAssembler::relaxCVInlineLineTable(MCAsmLayout &Layout,
-                                         MCCVInlineLineTableFragment &F) {
+bool MCAssembler::relaxCVInlineLineTable(MCCVInlineLineTableFragment &F) {
   unsigned OldSize = F.getContents().size();
-  getContext().getCVContext().encodeInlineLineTable(Layout, F);
+  getContext().getCVContext().encodeInlineLineTable(*Layout, F);
   return OldSize != F.getContents().size();
 }
 
-bool MCAssembler::relaxCVDefRange(MCAsmLayout &Layout,
-                                  MCCVDefRangeFragment &F) {
+bool MCAssembler::relaxCVDefRange(MCCVDefRangeFragment &F) {
   unsigned OldSize = F.getContents().size();
-  getContext().getCVContext().encodeDefRange(Layout, F);
+  getContext().getCVContext().encodeDefRange(*Layout, F);
   return OldSize != F.getContents().size();
 }
 
-bool MCAssembler::relaxPseudoProbeAddr(MCAsmLayout &Layout,
-                                       MCPseudoProbeAddrFragment &PF) {
+bool MCAssembler::relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &PF) {
   uint64_t OldSize = PF.getContents().size();
   int64_t AddrDelta;
-  bool Abs = PF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, Layout);
+  bool Abs = PF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, *Layout);
   assert(Abs && "We created a pseudo probe with an invalid expression");
   (void)Abs;
   SmallVectorImpl<char> &Data = PF.getContents();
@@ -1246,39 +1346,38 @@ bool MCAssembler::relaxPseudoProbeAddr(MCAsmLayout &Layout,
   return OldSize != Data.size();
 }
 
-bool MCAssembler::relaxFragment(MCAsmLayout &Layout, MCFragment &F) {
+bool MCAssembler::relaxFragment(MCFragment &F) {
   switch(F.getKind()) {
   default:
     return false;
   case MCFragment::FT_Relaxable:
     assert(!getRelaxAll() &&
            "Did not expect a MCRelaxableFragment in RelaxAll mode");
-    return relaxInstruction(Layout, cast<MCRelaxableFragment>(F));
+    return relaxInstruction(cast<MCRelaxableFragment>(F));
   case MCFragment::FT_Dwarf:
-    return relaxDwarfLineAddr(Layout, cast<MCDwarfLineAddrFragment>(F));
+    return relaxDwarfLineAddr(cast<MCDwarfLineAddrFragment>(F));
   case MCFragment::FT_DwarfFrame:
-    return relaxDwarfCallFrameFragment(Layout,
-                                       cast<MCDwarfCallFrameFragment>(F));
+    return relaxDwarfCallFrameFragment(cast<MCDwarfCallFrameFragment>(F));
   case MCFragment::FT_LEB:
-    return relaxLEB(Layout, cast<MCLEBFragment>(F));
+    return relaxLEB(cast<MCLEBFragment>(F));
   case MCFragment::FT_BoundaryAlign:
-    return relaxBoundaryAlign(Layout, cast<MCBoundaryAlignFragment>(F));
+    return relaxBoundaryAlign(cast<MCBoundaryAlignFragment>(F));
   case MCFragment::FT_CVInlineLines:
-    return relaxCVInlineLineTable(Layout, cast<MCCVInlineLineTableFragment>(F));
+    return relaxCVInlineLineTable(cast<MCCVInlineLineTableFragment>(F));
   case MCFragment::FT_CVDefRange:
-    return relaxCVDefRange(Layout, cast<MCCVDefRangeFragment>(F));
+    return relaxCVDefRange(cast<MCCVDefRangeFragment>(F));
   case MCFragment::FT_PseudoProbe:
-    return relaxPseudoProbeAddr(Layout, cast<MCPseudoProbeAddrFragment>(F));
+    return relaxPseudoProbeAddr(cast<MCPseudoProbeAddrFragment>(F));
   }
 }
 
-bool MCAssembler::layoutOnce(MCAsmLayout &Layout) {
+bool MCAssembler::layoutOnce() {
   ++stats::RelaxationSteps;
 
   bool Changed = false;
   for (MCSection &Sec : *this)
     for (MCFragment &Frag : Sec)
-      if (relaxFragment(Layout, Frag))
+      if (relaxFragment(Frag))
         Changed = true;
   return Changed;
 }
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index e7fc8835240bab..2fe83690164406 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -201,7 +201,8 @@ MCDataFragment *MCContext::allocInitialFragment(MCSection &Sec) {
   assert(!Sec.curFragList()->Head);
   auto *F = allocFragment<MCDataFragment>();
   F->setParent(&Sec);
-  Sec.addFragment(*F);
+  Sec.curFragList()->Head = F;
+  Sec.curFragList()->Tail = F;
   return F;
 }
 
@@ -383,6 +384,27 @@ MCSymbol *MCContext::getDirectionalLocalSymbol(unsigned LocalLabelVal,
   return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance);
 }
 
+template <typename Symbol>
+Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) {
+  Symbol *R;
+  auto &SymEntry = getSymbolTableEntry(Section);
+  MCSymbol *Sym = SymEntry.second.Symbol;
+  // A section symbol can not redefine regular symbols. There may be multiple
+  // sections with the same name, in which case the first such section wins.
+  if (Sym && Sym->isDefined() &&
+      (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
+    reportError(SMLoc(), "invalid symbol redefinition");
+  if (Sym && Sym->isUndefined()) {
+    R = cast<Symbol>(Sym);
+  } else {
+    SymEntry.second.Used = true;
+    R = new (&SymEntry, *this) Symbol(&SymEntry, /*isTemporary=*/false);
+    if (!Sym)
+      SymEntry.second.Symbol = R;
+  }
+  return R;
+}
+
 MCSymbol *MCContext::lookupSymbol(const Twine &Name) const {
   SmallString<128> NameSV;
   StringRef NameRef = Name.toStringRef(NameSV);
@@ -483,10 +505,12 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
 
   // Otherwise, return a new section.
   StringRef Name = R.first->first();
-  R.first->second = new (MachOAllocator.Allocate())
+  auto *Ret = new (MachOAllocator.Allocate())
       MCSectionMachO(Segment, Name.substr(Name.size() - Section.size()),
                      TypeAndAttributes, Reserved2, Kind, Begin);
-  return R.first->second;
+  R.first->second = Ret;
+  allocInitialFragment(*Ret);
+  return Ret;
 }
 
 MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
@@ -495,22 +519,7 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
                                               const MCSymbolELF *Group,
                                               bool Comdat, unsigned UniqueID,
                                               const MCSymbolELF *LinkedToSym) {
-  MCSymbolELF *R;
-  MCSymbolTableEntry &SymEntry = getSymbolTableEntry(Section);
-  MCSymbol *Sym = SymEntry.second.Symbol;
-  // A section symbol can not redefine regular symbols. There may be multiple
-  // sections with the same name, in which case the first such section wins.
-  if (Sym && Sym->isDefined() &&
-      (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
-    reportError(SMLoc(), "invalid symbol redefinition");
-  if (Sym && Sym->isUndefined()) {
-    R = cast<MCSymbolELF>(Sym);
-  } else {
-    SymEntry.second.Used = true;
-    R = new (&SymEntry, *this) MCSymbolELF(&SymEntry, /*isTemporary*/ false);
-    if (!Sym)
-      SymEntry.second.Symbol = R;
-  }
+  auto *R = getOrCreateSectionSymbol<MCSymbolELF>(Section);
   R->setBinding(ELF::STB_LOCAL);
   R->setType(ELF::STT_SECTION);
 
@@ -660,7 +669,7 @@ MCContext::getELFUniqueIDForEntsize(StringRef SectionName, unsigned Flags,
 
 MCSectionGOFF *MCContext::getGOFFSection(StringRef Section, SectionKind Kind,
                                          MCSection *Parent,
-                                         const MCExpr *SubsectionId) {
+                                         uint32_t Subsection) {
   // Do the lookup. If we don't have a hit, return a new section.
   auto IterBool =
       GOFFUniquingMap.insert(std::make_pair(Section.str(), nullptr));
@@ -670,21 +679,28 @@ MCSectionGOFF *MCContext::getGOFFSection(StringRef Section, SectionKind Kind,
 
   StringRef CachedName = Iter->first;
   MCSectionGOFF *GOFFSection = new (GOFFAllocator.Allocate())
-      MCSectionGOFF(CachedName, Kind, Parent, SubsectionId);
+      MCSectionGOFF(CachedName, Kind, Parent, Subsection);
   Iter->second = GOFFSection;
-
+  allocInitialFragment(*GOFFSection);
   return GOFFSection;
 }
 
 MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
                                          unsigned Characteristics,
                                          StringRef COMDATSymName, int Selection,
-                                         unsigned UniqueID,
-                                         const char *BeginSymName) {
+                                         unsigned UniqueID) {
   MCSymbol *COMDATSymbol = nullptr;
   if (!COMDATSymName.empty()) {
     COMDATSymbol = getOrCreateSymbol(COMDATSymName);
     COMDATSymName = COMDATSymbol->getName();
+    // A non-associative COMDAT is considered to define the COMDAT symbol. Check
+    // the redefinition error.
+    if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE && COMDATSymbol &&
+        COMDATSymbol->isDefined() &&
+        (!COMDATSymbol->isInSection() ||
+         cast<MCSectionCOFF>(COMDATSymbol->getSection()).getCOMDATSymbol() !=
+             COMDATSymbol))
+      reportError(SMLoc(), "invalid symbol redefinition");
   }
 
   // Do the lookup, if we have a hit, return it.
@@ -694,23 +710,19 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
   if (!IterBool.second)
     return Iter->second;
 
-  MCSymbol *Begin = nullptr;
-  if (BeginSymName)
-    Begin = createTempSymbol(BeginSymName, false);
-
   StringRef CachedName = Iter->first.SectionName;
+  MCSymbol *Begin = getOrCreateSectionSymbol<MCSymbolCOFF>(Section);
   MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF(
       CachedName, Characteristics, COMDATSymbol, Selection, Begin);
-
   Iter->second = Result;
+  auto *F = allocInitialFragment(*Result);
+  Begin->setFragment(F);
   return Result;
 }
 
 MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
-                                         unsigned Characteristics,
-                                         const char *BeginSymName) {
-  return getCOFFSection(Section, Characteristics, "", 0, GenericSectionID,
-                        BeginSymName);
+                                         unsigned Characteristics) {
+  return getCOFFSection(Section, Characteristics, "", 0, GenericSectionID);
 }
 
 MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
@@ -734,22 +746,20 @@ MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
 
 MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind K,
                                          unsigned Flags, const Twine &Group,
-                                         unsigned UniqueID,
-                                         const char *BeginSymName) {
+                                         unsigned UniqueID) {
   MCSymbolWasm *GroupSym = nullptr;
   if (!Group.isTriviallyEmpty() && !Group.str().empty()) {
     GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group));
     GroupSym->setComdat(true);
   }
 
-  return getWasmSection(Section, K, Flags, GroupSym, UniqueID, BeginSymName);
+  return getWasmSection(Section, K, Flags, GroupSym, UniqueID);
 }
 
 MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
                                          unsigned Flags,
                                          const MCSymbolWasm *GroupSym,
-                                         unsigned UniqueID,
-                                         const char *BeginSymName) {
+                                         unsigned UniqueID) {
   StringRef Group = "";
   if (GroupSym)
     Group = GroupSym->getName();
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 5032592b3861a2..321a66ee5abc42 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -1777,23 +1777,7 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
 namespace {
 
 struct CIEKey {
-  static const CIEKey getEmptyKey() {
-    return CIEKey(nullptr, 0, -1, false, false, static_cast<unsigned>(INT_MAX),
-                  false, false);
-  }
-
-  static const CIEKey getTombstoneKey() {
-    return CIEKey(nullptr, -1, 0, false, false, static_cast<unsigned>(INT_MAX),
-                  false, false);
-  }
-
-  CIEKey(const MCSymbol *Personality, unsigned PersonalityEncoding,
-         unsigned LSDAEncoding, bool IsSignalFrame, bool IsSimple,
-         unsigned RAReg, bool IsBKeyFrame, bool IsMTETaggedFrame)
-      : Personality(Personality), PersonalityEncoding(PersonalityEncoding),
-        LsdaEncoding(LSDAEncoding), IsSignalFrame(IsSignalFrame),
-        IsSimple(IsSimple), RAReg(RAReg), IsBKeyFrame(IsBKeyFrame),
-        IsMTETaggedFrame(IsMTETaggedFrame) {}
+  CIEKey() = default;
 
   explicit CIEKey(const MCDwarfFrameInfo &Frame)
       : Personality(Frame.Personality),
@@ -1819,44 +1803,28 @@ struct CIEKey {
                            Other.IsMTETaggedFrame);
   }
 
-  const MCSymbol *Personality;
-  unsigned PersonalityEncoding;
-  unsigned LsdaEncoding;
-  bool IsSignalFrame;
-  bool IsSimple;
-  unsigned RAReg;
-  bool IsBKeyFrame;
-  bool IsMTETaggedFrame;
+  bool operator==(const CIEKey &Other) const {
+    return Personality == Other.Personality &&
+           PersonalityEncoding == Other.PersonalityEncoding &&
+           LsdaEncoding == Other.LsdaEncoding &&
+           IsSignalFrame == Other.IsSignalFrame && IsSimple == Other.IsSimple &&
+           RAReg == Other.RAReg && IsBKeyFrame == Other.IsBKeyFrame &&
+           IsMTETaggedFrame == Other.IsMTETaggedFrame;
+  }
+  bool operator!=(const CIEKey &Other) const { return !(*this == Other); }
+
+  const MCSymbol *Personality = nullptr;
+  unsigned PersonalityEncoding = 0;
+  unsigned LsdaEncoding = -1;
+  bool IsSignalFrame = false;
+  bool IsSimple = false;
+  unsigned RAReg = static_cast<unsigned>(UINT_MAX);
+  bool IsBKeyFrame = false;
+  bool IsMTETaggedFrame = false;
 };
 
 } // end anonymous namespace
 
-namespace llvm {
-
-template <> struct DenseMapInfo<CIEKey> {
-  static CIEKey getEmptyKey() { return CIEKey::getEmptyKey(); }
-  static CIEKey getTombstoneKey() { return CIEKey::getTombstoneKey(); }
-
-  static unsigned getHashValue(const CIEKey &Key) {
-    return static_cast<unsigned>(
-        hash_combine(Key.Personality, Key.PersonalityEncoding, Key.LsdaEncoding,
-                     Key.IsSignalFrame, Key.IsSimple, Key.RAReg,
-                     Key.IsBKeyFrame, Key.IsMTETaggedFrame));
-  }
-
-  static bool isEqual(const CIEKey &LHS, const CIEKey &RHS) {
-    return LHS.Personality == RHS.Personality &&
-           LHS.PersonalityEncoding == RHS.PersonalityEncoding &&
-           LHS.LsdaEncoding == RHS.LsdaEncoding &&
-           LHS.IsSignalFrame == RHS.IsSignalFrame &&
-           LHS.IsSimple == RHS.IsSimple && LHS.RAReg == RHS.RAReg &&
-           LHS.IsBKeyFrame == RHS.IsBKeyFrame &&
-           LHS.IsMTETaggedFrame == RHS.IsMTETaggedFrame;
-  }
-};
-
-} // end namespace llvm
-
 void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                                bool IsEH) {
   MCContext &Context = Streamer.getContext();
@@ -1865,14 +1833,6 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
   FrameEmitterImpl Emitter(IsEH, Streamer);
   ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getDwarfFrameInfos();
 
-  // Disable AttemptToFoldSymbolOffsetDifference folding of EmitCompactUnwind
-  // and fdeStart-cieStart for EmitFDE due to the the performance issue. The
-  // label differences will be evaluate at write time.
-  assert(Streamer.getUseAssemblerInfoForParsing());
-  Streamer.setUseAssemblerInfoForParsing(false);
-  auto Enable = llvm::make_scope_exit(
-      [&]() { Streamer.setUseAssemblerInfoForParsing(true); });
-
   // Emit the compact unwind info if available.
   bool NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame();
   if (IsEH && MOFI->getCompactUnwindSection()) {
@@ -1906,9 +1866,6 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
   MCSymbol *SectionStart = Context.createTempSymbol();
   Streamer.emitLabel(SectionStart);
 
-  DenseMap<CIEKey, const MCSymbol *> CIEStarts;
-
-  const MCSymbol *DummyDebugKey = nullptr;
   bool CanOmitDwarf = MOFI->getOmitDwarfIfHaveCompactUnwind();
   // Sort the FDEs by their corresponding CIE before we emit them.
   // This isn't technically necessary according to the DWARF standard,
@@ -1919,6 +1876,8 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                     [](const MCDwarfFrameInfo &X, const MCDwarfFrameInfo &Y) {
                       return CIEKey(X) < CIEKey(Y);
                     });
+  CIEKey LastKey;
+  const MCSymbol *LastCIEStart = nullptr;
   for (auto I = FrameArrayX.begin(), E = FrameArrayX.end(); I != E;) {
     const MCDwarfFrameInfo &Frame = *I;
     ++I;
@@ -1933,11 +1892,12 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
       continue;
 
     CIEKey Key(Frame);
-    const MCSymbol *&CIEStart = IsEH ? CIEStarts[Key] : DummyDebugKey;
-    if (!CIEStart)
-      CIEStart = &Emitter.EmitCIE(Frame);
+    if (!LastCIEStart || (IsEH && Key != LastKey)) {
+      LastKey = Key;
+      LastCIEStart = &Emitter.EmitCIE(Frame);
+    }
 
-    Emitter.EmitFDE(*CIEStart, Frame, I == E, *SectionStart);
+    Emitter.EmitFDE(*LastCIEStart, Frame, I == E, *SectionStart);
   }
 }
 
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index a4788566926575..e6e6b7d19dff4a 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -102,19 +102,19 @@ void MCELFStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
 // needs to be aligned to at least the bundle size.
 static void setSectionAlignmentForBundling(const MCAssembler &Assembler,
                                            MCSection *Section) {
-  if (Section && Assembler.isBundlingEnabled() && Section->hasInstructions())
+  if (Assembler.isBundlingEnabled() && Section->hasInstructions())
     Section->ensureMinAlignment(Align(Assembler.getBundleAlignSize()));
 }
 
-void MCELFStreamer::changeSection(MCSection *Section,
-                                  const MCExpr *Subsection) {
-  MCSection *CurSection = getCurrentSectionOnly();
-  if (CurSection && isBundleLocked())
-    report_fatal_error("Unterminated .bundle_lock when changing a section");
-
+void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   MCAssembler &Asm = getAssembler();
-  // Ensure the previous section gets aligned if necessary.
-  setSectionAlignmentForBundling(Asm, CurSection);
+  if (auto *F = getCurrentFragment()) {
+    if (isBundleLocked())
+      report_fatal_error("Unterminated .bundle_lock when changing a section");
+
+    // Ensure the previous section gets aligned if necessary.
+    setSectionAlignmentForBundling(Asm, F->getParent());
+  }
   auto *SectionELF = static_cast<const MCSectionELF *>(Section);
   const MCSymbol *Grp = SectionELF->getGroup();
   if (Grp)
@@ -629,8 +629,8 @@ void MCELFStreamer::finishImpl() {
   }
 
   // Ensure the last section gets aligned if necessary.
-  MCSection *CurSection = getCurrentSectionOnly();
-  setSectionAlignmentForBundling(getAssembler(), CurSection);
+  if (MCFragment *F = getCurrentFragment())
+    setSectionAlignmentForBundling(getAssembler(), F->getParent());
 
   finalizeCGProfile();
   emitFrames(nullptr);
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 5cfb7b9becd1d5..e2af92e30e4bf3 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -553,12 +553,11 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res,
   return evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, nullptr, false);
 }
 
-bool MCExpr::evaluateAsAbsolute(int64_t &Res,
-                                const MCAsmLayout &Layout,
+bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm,
                                 const SectionAddrMap &Addrs) const {
   // Setting InSet causes us to absolutize differences across sections and that
   // is what the MachO writer uses Addrs for.
-  return evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, &Addrs, true);
+  return evaluateAsAbsolute(Res, &Asm, Asm.getLayout(), &Addrs, true);
 }
 
 bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const {
@@ -587,7 +586,7 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
   }
 
   bool IsRelocatable =
-      evaluateAsRelocatableImpl(Value, Asm, Layout, nullptr, Addrs, InSet);
+      evaluateAsRelocatableImpl(Value, Asm, nullptr, Addrs, InSet);
 
   // Record the current value.
   Res = Value.getConstant();
@@ -597,9 +596,8 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
 
 /// Helper method for \see EvaluateSymbolAdd().
 static void AttemptToFoldSymbolOffsetDifference(
-    const MCAssembler *Asm, const MCAsmLayout *Layout,
-    const SectionAddrMap *Addrs, bool InSet, const MCSymbolRefExpr *&A,
-    const MCSymbolRefExpr *&B, int64_t &Addend) {
+    const MCAssembler *Asm, const SectionAddrMap *Addrs, bool InSet,
+    const MCSymbolRefExpr *&A, const MCSymbolRefExpr *&B, int64_t &Addend) {
   if (!A || !B)
     return;
 
@@ -636,6 +634,7 @@ static void AttemptToFoldSymbolOffsetDifference(
   // separated by a linker-relaxable instruction. If the section contains
   // instructions and InSet is false (not expressions in directive like
   // .size/.fill), disable the fast path.
+  const MCAsmLayout *Layout = Asm->getLayout();
   if (Layout && (InSet || !SecA.hasInstructions() ||
                  !(Asm->getContext().getTargetTriple().isRISCV() ||
                    Asm->getContext().getTargetTriple().isLoongArch()))) {
@@ -711,7 +710,7 @@ static void AttemptToFoldSymbolOffsetDifference(
                  AF && Layout && AF->hasEmitNops() &&
                  !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
                      *AF, Count)) {
-        Displacement += Asm->computeFragmentSize(*Layout, *AF);
+        Displacement += Asm->computeFragmentSize(*AF);
       } else if (auto *FF = dyn_cast<MCFillFragment>(FI);
                  FF && FF->getNumValues().evaluateAsAbsolute(Num)) {
         Displacement += Num * FF->getValueSize();
@@ -741,8 +740,7 @@ static void AttemptToFoldSymbolOffsetDifference(
 /// They might look redundant, but this function can be used before layout
 /// is done (see the object streamer for example) and having the Asm argument
 /// lets us avoid relaxations early.
-static bool EvaluateSymbolicAdd(const MCAssembler *Asm,
-                                const MCAsmLayout *Layout,
+static bool evaluateSymbolicAdd(const MCAssembler *Asm,
                                 const SectionAddrMap *Addrs, bool InSet,
                                 const MCValue &LHS, const MCValue &RHS,
                                 MCValue &Res) {
@@ -762,9 +760,6 @@ static bool EvaluateSymbolicAdd(const MCAssembler *Asm,
   // Fold the result constant immediately.
   int64_t Result_Cst = LHS_Cst + RHS_Cst;
 
-  assert((!Layout || Asm) &&
-         "Must have an assembler object if layout is given!");
-
   // If we have a layout, we can fold resolved differences.
   if (Asm) {
     // First, fold out any differences which are fully resolved. By
@@ -777,13 +772,13 @@ static bool EvaluateSymbolicAdd(const MCAssembler *Asm,
     //   (RHS_A - RHS_B).
     // Since we are attempting to be as aggressive as possible about folding, we
     // attempt to evaluate each possible alternative.
-    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, LHS_A, LHS_B,
+    AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, LHS_A, LHS_B,
                                         Result_Cst);
-    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, LHS_A, RHS_B,
+    AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, LHS_A, RHS_B,
                                         Result_Cst);
-    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, RHS_A, LHS_B,
+    AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, RHS_A, LHS_B,
                                         Result_Cst);
-    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, RHS_A, RHS_B,
+    AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, RHS_A, RHS_B,
                                         Result_Cst);
   }
 
@@ -804,14 +799,11 @@ bool MCExpr::evaluateAsRelocatable(MCValue &Res,
                                    const MCAsmLayout *Layout,
                                    const MCFixup *Fixup) const {
   MCAssembler *Assembler = Layout ? &Layout->getAssembler() : nullptr;
-  return evaluateAsRelocatableImpl(Res, Assembler, Layout, Fixup, nullptr,
-                                   false);
+  return evaluateAsRelocatableImpl(Res, Assembler, Fixup, nullptr, false);
 }
 
-bool MCExpr::evaluateAsValue(MCValue &Res, const MCAsmLayout &Layout) const {
-  MCAssembler *Assembler = &Layout.getAssembler();
-  return evaluateAsRelocatableImpl(Res, Assembler, &Layout, nullptr, nullptr,
-                                   true);
+bool MCExpr::evaluateAsValue(MCValue &Res, const MCAssembler &Asm) const {
+  return evaluateAsRelocatableImpl(Res, &Asm, nullptr, nullptr, true);
 }
 
 static bool canExpand(const MCSymbol &Sym, bool InSet) {
@@ -831,12 +823,11 @@ static bool canExpand(const MCSymbol &Sym, bool InSet) {
 }
 
 bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
-                                       const MCAsmLayout *Layout,
                                        const MCFixup *Fixup,
                                        const SectionAddrMap *Addrs,
                                        bool InSet) const {
   ++stats::MCExprEvaluate;
-
+  MCAsmLayout *Layout = Asm ? Asm->getLayout() : nullptr;
   switch (getKind()) {
   case Target:
     return cast<MCTargetExpr>(this)->evaluateAsRelocatableImpl(Res, Layout,
@@ -856,7 +847,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
         canExpand(Sym, InSet)) {
       bool IsMachO = SRE->hasSubsectionsViaSymbols();
       if (Sym.getVariableValue()->evaluateAsRelocatableImpl(
-              Res, Asm, Layout, Fixup, Addrs, InSet || IsMachO)) {
+              Res, Asm, Fixup, Addrs, InSet || IsMachO)) {
         if (Kind != MCSymbolRefExpr::VK_None) {
           if (Res.isAbsolute()) {
             Res = MCValue::get(SRE, nullptr, 0);
@@ -901,8 +892,8 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     const MCUnaryExpr *AUE = cast<MCUnaryExpr>(this);
     MCValue Value;
 
-    if (!AUE->getSubExpr()->evaluateAsRelocatableImpl(Value, Asm, Layout, Fixup,
-                                                      Addrs, InSet))
+    if (!AUE->getSubExpr()->evaluateAsRelocatableImpl(Value, Asm, Fixup, Addrs,
+                                                      InSet))
       return false;
 
     switch (AUE->getOpcode()) {
@@ -937,10 +928,10 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     const MCBinaryExpr *ABE = cast<MCBinaryExpr>(this);
     MCValue LHSValue, RHSValue;
 
-    if (!ABE->getLHS()->evaluateAsRelocatableImpl(LHSValue, Asm, Layout, Fixup,
-                                                  Addrs, InSet) ||
-        !ABE->getRHS()->evaluateAsRelocatableImpl(RHSValue, Asm, Layout, Fixup,
-                                                  Addrs, InSet)) {
+    if (!ABE->getLHS()->evaluateAsRelocatableImpl(LHSValue, Asm, Fixup, Addrs,
+                                                  InSet) ||
+        !ABE->getRHS()->evaluateAsRelocatableImpl(RHSValue, Asm, Fixup, Addrs,
+                                                  InSet)) {
       // Check if both are Target Expressions, see if we can compare them.
       if (const MCTargetExpr *L = dyn_cast<MCTargetExpr>(ABE->getLHS())) {
         if (const MCTargetExpr *R = dyn_cast<MCTargetExpr>(ABE->getRHS())) {
@@ -968,16 +959,16 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
       case MCBinaryExpr::Sub:
         // Negate RHS and add.
         // The cast avoids undefined behavior if the constant is INT64_MIN.
-        return EvaluateSymbolicAdd(
-            Asm, Layout, Addrs, InSet, LHSValue,
+        return evaluateSymbolicAdd(
+            Asm, Addrs, InSet, LHSValue,
             MCValue::get(RHSValue.getSymB(), RHSValue.getSymA(),
                          -(uint64_t)RHSValue.getConstant(),
                          RHSValue.getRefKind()),
             Res);
 
       case MCBinaryExpr::Add:
-        return EvaluateSymbolicAdd(
-            Asm, Layout, Addrs, InSet, LHSValue,
+        return evaluateSymbolicAdd(
+            Asm, Addrs, InSet, LHSValue,
             MCValue::get(RHSValue.getSymA(), RHSValue.getSymB(),
                          RHSValue.getConstant(), RHSValue.getRefKind()),
             Res);
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 9e9a03af945dc9..b1012507a1242f 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -11,15 +11,11 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -30,133 +26,6 @@
 
 using namespace llvm;
 
-MCAsmLayout::MCAsmLayout(MCAssembler &Asm) : Assembler(Asm) {
-  // Compute the section layout order. Virtual sections must go last.
-  for (MCSection &Sec : Asm)
-    if (!Sec.isVirtualSection())
-      SectionOrder.push_back(&Sec);
-  for (MCSection &Sec : Asm)
-    if (Sec.isVirtualSection())
-      SectionOrder.push_back(&Sec);
-}
-
-void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
-  F->getParent()->setHasLayout(false);
-}
-
-// Simple getSymbolOffset helper for the non-variable case.
-static bool getLabelOffset(const MCAsmLayout &Layout, const MCSymbol &S,
-                           bool ReportError, uint64_t &Val) {
-  if (!S.getFragment()) {
-    if (ReportError)
-      report_fatal_error("unable to evaluate offset to undefined symbol '" +
-                         S.getName() + "'");
-    return false;
-  }
-  Val = Layout.getFragmentOffset(S.getFragment()) + S.getOffset();
-  return true;
-}
-
-static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S,
-                                bool ReportError, uint64_t &Val) {
-  if (!S.isVariable())
-    return getLabelOffset(Layout, S, ReportError, Val);
-
-  // If SD is a variable, evaluate it.
-  MCValue Target;
-  if (!S.getVariableValue()->evaluateAsValue(Target, Layout))
-    report_fatal_error("unable to evaluate offset for variable '" +
-                       S.getName() + "'");
-
-  uint64_t Offset = Target.getConstant();
-
-  const MCSymbolRefExpr *A = Target.getSymA();
-  if (A) {
-    uint64_t ValA;
-    // FIXME: On most platforms, `Target`'s component symbols are labels from
-    // having been simplified during evaluation, but on Mach-O they can be
-    // variables due to PR19203. This, and the line below for `B` can be
-    // restored to call `getLabelOffset` when PR19203 is fixed.
-    if (!getSymbolOffsetImpl(Layout, A->getSymbol(), ReportError, ValA))
-      return false;
-    Offset += ValA;
-  }
-
-  const MCSymbolRefExpr *B = Target.getSymB();
-  if (B) {
-    uint64_t ValB;
-    if (!getSymbolOffsetImpl(Layout, B->getSymbol(), ReportError, ValB))
-      return false;
-    Offset -= ValB;
-  }
-
-  Val = Offset;
-  return true;
-}
-
-bool MCAsmLayout::getSymbolOffset(const MCSymbol &S, uint64_t &Val) const {
-  return getSymbolOffsetImpl(*this, S, false, Val);
-}
-
-uint64_t MCAsmLayout::getSymbolOffset(const MCSymbol &S) const {
-  uint64_t Val;
-  getSymbolOffsetImpl(*this, S, true, Val);
-  return Val;
-}
-
-const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
-  if (!Symbol.isVariable())
-    return &Symbol;
-
-  const MCExpr *Expr = Symbol.getVariableValue();
-  MCValue Value;
-  if (!Expr->evaluateAsValue(Value, *this)) {
-    Assembler.getContext().reportError(
-        Expr->getLoc(), "expression could not be evaluated");
-    return nullptr;
-  }
-
-  const MCSymbolRefExpr *RefB = Value.getSymB();
-  if (RefB) {
-    Assembler.getContext().reportError(
-        Expr->getLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
-                     "' could not be evaluated in a subtraction expression");
-    return nullptr;
-  }
-
-  const MCSymbolRefExpr *A = Value.getSymA();
-  if (!A)
-    return nullptr;
-
-  const MCSymbol &ASym = A->getSymbol();
-  const MCAssembler &Asm = getAssembler();
-  if (ASym.isCommon()) {
-    Asm.getContext().reportError(Expr->getLoc(),
-                                 "Common symbol '" + ASym.getName() +
-                                     "' cannot be used in assignment expr");
-    return nullptr;
-  }
-
-  return &ASym;
-}
-
-uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const {
-  // The size is the last fragment's end offset.
-  const MCFragment &F = *Sec->curFragList()->Tail;
-  return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F);
-}
-
-uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const {
-  // Virtual sections have no file size.
-  if (Sec->isVirtualSection())
-    return 0;
-
-  // Otherwise, the file size is the same as the address space size.
-  return getSectionAddressSize(Sec);
-}
-
-/* *** */
-
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions)
     : Kind(Kind), HasInstructions(HasInstructions), LinkerRelaxable(false) {}
 
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 6b2e411b61505a..d67b95820a8a9d 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -85,7 +85,7 @@ class MCMachOStreamer : public MCObjectStreamer {
   /// @name MCStreamer Interface
   /// @{
 
-  void changeSection(MCSection *Sect, const MCExpr *Subsect) override;
+  void changeSection(MCSection *Sect, uint32_t Subsection = 0) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void emitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
@@ -155,16 +155,16 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
   if (SegName == "__TEXT" && SecName == "__eh_frame")
     return true;
 
-  if (SegName == "__DATA" && (SecName == "__nl_symbol_ptr" ||
-                              SecName == "__thread_ptr"))
+  if (SegName == "__DATA" &&
+      (SecName == "__llvm_addrsig" || SecName == "__nl_symbol_ptr" ||
+       SecName == "__thread_ptr"))
     return true;
-  if (SegName == "__LLVM" && SecName == "__cg_profile")
+  if (SegName == "__LLVM" && (SecName == "__cg_profile"))
     return true;
   return false;
 }
 
-void MCMachOStreamer::changeSection(MCSection *Section,
-                                    const MCExpr *Subsection) {
+void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   // Change the section normally.
   bool Created = changeSectionImpl(Section, Subsection);
   const MCSectionMachO &MSec = *cast<MCSectionMachO>(Section);
@@ -554,14 +554,13 @@ void MCMachOStreamer::finalizeCGProfile() {
   // and set its size now so that it's accounted for in layout.
   MCSection *CGProfileSection = Asm.getContext().getMachOSection(
       "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
-  Asm.registerSection(*CGProfileSection);
-  auto *Frag = getContext().allocFragment<MCDataFragment>();
-  Frag->setParent(CGProfileSection);
-  CGProfileSection->addFragment(*Frag);
+  changeSection(CGProfileSection);
   // For each entry, reserve space for 2 32-bit indices and a 64-bit count.
   size_t SectionBytes =
       Asm.CGProfile.size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
-  Frag->getContents().resize(SectionBytes);
+  cast<MCDataFragment>(*CGProfileSection->begin())
+      .getContents()
+      .resize(SectionBytes);
 }
 
 MCStreamer *llvm::createMachOStreamer(MCContext &Context,
@@ -596,10 +595,8 @@ void MCMachOStreamer::createAddrSigSection() {
   // to be computed immediately after in order for it to be exported correctly.
   MCSection *AddrSigSection =
       Asm.getContext().getObjectFileInfo()->getAddrSigSection();
-  Asm.registerSection(*AddrSigSection);
-  auto *Frag = getContext().allocFragment<MCDataFragment>();
-  Frag->setParent(AddrSigSection);
-  AddrSigSection->addFragment(*Frag);
+  changeSection(AddrSigSection);
+  auto *Frag = cast<MCDataFragment>(AddrSigSection->curFragList()->Head);
   // We will generate a series of pointer-sized symbol relocations at offset
   // 0x0. Set the section size to be large enough to contain a single pointer
   // (instead of emitting a zero-sized section) so these relocations are
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 35ac307519f470..7a2b43a954b101 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -546,25 +546,18 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
 }
 
 void MCObjectFileInfo::initGOFFMCObjectFileInfo(const Triple &T) {
-  TextSection =
-      Ctx->getGOFFSection(".text", SectionKind::getText(), nullptr, nullptr);
-  BSSSection =
-      Ctx->getGOFFSection(".bss", SectionKind::getBSS(), nullptr, nullptr);
-  PPA1Section =
-      Ctx->getGOFFSection(".ppa1", SectionKind::getMetadata(), TextSection,
-                          MCConstantExpr::create(GOFF::SK_PPA1, *Ctx));
-  PPA2Section =
-      Ctx->getGOFFSection(".ppa2", SectionKind::getMetadata(), TextSection,
-                          MCConstantExpr::create(GOFF::SK_PPA2, *Ctx));
+  TextSection = Ctx->getGOFFSection(".text", SectionKind::getText(), nullptr);
+  BSSSection = Ctx->getGOFFSection(".bss", SectionKind::getBSS(), nullptr);
+  PPA1Section = Ctx->getGOFFSection(".ppa1", SectionKind::getMetadata(),
+                                    TextSection, GOFF::SK_PPA1);
+  PPA2Section = Ctx->getGOFFSection(".ppa2", SectionKind::getMetadata(),
+                                    TextSection, GOFF::SK_PPA2);
 
   PPA2ListSection =
-      Ctx->getGOFFSection(".ppa2list", SectionKind::getData(),
-                          nullptr, nullptr);
+      Ctx->getGOFFSection(".ppa2list", SectionKind::getData(), nullptr);
 
-  ADASection =
-      Ctx->getGOFFSection(".ada", SectionKind::getData(), nullptr, nullptr);
-  IDRLSection =
-      Ctx->getGOFFSection("B_IDRL", SectionKind::getData(), nullptr, nullptr);
+  ADASection = Ctx->getGOFFSection(".ada", SectionKind::getData(), nullptr);
+  IDRLSection = Ctx->getGOFFSection("B_IDRL", SectionKind::getData(), nullptr);
 }
 
 void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
@@ -618,25 +611,21 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                        COFF::IMAGE_SCN_MEM_READ));
 
   DwarfAbbrevSection = Ctx->getCOFFSection(
-      ".debug_abbrev",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_abbrev");
+      ".debug_abbrev", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                           COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                           COFF::IMAGE_SCN_MEM_READ);
   DwarfInfoSection = Ctx->getCOFFSection(
-      ".debug_info",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_info");
+      ".debug_info", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                         COFF::IMAGE_SCN_MEM_READ);
   DwarfLineSection = Ctx->getCOFFSection(
-      ".debug_line",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_line");
+      ".debug_line", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                         COFF::IMAGE_SCN_MEM_READ);
   DwarfLineStrSection = Ctx->getCOFFSection(
-      ".debug_line_str",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_line_str");
+      ".debug_line_str", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                             COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                             COFF::IMAGE_SCN_MEM_READ);
   DwarfFrameSection = Ctx->getCOFFSection(
       ".debug_frame", COFF::IMAGE_SCN_MEM_DISCARDABLE |
                           COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -658,98 +647,81 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                  COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                  COFF::IMAGE_SCN_MEM_READ);
   DwarfStrSection = Ctx->getCOFFSection(
-      ".debug_str",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "info_string");
+      ".debug_str", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ);
   DwarfStrOffSection = Ctx->getCOFFSection(
-      ".debug_str_offsets",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_str_off");
+      ".debug_str_offsets", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ);
   DwarfLocSection = Ctx->getCOFFSection(
-      ".debug_loc",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_debug_loc");
+      ".debug_loc", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ);
   DwarfLoclistsSection = Ctx->getCOFFSection(
-      ".debug_loclists",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_debug_loclists");
+      ".debug_loclists", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                             COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                             COFF::IMAGE_SCN_MEM_READ);
   DwarfARangesSection = Ctx->getCOFFSection(
       ".debug_aranges", COFF::IMAGE_SCN_MEM_DISCARDABLE |
                             COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                             COFF::IMAGE_SCN_MEM_READ);
   DwarfRangesSection = Ctx->getCOFFSection(
-      ".debug_ranges",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "debug_range");
+      ".debug_ranges", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                           COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                           COFF::IMAGE_SCN_MEM_READ);
   DwarfRnglistsSection = Ctx->getCOFFSection(
-      ".debug_rnglists",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "debug_rnglists");
+      ".debug_rnglists", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                             COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                             COFF::IMAGE_SCN_MEM_READ);
   DwarfMacinfoSection = Ctx->getCOFFSection(
-      ".debug_macinfo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "debug_macinfo");
+      ".debug_macinfo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                            COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                            COFF::IMAGE_SCN_MEM_READ);
   DwarfMacroSection = Ctx->getCOFFSection(
-      ".debug_macro",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "debug_macro");
+      ".debug_macro", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ);
   DwarfMacinfoDWOSection = Ctx->getCOFFSection(
-      ".debug_macinfo.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "debug_macinfo.dwo");
+      ".debug_macinfo.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ);
   DwarfMacroDWOSection = Ctx->getCOFFSection(
-      ".debug_macro.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "debug_macro.dwo");
+      ".debug_macro.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                              COFF::IMAGE_SCN_MEM_READ);
   DwarfInfoDWOSection = Ctx->getCOFFSection(
-      ".debug_info.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_info_dwo");
+      ".debug_info.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                             COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                             COFF::IMAGE_SCN_MEM_READ);
   DwarfTypesDWOSection = Ctx->getCOFFSection(
-      ".debug_types.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_types_dwo");
+      ".debug_types.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                              COFF::IMAGE_SCN_MEM_READ);
   DwarfAbbrevDWOSection = Ctx->getCOFFSection(
-      ".debug_abbrev.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_abbrev_dwo");
+      ".debug_abbrev.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                               COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                               COFF::IMAGE_SCN_MEM_READ);
   DwarfStrDWOSection = Ctx->getCOFFSection(
-      ".debug_str.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "skel_string");
+      ".debug_str.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                            COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                            COFF::IMAGE_SCN_MEM_READ);
   DwarfLineDWOSection = Ctx->getCOFFSection(
       ".debug_line.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                              COFF::IMAGE_SCN_MEM_READ);
   DwarfLocDWOSection = Ctx->getCOFFSection(
-      ".debug_loc.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "skel_loc");
+      ".debug_loc.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                            COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                            COFF::IMAGE_SCN_MEM_READ);
   DwarfStrOffDWOSection = Ctx->getCOFFSection(
-      ".debug_str_offsets.dwo",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "section_str_off_dwo");
+      ".debug_str_offsets.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                    COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                    COFF::IMAGE_SCN_MEM_READ);
   DwarfAddrSection = Ctx->getCOFFSection(
-      ".debug_addr",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "addr_sec");
+      ".debug_addr", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                         COFF::IMAGE_SCN_MEM_READ);
   DwarfCUIndexSection = Ctx->getCOFFSection(
       ".debug_cu_index", COFF::IMAGE_SCN_MEM_DISCARDABLE |
                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -759,30 +731,25 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                              COFF::IMAGE_SCN_MEM_READ);
   DwarfDebugNamesSection = Ctx->getCOFFSection(
-      ".debug_names",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "debug_names_begin");
+      ".debug_names", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ);
   DwarfAccelNamesSection = Ctx->getCOFFSection(
-      ".apple_names",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "names_begin");
+      ".apple_names", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ);
   DwarfAccelNamespaceSection = Ctx->getCOFFSection(
-      ".apple_namespaces",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "namespac_begin");
+      ".apple_namespaces", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                               COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                               COFF::IMAGE_SCN_MEM_READ);
   DwarfAccelTypesSection = Ctx->getCOFFSection(
-      ".apple_types",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "types_begin");
+      ".apple_types", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ);
   DwarfAccelObjCSection = Ctx->getCOFFSection(
-      ".apple_objc",
-      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-          COFF::IMAGE_SCN_MEM_READ,
-      "objc_begin");
+      ".apple_objc", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                         COFF::IMAGE_SCN_MEM_READ);
 
   DrectveSection = Ctx->getCOFFSection(
       ".drectve", COFF::IMAGE_SCN_LNK_INFO | COFF::IMAGE_SCN_LNK_REMOVE);
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 75506514368a73..afe5da6e5fbb90 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -142,10 +142,6 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
     MCDwarfFrameEmitter::Emit(*this, MAB, false);
 }
 
-MCFragment *MCObjectStreamer::getCurrentFragment() const {
-  return getCurrentSectionOnly()->curFragList()->Tail;
-}
-
 static bool canReuseDataFragment(const MCDataFragment &F,
                                  const MCAssembler &Assembler,
                                  const MCSubtargetInfo *STI) {
@@ -167,7 +163,7 @@ static bool canReuseDataFragment(const MCDataFragment &F,
 
 MCDataFragment *
 MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) {
-  MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+  auto *F = dyn_cast<MCDataFragment>(getCurrentFragment());
   if (!F || !canReuseDataFragment(*F, *Assembler, STI)) {
     F = getContext().allocFragment<MCDataFragment>();
     insert(F);
@@ -285,33 +281,36 @@ void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
   report_fatal_error("This file format doesn't support weak aliases.");
 }
 
-void MCObjectStreamer::changeSection(MCSection *Section,
-                                     const MCExpr *Subsection) {
+void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   changeSectionImpl(Section, Subsection);
 }
 
 bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
-                                         const MCExpr *Subsection) {
+                                         uint32_t Subsection) {
   assert(Section && "Cannot switch to a null section!");
   getContext().clearDwarfLocSeen();
 
-  bool Created = getAssembler().registerSection(*Section);
-
-  int64_t IntSubsection = 0;
-  if (Subsection &&
-      !Subsection->evaluateAsAbsolute(IntSubsection, getAssemblerPtr())) {
-    getContext().reportError(Subsection->getLoc(),
-                             "cannot evaluate subsection number");
-  }
-  if (!isUInt<31>(IntSubsection)) {
-    getContext().reportError(Subsection->getLoc(),
-                             "subsection number " + Twine(IntSubsection) +
-                                 " is not within [0,2147483647]");
+  auto &Subsections = Section->Subsections;
+  size_t I = 0, E = Subsections.size();
+  while (I != E && Subsections[I].first < Subsection)
+    ++I;
+  // If the subsection number is not in the sorted Subsections list, create a
+  // new fragment list.
+  if (I == E || Subsections[I].first != Subsection) {
+    auto *F = getContext().allocFragment<MCDataFragment>();
+    F->setParent(Section);
+    Subsections.insert(Subsections.begin() + I,
+                       {Subsection, MCSection::FragList{F, F}});
   }
+  Section->CurFragList = &Subsections[I].second;
+  CurFrag = Section->CurFragList->Tail;
+
+  return getAssembler().registerSection(*Section);
+}
 
-  CurSubsectionIdx = unsigned(IntSubsection);
-  Section->switchSubsection(CurSubsectionIdx);
-  return Created;
+void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) {
+  MCStreamer::switchSectionNoPrint(Section);
+  changeSection(Section, 0);
 }
 
 void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
@@ -345,9 +344,7 @@ void MCObjectStreamer::emitInstruction(const MCInst &Inst,
                                                 "' cannot have instructions");
     return;
   }
-  getAssembler().getBackend().emitInstructionBegin(*this, Inst, STI);
   emitInstructionImpl(Inst, STI);
-  getAssembler().getBackend().emitInstructionEnd(*this, Inst);
 }
 
 void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
diff --git a/llvm/lib/MC/MCObjectWriter.cpp b/llvm/lib/MC/MCObjectWriter.cpp
index 559aff130d8802..d321e581bbf0e6 100644
--- a/llvm/lib/MC/MCObjectWriter.cpp
+++ b/llvm/lib/MC/MCObjectWriter.cpp
@@ -29,17 +29,11 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(
   const MCSymbol &SA = A->getSymbol();
   const MCSymbol &SB = B->getSymbol();
   assert(!SA.isUndefined() && !SB.isUndefined());
-  if (!SA.getFragment() || !SB.getFragment())
+  MCFragment *FB = SB.getFragment();
+  if (!FB || !SA.getFragment())
     return false;
 
-  return isSymbolRefDifferenceFullyResolvedImpl(Asm, SA, SB, InSet);
-}
-
-bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
-    const MCAssembler &Asm, const MCSymbol &A, const MCSymbol &B,
-    bool InSet) const {
-  return isSymbolRefDifferenceFullyResolvedImpl(Asm, A, *B.getFragment(), InSet,
-                                                false);
+  return isSymbolRefDifferenceFullyResolvedImpl(Asm, SA, *FB, InSet, /*IsPCRel=*/false);
 }
 
 bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 6e7ec68c44975c..72cf59f7f583b4 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -1082,7 +1082,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 }
 
 bool AsmParser::checkForValidSection() {
-  if (!ParsingMSInlineAsm && !getStreamer().getCurrentSectionOnly()) {
+  if (!ParsingMSInlineAsm && !getStreamer().getCurrentFragment()) {
     Out.initSections(false, getTargetParser().getSTI());
     return Error(getTok().getLoc(),
                  "expected section directive before assembly directive");
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index d4c4bcb8564889..e8a22d3defd6e8 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -682,9 +682,8 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   }
 
   if (UseLastGroup) {
-    MCSectionSubPair CurrentSection = getStreamer().getCurrentSection();
     if (const MCSectionELF *Section =
-            cast_or_null<MCSectionELF>(CurrentSection.first))
+            cast_or_null<MCSectionELF>(getStreamer().getCurrentSectionOnly()))
       if (const MCSymbol *Group = Section->getGroup()) {
         GroupName = Group->getName();
         IsComdat = Section->isComdat();
@@ -716,16 +715,8 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
       (Section->getFlags() & ELF::SHF_ALLOC) &&
       (Section->getFlags() & ELF::SHF_EXECINSTR)) {
     bool InsertResult = getContext().addGenDwarfSection(Section);
-    if (InsertResult) {
-      if (getContext().getDwarfVersion() <= 2)
-        Warning(loc, "DWARF2 only supports one section per compilation unit");
-
-      if (!Section->getBeginSymbol()) {
-        MCSymbol *SectionStartSymbol = getContext().createTempSymbol();
-        getStreamer().emitLabel(SectionStartSymbol);
-        Section->setBeginSymbol(SectionStartSymbol);
-      }
-    }
+    if (InsertResult && getContext().getDwarfVersion() <= 2)
+      Warning(loc, "DWARF2 only supports one section per compilation unit");
   }
 
   return false;
@@ -916,7 +907,7 @@ bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
 }
 
 bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
-  const MCExpr *Subsection = nullptr;
+  const MCExpr *Subsection = MCConstantExpr::create(0, getContext());
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getParser().parseExpression(Subsection))
      return true;
@@ -927,8 +918,8 @@ bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
 
   Lex();
 
-  getStreamer().subSection(Subsection);
-  return false;
+  return getStreamer().switchSection(getStreamer().getCurrentSectionOnly(),
+                                     Subsection);
 }
 
 bool ELFAsmParser::ParseDirectiveCGProfile(StringRef S, SMLoc Loc) {
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index b95ee33debc37b..d8ab30f296c3cb 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -252,7 +252,7 @@ class WasmAsmParser : public MCAsmParserExtension {
     if (TypeName == "function") {
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
       auto *Current =
-          cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
+          cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
       if (Current->getGroup())
         WasmSym->setComdat(true);
     } else if (TypeName == "global")
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index c209c9715f474c..8c2ee5635a49c1 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -21,10 +21,10 @@
 using namespace llvm;
 
 MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText,
-                     MCSymbol *Begin)
+                     bool IsVirtual, MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
-      HasLayout(false), IsRegistered(false), IsText(IsText), Name(Name),
-      Variant(V) {
+      HasLayout(false), IsRegistered(false), IsText(IsText),
+      IsVirtual(IsVirtual), Name(Name), Variant(V) {
   DummyFragment.setParent(this);
   // The initial subsection number is 0. Create a fragment list.
   CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
@@ -66,17 +66,6 @@ void MCSection::setBundleLockState(BundleLockStateType NewState) {
   ++BundleLockNestingDepth;
 }
 
-void MCSection::switchSubsection(unsigned Subsection) {
-  size_t I = 0, E = Subsections.size();
-  while (I != E && Subsections[I].first < Subsection)
-    ++I;
-  // If the subsection number is not in the sorted Subsections list, create a
-  // new fragment list.
-  if (I == E || Subsections[I].first != Subsection)
-    Subsections.insert(Subsections.begin() + I, {Subsection, FragList{}});
-  CurFragList = &Subsections[I].second;
-}
-
 StringRef MCSection::getVirtualSectionKind() const { return "virtual"; }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
index ea0c5f0e4e551d..13897725028028 100644
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/llvm/lib/MC/MCSectionCOFF.cpp
@@ -36,7 +36,7 @@ void MCSectionCOFF::setSelection(int Selection) const {
 
 void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
-                                         const MCExpr *Subsection) const {
+                                         uint32_t Subsection) const {
   // standard sections don't require the '.section'
   if (shouldOmitSectionDirective(getName(), MAI)) {
     OS << '\t' << getName() << '\n';
@@ -108,10 +108,6 @@ void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
 
 bool MCSectionCOFF::useCodeAlign() const { return isText(); }
 
-bool MCSectionCOFF::isVirtualSection() const {
-  return getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
-}
-
 StringRef MCSectionCOFF::getVirtualSectionKind() const {
   return "IMAGE_SCN_CNT_UNINITIALIZED_DATA";
 }
diff --git a/llvm/lib/MC/MCSectionDXContainer.cpp b/llvm/lib/MC/MCSectionDXContainer.cpp
index 065b506c21ce3e..7eee59d5873dba 100644
--- a/llvm/lib/MC/MCSectionDXContainer.cpp
+++ b/llvm/lib/MC/MCSectionDXContainer.cpp
@@ -12,4 +12,4 @@ using namespace llvm;
 
 void MCSectionDXContainer::printSwitchToSection(const MCAsmInfo &,
                                                 const Triple &, raw_ostream &,
-                                                const MCExpr *) const {}
+                                                uint32_t) const {}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index 6bbdef3dce32a1..5cd6590fb626d3 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -52,13 +52,11 @@ static void printName(raw_ostream &OS, StringRef Name) {
 
 void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                         raw_ostream &OS,
-                                        const MCExpr *Subsection) const {
+                                        uint32_t Subsection) const {
   if (shouldOmitSectionDirective(getName(), MAI)) {
     OS << '\t' << getName();
-    if (Subsection) {
-      OS << '\t';
-      Subsection->print(OS, &MAI);
-    }
+    if (Subsection)
+      OS << '\t' << Subsection;
     OS << '\n';
     return;
   }
@@ -203,8 +201,7 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   OS << '\n';
 
   if (Subsection) {
-    OS << "\t.subsection\t";
-    Subsection->print(OS, &MAI);
+    OS << "\t.subsection\t" << Subsection;
     OS << '\n';
   }
 }
@@ -213,8 +210,4 @@ bool MCSectionELF::useCodeAlign() const {
   return getFlags() & ELF::SHF_EXECINSTR;
 }
 
-bool MCSectionELF::isVirtualSection() const {
-  return getType() == ELF::SHT_NOBITS;
-}
-
 StringRef MCSectionELF::getVirtualSectionKind() const { return "SHT_NOBITS"; }
diff --git a/llvm/lib/MC/MCSectionMachO.cpp b/llvm/lib/MC/MCSectionMachO.cpp
index 9b7d9bf08fc2b9..67453cea53a264 100644
--- a/llvm/lib/MC/MCSectionMachO.cpp
+++ b/llvm/lib/MC/MCSectionMachO.cpp
@@ -92,8 +92,9 @@ ENTRY("" /*FIXME*/,          S_ATTR_LOC_RELOC)
 MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
                                unsigned TAA, unsigned reserved2, SectionKind K,
                                MCSymbol *Begin)
-    : MCSection(SV_MachO, Section, K.isText(), Begin), TypeAndAttributes(TAA),
-      Reserved2(reserved2) {
+    : MCSection(SV_MachO, Section, K.isText(),
+                MachO::isVirtualSection(TAA & MachO::SECTION_TYPE), Begin),
+      TypeAndAttributes(TAA), Reserved2(reserved2) {
   assert(Segment.size() <= 16 && Section.size() <= 16 &&
          "Segment or section string too long");
   for (unsigned i = 0; i != 16; ++i) {
@@ -106,7 +107,7 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
 
 void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
-                                          const MCExpr *Subsection) const {
+                                          uint32_t Subsection) const {
   OS << "\t.section\t" << getSegmentName() << ',' << getName();
 
   // Get the section type and attributes.
@@ -172,12 +173,6 @@ bool MCSectionMachO::useCodeAlign() const {
   return hasAttribute(MachO::S_ATTR_PURE_INSTRUCTIONS);
 }
 
-bool MCSectionMachO::isVirtualSection() const {
-  return (getType() == MachO::S_ZEROFILL ||
-          getType() == MachO::S_GB_ZEROFILL ||
-          getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
-}
-
 /// ParseSectionSpecifier - Parse the section specifier indicated by "Spec".
 /// This is a string that can appear after a .section directive in a mach-o
 /// flavored .s file.  If successful, this fills in the specified Out
diff --git a/llvm/lib/MC/MCSectionWasm.cpp b/llvm/lib/MC/MCSectionWasm.cpp
index e3761820bb4c3e..e25af1c3133fd5 100644
--- a/llvm/lib/MC/MCSectionWasm.cpp
+++ b/llvm/lib/MC/MCSectionWasm.cpp
@@ -46,14 +46,12 @@ static void printName(raw_ostream &OS, StringRef Name) {
 
 void MCSectionWasm::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
-                                         const MCExpr *Subsection) const {
+                                         uint32_t Subsection) const {
 
   if (shouldOmitSectionDirective(getName(), MAI)) {
     OS << '\t' << getName();
-    if (Subsection) {
-      OS << '\t';
-      Subsection->print(OS, &MAI);
-    }
+    if (Subsection)
+      OS << '\t' << Subsection;
     OS << '\n';
     return;
   }
@@ -96,13 +94,8 @@ void MCSectionWasm::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
 
   OS << '\n';
 
-  if (Subsection) {
-    OS << "\t.subsection\t";
-    Subsection->print(OS, &MAI);
-    OS << '\n';
-  }
+  if (Subsection)
+    OS << "\t.subsection\t" << Subsection << '\n';
 }
 
 bool MCSectionWasm::useCodeAlign() const { return false; }
-
-bool MCSectionWasm::isVirtualSection() const { return false; }
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 609ef096293042..00ca5d2067d589 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -25,7 +25,7 @@ void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const {
 
 void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
-                                          const MCExpr *Subsection) const {
+                                          uint32_t Subsection) const {
   if (getKind().isText()) {
     if (getMappingClass() != XCOFF::XMC_PR)
       report_fatal_error("Unhandled storage-mapping class for .text csect");
@@ -132,14 +132,3 @@ void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
 }
 
 bool MCSectionXCOFF::useCodeAlign() const { return getKind().isText(); }
-
-bool MCSectionXCOFF::isVirtualSection() const {
-  // DWARF sections are always not virtual.
-  if (isDwarfSect())
-    return false;
-  assert(isCsect() &&
-         "Handling for isVirtualSection not implemented for this section!");
-  // XTY_CM sections are virtual except for toc-data symbols.
-  return (XCOFF::XTY_CM == CsectProp->Type) &&
-         (getMappingClass() != XCOFF::XMC_TD);
-}
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 680b0567c1493b..a3f67941c10157 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -57,8 +57,7 @@ void MCTargetStreamer::finish() {}
 void MCTargetStreamer::emitConstantPools() {}
 
 void MCTargetStreamer::changeSection(const MCSection *CurSection,
-                                     MCSection *Section,
-                                     const MCExpr *Subsection,
+                                     MCSection *Section, uint32_t Subsection,
                                      raw_ostream &OS) {
   Section->printSwitchToSection(*Streamer.getContext().getAsmInfo(),
                                 Streamer.getContext().getTargetTriple(), OS,
@@ -103,9 +102,9 @@ void MCStreamer::reset() {
   DwarfFrameInfos.clear();
   CurrentWinFrameInfo = nullptr;
   WinFrameInfos.clear();
-  SymbolOrdering.clear();
   SectionStack.clear();
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
+  CurFrag = nullptr;
 }
 
 raw_ostream &MCStreamer::getCommentOS() {
@@ -412,15 +411,6 @@ void MCStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
   switchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCStreamer::assignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
-  assert(Fragment);
-  Symbol->setFragment(Fragment);
-
-  // As we emit symbols into a section, track the order so that they can
-  // be sorted upon later. Zero is reserved to mean 'unemitted'.
-  SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
-}
-
 void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   Symbol->redefineIfPossible();
 
@@ -1218,7 +1208,9 @@ void MCStreamer::emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        Align ByteAlignment) {}
 void MCStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, Align ByteAlignment) {}
-void MCStreamer::changeSection(MCSection *, const MCExpr *) {}
+void MCStreamer::changeSection(MCSection *Section, uint32_t) {
+  CurFrag = &Section->getDummyFragment();
+}
 void MCStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
 void MCStreamer::emitBytes(StringRef Data) {}
 void MCStreamer::emitBinaryData(StringRef Data) { emitBytes(Data); }
@@ -1242,7 +1234,22 @@ void MCStreamer::emitBundleLock(bool AlignToEnd) {}
 void MCStreamer::finishImpl() {}
 void MCStreamer::emitBundleUnlock() {}
 
-void MCStreamer::switchSection(MCSection *Section, const MCExpr *Subsection) {
+bool MCStreamer::popSection() {
+  if (SectionStack.size() <= 1)
+    return false;
+  auto I = SectionStack.end();
+  --I;
+  MCSectionSubPair OldSec = I->first;
+  --I;
+  MCSectionSubPair NewSec = I->first;
+
+  if (NewSec.first && OldSec != NewSec)
+    changeSection(NewSec.first, NewSec.second);
+  SectionStack.pop_back();
+  return true;
+}
+
+void MCStreamer::switchSection(MCSection *Section, uint32_t Subsection) {
   assert(Section && "Cannot switch to a null section!");
   MCSectionSubPair curSection = SectionStack.back().first;
   SectionStack.back().second = curSection;
@@ -1256,6 +1263,31 @@ void MCStreamer::switchSection(MCSection *Section, const MCExpr *Subsection) {
   }
 }
 
+bool MCStreamer::switchSection(MCSection *Section, const MCExpr *SubsecExpr) {
+  int64_t Subsec = 0;
+  if (SubsecExpr) {
+    if (!SubsecExpr->evaluateAsAbsolute(Subsec, getAssemblerPtr())) {
+      getContext().reportError(SubsecExpr->getLoc(),
+                               "cannot evaluate subsection number");
+      return true;
+    }
+    if (!isUInt<31>(Subsec)) {
+      getContext().reportError(SubsecExpr->getLoc(),
+                               "subsection number " + Twine(Subsec) +
+                                   " is not within [0,2147483647]");
+      return true;
+    }
+  }
+  switchSection(Section, Subsec);
+  return false;
+}
+
+void MCStreamer::switchSectionNoPrint(MCSection *Section) {
+  SectionStack.back().second = SectionStack.back().first;
+  SectionStack.back().first = MCSectionSubPair(Section, 0);
+  CurFrag = &Section->getDummyFragment();
+}
+
 MCSymbol *MCStreamer::endSection(MCSection *Section) {
   // TODO: keep track of the last subsection so that this symbol appears in the
   // correct place.
diff --git a/llvm/lib/MC/MCSubtargetInfo.cpp b/llvm/lib/MC/MCSubtargetInfo.cpp
index cf3aba17fc3d4a..1de0a9f66669a5 100644
--- a/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -336,6 +336,16 @@ void MCSubtargetInfo::initInstrItins(InstrItineraryData &InstrItins) const {
                                   ForwardingPaths);
 }
 
+std::vector<SubtargetFeatureKV>
+MCSubtargetInfo::getEnabledProcessorFeatures() const {
+  std::vector<SubtargetFeatureKV> EnabledFeatures;
+  auto IsEnabled = [&](const SubtargetFeatureKV &FeatureKV) {
+    return FeatureBits.test(FeatureKV.Value);
+  };
+  llvm::copy_if(ProcFeatures, std::back_inserter(EnabledFeatures), IsEnabled);
+  return EnabledFeatures;
+}
+
 std::optional<unsigned> MCSubtargetInfo::getCacheSize(unsigned Level) const {
   return std::nullopt;
 }
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index 4187de4d5eb8e1..90039cb6300c2d 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -68,8 +68,7 @@ void MCWasmStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   llvm_unreachable("invalid assembler flag!");
 }
 
-void MCWasmStreamer::changeSection(MCSection *Section,
-                                   const MCExpr *Subsection) {
+void MCWasmStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   MCAssembler &Asm = getAssembler();
   auto *SectionWasm = cast<MCSectionWasm>(Section);
   const MCSymbol *Grp = SectionWasm->getGroup();
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 05445a3db10db4..538244cb8ba6a4 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -25,7 +25,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -81,6 +81,15 @@ void MCWinCOFFStreamer::initSections(bool NoExecStack,
   switchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
+void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
+  changeSectionImpl(Section, Subsection);
+  // Ensure that the first and the second symbols relative to the section are
+  // the section symbol and the COMDAT symbol.
+  getAssembler().registerSymbol(*Section->getBeginSymbol());
+  if (auto *Sym = cast<MCSectionCOFF>(Section)->getCOMDATSymbol())
+    getAssembler().registerSymbol(*Sym);
+}
+
 void MCWinCOFFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
   MCObjectStreamer::emitLabel(Symbol, Loc);
@@ -193,12 +202,10 @@ void MCWinCOFFStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {
     return;
 
   MCSection *SXData = getContext().getObjectFileInfo()->getSXDataSection();
-  getAssembler().registerSection(*SXData);
+  changeSection(SXData);
   SXData->ensureMinAlignment(Align(4));
 
-  auto *F = getContext().allocFragment<MCSymbolIdFragment>(Symbol);
-  F->setParent(SXData);
-  SXData->addFragment(*F);
+  insert(getContext().allocFragment<MCSymbolIdFragment>(Symbol));
   getAssembler().registerSymbol(*Symbol);
   CSymbol->setIsSafeSEH();
 
@@ -210,7 +217,6 @@ void MCWinCOFFStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {
 
 void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
   MCSection *Sec = getCurrentSectionOnly();
-  getAssembler().registerSection(*Sec);
   Sec->ensureMinAlignment(Align(4));
 
   insert(getContext().allocFragment<MCSymbolIdFragment>(Symbol));
@@ -353,15 +359,21 @@ void MCWinCOFFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
     cast<MCSymbolCOFF>(S)->setExternal(true);
 }
 
-void MCWinCOFFStreamer::finalizeCGProfile() {
-  for (MCAssembler::CGProfileEntry &E : getAssembler().CGProfile) {
-    finalizeCGProfileEntry(E.From);
-    finalizeCGProfileEntry(E.To);
-  }
-}
-
 void MCWinCOFFStreamer::finishImpl() {
-  finalizeCGProfile();
+  MCAssembler &Asm = getAssembler();
+  if (Asm.getWriter().getEmitAddrsigSection()) {
+    // Register the section.
+    switchSection(Asm.getContext().getCOFFSection(".llvm_addrsig",
+                                                  COFF::IMAGE_SCN_LNK_REMOVE));
+  }
+  if (!Asm.CGProfile.empty()) {
+    for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
+      finalizeCGProfileEntry(E.From);
+      finalizeCGProfileEntry(E.To);
+    }
+    switchSection(Asm.getContext().getCOFFSection(".llvm.call-graph-profile",
+                                                  COFF::IMAGE_SCN_LNK_REMOVE));
+  }
 
   MCObjectStreamer::finishImpl();
 }
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index de8bde4211b49b..cecca548d8f0e3 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -12,6 +12,7 @@
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
@@ -132,6 +133,36 @@ uint64_t MachObjectWriter::getPaddingSize(const MCSection *Sec,
   return offsetToAlignment(EndAddr, NextSec.getAlign());
 }
 
+static bool isSymbolLinkerVisible(const MCSymbol &Symbol) {
+  // Non-temporary labels should always be visible to the linker.
+  if (!Symbol.isTemporary())
+    return true;
+
+  if (Symbol.isUsedInReloc())
+    return true;
+
+  return false;
+}
+
+const MCSymbol *MachObjectWriter::getAtom(const MCSymbol &S) const {
+  // Linker visible symbols define atoms.
+  if (isSymbolLinkerVisible(S))
+    return &S;
+
+  // Absolute and undefined symbols have no defining atom.
+  if (!S.isInSection())
+    return nullptr;
+
+  // Non-linker visible symbols in sections which can't be atomized have no
+  // defining atom.
+  if (!MCAsmInfoDarwin::isSectionAtomizableBySymbols(
+          *S.getFragment()->getParent()))
+    return nullptr;
+
+  // Otherwise, return the atom for the containing fragment.
+  return S.getFragment()->getAtom();
+}
+
 void MachObjectWriter::writeHeader(MachO::HeaderFileType Type,
                                    unsigned NumLoadCommands,
                                    unsigned LoadCommandsSize,
@@ -661,18 +692,6 @@ void MachObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
   bindIndirectSymbols(Asm);
 }
 
-bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
-    const MCAssembler &Asm, const MCSymbol &A, const MCSymbol &B,
-    bool InSet) const {
-  // FIXME: We don't handle things like
-  // foo = .
-  // creating atoms.
-  if (A.isVariable() || B.isVariable())
-    return false;
-  return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, A, B,
-                                                                InSet);
-}
-
 bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
     const MCAssembler &Asm, const MCSymbol &SymA, const MCFragment &FB,
     bool InSet, bool IsPCRel) const {
@@ -716,18 +735,8 @@ bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   if (&SecA != &SecB)
     return false;
 
-  const MCFragment *FA = SA.getFragment();
-
-  // Bail if the symbol has no fragment.
-  if (!FA)
-    return false;
-
   // If the atoms are the same, they are guaranteed to have the same address.
-  if (FA->getAtom() == FB.getAtom())
-    return true;
-
-  // Otherwise, we can't prove this is fully resolved.
-  return false;
+  return SA.getFragment()->getAtom() == FB.getAtom();
 }
 
 static MachO::LoadCommandType getLCFromMCVM(MCVersionMinType Type) {
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 8313c897d41307..5101cb381cb6d6 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -151,9 +151,6 @@ class WinCOFFWriter {
   bool UseOffsetLabels = false;
 
 public:
-  MCSectionCOFF *AddrsigSection = nullptr;
-  MCSectionCOFF *CGProfileSection = nullptr;
-
   enum DwoMode {
     AllSections,
     NonDwoOnly,
@@ -327,6 +324,7 @@ void WinCOFFWriter::defineSection(const MCSectionCOFF &MCSec,
   COFFSection *Section = createSection(MCSec.getName());
   COFFSymbol *Symbol = createSymbol(MCSec.getName());
   Section->Symbol = Symbol;
+  SymbolMap[MCSec.getBeginSymbol()] = Symbol;
   Symbol->Section = Section;
   Symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
 
@@ -369,13 +367,12 @@ void WinCOFFWriter::defineSection(const MCSectionCOFF &MCSec,
   }
 }
 
-static uint64_t getSymbolValue(const MCSymbol &Symbol,
-                               const MCAsmLayout &Layout) {
+static uint64_t getSymbolValue(const MCSymbol &Symbol, const MCAssembler &Asm) {
   if (Symbol.isCommon() && Symbol.isExternal())
     return Symbol.getCommonSize();
 
   uint64_t Res;
-  if (!Layout.getSymbolOffset(Symbol, Res))
+  if (!Asm.getSymbolOffset(Symbol, Res))
     return 0;
 
   return Res;
@@ -401,15 +398,18 @@ COFFSymbol *WinCOFFWriter::getLinkedSymbol(const MCSymbol &Symbol) {
 /// and creates the associated COFF symbol staging object.
 void WinCOFFWriter::DefineSymbol(const MCSymbol &MCSym, MCAssembler &Assembler,
                                  const MCAsmLayout &Layout) {
-  COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym);
   const MCSymbol *Base = Layout.getBaseSymbol(MCSym);
   COFFSection *Sec = nullptr;
+  MCSectionCOFF *MCSec = nullptr;
   if (Base && Base->getFragment()) {
-    Sec = SectionMap[Base->getFragment()->getParent()];
-    if (Sym->Section && Sym->Section != Sec)
-      report_fatal_error("conflicting sections for symbol");
+    MCSec = cast<MCSectionCOFF>(Base->getFragment()->getParent());
+    Sec = SectionMap[MCSec];
   }
 
+  if (Mode == NonDwoOnly && MCSec && isDwoSection(*MCSec))
+    return;
+
+  COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym);
   COFFSymbol *Local = nullptr;
   if (cast<MCSymbolCOFF>(MCSym).getWeakExternalCharacteristics()) {
     Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
@@ -445,7 +445,7 @@ void WinCOFFWriter::DefineSymbol(const MCSymbol &MCSym, MCAssembler &Assembler,
   }
 
   if (Local) {
-    Local->Data.Value = getSymbolValue(MCSym, Layout);
+    Local->Data.Value = getSymbolValue(MCSym, Assembler);
 
     const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(MCSym);
     Local->Data.Type = SymbolCOFF.getType();
@@ -1096,10 +1096,10 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm,
   }
 
   // Create the contents of the .llvm_addrsig section.
-  if (Mode != DwoOnly && OWriter.EmitAddrsigSection) {
-    auto *Frag = Asm.getContext().allocFragment<MCDataFragment>();
-    Frag->setParent(AddrsigSection);
-    AddrsigSection->addFragment(*Frag);
+  if (Mode != DwoOnly && OWriter.getEmitAddrsigSection()) {
+    auto *Sec = Asm.getContext().getCOFFSection(
+        ".llvm_addrsig", COFF::IMAGE_SCN_LNK_REMOVE);
+    auto *Frag = cast<MCDataFragment>(Sec->curFragList()->Head);
     raw_svector_ostream OS(Frag->getContents());
     for (const MCSymbol *S : OWriter.AddrsigSyms) {
       if (!S->isRegistered())
@@ -1118,10 +1118,10 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm,
   }
 
   // Create the contents of the .llvm.call-graph-profile section.
-  if (Mode != DwoOnly && CGProfileSection) {
-    auto *Frag = Asm.getContext().allocFragment<MCDataFragment>();
-    Frag->setParent(CGProfileSection);
-    CGProfileSection->addFragment(*Frag);
+  if (Mode != DwoOnly && !Asm.CGProfile.empty()) {
+    auto *Sec = Asm.getContext().getCOFFSection(
+        ".llvm.call-graph-profile", COFF::IMAGE_SCN_LNK_REMOVE);
+    auto *Frag = cast<MCDataFragment>(Sec->curFragList()->Head);
     raw_svector_ostream OS(Frag->getContents());
     for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
@@ -1203,24 +1203,11 @@ bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   uint16_t Type = cast<MCSymbolCOFF>(SymA).getType();
   if ((Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION)
     return false;
-  return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
-                                                                InSet, IsPCRel);
+  return &SymA.getSection() == FB.getParent();
 }
 
 void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                    const MCAsmLayout &Layout) {
-  if (EmitAddrsigSection) {
-    ObjWriter->AddrsigSection = Asm.getContext().getCOFFSection(
-        ".llvm_addrsig", COFF::IMAGE_SCN_LNK_REMOVE);
-    Asm.registerSection(*ObjWriter->AddrsigSection);
-  }
-
-  if (!Asm.CGProfile.empty()) {
-    ObjWriter->CGProfileSection = Asm.getContext().getCOFFSection(
-        ".llvm.call-graph-profile", COFF::IMAGE_SCN_LNK_REMOVE);
-    Asm.registerSection(*ObjWriter->CGProfileSection);
-  }
-
   ObjWriter->executePostLayoutBinding(Asm, Layout);
   if (DwoWriter)
     DwoWriter->executePostLayoutBinding(Asm, Layout);
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index d5cbdc5de0b84f..32b20d758ee70b 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MCA/InstrBuilder.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
@@ -504,6 +505,24 @@ void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
   ID.Reads.resize(CurrentUse);
 }
 
+hash_code hashMCOperand(const MCOperand &MCO) {
+  hash_code TypeHash = hash_combine(MCO.isReg(), MCO.isImm(), MCO.isSFPImm(),
+                                    MCO.isDFPImm(), MCO.isExpr(), MCO.isInst());
+  if (MCO.isReg())
+    return hash_combine(TypeHash, MCO.getReg());
+
+  return TypeHash;
+}
+
+hash_code hashMCInst(const MCInst &MCI) {
+  hash_code InstructionHash = hash_combine(MCI.getOpcode(), MCI.getFlags());
+  for (unsigned I = 0; I < MCI.getNumOperands(); ++I) {
+    InstructionHash =
+        hash_combine(InstructionHash, hashMCOperand(MCI.getOperand(I)));
+  }
+  return InstructionHash;
+}
+
 Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
                                     const MCInst &MCI) const {
   if (ID.NumMicroOps != 0)
@@ -521,6 +540,22 @@ Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
   return make_error<InstructionError<MCInst>>(std::string(Message), MCI);
 }
 
+Expected<unsigned> InstrBuilder::getVariantSchedClassID(const MCInst &MCI,
+                                                        unsigned SchedClassID) {
+  const MCSchedModel &SM = STI.getSchedModel();
+  unsigned CPUID = SM.getProcessorID();
+  while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+    SchedClassID =
+        STI.resolveVariantSchedClass(SchedClassID, &MCI, &MCII, CPUID);
+
+  if (!SchedClassID) {
+    return make_error<InstructionError<MCInst>>(
+        "unable to resolve scheduling class for write variant.", MCI);
+  }
+
+  return SchedClassID;
+}
+
 Expected<const InstrDesc &>
 InstrBuilder::createInstrDescImpl(const MCInst &MCI,
                                   const SmallVector<Instrument *> &IVec) {
@@ -539,15 +574,13 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
 
   // Try to solve variant scheduling classes.
   if (IsVariant) {
-    unsigned CPUID = SM.getProcessorID();
-    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
-      SchedClassID =
-          STI.resolveVariantSchedClass(SchedClassID, &MCI, &MCII, CPUID);
-
-    if (!SchedClassID) {
-      return make_error<InstructionError<MCInst>>(
-          "unable to resolve scheduling class for write variant.", MCI);
+    Expected<unsigned> VariantSchedClassIDOrErr =
+        getVariantSchedClassID(MCI, SchedClassID);
+    if (!VariantSchedClassIDOrErr) {
+      return VariantSchedClassIDOrErr.takeError();
     }
+
+    SchedClassID = *VariantSchedClassIDOrErr;
   }
 
   // Check if this instruction is supported. Otherwise, report an error.
@@ -605,7 +638,10 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
     return *Descriptors[DKey];
   }
 
-  auto VDKey = std::make_pair(&MCI, SchedClassID);
+  auto VDKey = std::make_pair(hashMCInst(MCI), SchedClassID);
+  assert(
+      !VariantDescriptors.contains(VDKey) &&
+      "Expected VariantDescriptors to not already have a value for this key.");
   VariantDescriptors[VDKey] = std::move(ID);
   return *VariantDescriptors[VDKey];
 }
@@ -620,9 +656,15 @@ InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI,
   if (Descriptors.find_as(DKey) != Descriptors.end())
     return *Descriptors[DKey];
 
-  unsigned CPUID = STI.getSchedModel().getProcessorID();
-  SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, &MCII, CPUID);
-  auto VDKey = std::make_pair(&MCI, SchedClassID);
+  Expected<unsigned> VariantSchedClassIDOrErr =
+      getVariantSchedClassID(MCI, SchedClassID);
+  if (!VariantSchedClassIDOrErr) {
+    return VariantSchedClassIDOrErr.takeError();
+  }
+
+  SchedClassID = *VariantSchedClassIDOrErr;
+
+  auto VDKey = std::make_pair(hashMCInst(MCI), SchedClassID);
   if (VariantDescriptors.contains(VDKey))
     return *VariantDescriptors[VDKey];
 
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 18fc2e4d4a37b2..7e29da4d5e948f 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -295,7 +295,7 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
       return make_error<StringError>("Only variables can have common linkage!",
                                      inconvertibleErrorCode());
     Uncommon().CommonSize =
-        GV->getParent()->getDataLayout().getTypeAllocSize(GV->getValueType());
+        GV->getDataLayout().getTypeAllocSize(GV->getValueType());
     Uncommon().CommonAlign = GVar->getAlign() ? GVar->getAlign()->value() : 0;
   }
 
diff --git a/llvm/lib/Object/TapiUniversal.cpp b/llvm/lib/Object/TapiUniversal.cpp
index bf96b57f032101..4db5841a8f34c3 100644
--- a/llvm/lib/Object/TapiUniversal.cpp
+++ b/llvm/lib/Object/TapiUniversal.cpp
@@ -47,9 +47,9 @@ TapiUniversal::~TapiUniversal() = default;
 
 Expected<std::unique_ptr<TapiFile>>
 TapiUniversal::ObjectForArch::getAsObjectFile() const {
-  return std::unique_ptr<TapiFile>(new TapiFile(Parent->getMemoryBufferRef(),
-                                                *Parent->ParsedFile,
-                                                Parent->Libraries[Index].Arch));
+  return std::make_unique<TapiFile>(Parent->getMemoryBufferRef(),
+                                    *Parent->ParsedFile,
+                                    Parent->Libraries[Index].Arch);
 }
 
 Expected<std::unique_ptr<TapiUniversal>>
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f819345f69b44c..17cc156846d360 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -93,9 +93,11 @@
 #include "llvm/CodeGen/LocalStackSlotAllocation.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
 #include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/RegAllocFast.h"
@@ -182,6 +184,7 @@
 #include "llvm/Transforms/Instrumentation/LowerAllowCheckPass.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h"
 #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
@@ -325,27 +328,25 @@ AnalysisKey NoOpLoopAnalysis::Key;
 
 namespace {
 
-/// Whether or not we should populate a PassInstrumentationCallbacks's class to
-/// pass name map.
-///
-/// This is for optimization purposes so we don't populate it if we never use
-/// it. This should be updated if new pass instrumentation wants to use the map.
-/// We currently only use this for --print-before/after.
-bool shouldPopulateClassToPassNames() {
-  return PrintPipelinePasses || !printBeforePasses().empty() ||
-         !printAfterPasses().empty() || !isFilterPassesEmpty() ||
-         TargetPassConfig::hasLimitedCodeGenPipeline();
-}
-
-// A pass for testing -print-on-crash.
+// Passes for testing crashes.
 // DO NOT USE THIS EXCEPT FOR TESTING!
-class TriggerCrashPass : public PassInfoMixin<TriggerCrashPass> {
+class TriggerCrashModulePass : public PassInfoMixin<TriggerCrashModulePass> {
 public:
   PreservedAnalyses run(Module &, ModuleAnalysisManager &) {
     abort();
     return PreservedAnalyses::all();
   }
-  static StringRef name() { return "TriggerCrashPass"; }
+  static StringRef name() { return "TriggerCrashModulePass"; }
+};
+
+class TriggerCrashFunctionPass
+    : public PassInfoMixin<TriggerCrashFunctionPass> {
+public:
+  PreservedAnalyses run(Function &, FunctionAnalysisManager &) {
+    abort();
+    return PreservedAnalyses::all();
+  }
+  static StringRef name() { return "TriggerCrashFunctionPass"; }
 };
 
 // A pass for testing message reporting of -verify-each failures.
@@ -416,10 +417,13 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
                          std::optional<PGOOptions> PGOOpt,
                          PassInstrumentationCallbacks *PIC)
     : TM(TM), PTO(PTO), PGOOpt(PGOOpt), PIC(PIC) {
-  bool ShouldPopulateClassToPassNames = PIC && shouldPopulateClassToPassNames();
   if (TM)
-    TM->registerPassBuilderCallbacks(*this, ShouldPopulateClassToPassNames);
-  if (ShouldPopulateClassToPassNames) {
+    TM->registerPassBuilderCallbacks(*this);
+  if (PIC) {
+    PIC->registerClassToPassNameCallback([this, PIC]() {
+      // MSVC requires this to be captured if it's used inside decltype.
+      // Other compilers consider it an unused lambda capture.
+      (void)this;
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
@@ -453,6 +457,7 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)                               \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #include "llvm/Passes/MachinePassRegistry.def"
+    });
   }
 }
 
@@ -1917,6 +1922,18 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM,
     MFPM.addPass(CREATE_PASS(Params.get()));                                   \
     return Error::success();                                                   \
   }
+#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)                           \
+  if (Name == "require<" NAME ">") {                                           \
+    MFPM.addPass(                                                              \
+        RequireAnalysisPass<std::remove_reference_t<decltype(CREATE_PASS)>,    \
+                            MachineFunction>());                               \
+    return Error::success();                                                   \
+  }                                                                            \
+  if (Name == "invalidate<" NAME ">") {                                        \
+    MFPM.addPass(InvalidateAnalysisPass<                                       \
+                 std::remove_reference_t<decltype(CREATE_PASS)>>());           \
+    return Error::success();                                                   \
+  }
 #include "llvm/Passes/MachinePassRegistry.def"
 
   for (auto &C : MachineFunctionPipelineParsingCallbacks)
diff --git a/llvm/lib/Passes/PassBuilderBindings.cpp b/llvm/lib/Passes/PassBuilderBindings.cpp
index 0d3a3d7d02239a..b80dc0231ed5fd 100644
--- a/llvm/lib/Passes/PassBuilderBindings.cpp
+++ b/llvm/lib/Passes/PassBuilderBindings.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/Transforms/PassBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 60c517790bcab6..3b92823cd283b4 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -94,6 +94,7 @@ MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("module-inline", ModuleInlinerPass())
 MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
+MODULE_PASS("nsan", NumericalStabilitySanitizerPass())
 MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
 MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("openmp-opt-postlink",
@@ -137,7 +138,7 @@ MODULE_PASS("strip-debug-declare", StripDebugDeclarePass())
 MODULE_PASS("strip-nondebug", StripNonDebugSymbolsPass())
 MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass())
 MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
-MODULE_PASS("trigger-crash", TriggerCrashPass())
+MODULE_PASS("trigger-crash-module", TriggerCrashModulePass())
 MODULE_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
 MODULE_PASS("verify", VerifierPass())
@@ -459,6 +460,7 @@ FUNCTION_PASS("structurizecfg", StructurizeCFGPass())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("tlshoist", TLSVariableHoistPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
+FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass())
 FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index c7749f33d9af55..9dbaa2ca0f0209 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -228,6 +228,12 @@ cl::opt<bool> EnableVTableValueProfiling(
              "the types of a C++ pointer. The information is used in indirect "
              "call promotion to do selective vtable-based comparison."));
 
+cl::opt<bool> EnableVTableProfileUse(
+    "enable-vtable-profile-use", cl::init(false),
+    cl::desc("If ThinLTO and WPD is enabled and this option is true, vtable "
+             "profiles will be used by ICP pass for more efficient indirect "
+             "call sequence. If false, type profiles won't be used."));
+
 std::string getInstrProfSectionName(InstrProfSectKind IPSK,
                                     Triple::ObjectFormatType OF,
                                     bool AddSegmentInfo) {
@@ -391,7 +397,7 @@ std::string getPGOName(const GlobalVariable &V, bool InLTO) {
   // PGONameMetadata should be set by compiler at profile use time
   // and read by symtab creation to look up symbols corresponding to
   // a MD5 hash.
-  return getIRPGOObjectName(V, InLTO, /*PGONameMetadata=*/nullptr);
+  return getIRPGOObjectName(V, InLTO, V.getMetadata(getPGONameMetadataName()));
 }
 
 // See getIRPGOObjectName() for a discription of the format.
@@ -480,8 +486,7 @@ Error InstrProfSymtab::create(Module &M, bool InLTO) {
   for (GlobalVariable &G : M.globals()) {
     if (!G.hasName() || !G.hasMetadata(LLVMContext::MD_type))
       continue;
-    if (Error E = addVTableWithName(
-            G, getIRPGOObjectName(G, InLTO, /* PGONameMetadata */ nullptr)))
+    if (Error E = addVTableWithName(G, getPGOName(G, InLTO)))
       return E;
   }
 
@@ -1425,16 +1430,28 @@ MDNode *getPGOFuncNameMetadata(const Function &F) {
   return F.getMetadata(getPGOFuncNameMetadataName());
 }
 
-void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName) {
-  // Only for internal linkage functions.
-  if (PGOFuncName == F.getName())
-      return;
-  // Don't create duplicated meta-data.
-  if (getPGOFuncNameMetadata(F))
+static void createPGONameMetadata(GlobalObject &GO, StringRef MetadataName,
+                                  StringRef PGOName) {
+  // Only for internal linkage functions or global variables. The name is not
+  // the same as PGO name for these global objects.
+  if (GO.getName() == PGOName)
     return;
-  LLVMContext &C = F.getContext();
-  MDNode *N = MDNode::get(C, MDString::get(C, PGOFuncName));
-  F.setMetadata(getPGOFuncNameMetadataName(), N);
+
+  // Don't create duplicated metadata.
+  if (GO.getMetadata(MetadataName))
+    return;
+
+  LLVMContext &C = GO.getContext();
+  MDNode *N = MDNode::get(C, MDString::get(C, PGOName));
+  GO.setMetadata(MetadataName, N);
+}
+
+void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName) {
+  return createPGONameMetadata(F, getPGOFuncNameMetadataName(), PGOFuncName);
+}
+
+void createPGONameMetadata(GlobalObject &GO, StringRef PGOName) {
+  return createPGONameMetadata(GO, getPGONameMetadataName(), PGOName);
 }
 
 bool needsComdatForCounter(const GlobalObject &GO, const Module &M) {
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index de58cb6331860b..5e2d76e24dab7a 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -41,7 +41,6 @@
 #include "llvm/Support/Path.h"
 
 #define DEBUG_TYPE "memprof"
-
 namespace llvm {
 namespace memprof {
 namespace {
@@ -68,7 +67,14 @@ Error checkBuffer(const MemoryBuffer &Buffer) {
   const char *Next = Buffer.getBufferStart();
   while (Next < Buffer.getBufferEnd()) {
     const auto *H = reinterpret_cast<const Header *>(Next);
-    if (H->Version != MEMPROF_RAW_VERSION) {
+
+    // Check if the version in header is among the supported versions.
+    bool IsSupported = false;
+    for (auto SupportedVersion : MEMPROF_RAW_SUPPORTED_VERSIONS) {
+      if (H->Version == SupportedVersion)
+        IsSupported = true;
+    }
+    if (!IsSupported) {
       return make_error<InstrProfError>(instrprof_error::unsupported_version);
     }
 
@@ -96,19 +102,63 @@ llvm::SmallVector<SegmentEntry> readSegmentEntries(const char *Ptr) {
 }
 
 llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
-readMemInfoBlocks(const char *Ptr) {
+readMemInfoBlocksV3(const char *Ptr) {
   using namespace support;
 
   const uint64_t NumItemsToRead =
-      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+
   llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> Items;
   for (uint64_t I = 0; I < NumItemsToRead; I++) {
     const uint64_t Id =
-        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
-    const MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+
+    // We cheat a bit here and remove the const from cast to set the
+    // Histogram Pointer to newly allocated buffer. We also cheat, since V3 and
+    // V4 do not have the same fields. V3 is missing AccessHistogramSize and
+    // AccessHistogram. This means we read "dirty" data in here, but it should
+    // not segfault, since there will be callstack data placed after this in the
+    // binary format.
+    MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
+    // Overwrite dirty data.
+    MIB.AccessHistogramSize = 0;
+    MIB.AccessHistogram = 0;
+
     Items.push_back({Id, MIB});
+    // Only increment by the size of MIB in V3.
+    Ptr += MEMPROF_V3_MIB_SIZE;
+  }
+  return Items;
+}
+
+llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
+readMemInfoBlocksV4(const char *Ptr) {
+  using namespace support;
+
+  const uint64_t NumItemsToRead =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+
+  llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> Items;
+  for (uint64_t I = 0; I < NumItemsToRead; I++) {
+    const uint64_t Id =
+        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+    // We cheat a bit here and remove the const from cast to set the
+    // Histogram Pointer to newly allocated buffer.
+    MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
+
     // Only increment by size of MIB since readNext implicitly increments.
     Ptr += sizeof(MemInfoBlock);
+
+    if (MIB.AccessHistogramSize > 0) {
+      MIB.AccessHistogram =
+          (uintptr_t)malloc(MIB.AccessHistogramSize * sizeof(uint64_t));
+    }
+
+    for (uint64_t J = 0; J < MIB.AccessHistogramSize; J++) {
+      ((uint64_t *)MIB.AccessHistogram)[J] =
+          endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+    }
+    Items.push_back({Id, MIB});
   }
   return Items;
 }
@@ -251,6 +301,16 @@ RawMemProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
   return std::move(Reader);
 }
 
+// We need to make sure that all leftover MIB histograms that have not been
+// freed by merge are freed here.
+RawMemProfReader::~RawMemProfReader() {
+  for (auto &[_, MIB] : CallstackProfileData) {
+    if (MemprofRawVersion >= 4ULL && MIB.AccessHistogramSize > 0) {
+      free((void *)MIB.AccessHistogram);
+    }
+  }
+}
+
 bool RawMemProfReader::hasFormat(const StringRef Path) {
   auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
   if (!BufferOr)
@@ -281,7 +341,7 @@ void RawMemProfReader::printYAML(raw_ostream &OS) {
 
   OS << "MemprofProfile:\n";
   OS << "  Summary:\n";
-  OS << "    Version: " << MEMPROF_RAW_VERSION << "\n";
+  OS << "    Version: " << MemprofRawVersion << "\n";
   OS << "    NumSegments: " << SegmentInfo.size() << "\n";
   OS << "    NumMibInfo: " << NumMibInfo << "\n";
   OS << "    NumAllocFunctions: " << NumAllocFunctions << "\n";
@@ -573,6 +633,8 @@ Error RawMemProfReader::symbolizeAndFilterStackFrames(
   // Drop the entries where the callstack is empty.
   for (const uint64_t Id : EntriesToErase) {
     StackMap.erase(Id);
+    if(CallstackProfileData[Id].AccessHistogramSize > 0)
+      free((void*) CallstackProfileData[Id].AccessHistogram);
     CallstackProfileData.erase(Id);
   }
 
@@ -610,6 +672,19 @@ RawMemProfReader::peekBuildIds(MemoryBuffer *DataBuffer) {
   return BuildIds.takeVector();
 }
 
+// FIXME: Add a schema for serializing similiar to IndexedMemprofReader. This
+// will help being able to deserialize different versions raw memprof versions
+// more easily.
+llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
+RawMemProfReader::readMemInfoBlocks(const char *Ptr) {
+  if (MemprofRawVersion == 3ULL)
+    return readMemInfoBlocksV3(Ptr);
+  if (MemprofRawVersion == 4ULL)
+    return readMemInfoBlocksV4(Ptr);
+  llvm_unreachable(
+      "Panic: Unsupported version number when reading MemInfoBlocks");
+}
+
 Error RawMemProfReader::readRawProfile(
     std::unique_ptr<MemoryBuffer> DataBuffer) {
   const char *Next = DataBuffer->getBufferStart();
@@ -617,6 +692,10 @@ Error RawMemProfReader::readRawProfile(
   while (Next < DataBuffer->getBufferEnd()) {
     const auto *Header = reinterpret_cast<const memprof::Header *>(Next);
 
+    // Set Reader version to memprof raw version of profile. Checking if version
+    // is supported is checked before creating the reader.
+    MemprofRawVersion = Header->Version;
+
     // Read in the segment information, check whether its the same across all
     // profiles in this binary file.
     const llvm::SmallVector<SegmentEntry> Entries =
@@ -636,7 +715,21 @@ Error RawMemProfReader::readRawProfile(
     // stackdepot ids are the same.
     for (const auto &[Id, MIB] : readMemInfoBlocks(Next + Header->MIBOffset)) {
       if (CallstackProfileData.count(Id)) {
-        CallstackProfileData[Id].Merge(MIB);
+
+        if (MemprofRawVersion >= 4ULL &&
+            (CallstackProfileData[Id].AccessHistogramSize > 0 ||
+             MIB.AccessHistogramSize > 0)) {
+          uintptr_t ShorterHistogram;
+          if (CallstackProfileData[Id].AccessHistogramSize >
+              MIB.AccessHistogramSize)
+            ShorterHistogram = MIB.AccessHistogram;
+          else
+            ShorterHistogram = CallstackProfileData[Id].AccessHistogram;
+          CallstackProfileData[Id].Merge(MIB);
+          free((void *)ShorterHistogram);
+        } else {
+          CallstackProfileData[Id].Merge(MIB);
+        }
       } else {
         CallstackProfileData[Id] = MIB;
       }
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 47618bc3259512..3017a9b9766589 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -83,8 +83,8 @@ enum class fltNanEncoding {
   // exponent is all 1s and the significand is non-zero.
   IEEE,
 
-  // Represents the behavior in the Float8E4M3 floating point type where NaN is
-  // represented by having the exponent and mantissa set to all 1s.
+  // Represents the behavior in the Float8E4M3FN floating point type where NaN
+  // is represented by having the exponent and mantissa set to all 1s.
   // This behavior matches the FP8 E4M3 type described in
   // https://arxiv.org/abs/2209.05433. We treat both signed and unsigned NaNs
   // as non-signalling, although the paper does not state whether the NaN
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 0c69ac99f5bc6d..f653379e303349 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -187,7 +187,6 @@ add_llvm_component_library(LLVMSupport
   FormatVariadic.cpp
   GlobPattern.cpp
   GraphWriter.cpp
-  Hashing.cpp
   HexagonAttributeParser.cpp
   HexagonAttributes.cpp
   InitLLVM.cpp
@@ -323,8 +322,12 @@ if(LLVM_ENABLE_ZSTD)
   if(NOT zstd_library)
     get_property(zstd_library TARGET ${zstd_target} PROPERTY LOCATION)
   endif()
-  get_library_name(${zstd_library} zstd_library)
-  set(llvm_system_libs ${llvm_system_libs} "${zstd_library}")
+  if (zstd_target STREQUAL zstd::libzstd_shared)
+    get_library_name(${zstd_library} zstd_library)
+    set(llvm_system_libs ${llvm_system_libs} "${zstd_library}")
+  else()
+    set(llvm_system_libs ${llvm_system_libs} "${zstd_STATIC_LIBRARY}")
+  endif()
 endif()
 
 set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}")
diff --git a/llvm/lib/Support/Hashing.cpp b/llvm/lib/Support/Hashing.cpp
deleted file mode 100644
index 1b20a670434f12..00000000000000
--- a/llvm/lib/Support/Hashing.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-//===-------------- lib/Support/Hashing.cpp -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides implementation bits for the LLVM common hashing
-// infrastructure. Documentation and most of the other information is in the
-// header file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Hashing.h"
-
-using namespace llvm;
-
-// Provide a definition and static initializer for the fixed seed. This
-// initializer should always be zero to ensure its value can never appear to be
-// non-zero, even during dynamic initialization.
-uint64_t llvm::hashing::detail::fixed_seed_override = 0;
-
-// Implement the function for forced setting of the fixed seed.
-// FIXME: Use atomic operations here so that there is no data race.
-void llvm::set_fixed_execution_hash_seed(uint64_t fixed_value) {
-  hashing::detail::fixed_seed_override = fixed_value;
-}
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp
index 2f9c1df3d20b23..17779b58f81b73 100644
--- a/llvm/lib/Support/JSON.cpp
+++ b/llvm/lib/Support/JSON.cpp
@@ -242,8 +242,6 @@ Error Path::Root::getError() const {
   return createStringError(llvm::inconvertibleErrorCode(), OS.str());
 }
 
-namespace {
-
 std::vector<const Object::value_type *> sortedElements(const Object &O) {
   std::vector<const Object::value_type *> Elements;
   for (const auto &E : O)
@@ -258,7 +256,7 @@ std::vector<const Object::value_type *> sortedElements(const Object &O) {
 // Prints a one-line version of a value that isn't our main focus.
 // We interleave writes to OS and JOS, exploiting the lack of extra buffering.
 // This is OK as we own the implementation.
-void abbreviate(const Value &V, OStream &JOS) {
+static void abbreviate(const Value &V, OStream &JOS) {
   switch (V.kind()) {
   case Value::Array:
     JOS.rawValue(V.getAsArray()->empty() ? "[]" : "[ ... ]");
@@ -284,7 +282,7 @@ void abbreviate(const Value &V, OStream &JOS) {
 
 // Prints a semi-expanded version of a value that is our main focus.
 // Array/Object entries are printed, but not recursively as they may be huge.
-void abbreviateChildren(const Value &V, OStream &JOS) {
+static void abbreviateChildren(const Value &V, OStream &JOS) {
   switch (V.kind()) {
   case Value::Array:
     JOS.array([&] {
@@ -306,8 +304,6 @@ void abbreviateChildren(const Value &V, OStream &JOS) {
   }
 }
 
-} // namespace
-
 void Path::Root::printErrorContext(const Value &R, raw_ostream &OS) const {
   OStream JOS(OS, /*IndentSize=*/2);
   // PrintValue recurses down the path, printing the ancestors of our target.
@@ -414,6 +410,7 @@ class Parser {
   std::optional<Error> Err;
   const char *Start, *P, *End;
 };
+} // namespace
 
 bool Parser::parseValue(Value &Out) {
   eatWhitespace();
@@ -681,7 +678,6 @@ bool Parser::parseError(const char *Msg) {
       std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
   return false;
 }
-} // namespace
 
 Expected<Value> parse(StringRef JSON) {
   Parser P(JSON);
diff --git a/llvm/lib/Support/xxhash.cpp b/llvm/lib/Support/xxhash.cpp
index a0803297555ce6..607789b391381f 100644
--- a/llvm/lib/Support/xxhash.cpp
+++ b/llvm/lib/Support/xxhash.cpp
@@ -453,18 +453,7 @@ uint64_t llvm::xxh3_64bits(ArrayRef<uint8_t> data) {
 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
 #endif
 
-#if defined(_MSC_VER) && defined(_M_IX86)
-#define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
-#else
-/*
- * Downcast + upcast is usually better than masking on older compilers like
- * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
- *
- * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
- * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
- */
 #define XXH_mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
-#endif
 
 /*!
  * @brief Calculates a 64->128-bit long multiply.
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index f2c38b09c64815..8d9463968b8986 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index da11539eab348f..64d41d41476440 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
@@ -133,6 +134,13 @@ class AArch64AsmPrinter : public AsmPrinter {
   unsigned emitPtrauthDiscriminator(uint16_t Disc, unsigned AddrDisc,
                                     unsigned &InstsEmitted);
 
+  // Emit the sequence for LOADauthptrstatic
+  void LowerLOADauthptrstatic(const MachineInstr &MI);
+
+  // Emit the sequence for LOADgotPAC/MOVaddrPAC (either GOT adrp-ldr or
+  // adrp-add followed by PAC sign)
+  void LowerMOVaddrPAC(const MachineInstr &MI);
+
   /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
@@ -840,6 +848,15 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
   }
 }
 
+template <typename MachineModuleInfoTarget>
+static void emitAuthenticatedPointer(
+    MCStreamer &OutStreamer, MCSymbol *StubLabel,
+    const typename MachineModuleInfoTarget::AuthStubInfo &StubInfo) {
+  // sym$auth_ptr$key$disc:
+  OutStreamer.emitLabel(StubLabel);
+  OutStreamer.emitValue(StubInfo.AuthPtrRef, /*size=*/8);
+}
+
 void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
   emitHwasanMemaccessSymbols(M);
 
@@ -853,6 +870,25 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
     OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
 
+  if (TT.isOSBinFormatELF()) {
+    // Output authenticated pointers as indirect symbols, if we have any.
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    auto Stubs = MMIELF.getAuthGVStubList();
+
+    if (!Stubs.empty()) {
+      const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+      OutStreamer->switchSection(TLOF.getDataSection());
+      emitAlignment(Align(8));
+
+      for (const auto &Stub : Stubs)
+        emitAuthenticatedPointer<MachineModuleInfoELF>(*OutStreamer, Stub.first,
+                                                       Stub.second);
+
+      OutStreamer->addBlankLine();
+    }
+  }
+
   // Emit stack and fault map information.
   FM.serializeToFaultMapSection();
 
@@ -1623,6 +1659,213 @@ AArch64AsmPrinter::lowerConstantPtrAuth(const ConstantPtrAuth &CPA) {
                                    CPA.hasAddressDiscriminator(), Ctx);
 }
 
+void AArch64AsmPrinter::LowerLOADauthptrstatic(const MachineInstr &MI) {
+  unsigned DstReg = MI.getOperand(0).getReg();
+  const MachineOperand &GAOp = MI.getOperand(1);
+  const uint64_t KeyC = MI.getOperand(2).getImm();
+  assert(KeyC <= AArch64PACKey::LAST &&
+         "key is out of range [0, AArch64PACKey::LAST]");
+  const auto Key = (AArch64PACKey::ID)KeyC;
+  const uint64_t Disc = MI.getOperand(3).getImm();
+  assert(isUInt<16>(Disc) &&
+         "constant discriminator is out of range [0, 0xffff]");
+
+  // Emit instruction sequence like the following:
+  //   ADRP x16, symbol$auth_ptr$key$disc
+  //   LDR x16, [x16, :lo12:symbol$auth_ptr$key$disc]
+  //
+  // Where the $auth_ptr$ symbol is the stub slot containing the signed pointer
+  // to symbol.
+  assert(TM.getTargetTriple().isOSBinFormatELF() &&
+         "LOADauthptrstatic is implemented only for ELF");
+  const auto &TLOF =
+      static_cast<const AArch64_ELFTargetObjectFile &>(getObjFileLowering());
+
+  assert(GAOp.getOffset() == 0 &&
+         "non-zero offset for $auth_ptr$ stub slots is not supported");
+  const MCSymbol *GASym = TM.getSymbol(GAOp.getGlobal());
+  MCSymbol *AuthPtrStubSym =
+      TLOF.getAuthPtrSlotSymbol(TM, &MF->getMMI(), GASym, Key, Disc);
+
+  MachineOperand StubMOHi =
+      MachineOperand::CreateMCSymbol(AuthPtrStubSym, AArch64II::MO_PAGE);
+  MachineOperand StubMOLo = MachineOperand::CreateMCSymbol(
+      AuthPtrStubSym, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  MCOperand StubMCHi, StubMCLo;
+
+  MCInstLowering.lowerOperand(StubMOHi, StubMCHi);
+  MCInstLowering.lowerOperand(StubMOLo, StubMCLo);
+
+  EmitToStreamer(
+      *OutStreamer,
+      MCInstBuilder(AArch64::ADRP).addReg(DstReg).addOperand(StubMCHi));
+
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRXui)
+                                   .addReg(DstReg)
+                                   .addReg(DstReg)
+                                   .addOperand(StubMCLo));
+}
+
+void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
+  unsigned InstsEmitted = 0;
+  auto EmitAndIncrement = [this, &InstsEmitted](const MCInst &Inst) {
+    EmitToStreamer(*OutStreamer, Inst);
+    ++InstsEmitted;
+  };
+
+  const bool IsGOTLoad = MI.getOpcode() == AArch64::LOADgotPAC;
+  MachineOperand GAOp = MI.getOperand(0);
+  const uint64_t KeyC = MI.getOperand(1).getImm();
+  assert(KeyC <= AArch64PACKey::LAST &&
+         "key is out of range [0, AArch64PACKey::LAST]");
+  const auto Key = (AArch64PACKey::ID)KeyC;
+  const unsigned AddrDisc = MI.getOperand(2).getReg();
+  const uint64_t Disc = MI.getOperand(3).getImm();
+  assert(isUInt<16>(Disc) &&
+         "constant discriminator is out of range [0, 0xffff]");
+
+  const int64_t Offset = GAOp.getOffset();
+  GAOp.setOffset(0);
+
+  // Emit:
+  // target materialization:
+  // - via GOT:
+  //     adrp x16, :got:target
+  //     ldr x16, [x16, :got_lo12:target]
+  //     add offset to x16 if offset != 0
+  //
+  // - direct:
+  //     adrp x16, target
+  //     add x16, x16, :lo12:target
+  //     add offset to x16 if offset != 0
+  //
+  // add offset to x16:
+  // - abs(offset) fits 24 bits:
+  //     add/sub x16, x16, #<offset>[, #lsl 12] (up to 2 instructions)
+  // - abs(offset) does not fit 24 bits:
+  //   - offset < 0:
+  //       movn+movk sequence filling x17 register with the offset (up to 4
+  //       instructions)
+  //       add x16, x16, x17
+  //   - offset > 0:
+  //       movz+movk sequence filling x17 register with the offset (up to 4
+  //       instructions)
+  //       add x16, x16, x17
+  //
+  // signing:
+  // - 0 discriminator:
+  //     paciza x16
+  // - Non-0 discriminator, no address discriminator:
+  //     mov x17, #Disc
+  //     pacia x16, x17
+  // - address discriminator (with potentially folded immediate discriminator):
+  //     pacia x16, xAddrDisc
+
+  MachineOperand GAMOHi(GAOp), GAMOLo(GAOp);
+  MCOperand GAMCHi, GAMCLo;
+
+  GAMOHi.setTargetFlags(AArch64II::MO_PAGE);
+  GAMOLo.setTargetFlags(AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  if (IsGOTLoad) {
+    GAMOHi.addTargetFlag(AArch64II::MO_GOT);
+    GAMOLo.addTargetFlag(AArch64II::MO_GOT);
+  }
+
+  MCInstLowering.lowerOperand(GAMOHi, GAMCHi);
+  MCInstLowering.lowerOperand(GAMOLo, GAMCLo);
+
+  EmitAndIncrement(
+      MCInstBuilder(AArch64::ADRP).addReg(AArch64::X16).addOperand(GAMCHi));
+
+  if (IsGOTLoad) {
+    EmitAndIncrement(MCInstBuilder(AArch64::LDRXui)
+                         .addReg(AArch64::X16)
+                         .addReg(AArch64::X16)
+                         .addOperand(GAMCLo));
+  } else {
+    EmitAndIncrement(MCInstBuilder(AArch64::ADDXri)
+                         .addReg(AArch64::X16)
+                         .addReg(AArch64::X16)
+                         .addOperand(GAMCLo)
+                         .addImm(0));
+  }
+
+  if (Offset != 0) {
+    const uint64_t AbsOffset = (Offset > 0 ? Offset : -((uint64_t)Offset));
+    const bool IsNeg = Offset < 0;
+    if (isUInt<24>(AbsOffset)) {
+      for (int BitPos = 0; BitPos != 24 && (AbsOffset >> BitPos);
+           BitPos += 12) {
+        EmitAndIncrement(
+            MCInstBuilder(IsNeg ? AArch64::SUBXri : AArch64::ADDXri)
+                .addReg(AArch64::X16)
+                .addReg(AArch64::X16)
+                .addImm((AbsOffset >> BitPos) & 0xfff)
+                .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, BitPos)));
+      }
+    } else {
+      const uint64_t UOffset = Offset;
+      EmitAndIncrement(MCInstBuilder(IsNeg ? AArch64::MOVNXi : AArch64::MOVZXi)
+                           .addReg(AArch64::X17)
+                           .addImm((IsNeg ? ~UOffset : UOffset) & 0xffff)
+                           .addImm(/*shift=*/0));
+      auto NeedMovk = [IsNeg, UOffset](int BitPos) -> bool {
+        assert(BitPos == 16 || BitPos == 32 || BitPos == 48);
+        uint64_t Shifted = UOffset >> BitPos;
+        if (!IsNeg)
+          return Shifted != 0;
+        for (int I = 0; I != 64 - BitPos; I += 16)
+          if (((Shifted >> I) & 0xffff) != 0xffff)
+            return true;
+        return false;
+      };
+      for (int BitPos = 16; BitPos != 64 && NeedMovk(BitPos); BitPos += 16) {
+        EmitAndIncrement(MCInstBuilder(AArch64::MOVKXi)
+                             .addReg(AArch64::X17)
+                             .addReg(AArch64::X17)
+                             .addImm((UOffset >> BitPos) & 0xffff)
+                             .addImm(/*shift=*/BitPos));
+      }
+      EmitAndIncrement(MCInstBuilder(AArch64::ADDXrs)
+                           .addReg(AArch64::X16)
+                           .addReg(AArch64::X16)
+                           .addReg(AArch64::X17)
+                           .addImm(/*shift=*/0));
+    }
+  }
+
+  unsigned DiscReg = AddrDisc;
+  if (Disc != 0) {
+    if (AddrDisc != AArch64::XZR) {
+      EmitAndIncrement(MCInstBuilder(AArch64::ORRXrs)
+                           .addReg(AArch64::X17)
+                           .addReg(AArch64::XZR)
+                           .addReg(AddrDisc)
+                           .addImm(0));
+      EmitAndIncrement(MCInstBuilder(AArch64::MOVKXi)
+                           .addReg(AArch64::X17)
+                           .addReg(AArch64::X17)
+                           .addImm(Disc)
+                           .addImm(/*shift=*/48));
+    } else {
+      EmitAndIncrement(MCInstBuilder(AArch64::MOVZXi)
+                           .addReg(AArch64::X17)
+                           .addImm(Disc)
+                           .addImm(/*shift=*/0));
+    }
+    DiscReg = AArch64::X17;
+  }
+
+  auto MIB = MCInstBuilder(getPACOpcodeForKey(Key, DiscReg == AArch64::XZR))
+                 .addReg(AArch64::X16)
+                 .addReg(AArch64::X16);
+  if (DiscReg != AArch64::XZR)
+    MIB.addReg(DiscReg);
+  EmitAndIncrement(MIB);
+
+  assert(STI->getInstrInfo()->getInstSizeInBytes(MI) >= InstsEmitted * 4);
+}
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
@@ -1758,6 +2001,15 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  case AArch64::LOADauthptrstatic:
+    LowerLOADauthptrstatic(*MI);
+    return;
+
+  case AArch64::LOADgotPAC:
+  case AArch64::MOVaddrPAC:
+    LowerMOVaddrPAC(*MI);
+    return;
+
   case AArch64::BLRA:
     emitPtrauthBranch(MI);
     return;
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 941990c53c4a7f..2f7e226fd09b2d 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -500,23 +500,31 @@ def CC_AArch64_Preserve_None : CallingConv<[
     // - X8, used for sret
     // - X16/X17, used by the linker as IP0/IP1
     // - X18, the platform register
+    // - X19, the base pointer
     // - X29, the frame pointer
     // - X30, the link register
     // General registers are not preserved with the exception of
     // FP, LR, and X18
     // Non-volatile registers are used first, so functions may call
     // normal functions without saving and reloading arguments.
-    CCIfType<[i32], CCAssignToReg<[W19, W20, W21, W22, W23,
+    // X9 is assigned last as it is used in FrameLowering as the first
+    // choice for a scratch register.
+    CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23,
                                    W24, W25, W26, W27, W28,
                                    W0, W1, W2, W3, W4, W5,
-                                   W6, W7, W9, W10, W11,
-                                   W12, W13, W14, W15]>>,
-    CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23,
+                                   W6, W7, W10, W11,
+                                   W12, W13, W14, W9]>>,
+    CCIfType<[i64], CCAssignToReg<[X20, X21, X22, X23,
                                    X24, X25, X26, X27, X28,
                                    X0, X1, X2, X3, X4, X5,
-                                   X6, X7, X9, X10, X11,
-                                   X12, X13, X14, X15]>>,
-
+                                   X6, X7, X10, X11,
+                                   X12, X13, X14, X9]>>,
+
+    // Windows uses X15 for stack allocation
+    CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
+        CCIfType<[i32], CCAssignToReg<[W15]>>>,
+    CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
+        CCIfType<[i64], CCAssignToReg<[X15]>>>,
     CCDelegateTo<CC_AArch64_AAPCS>
 ]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 9a788123b1ffa3..3c2201ec4c998c 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -794,7 +794,7 @@ char AArch64ConditionalCompares::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -805,7 +805,7 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
 }
 
 void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
@@ -935,7 +935,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfo>();
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
   MinSize = MF.getFunction().hasMinSize();
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 56fd15f23363d1..e3c5a143b28892 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -55,6 +55,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 8c1003c085e631..8754ea4974999f 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -9,28 +9,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-// A SubtargetFeature that can be toggled from the command line, and therefore
-// has an AEK_* entry in ArmExtKind.
+// A SubtargetFeature that represents one or more Architecture Extensions, as
+// defined by the Arm ARM and typically identified by a 'FEAT_*' name.
+// Each Extension record defines an ExtensionInfo entry in the Target Parser
+// with a corresponding 'AEK_*' entry in the ArchExtKind enum.
 class Extension<
-  string TargetFeatureName,            // String used for -target-feature and -march, unless overridden.
+  string TargetFeatureName,            // String used for -target-feature, unless overridden.
   string Spelling,                     // The XYZ in HasXYZ and AEK_XYZ.
+  string ArchitectureFeatureName,      // The extension's "FEAT_*"" name(s) defined by the architecture
   string Desc,                         // Description.
   list<SubtargetFeature> Implies = []  // List of dependent features.
 > : SubtargetFeature<TargetFeatureName, "Has" # Spelling, "true", Desc, Implies>
 {
     string ArchExtKindSpelling = "AEK_" # Spelling; // ArchExtKind enum name.
 
-    // In general, the name written on the command line should match the name
-    // used for -target-feature. However, there are exceptions. Therefore we
-    // add a separate field for this, to allow overriding it. Strongly prefer
-    // not doing so.
-    string MArchName = TargetFeatureName;
+    string ArchFeatureName = ArchitectureFeatureName;
+
+    // The user visible name used by -march/-mcpu modifiers and target attribute
+    // values. Extensions are not available on these by default.
+    string UserVisibleName = "";
 
     // An alias that can be used on the command line, if the extension has one.
     // Used for correcting historical names while remaining backwards compatible.
-    string MArchAlias = "";
+    string UserVisibleAlias = "";
 }
 
+// An Extension that can be toggled via a '-march'/'-mcpu' modifier or a target
+// attribute, e.g. '+sm4".
+class ExtensionWithMArch<
+  string TargetFeatureName,            // String used for -target-feature and -march, unless overridden.
+  string Spelling,                     // The XYZ in HasXYZ and AEK_XYZ.
+  string ArchitectureFeatureName,      // The extension's "FEAT_*"" name(s) defined by the architecture
+  string Desc,                         // Description.
+  list<SubtargetFeature> Implies = []  // List of dependent features.
+> : Extension<TargetFeatureName, Spelling, ArchitectureFeatureName, Desc, Implies> {
+    // In general, the name written on the command line should match the name
+    // used for -target-feature. However, there are exceptions. Therefore we
+    // add a separate field for this, to allow overriding it. Strongly prefer
+    // not doing so.
+    let UserVisibleName = TargetFeatureName;
+}
 
 
 
@@ -45,21 +63,19 @@ class Extension<
 //  Armv8.0 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-let ArchExtKindSpelling = "AEK_FP", MArchName = "fp" in
-def FeatureFPARMv8 : Extension<"fp-armv8", "FPARMv8",
-  "Enable ARMv8 (FEAT_FP)">;
+let ArchExtKindSpelling = "AEK_FP", UserVisibleName = "fp" in
+def FeatureFPARMv8 : ExtensionWithMArch<"fp-armv8", "FPARMv8", "FEAT_FP",
+  "Enable ARMv8">;
 
-let ArchExtKindSpelling = "AEK_SIMD", MArchName = "simd" in
-def FeatureNEON : Extension<"neon", "NEON",
-  "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8]>;
+let ArchExtKindSpelling = "AEK_SIMD", UserVisibleName = "simd" in
+def FeatureNEON : ExtensionWithMArch<"neon", "NEON", "FEAT_AdvSIMD",
+  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
-def FeatureSHA2 : Extension<
-    "sha2", "SHA2",
-    "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON]>;
+def FeatureSHA2 : ExtensionWithMArch<"sha2", "SHA2", "FEAT_SHA1, FEAT_SHA256",
+  "Enable SHA1 and SHA256 support", [FeatureNEON]>;
 
-def FeatureAES : Extension<
-    "aes", "AES",
-    "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON]>;
+def FeatureAES : ExtensionWithMArch<"aes", "AES", "FEAT_AES, FEAT_PMULL",
+  "Enable AES support", [FeatureNEON]>;
 
 // Crypto has been split up and any combination is now valid (see the
 // crypto definitions above). Also, crypto is now context sensitive:
@@ -69,289 +85,271 @@ def FeatureAES : Extension<
 // meaning anymore. We kept the Crypto definition here for backward
 // compatibility, and now imply features SHA2 and AES, which was the
 // "traditional" meaning of Crypto.
-def FeatureCrypto : Extension<"crypto", "Crypto",
+def FeatureCrypto : ExtensionWithMArch<"crypto", "Crypto", "FEAT_Crypto",
   "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
 
-def FeatureCRC : Extension<"crc", "CRC",
-  "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)">;
+def FeatureCRC : ExtensionWithMArch<"crc", "CRC", "FEAT_CRC32",
+  "Enable ARMv8 CRC-32 checksum instructions">;
 
 // This SubtargetFeature is special. It controls only whether codegen will turn
 // `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The
 // `FEAT_PMUv3*` system registers are always available for assembly/disassembly.
-let MArchName = "pmuv3" in
-def FeaturePerfMon : Extension<"perfmon", "PerfMon",
-  "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">;
+let UserVisibleName = "pmuv3" in
+def FeaturePerfMon : ExtensionWithMArch<"perfmon", "PerfMon", "FEAT_PMUv3",
+  "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension">;
 
-def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
-  "true", "Enable architectural speculation restriction (FEAT_CSV2_2)">;
+def FeatureSpecRestrict : Extension<"specrestrict", "SpecRestrict", "FEAT_CSV2_2",
+  "Enable architectural speculation restriction">;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.1 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureLSE : Extension<"lse", "LSE",
-  "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)">;
+def FeatureLSE : ExtensionWithMArch<"lse", "LSE", "FEAT_LSE",
+  "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
 
-let MArchAlias = "rdma" in
-def FeatureRDM : Extension<"rdm", "RDM",
-  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)",
+let UserVisibleAlias = "rdma" in
+def FeatureRDM : ExtensionWithMArch<"rdm", "RDM", "FEAT_RDM",
+  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions",
   [FeatureNEON]>;
 
-def FeaturePAN : SubtargetFeature<
-    "pan", "HasPAN", "true",
-    "Enables ARM v8.1 Privileged Access-Never extension (FEAT_PAN)">;
+def FeaturePAN : Extension<"pan", "PAN", "FEAT_PAN",
+  "Enables ARM v8.1 Privileged Access-Never extension">;
 
-def FeatureLOR : SubtargetFeature<
-    "lor", "HasLOR", "true",
-    "Enables ARM v8.1 Limited Ordering Regions extension (FEAT_LOR)">;
+def FeatureLOR : Extension<"lor", "LOR", "FEAT_LOR",
+  "Enables ARM v8.1 Limited Ordering Regions extension">;
 
 def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2",
-    "true", "Enable RW operand CONTEXTIDR_EL2" >;
+  "true", "Enable RW operand CONTEXTIDR_EL2">;
 
-def FeatureVH : SubtargetFeature<"vh", "HasVH", "true",
-    "Enables ARM v8.1 Virtual Host extension (FEAT_VHE)", [FeatureCONTEXTIDREL2] >;
+def FeatureVH : Extension<"vh", "VH", "FEAT_VHE",
+  "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2]>;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.2 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureSM4 : Extension<
-    "sm4", "SM4",
-    "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON]>;
+def FeatureSM4 : ExtensionWithMArch<"sm4", "SM4", "FEAT_SM4, FEAT_SM3",
+  "Enable SM3 and SM4 support", [FeatureNEON]>;
 
-def FeatureSHA3 : Extension<
-    "sha3", "SHA3",
-    "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2]>;
+def FeatureSHA3 : ExtensionWithMArch<"sha3", "SHA3", "FEAT_SHA3, FEAT_SHA512",
+  "Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2]>;
 
-def FeatureRAS : Extension<"ras", "RAS",
-  "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">;
+def FeatureRAS : ExtensionWithMArch<"ras", "RAS", "FEAT_RAS, FEAT_RASv1p1",
+  "Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
 
-let ArchExtKindSpelling = "AEK_FP16", MArchName = "fp16" in
-def FeatureFullFP16 : Extension<"fullfp16", "FullFP16",
-  "Full FP16 (FEAT_FP16)", [FeatureFPARMv8]>;
+let ArchExtKindSpelling = "AEK_FP16", UserVisibleName = "fp16" in
+def FeatureFullFP16 : ExtensionWithMArch<"fullfp16", "FullFP16", "FEAT_FP16",
+  "Full FP16", [FeatureFPARMv8]>;
 
-let ArchExtKindSpelling = "AEK_PROFILE", MArchName = "profile" in
-def FeatureSPE : Extension<"spe", "SPE",
-  "Enable Statistical Profiling extension (FEAT_SPE)">;
+let ArchExtKindSpelling = "AEK_PROFILE", UserVisibleName = "profile" in
+def FeatureSPE : ExtensionWithMArch<"spe", "SPE", "FEAT_SPE",
+  "Enable Statistical Profiling extension">;
 
-def FeaturePAN_RWV : SubtargetFeature<
-    "pan-rwv", "HasPAN_RWV", "true",
-    "Enable v8.2 PAN s1e1R and s1e1W Variants (FEAT_PAN2)",
-    [FeaturePAN]>;
+def FeaturePAN_RWV : Extension<"pan-rwv", "PAN_RWV", "FEAT_PAN2",
+  "Enable v8.2 PAN s1e1R and s1e1W Variants", [FeaturePAN]>;
 
-def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true",
-    "Enable v8.2 UAO PState (FEAT_UAO)">;
+def FeaturePsUAO : Extension<"uaops", "PsUAO", "FEAT_UAO",
+  "Enable v8.2 UAO PState">;
 
-def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
-    "true", "Enable v8.2 data Cache Clean to Point of Persistence (FEAT_DPB)" >;
+def FeatureCCPP : Extension<"ccpp", "CCPP", "FEAT_DPB",
+  "Enable v8.2 data Cache Clean to Point of Persistence">;
 
-def FeatureSVE : Extension<"sve", "SVE",
-  "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16]>;
+def FeatureSVE : ExtensionWithMArch<"sve", "SVE", "FEAT_SVE",
+  "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>;
 
 let ArchExtKindSpelling = "AEK_I8MM" in
-def FeatureMatMulInt8 : Extension<"i8mm", "MatMulInt8",
-    "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)", []>;
+def FeatureMatMulInt8 : ExtensionWithMArch<"i8mm", "MatMulInt8", "FEAT_I8MM",
+  "Enable Matrix Multiply Int8 Extension">;
 
 let ArchExtKindSpelling = "AEK_F32MM" in
-def FeatureMatMulFP32 : Extension<"f32mm", "MatMulFP32",
-    "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE]>;
+def FeatureMatMulFP32 : ExtensionWithMArch<"f32mm", "MatMulFP32", "FEAT_F32MM",
+  "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
 
 let ArchExtKindSpelling = "AEK_F64MM" in
-def FeatureMatMulFP64 : Extension<"f64mm", "MatMulFP64",
-    "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE]>;
+def FeatureMatMulFP64 : ExtensionWithMArch<"f64mm", "MatMulFP64", "FEAT_F64MM",
+  "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.3 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureRCPC : Extension<"rcpc", "RCPC",
-    "Enable support for RCPC extension (FEAT_LRCPC)", []>;
+def FeatureRCPC : ExtensionWithMArch<"rcpc", "RCPC", "FEAT_LRCPC",
+  "Enable support for RCPC extension">;
 
-def FeaturePAuth : Extension<
-    "pauth", "PAuth",
-    "Enable v8.3-A Pointer Authentication extension (FEAT_PAuth)">;
+def FeaturePAuth : ExtensionWithMArch<"pauth", "PAuth", "FEAT_PAuth",
+  "Enable v8.3-A Pointer Authentication extension">;
 
-let ArchExtKindSpelling = "AEK_JSCVT", MArchName = "jscvt" in
-def FeatureJS : Extension<
-    "jsconv", "JS",
-    "Enable v8.3-A JavaScript FP conversion instructions (FEAT_JSCVT)",
-    [FeatureFPARMv8]>;
+let ArchExtKindSpelling = "AEK_JSCVT", UserVisibleName = "jscvt" in
+def FeatureJS : ExtensionWithMArch<"jsconv", "JS", "FEAT_JSCVT",
+  "Enable v8.3-A JavaScript FP conversion instructions",
+  [FeatureFPARMv8]>;
 
-def FeatureCCIDX : SubtargetFeature<
-    "ccidx", "HasCCIDX", "true",
-    "Enable v8.3-A Extend of the CCSIDR number of sets (FEAT_CCIDX)">;
+def FeatureCCIDX : Extension<"ccidx", "CCIDX", "FEAT_CCIDX",
+  "Enable v8.3-A Extend of the CCSIDR number of sets">;
 
-let ArchExtKindSpelling = "AEK_FCMA", MArchName = "fcma" in
-def FeatureComplxNum : Extension<
-    "complxnum", "ComplxNum",
-    "Enable v8.3-A Floating-point complex number support (FEAT_FCMA)",
-    [FeatureNEON]>;
+let ArchExtKindSpelling = "AEK_FCMA", UserVisibleName = "fcma" in
+def FeatureComplxNum : ExtensionWithMArch<"complxnum", "ComplxNum", "FEAT_FCMA",
+  "Enable v8.3-A Floating-point complex number support",
+  [FeatureNEON]>;
 
-def FeatureNV : SubtargetFeature<
-    "nv", "HasNV", "true",
-    "Enable v8.4-A Nested Virtualization Enchancement (FEAT_NV, FEAT_NV2)">;
+def FeatureNV : Extension<"nv", "NV", "FEAT_NV, FEAT_NV2",
+  "Enable v8.4-A Nested Virtualization Enchancement">;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.4 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
-  "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules (FEAT_LSE2)">;
+def FeatureLSE2 : Extension<"lse2", "LSE2", "FEAT_LSE2",
+  "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">;
 
-def FeatureFP16FML : Extension<"fp16fml", "FP16FML",
-  "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16]>;
+def FeatureFP16FML : ExtensionWithMArch<"fp16fml", "FP16FML", "FEAT_FHM",
+  "Enable FP16 FML instructions", [FeatureFullFP16]>;
 
-def FeatureDotProd : Extension<
-    "dotprod", "DotProd",
-    "Enable dot product support (FEAT_DotProd)", [FeatureNEON]>;
+def FeatureDotProd : ExtensionWithMArch<"dotprod", "DotProd", "FEAT_DotProd",
+  "Enable dot product support", [FeatureNEON]>;
 
-def FeatureMPAM : SubtargetFeature<
-    "mpam", "HasMPAM", "true",
-    "Enable v8.4-A Memory system Partitioning and Monitoring extension (FEAT_MPAM)">;
+def FeatureMPAM : Extension<"mpam", "MPAM", "FEAT_MPAM",
+  "Enable v8.4-A Memory system Partitioning and Monitoring extension">;
 
-def FeatureDIT : SubtargetFeature<
-    "dit", "HasDIT", "true",
-    "Enable v8.4-A Data Independent Timing instructions (FEAT_DIT)">;
+def FeatureDIT : ExtensionWithMArch<"dit", "DIT", "FEAT_DIT",
+  "Enable v8.4-A Data Independent Timing instructions">;
 
-def FeatureTRACEV8_4 : SubtargetFeature<
-    "tracev8.4", "HasTRACEV8_4", "true",
-    "Enable v8.4-A Trace extension (FEAT_TRF)">;
+def FeatureTRACEV8_4 : Extension<"tracev8.4", "TRACEV8_4", "FEAT_TRF",
+  "Enable v8.4-A Trace extension">;
 
-def FeatureAM : SubtargetFeature<
-    "am", "HasAM", "true",
-    "Enable v8.4-A Activity Monitors extension (FEAT_AMUv1)">;
+def FeatureAM : Extension<"am", "AM", "FEAT_AMUv1",
+  "Enable v8.4-A Activity Monitors extension">;
 
-def FeatureSEL2 : SubtargetFeature<
-    "sel2", "HasSEL2", "true",
-    "Enable v8.4-A Secure Exception Level 2 extension (FEAT_SEL2)">;
+def FeatureSEL2 : Extension<"sel2", "SEL2", "FEAT_SEL2",
+  "Enable v8.4-A Secure Exception Level 2 extension">;
 
-def FeatureTLB_RMI : SubtargetFeature<
-    "tlb-rmi", "HasTLB_RMI", "true",
-    "Enable v8.4-A TLB Range and Maintenance Instructions (FEAT_TLBIOS, FEAT_TLBIRANGE)">;
+def FeatureTLB_RMI : Extension<"tlb-rmi", "TLB_RMI",
+  "FEAT_TLBIOS, FEAT_TLBIRANGE",
+  "Enable v8.4-A TLB Range and Maintenance Instructions">;
 
-def FeatureFlagM : Extension<
-    "flagm", "FlagM",
-    "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)", []>;
+def FeatureFlagM : ExtensionWithMArch<"flagm", "FlagM", "FEAT_FlagM",
+  "Enable v8.4-A Flag Manipulation Instructions">;
 
-def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true",
-    "Enable v8.4-A RCPC instructions with Immediate Offsets (FEAT_LRCPC2)",
-    [FeatureRCPC]>;
+def FeatureRCPC_IMMO : Extension<"rcpc-immo", "RCPC_IMMO", "FEAT_LRCPC2",
+  "Enable v8.4-A RCPC instructions with Immediate Offsets",
+  [FeatureRCPC]>;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.5 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true",
-  "Enable alternative NZCV format for floating point comparisons (FEAT_FlagM2)">;
+def FeatureAltFPCmp : Extension<"altnzcv", "AlternativeNZCV", "FEAT_FlagM2",
+  "Enable alternative NZCV format for floating point comparisons">;
 
-def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true",
+def FeatureFRInt3264 : Extension<"fptoint", "FRInt3264", "FEAT_FRINTTS",
   "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to "
-  "an integer (in FP format) forcing it to fit into a 32- or 64-bit int (FEAT_FRINTTS)" >;
+  "an integer (in FP format) forcing it to fit into a 32- or 64-bit int">;
 
-def FeatureSB : Extension<"sb", "SB",
-  "Enable v8.5 Speculation Barrier (FEAT_SB)", []>;
+def FeatureSB : ExtensionWithMArch<"sb", "SB", "FEAT_SB",
+  "Enable v8.5 Speculation Barrier">;
 
-def FeatureSSBS : Extension<"ssbs", "SSBS",
-  "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)", []>;
+def FeatureSSBS : ExtensionWithMArch<"ssbs", "SSBS", "FEAT_SSBS, FEAT_SSBS2",
+  "Enable Speculative Store Bypass Safe bit">;
 
-def FeaturePredRes : Extension<"predres", "PredRes",
-  "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)", []>;
+def FeaturePredRes : ExtensionWithMArch<"predres", "PredRes", "FEAT_SPECRES",
+  "Enable v8.5a execution and data prediction invalidation instructions">;
 
-def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "CCDP", "true",
-    "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >;
+def FeatureCacheDeepPersist : Extension<"ccdp", "CCDP", "FEAT_DPB2",
+  "Enable v8.5 Cache Clean to Point of Deep Persistence">;
 
-def FeatureBranchTargetId : SubtargetFeature<"bti", "BTI", "true",
-    "Enable Branch Target Identification (FEAT_BTI)">;
+def FeatureBranchTargetId : ExtensionWithMArch<"bti", "BTI", "FEAT_BTI",
+  "Enable Branch Target Identification">;
 
-let ArchExtKindSpelling = "AEK_RAND", MArchName = "rng" in
-def FeatureRandGen : Extension<"rand", "RandGen",
-    "Enable Random Number generation instructions (FEAT_RNG)", []>;
+let ArchExtKindSpelling = "AEK_RAND", UserVisibleName = "rng" in
+def FeatureRandGen : ExtensionWithMArch<"rand", "RandGen", "FEAT_RNG",
+  "Enable Random Number generation instructions">;
 
 // NOTE: "memtag" means FEAT_MTE + FEAT_MTE2 for -march or
 // __attribute((target(...))), but only FEAT_MTE for FMV.
-let MArchName = "memtag" in
-def FeatureMTE : Extension<"mte", "MTE",
-    "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)", []>;
+let UserVisibleName = "memtag" in
+def FeatureMTE : ExtensionWithMArch<"mte", "MTE", "FEAT_MTE, FEAT_MTE2",
+  "Enable Memory Tagging Extension">;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.6 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureBF16 : Extension<"bf16", "BF16",
-    "Enable BFloat16 Extension (FEAT_BF16)">;
+def FeatureBF16 : ExtensionWithMArch<"bf16", "BF16", "FEAT_BF16",
+  "Enable BFloat16 Extension">;
 
-def FeatureAMVS : SubtargetFeature<
-    "amvs", "HasAMVS", "true",
-    "Enable v8.6-A Activity Monitors Virtualization support (FEAT_AMUv1p1)",
-    [FeatureAM]>;
+def FeatureAMVS : Extension<"amvs", "AMVS", "FEAT_AMUv1p1",
+  "Enable v8.6-A Activity Monitors Virtualization support",
+  [FeatureAM]>;
 
-def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
-    "true", "Enable fine grained virtualization traps extension (FEAT_FGT)">;
+def FeatureFineGrainedTraps : Extension<"fgt", "FineGrainedTraps", "FEAT_FGT",
+  "Enable fine grained virtualization traps extension">;
 
 def FeatureEnhancedCounterVirtualization :
-      SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
-      "true", "Enable enhanced counter virtualization extension (FEAT_ECV)">;
+    Extension<"ecv", "EnhancedCounterVirtualization", "FEAT_ECV",
+      "Enable enhanced counter virtualization extension">;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.7 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureXS : SubtargetFeature<"xs", "HasXS",
-    "true", "Enable Armv8.7-A limited-TLB-maintenance instruction (FEAT_XS)">;
+def FeatureXS : Extension<"xs", "XS", "FEAT_XS",
+  "Enable Armv8.7-A limited-TLB-maintenance instruction">;
 
-def FeatureWFxT : SubtargetFeature<"wfxt", "WFxT", "true",
-    "Enable Armv8.7-A WFET and WFIT instruction (FEAT_WFxT)">;
+def FeatureWFxT : ExtensionWithMArch<"wfxt", "WFxT", "FEAT_WFxT",
+  "Enable Armv8.7-A WFET and WFIT instruction">;
 
-def FeatureHCX : SubtargetFeature<
-    "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register (FEAT_HCX)">;
+def FeatureHCX : Extension<"hcx", "HCX", "FEAT_HCX",
+  "Enable Armv8.7-A HCRX_EL2 system register">;
 
-def FeatureLS64 : Extension<"ls64", "LS64",
-    "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)", []>;
+def FeatureLS64 : ExtensionWithMArch<"ls64", "LS64",
+  "FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA",
+  "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">;
 
-def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF",
-    "true", "Enable extra register in the Statistical Profiling Extension (FEAT_SPEv1p2)">;
+def FeatureSPE_EEF : Extension<"spe-eef", "SPE_EEF", "FEAT_SPEv1p2",
+  "Enable extra register in the Statistical Profiling Extension">;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.8 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureHBC : Extension<"hbc", "HBC",
-    "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">;
+def FeatureHBC : ExtensionWithMArch<"hbc", "HBC", "FEAT_HBC",
+    "Enable Armv8.8-A Hinted Conditional Branches Extension">;
 
-def FeatureMOPS : Extension<"mops", "MOPS",
-    "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)", []>;
+def FeatureMOPS : ExtensionWithMArch<"mops", "MOPS", "FEAT_MOPS",
+    "Enable Armv8.8-A memcpy and memset acceleration instructions">;
 
-def FeatureNMI : SubtargetFeature<"nmi", "HasNMI",
-    "true", "Enable Armv8.8-A Non-maskable Interrupts (FEAT_NMI, FEAT_GICv3_NMI)">;
+def FeatureNMI : Extension<"nmi", "NMI", "FEAT_NMI, FEAT_GICv3_NMI",
+  "Enable Armv8.8-A Non-maskable Interrupts">;
 
 //===----------------------------------------------------------------------===//
 //  Armv8.9 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureRASv2 : Extension<"rasv2", "RASv2",
-  "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions (FEAT_RASv2)",
+def FeatureRASv2 : ExtensionWithMArch<"rasv2", "RASv2", "FEAT_RASv2",
+  "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions",
   [FeatureRAS]>;
 
-def FeatureCSSC : Extension<"cssc", "CSSC",
-  "Enable Common Short Sequence Compression (CSSC) instructions (FEAT_CSSC)">;
+def FeatureCSSC : ExtensionWithMArch<"cssc", "CSSC", "FEAT_CSSC",
+  "Enable Common Short Sequence Compression (CSSC) instructions">;
 
-def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB",
-    "true", "Enable Clear BHB instruction (FEAT_CLRBHB)">;
+def FeatureCLRBHB : Extension<"clrbhb", "CLRBHB", "FEAT_CLRBHB",
+  "Enable Clear BHB instruction">;
 
-def FeaturePRFM_SLC : SubtargetFeature<"prfm-slc-target", "HasPRFM_SLC",
-    "true", "Enable SLC target for PRFM instruction">;
+def FeaturePRFM_SLC : Extension<"prfm-slc-target", "PRFM_SLC", "FEAT_PRFMSLC",
+  "Enable SLC target for PRFM instruction">;
 
-let MArchName = "predres2" in
-def FeatureSPECRES2 : Extension<"specres2", "SPECRES2",
-    "Enable Speculation Restriction Instruction (FEAT_SPECRES2)",
-    [FeaturePredRes]>;
+let UserVisibleName = "predres2" in
+def FeatureSPECRES2 : ExtensionWithMArch<"specres2", "SPECRES2", "FEAT_SPECRES2",
+  "Enable Speculation Restriction Instruction",
+  [FeaturePredRes]>;
 
-def FeatureRCPC3 : Extension<"rcpc3", "RCPC3",
-    "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)",
-    [FeatureRCPC_IMMO]>;
+def FeatureRCPC3 : ExtensionWithMArch<"rcpc3", "RCPC3", "FEAT_LRCPC3",
+  "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set",
+  [FeatureRCPC_IMMO]>;
 
-def FeatureTHE : Extension<"the", "THE",
-    "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">;
+def FeatureTHE : ExtensionWithMArch<"the", "THE", "FEAT_THE",
+  "Enable Armv8.9-A Translation Hardening Extension">;
 
 //===----------------------------------------------------------------------===//
 //  Armv9.0 Architecture Extensions
@@ -360,32 +358,32 @@ def FeatureTHE : Extension<"the", "THE",
 def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
   "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
 
-def FeatureSVE2 : Extension<"sve2", "SVE2",
-  "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)",
+def FeatureSVE2 : ExtensionWithMArch<"sve2", "SVE2", "FEAT_SVE2",
+  "Enable Scalable Vector Extension 2 (SVE2) instructions",
   [FeatureSVE, FeatureUseScalarIncVL]>;
 
-def FeatureSVE2AES : Extension<"sve2-aes", "SVE2AES",
-  "Enable AES SVE2 instructions (FEAT_SVE_AES, FEAT_SVE_PMULL128)",
-  [FeatureSVE2, FeatureAES]>;
+def FeatureSVE2AES : ExtensionWithMArch<"sve2-aes", "SVE2AES",
+  "FEAT_SVE_AES, FEAT_SVE_PMULL128",
+  "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>;
 
-def FeatureSVE2SM4 : Extension<"sve2-sm4", "SVE2SM4",
-  "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4]>;
+def FeatureSVE2SM4 : ExtensionWithMArch<"sve2-sm4", "SVE2SM4", "FEAT_SVE_SM4",
+  "Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>;
 
-def FeatureSVE2SHA3 : Extension<"sve2-sha3", "SVE2SHA3",
-  "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3]>;
+def FeatureSVE2SHA3 : ExtensionWithMArch<"sve2-sha3", "SVE2SHA3", "FEAT_SVE_SHA3",
+  "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;
 
-def FeatureSVE2BitPerm : Extension<"sve2-bitperm", "SVE2BitPerm",
-  "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>;
+def FeatureSVE2BitPerm : ExtensionWithMArch<"sve2-bitperm", "SVE2BitPerm",
+  "FEAT_SVE_BitPerm",
+  "Enable bit permutation SVE2 instructions", [FeatureSVE2]>;
 
-def FeatureTRBE : SubtargetFeature<"trbe", "TRBE", "true",
-    "Enable Trace Buffer Extension (FEAT_TRBE)">;
+def FeatureTRBE : Extension<"trbe", "TRBE", "FEAT_TRBE",
+  "Enable Trace Buffer Extension">;
 
-def FeatureETE : SubtargetFeature<"ete", "ETE", "true",
-    "Enable Embedded Trace Extension (FEAT_ETE)",
-    [FeatureTRBE]>;
+def FeatureETE : Extension<"ete", "ETE", "FEAT_ETE",
+  "Enable Embedded Trace Extension", [FeatureTRBE]>;
 
-def FeatureTME : Extension<"tme", "TME",
-    "Enable Transactional Memory Extension (FEAT_TME)" >;
+def FeatureTME : ExtensionWithMArch<"tme", "TME", "FEAT_TME",
+  "Enable Transactional Memory Extension">;
 
 //===----------------------------------------------------------------------===//
 //  Armv9.1 Architecture Extensions
@@ -395,119 +393,119 @@ def FeatureTME : Extension<"tme", "TME",
 //  Armv9.2 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureBRBE : Extension<"brbe", "BRBE",
-    "Enable Branch Record Buffer Extension (FEAT_BRBE)">;
+def FeatureBRBE : ExtensionWithMArch<"brbe", "BRBE", "FEAT_BRBE",
+  "Enable Branch Record Buffer Extension">;
 
-def FeatureRME : SubtargetFeature<"rme", "HasRME",
-    "true", "Enable Realm Management Extension (FEAT_RME)">;
+def FeatureRME : Extension<"rme", "RME", "FEAT_RME",
+  "Enable Realm Management Extension">;
 
-def FeatureSME : Extension<"sme", "SME",
-  "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL]>;
+def FeatureSME : ExtensionWithMArch<"sme", "SME", "FEAT_SME",
+  "Enable Scalable Matrix Extension (SME)", [FeatureBF16, FeatureUseScalarIncVL]>;
 
-def FeatureSMEF64F64 : Extension<"sme-f64f64", "SMEF64F64",
-  "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME]>;
+def FeatureSMEF64F64 : ExtensionWithMArch<"sme-f64f64", "SMEF64F64", "FEAT_SME_F64F64",
+  "Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>;
 
-def FeatureSMEI16I64 : Extension<"sme-i16i64", "SMEI16I64",
-  "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME]>;
+def FeatureSMEI16I64 : ExtensionWithMArch<"sme-i16i64", "SMEI16I64", "FEAT_SME_I16I64",
+  "Enable Scalable Matrix Extension (SME) I16I64 instructions", [FeatureSME]>;
 
-def FeatureSMEFA64 : Extension<"sme-fa64", "SMEFA64",
-  "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>;
+def FeatureSMEFA64 : ExtensionWithMArch<"sme-fa64", "SMEFA64", "FEAT_SME_FA64",
+  "Enable the full A64 instruction set in streaming SVE mode", [FeatureSME, FeatureSVE2]>;
 
 //===----------------------------------------------------------------------===//
 //  Armv9.3 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureSME2 : Extension<"sme2", "SME2",
+def FeatureSME2 : ExtensionWithMArch<"sme2", "SME2", "FEAT_SME2",
   "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME]>;
 
-def FeatureMEC : SubtargetFeature<"mec", "HasMEC",
-    "true", "Enable Memory Encryption Contexts Extension", [FeatureRME]>;
+def FeatureMEC : Extension<"mec", "MEC", "FEAT_MEC",
+  "Enable Memory Encryption Contexts Extension", [FeatureRME]>;
 
 //===----------------------------------------------------------------------===//
 //  Armv9.4 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureSVE2p1: Extension<"sve2p1", "SVE2p1",
+def FeatureSVE2p1: ExtensionWithMArch<"sve2p1", "SVE2p1", "FEAT_SVE2p1",
   "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>;
 
-def FeatureB16B16 : Extension<"b16b16", "B16B16",
-  "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", [FeatureBF16]>;
+def FeatureB16B16 : ExtensionWithMArch<"b16b16", "B16B16", "FEAT_SVE_B16B16",
+  "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions", [FeatureBF16]>;
 
-def FeatureSMEF16F16 : Extension<"sme-f16f16", "SMEF16F16",
-  "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>;
+def FeatureSMEF16F16 : ExtensionWithMArch<"sme-f16f16", "SMEF16F16", "FEAT_SME_F16F16",
+  "Enable SME non-widening Float16 instructions", [FeatureSME2]>;
 
-def FeatureSME2p1 : Extension<"sme2p1", "SME2p1",
-  "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>;
+def FeatureSME2p1 : ExtensionWithMArch<"sme2p1", "SME2p1", "FEAT_SME2p1",
+  "Enable Scalable Matrix Extension 2.1 instructions", [FeatureSME2]>;
 
-def FeatureCHK : SubtargetFeature<"chk", "HasCHK",
-    "true", "Enable Armv8.0-A Check Feature Status Extension (FEAT_CHK)">;
+def FeatureCHK : Extension<"chk", "CHK", "FEAT_CHK",
+  "Enable Armv8.0-A Check Feature Status Extension">;
 
-def FeatureGCS : Extension<"gcs", "GCS",
-    "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>;
+def FeatureGCS : ExtensionWithMArch<"gcs", "GCS", "FEAT_GCS",
+  "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>;
 
-def FeatureITE : Extension<"ite", "ITE",
-    "Enable Armv9.4-A Instrumentation Extension FEAT_ITE", [FeatureETE,
-    FeatureTRBE]>;
+def FeatureITE : ExtensionWithMArch<"ite", "ITE", "FEAT_ITE",
+  "Enable Armv9.4-A Instrumentation Extension", [FeatureETE, FeatureTRBE]>;
 
-def FeatureLSE128 : Extension<"lse128", "LSE128",
-    "Enable Armv9.4-A 128-bit Atomic Instructions (FEAT_LSE128)",
-    [FeatureLSE]>;
+def FeatureLSE128 : ExtensionWithMArch<"lse128", "LSE128", "FEAT_LSE128",
+  "Enable Armv9.4-A 128-bit Atomic Instructions",
+  [FeatureLSE]>;
 
 // FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, and FEAT_SYSINSTR128 are mutually implicit.
 // Therefore group them all under a single feature flag, d128:
-def FeatureD128 : Extension<"d128", "D128",
-    "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers "
-    "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
-    [FeatureLSE128]>;
+def FeatureD128 : ExtensionWithMArch<"d128", "D128",
+  "FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128",
+  "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers "
+  "and Instructions",
+  [FeatureLSE128]>;
 
 //===----------------------------------------------------------------------===//
 //  Armv9.5 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
-def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX",
-   "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
+def FeatureFAMINMAX: ExtensionWithMArch<"faminmax", "FAMINMAX", "FEAT_FAMINMAX",
+ "Enable FAMIN and FAMAX instructions">;
 
-def FeatureLUT: Extension<"lut", "LUT",
-   "Enable Lookup Table instructions (FEAT_LUT)">;
+def FeatureLUT: ExtensionWithMArch<"lut", "LUT", "FEAT_LUT",
+ "Enable Lookup Table instructions">;
    
-def FeatureFP8 : Extension<"fp8", "FP8",
-  "Enable FP8 instructions (FEAT_FP8)", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>;
+def FeatureFP8 : ExtensionWithMArch<"fp8", "FP8", "FEAT_FP8",
+  "Enable FP8 instructions", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>;
   
-def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA",
-  "Enable fp8 multiply-add instructions (FEAT_FP8FMA)", [FeatureFP8]>;
+def FeatureFP8FMA : ExtensionWithMArch<"fp8fma", "FP8FMA", "FEAT_FP8FMA",
+  "Enable fp8 multiply-add instructions", [FeatureFP8]>;
 
-def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA",
-  "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2, FeatureFP8]>;
+def FeatureSSVE_FP8FMA : ExtensionWithMArch<"ssve-fp8fma", "SSVE_FP8FMA", "FEAT_SSVE_FP8FMA",
+  "Enable SVE2 fp8 multiply-add instructions", [FeatureSME2, FeatureFP8]>;
 
-def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
-   "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)", [FeatureFP8FMA]>;
+def FeatureFP8DOT4: ExtensionWithMArch<"fp8dot4", "FP8DOT4", "FEAT_FP8DOT4",
+  "Enable fp8 4-way dot instructions", [FeatureFP8FMA]>;
   
-def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2",
-   "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)", [FeatureFP8DOT4]>;
+def FeatureFP8DOT2: ExtensionWithMArch<"fp8dot2", "FP8DOT2", "FEAT_FP8DOT2",
+  "Enable fp8 2-way dot instructions", [FeatureFP8DOT4]>;
 
-def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
-  "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSSVE_FP8FMA]>;
+def FeatureSSVE_FP8DOT4 : ExtensionWithMArch<"ssve-fp8dot4", "SSVE_FP8DOT4", "FEAT_SSVE_FP8DOT4",
+  "Enable SVE2 fp8 4-way dot product instructions", [FeatureSSVE_FP8FMA]>;
 
-def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
-  "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSSVE_FP8DOT4]>;
+def FeatureSSVE_FP8DOT2 : ExtensionWithMArch<"ssve-fp8dot2", "SSVE_FP8DOT2", "FEAT_SSVE_FP8DOT2",
+  "Enable SVE2 fp8 2-way dot product instructions", [FeatureSSVE_FP8DOT4]>;
 
-def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2",
-  "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
+def FeatureSME_LUTv2 : ExtensionWithMArch<"sme-lutv2", "SME_LUTv2", "FEAT_SME_LUTv2",
+  "Enable Scalable Matrix Extension (SME) LUTv2 instructions">;
 
-def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32",
-  "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
+def FeatureSMEF8F32 : ExtensionWithMArch<"sme-f8f32", "SMEF8F32", "FEAT_SME_F8F32",
+  "Enable Scalable Matrix Extension (SME) F8F32 instructions", [FeatureSME2, FeatureFP8]>;
 
-def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
-  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSMEF8F32]>;
+def FeatureSMEF8F16 : ExtensionWithMArch<"sme-f8f16", "SMEF8F16", "FEAT_SME_F8F16",
+  "Enable Scalable Matrix Extension (SME) F8F16 instructions", [FeatureSMEF8F32]>;
 
-def FeatureCPA : Extension<"cpa", "CPA",
-    "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+def FeatureCPA : ExtensionWithMArch<"cpa", "CPA", "FEAT_CPA",
+  "Enable Armv9.5-A Checked Pointer Arithmetic">;
 
-def FeaturePAuthLR : Extension<"pauth-lr", "PAuthLR",
-    "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
+def FeaturePAuthLR : ExtensionWithMArch<"pauth-lr", "PAuthLR", "FEAT_PAuth_LR",
+  "Enable Armv9.5-A PAC enhancements">;
 
-def FeatureTLBIW : Extension<"tlbiw", "TLBIW",
-  "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">;
+def FeatureTLBIW : ExtensionWithMArch<"tlbiw", "TLBIW", "FEAT_TLBIW",
+  "Enable ARMv9.5-A TLBI VMALL for Dirty State">;
 
 //===----------------------------------------------------------------------===//
 //  Other Features
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a99181373fc3cb..8216fa7db822cc 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1036,7 +1036,10 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
   MachineFunction *MF = MBB->getParent();
 
   // If MBB is an entry block, use X9 as the scratch register
-  if (&MF->front() == MBB)
+  // preserve_none functions may be using X9 to pass arguments,
+  // so prefer to pick an available register below.
+  if (&MF->front() == MBB &&
+      MF->getFunction().getCallingConv() != CallingConv::PreserveNone)
     return AArch64::X9;
 
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 248778f98f4c73..544eec3ab9cecf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -395,7 +395,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   template <unsigned MaxIdx, unsigned Scale>
   void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg,
                              unsigned Op);
-
+  void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, unsigned Op,
+                              unsigned MaxIdx, unsigned Scale);
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
   template <int64_t Min, int64_t Max>
@@ -2003,6 +2004,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs,
   CurDAG->RemoveDeadNode(N);
 }
 
+void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs,
+                                                 unsigned Op, unsigned MaxIdx,
+                                                 unsigned Scale) {
+
+  SDValue SliceBase = N->getOperand(3);
+  SDValue Base, Offset;
+  if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale))
+    return;
+  // The correct Za tile number is computed in Machine Instruction
+  // See EmitZAInstr
+  // DAG cannot select Za tile as an output register with ZReg
+  SDLoc DL(N);
+  SDValue Ops[] = {/*TileNum*/ N->getOperand(2), Base, Offset,
+                   /*Chain*/ N->getOperand(0)};
+  SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, Ops);
+
+  EVT VT = N->getValueType(0);
+  for (unsigned I = 0; I < NumVecs; ++I)
+    ReplaceUses(SDValue(N, I),
+                CurDAG->getTargetExtractSubreg(AArch64::zsub0 + I, DL, VT,
+                                               SDValue(Mov, 0)));
+
+  // Copy chain
+  unsigned ChainIdx = NumVecs;
+  ReplaceUses(SDValue(N, ChainIdx), SDValue(Mov, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
 void AArch64DAGToDAGISel::SelectUnaryMultiIntrinsic(SDNode *N,
                                                     unsigned NumOutVecs,
                                                     bool IsTupleInput,
@@ -4359,7 +4388,9 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
   //   N1 = SRL_PRED true, V, splat(imm)  --> rotr amount
   //   N0 = SHL_PRED true, V, splat(bits-imm)
   //   V = (xor x, y)
-  if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
+  if (VT.isScalableVector() &&
+      (Subtarget->hasSVE2() ||
+       (Subtarget->hasSME() && Subtarget->isStreaming()))) {
     if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
         N1.getOpcode() != AArch64ISD::SRL_PRED)
       std::swap(N0, N1);
@@ -5243,6 +5274,74 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                                   AArch64::MOVA_VG4_4ZMXI);
       return;
     }
+    case Intrinsic::aarch64_sme_readz_horiz_x2: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_B_PSEUDO, 14, 2);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_H_PSEUDO, 6, 2);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_S_PSEUDO, 2, 2);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_D_PSEUDO, 0, 2);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sme_readz_vert_x2: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_B_PSEUDO, 14, 2);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_H_PSEUDO, 6, 2);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_S_PSEUDO, 2, 2);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_D_PSEUDO, 0, 2);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sme_readz_horiz_x4: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_B_PSEUDO, 12, 4);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_H_PSEUDO, 4, 4);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_S_PSEUDO, 0, 4);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_D_PSEUDO, 0, 4);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sme_readz_vert_x4: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_B_PSEUDO, 12, 4);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_H_PSEUDO, 4, 4);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_S_PSEUDO, 0, 4);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_D_PSEUDO, 0, 4);
+        return;
+      }
+      break;
+    }
     case Intrinsic::swift_async_context_addr: {
       SDLoc DL(Node);
       SDValue Chain = Node->getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0c834acacdca90..acce9515e832cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -423,7 +423,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addQRType(MVT::v8bf16);
   }
 
-  if (Subtarget->hasSVEorSME()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     // Add legal sve predicate types
     addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
     addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
@@ -511,6 +511,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
   setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
 
+  setOperationAction(ISD::PtrAuthGlobalAddress, MVT::i64, Custom);
+
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
@@ -1408,7 +1410,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   // FIXME: Move lowering for more nodes here if those are common between
   // SVE and SME.
-  if (Subtarget->hasSVEorSME()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     for (auto VT :
          {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
@@ -1484,7 +1486,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       if (!Subtarget->isLittleEndian())
         setOperationAction(ISD::BITCAST, VT, Expand);
 
-      if (Subtarget->hasSVE2orSME())
+      if (Subtarget->hasSVE2() ||
+          (Subtarget->hasSME() && Subtarget->isStreaming()))
         // For SLI/SRI.
         setOperationAction(ISD::OR, VT, Custom);
     }
@@ -1785,6 +1788,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   IsStrictFPEnabled = true;
   setMaxAtomicSizeInBitsSupported(128);
 
+  // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has
+  // it, but it's just a wrapper around ldexp.
+  if (Subtarget->isTargetWindows()) {
+    for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+      if (isOperationExpand(Op, MVT::f32))
+        setOperationAction(Op, MVT::f32, Promote);
+  }
+
+  // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
+  // isn't legal.
+  for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+    if (isOperationExpand(Op, MVT::f16))
+      setOperationAction(Op, MVT::f16, Promote);
+
   if (Subtarget->isWindowsArm64EC()) {
     // FIXME: are there intrinsics we need to exclude from this?
     for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
@@ -1937,7 +1954,7 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
 }
 
 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
-  if (!Subtarget->hasSVEorSME())
+  if (!Subtarget->isSVEorStreamingSVEAvailable())
     return true;
 
   // We can only use the BRKB + CNTP sequence with legal predicate types. We can
@@ -2961,18 +2978,25 @@ MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
 MachineBasicBlock *
 AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
                                    MachineInstr &MI,
-                                   MachineBasicBlock *BB, bool HasTile) const {
+                                   MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
   unsigned StartIdx = 0;
 
+  bool HasTile = BaseReg != AArch64::ZA;
+  bool HasZPROut = HasTile && MI.getOperand(0).isReg();
+  if (HasZPROut) {
+    MIB.add(MI.getOperand(StartIdx)); // Output ZPR
+    ++StartIdx;
+  }
   if (HasTile) {
-    MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
-    MIB.addReg(BaseReg + MI.getOperand(0).getImm());
-    StartIdx = 1;
-  } else
+    MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
+               RegState::Define);                           // Output ZA Tile
+    MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
+    StartIdx++;
+  } else {
     MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
-
+  }
   for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
     MIB.add(MI.getOperand(I));
 
@@ -3081,17 +3105,17 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
         TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
     switch (SMEMatrixType) {
     case (AArch64::SMEMatrixArray):
-      return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
+      return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
     case (AArch64::SMEMatrixTileB):
-      return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
+      return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
     case (AArch64::SMEMatrixTileH):
-      return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
+      return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
     case (AArch64::SMEMatrixTileS):
-      return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
+      return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
     case (AArch64::SMEMatrixTileD):
-      return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
+      return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
     case (AArch64::SMEMatrixTileQ):
-      return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
+      return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
     }
   }
 
@@ -6645,6 +6669,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
     return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::PtrAuthGlobalAddress:
+    return LowerPtrAuthGlobalAddress(Op, DAG);
   case ISD::SETCC:
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:
@@ -9495,6 +9521,133 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
   llvm_unreachable("Unexpected platform trying to use TLS");
 }
 
+//===----------------------------------------------------------------------===//
+//                      PtrAuthGlobalAddress lowering
+//
+// We have 3 lowering alternatives to choose from:
+// - MOVaddrPAC: similar to MOVaddr, with added PAC.
+//   If the GV doesn't need a GOT load (i.e., is locally defined)
+//   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
+//
+// - LOADgotPAC: similar to LOADgot, with added PAC.
+//   If the GV needs a GOT load, materialize the pointer using the usual
+//   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
+//   section is assumed to be read-only (for example, via relro mechanism). See
+//   LowerMOVaddrPAC.
+//
+// - LOADauthptrstatic: similar to LOADgot, but use a
+//   special stub slot instead of a GOT slot.
+//   Load a signed pointer for symbol 'sym' from a stub slot named
+//   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
+//   resolving. This usually lowers to adrp+ldr, but also emits an entry into
+//   .data with an
+//   @AUTH relocation. See LowerLOADauthptrstatic.
+//
+// All 3 are pseudos that are expand late to longer sequences: this lets us
+// provide integrity guarantees on the to-be-signed intermediate values.
+//
+// LOADauthptrstatic is undesirable because it requires a large section filled
+// with often similarly-signed pointers, making it a good harvesting target.
+// Thus, it's only used for ptrauth references to extern_weak to avoid null
+// checks.
+
+SDValue AArch64TargetLowering::LowerPtrAuthGlobalAddressStatically(
+    SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
+    SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) const {
+  const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
+  assert(TGN->getGlobal()->hasExternalWeakLinkage());
+
+  // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
+  // offset alone as a pointer if the symbol wasn't available, which would
+  // probably break null checks in users. Ptrauth complicates things further:
+  // error out.
+  if (TGN->getOffset() != 0)
+    report_fatal_error(
+        "unsupported non-zero offset in weak ptrauth global reference");
+
+  if (!isNullConstant(AddrDiscriminator))
+    report_fatal_error("unsupported weak addr-div ptrauth global");
+
+  SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
+  return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
+                                    {TGA, Key, Discriminator}),
+                 0);
+}
+
+SDValue
+AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDValue Ptr = Op.getOperand(0);
+  uint64_t KeyC = Op.getConstantOperandVal(1);
+  SDValue AddrDiscriminator = Op.getOperand(2);
+  uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  if (KeyC > AArch64PACKey::LAST)
+    report_fatal_error("key in ptrauth global out of range [0, " +
+                       Twine((int)AArch64PACKey::LAST) + "]");
+
+  // Blend only works if the integer discriminator is 16-bit wide.
+  if (!isUInt<16>(DiscriminatorC))
+    report_fatal_error(
+        "constant discriminator in ptrauth global out of range [0, 0xffff]");
+
+  // Choosing between 3 lowering alternatives is target-specific.
+  if (!Subtarget->isTargetELF())
+    report_fatal_error("ptrauth global lowering is only implemented for ELF");
+
+  int64_t PtrOffsetC = 0;
+  if (Ptr.getOpcode() == ISD::ADD) {
+    PtrOffsetC = Ptr.getConstantOperandVal(1);
+    Ptr = Ptr.getOperand(0);
+  }
+  const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
+  const GlobalValue *PtrGV = PtrN->getGlobal();
+
+  // Classify the reference to determine whether it needs a GOT load.
+  const unsigned OpFlags =
+      Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
+  const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
+  assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
+         "unsupported non-GOT op flags on ptrauth global reference");
+
+  // Fold any offset into the GV; our pseudos expect it there.
+  PtrOffsetC += PtrN->getOffset();
+  SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
+                                            /*TargetFlags=*/0);
+  assert(PtrN->getTargetFlags() == 0 &&
+         "unsupported target flags on ptrauth global");
+
+  SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
+  SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
+  SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
+                                   ? AddrDiscriminator
+                                   : DAG.getRegister(AArch64::XZR, MVT::i64);
+
+  // No GOT load needed -> MOVaddrPAC
+  if (!NeedsGOTLoad) {
+    assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
+    return SDValue(
+        DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
+                           {TPtr, Key, TAddrDiscriminator, Discriminator}),
+        0);
+  }
+
+  // GOT load -> LOADgotPAC
+  // Note that we disallow extern_weak refs to avoid null checks later.
+  if (!PtrGV->hasExternalWeakLinkage())
+    return SDValue(
+        DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
+                           {TPtr, Key, TAddrDiscriminator, Discriminator}),
+        0);
+
+  // extern_weak ref -> LOADauthptrstatic
+  return LowerPtrAuthGlobalAddressStatically(
+      TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
+      DAG);
+}
+
 // Looks through \param Val to determine the bit that can be used to
 // check the sign of the value. It returns the unextended value and
 // the sign bit position.
@@ -14527,7 +14680,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
                        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
-    if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
+    if (VT.isScalableVector() &&
+        (Subtarget->hasSVE2() ||
+         (Subtarget->hasSME() && Subtarget->isStreaming()))) {
       SDValue RShOperand;
       unsigned ShiftValue;
       if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
@@ -15131,7 +15286,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                const CallInst &I,
                                                MachineFunction &MF,
                                                unsigned Intrinsic) const {
-  auto &DL = I.getModule()->getDataLayout();
+  auto &DL = I.getDataLayout();
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_st2:
     return setInfoSVEStN<2>(*this, DL, Info, I);
@@ -15387,7 +15542,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
 
   const TargetOptions &Options = getTargetMachine().Options;
   const Function *F = I->getFunction();
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   Type *Ty = User->getOperand(0)->getType();
 
   return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
@@ -15451,7 +15606,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
       break;
     case Instruction::GetElementPtr: {
       gep_type_iterator GTI = gep_type_begin(Instr);
-      auto &DL = Ext->getModule()->getDataLayout();
+      auto &DL = Ext->getDataLayout();
       std::advance(GTI, U.getOperandNo()-1);
       Type *IdxTy = GTI.getIndexedType();
       // This extension will end up with a shift because of the scaling factor.
@@ -15877,7 +16032,7 @@ bool AArch64TargetLowering::shouldSinkOperands(
         // the backend to generate a umull.
         unsigned Bitwidth = I->getType()->getScalarSizeInBits();
         APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
-        const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
+        const DataLayout &DL = I->getDataLayout();
         if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
           continue;
         NumZExts++;
@@ -15941,24 +16096,6 @@ static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
   return Result;
 }
 
-static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
-                                      FixedVectorType *DstTy,
-                                      bool IsLittleEndian) {
-  auto *SrcTy = cast<FixedVectorType>(Op->getType());
-  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
-  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
-
-  SmallVector<int> Mask;
-  if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
-                            !IsLittleEndian, Mask))
-    return nullptr;
-
-  auto *FirstEltZero = Builder.CreateInsertElement(
-      PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
-
-  return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
-}
-
 static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
   IRBuilder<> Builder(TI);
   SmallVector<Value *> Parts;
@@ -16139,29 +16276,14 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
     Value *ZExt = createTblShuffleForZExt(
         Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
         FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
-    assert(ZExt && "Cannot fail for the i8 to float conversion");
+    if (!ZExt)
+      return false;
     auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
     I->replaceAllUsesWith(UI);
     I->eraseFromParent();
     return true;
   }
 
-  auto *SIToFP = dyn_cast<SIToFPInst>(I);
-  if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
-      DstTy->getElementType()->isFloatTy()) {
-    IRBuilder<> Builder(I);
-    auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
-                                            FixedVectorType::getInteger(DstTy),
-                                            Subtarget->isLittleEndian());
-    assert(Shuffle && "Cannot fail for the i8 to float conversion");
-    auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
-    auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
-    auto *SI = Builder.CreateSIToFP(AShr, DstTy);
-    I->replaceAllUsesWith(SI);
-    I->eraseFromParent();
-    return true;
-  }
-
   // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
   // followed by a truncate lowered to using tbl.4.
   auto *FPToUI = dyn_cast<FPToUIInst>(I);
@@ -16234,15 +16356,13 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
 
   UseScalable = false;
 
-  if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
-      !Subtarget->useSVEForFixedLengthVectors())
+  if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
+      (!Subtarget->useSVEForFixedLengthVectors() ||
+       !getSVEPredPatternFromNumElements(MinElts)))
     return false;
 
-  if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
-    return false;
-
-  // Ensure that the predicate for this number of elements is available.
-  if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
+  if (isa<ScalableVectorType>(VecTy) &&
+      !Subtarget->isSVEorStreamingSVEAvailable())
     return false;
 
   // Ensure the number of vector elements is greater than 1.
@@ -16355,7 +16475,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const DataLayout &DL = LI->getDataLayout();
 
   VectorType *VTy = Shuffles[0]->getType();
 
@@ -16534,7 +16654,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   Type *EltTy = VecTy->getElementType();
   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
 
-  const DataLayout &DL = SI->getModule()->getDataLayout();
+  const DataLayout &DL = SI->getDataLayout();
   bool UseScalable;
 
   // Skip if we do not have NEON and skip illegal vector types. We can
@@ -16680,7 +16800,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
   const unsigned Factor = 2;
 
   VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
-  const DataLayout &DL = DI->getModule()->getDataLayout();
+  const DataLayout &DL = DI->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -16755,7 +16875,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
   const unsigned Factor = 2;
 
   VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
-  const DataLayout &DL = II->getModule()->getDataLayout();
+  const DataLayout &DL = II->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -17939,16 +18059,27 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   unsigned ShiftAmt;
 
   auto Shl = [&](SDValue N0, unsigned N1) {
+    if (!N0.getNode())
+      return SDValue();
+    // If shift causes overflow, ignore this combine.
+    if (N1 >= N0.getValueSizeInBits())
+      return SDValue();
     SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
     return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
   };
   auto Add = [&](SDValue N0, SDValue N1) {
+    if (!N0.getNode() || !N1.getNode())
+      return SDValue();
     return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
   };
   auto Sub = [&](SDValue N0, SDValue N1) {
+    if (!N0.getNode() || !N1.getNode())
+      return SDValue();
     return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
   };
   auto Negate = [&](SDValue N) {
+    if (!N0.getNode())
+      return SDValue();
     SDValue Zero = DAG.getConstant(0, DL, VT);
     return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
   };
@@ -24023,7 +24154,7 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
   const GlobalValue *GV = GN->getGlobal();
   Type *T = GV->getValueType();
   if (!T->isSized() ||
-      Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
+      Offset > GV->getDataLayout().getTypeAllocSize(T))
     return SDValue();
 
   SDLoc DL(GN);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 986f1b67ee5136..047c852bb01d25 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -655,8 +655,7 @@ class AArch64TargetLowering : public TargetLowering {
                                   MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
-                                 MachineInstr &MI, MachineBasicBlock *BB,
-                                 bool HasTile) const;
+                                 MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Opcode, bool Op0IsDef) const;
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
@@ -1128,6 +1127,12 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
                                  SelectionDAG &DAG) const;
   SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerPtrAuthGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT,
+                                              AArch64PACKey::ID Key,
+                                              SDValue Discriminator,
+                                              SDValue AddrDiscriminator,
+                                              SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 949e7699d070de..93278a2ba0d091 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
@@ -4497,7 +4498,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Predicate register by ORRing with itself.
   if (AArch64::PPRRegClass.contains(DestReg) &&
       AArch64::PPRRegClass.contains(SrcReg)) {
-    assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+           "Unexpected SVE register.");
     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
       .addReg(SrcReg) // Pg
       .addReg(SrcReg)
@@ -4510,8 +4512,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
   bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
   if (DestIsPNR || SrcIsPNR) {
-    assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
-           "Unexpected predicate-as-counter register.");
     auto ToPPR = [](MCRegister R) -> MCRegister {
       return (R - AArch64::PN0) + AArch64::P0;
     };
@@ -4532,7 +4532,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Z register by ORRing with itself.
   if (AArch64::ZPRRegClass.contains(DestReg) &&
       AArch64::ZPRRegClass.contains(SrcReg)) {
-    assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+           "Unexpected SVE register.");
     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
       .addReg(SrcReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
@@ -4544,7 +4545,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
        AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
       (AArch64::ZPR2RegClass.contains(SrcReg) ||
        AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
-    assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+           "Unexpected SVE register.");
     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
                      Indices);
@@ -4554,7 +4556,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copy a Z register triple by copying the individual sub-registers.
   if (AArch64::ZPR3RegClass.contains(DestReg) &&
       AArch64::ZPR3RegClass.contains(SrcReg)) {
-    assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+           "Unexpected SVE register.");
     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
                                        AArch64::zsub2};
     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -4567,7 +4570,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
        AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
       (AArch64::ZPR4RegClass.contains(SrcReg) ||
        AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
-    assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+           "Unexpected SVE register.");
     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
                                        AArch64::zsub2, AArch64::zsub3};
     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -4672,7 +4676,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR128RegClass.contains(DestReg) &&
       AArch64::FPR128RegClass.contains(SrcReg)) {
-    if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable())
+    if (Subtarget.isSVEorStreamingSVEAvailable() &&
+        !Subtarget.isNeonAvailable())
       BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
           .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
@@ -4830,14 +4835,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       Opc = AArch64::STRBui;
     break;
   case 2: {
-    bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRHui;
-    else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+    else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
+             AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
-      assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
-             "Unexpected register store without SVE2p1 or SME2");
       Opc = AArch64::STR_PXI;
       StackID = TargetStackID::ScalableVector;
     }
@@ -4886,7 +4889,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                               AArch64::sube64, AArch64::subo64, FI, MMO);
       return;
     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZXI;
       StackID = TargetStackID::ScalableVector;
@@ -4910,7 +4913,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       Offset = false;
     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZXI;
       StackID = TargetStackID::ScalableVector;
@@ -4922,7 +4925,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       Opc = AArch64::ST1Threev2d;
       Offset = false;
     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZZXI;
       StackID = TargetStackID::ScalableVector;
@@ -4935,7 +4938,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       Offset = false;
     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZZZXI;
       StackID = TargetStackID::ScalableVector;
@@ -5008,10 +5011,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRHui;
     else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
-      assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
-             "Unexpected register load without SVE2p1 or SME2");
       if (IsPNR)
         PNRReg = DestReg;
       Opc = AArch64::LDR_PXI;
@@ -5062,7 +5063,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                AArch64::subo64, FI, MMO);
       return;
     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZXI;
       StackID = TargetStackID::ScalableVector;
@@ -5086,7 +5087,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       Offset = false;
     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZXI;
       StackID = TargetStackID::ScalableVector;
@@ -5098,7 +5099,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       Opc = AArch64::LD1Threev2d;
       Offset = false;
     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZZXI;
       StackID = TargetStackID::ScalableVector;
@@ -5111,7 +5112,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       Offset = false;
     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.hasSVEorSME() &&
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZZZXI;
       StackID = TargetStackID::ScalableVector;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6afee9bd388a6d..1e06d5fdc7562e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -141,35 +141,41 @@ def HasSPE           : Predicate<"Subtarget->hasSPE()">,
 def HasFuseAES       : Predicate<"Subtarget->hasFuseAES()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFuseAES),
                                  "fuse-aes">;
-def HasSVE           : Predicate<"Subtarget->hasSVE()">,
+def HasSVE           : Predicate<"Subtarget->isSVEAvailable()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
-def HasSVE2          : Predicate<"Subtarget->hasSVE2()">,
+def HasSVE2          : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
-def HasSVE2p1        : Predicate<"Subtarget->hasSVE2p1()">,
+def HasSVE2p1        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">;
-def HasSVE2AES       : Predicate<"Subtarget->hasSVE2AES()">,
+def HasSVE2AES       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2AES()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">;
-def HasSVE2SM4       : Predicate<"Subtarget->hasSVE2SM4()">,
+def HasSVE2SM4       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SM4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">;
-def HasSVE2SHA3      : Predicate<"Subtarget->hasSVE2SHA3()">,
+def HasSVE2SHA3      : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">;
-def HasSVE2BitPerm   : Predicate<"Subtarget->hasSVE2BitPerm()">,
+def HasSVE2BitPerm   : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
 def HasB16B16        : Predicate<"Subtarget->hasB16B16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureB16B16), "b16b16">;
-def HasSME           : Predicate<"Subtarget->hasSME()">,
+def HasSMEandIsNonStreamingSafe
+                     : Predicate<"Subtarget->hasSME()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
-def HasSMEF64F64     : Predicate<"Subtarget->hasSMEF64F64()">,
+def HasSME           : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
+def HasSMEF64F64     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF64F64()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF64F64), "sme-f64f64">;
-def HasSMEF16F16     : Predicate<"Subtarget->hasSMEF16F16()">,
+def HasSMEF16F16     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF16F16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF16F16), "sme-f16f16">;
-def HasSMEFA64       : Predicate<"Subtarget->hasSMEFA64()">,
+def HasSMEFA64       : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEFA64()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEFA64), "sme-fa64">;
-def HasSMEI16I64     : Predicate<"Subtarget->hasSMEI16I64()">,
+def HasSMEI16I64     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEI16I64()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEI16I64), "sme-i16i64">;
-def HasSME2          : Predicate<"Subtarget->hasSME2()">,
+def HasSME2andIsNonStreamingSafe
+                     : Predicate<"Subtarget->hasSME2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">;
+def HasSME2          : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">;
-def HasSME2p1        : Predicate<"Subtarget->hasSME2p1()">,
+def HasSME2p1        : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p1), "sme2p1">;
 def HasFP8           : Predicate<"Subtarget->hasFP8()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">;
@@ -198,39 +204,39 @@ def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
                                  "ssve-fp8dot4 or (sve2 and fp8dot4)">;
 def HasLUT          : Predicate<"Subtarget->hasLUT()">,
                                  AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">;
-def HasSME_LUTv2     : Predicate<"Subtarget->hasSME_LUTv2()">,
+def HasSME_LUTv2     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">;
-def HasSMEF8F16     : Predicate<"Subtarget->hasSMEF8F16()">,
+def HasSMEF8F16     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">;
-def HasSMEF8F32     : Predicate<"Subtarget->hasSMEF8F32()">,
+def HasSMEF8F32     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
 def HasSVEorSME
-    : Predicate<"Subtarget->hasSVEorSME()">,
+    : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
                 "sve or sme">;
 def HasSVE2orSME
-    : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME()">,
+    : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
                 "sve2 or sme">;
 def HasSVE2orSME2
-    : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME2()">,
+    : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2),
                 "sve2 or sme2">;
 def HasSVE2p1_or_HasSME
-    : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME()">,
+    : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">;
 def HasSVE2p1_or_HasSME2
-    : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2()">,
+    : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1), "sme2 or sve2p1">;
 def HasSVE2p1_or_HasSME2p1
-    : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()">,
+    : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
 
 def HasSMEF16F16orSMEF8F16
-    : Predicate<"Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16()">,
+    : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
                 "sme-f16f16 or sme-f8f16">;
 
@@ -1759,6 +1765,38 @@ let Predicates = [HasPAuth] in {
   defm LDRAA  : AuthLoad<0, "ldraa", simm10Scaled>;
   defm LDRAB  : AuthLoad<1, "ldrab", simm10Scaled>;
 
+  // Materialize a signed global address, with adrp+add and PAC.
+  def MOVaddrPAC : Pseudo<(outs),
+                          (ins i64imm:$Addr, i32imm:$Key,
+                               GPR64noip:$AddrDisc, i64imm:$Disc), []>,
+               Sched<[WriteI, ReadI]> {
+    let isReMaterializable = 1;
+    let isCodeGenOnly = 1;
+    let Size = 40; // 12 fixed + 28 variable, for pointer offset, and discriminator
+    let Defs = [X16,X17];
+  }
+
+  // Materialize a signed global address, using a GOT load and PAC.
+  def LOADgotPAC : Pseudo<(outs),
+                          (ins i64imm:$Addr, i32imm:$Key,
+                               GPR64noip:$AddrDisc, i64imm:$Disc), []>,
+               Sched<[WriteI, ReadI]> {
+    let isReMaterializable = 1;
+    let isCodeGenOnly = 1;
+    let Size = 40; // 12 fixed + 28 variable, for pointer offset, and discriminator
+    let Defs = [X16,X17];
+  }
+
+  // Load a signed global address from a special $auth_ptr$ stub slot.
+  def LOADauthptrstatic : Pseudo<(outs GPR64:$dst),
+                              (ins i64imm:$Addr, i32imm:$Key,
+                                   i64imm:$Disc), []>,
+               Sched<[WriteI, ReadI]> {
+    let isReMaterializable = 1;
+    let isCodeGenOnly = 1;
+    let Size = 8;
+  }
+
   // Size 16: 4 fixed + 8 variable, to compute discriminator.
   let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Size = 16,
       Uses = [SP] in {
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
index d21aa59659a25d..6715382eefebef 100644
--- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 37d621cd2f6580..48672241f905d5 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 53b46ff42b72f5..5859d5cd912363 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -930,6 +930,12 @@ def ProcessorFeatures {
   list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeatureETE];
 }
 
+// Define an alternative name for a given Processor.
+class ProcessorAlias<string n, string alias> {
+  string Name = n;
+  string Alias = alias;
+}
+
 // FeatureFuseAdrpAdd is enabled under Generic to allow linker merging
 // optimizations.
 def : ProcessorModel<"generic", CortexA510Model, ProcessorFeatures.Generic,
@@ -1005,6 +1011,7 @@ def : ProcessorModel<"neoverse-n1", NeoverseN1Model,
                      ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>;
 def : ProcessorModel<"neoverse-n2", NeoverseN2Model,
                      ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>;
+def : ProcessorAlias<"cobalt-100", "neoverse-n2">;
 def : ProcessorModel<"neoverse-n3", NeoverseN2Model,
                      ProcessorFeatures.NeoverseN3, [TuneNeoverseN3]>;
 def : ProcessorModel<"neoverse-512tvb", NeoverseV1Model,
@@ -1013,6 +1020,7 @@ def : ProcessorModel<"neoverse-v1", NeoverseV1Model,
                      ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>;
 def : ProcessorModel<"neoverse-v2", NeoverseV2Model,
                      ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>;
+def : ProcessorAlias<"grace", "neoverse-v2">;
 def : ProcessorModel<"neoverse-v3", NeoverseV2Model,
                      ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>;
 def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model,
@@ -1050,15 +1058,12 @@ def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110,
 
 // Apple CPUs
 
-// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
-def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7,
-                     [TuneAppleA7]>;
 def : ProcessorModel<"apple-a7", CycloneModel, ProcessorFeatures.AppleA7,
                      [TuneAppleA7]>;
-def : ProcessorModel<"apple-a8", CycloneModel, ProcessorFeatures.AppleA7,
-                     [TuneAppleA7]>;
-def : ProcessorModel<"apple-a9", CycloneModel, ProcessorFeatures.AppleA7,
-                     [TuneAppleA7]>;
+// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
+def : ProcessorAlias<"cyclone", "apple-a7">;
+def : ProcessorAlias<"apple-a8", "apple-a7">;
+def : ProcessorAlias<"apple-a9", "apple-a7">;
 
 def : ProcessorModel<"apple-a10", CycloneModel, ProcessorFeatures.AppleA10,
                      [TuneAppleA10]>;
@@ -1068,28 +1073,23 @@ def : ProcessorModel<"apple-a11", CycloneModel, ProcessorFeatures.AppleA11,
 
 def : ProcessorModel<"apple-a12", CycloneModel, ProcessorFeatures.AppleA12,
                      [TuneAppleA12]>;
-def : ProcessorModel<"apple-s4", CycloneModel, ProcessorFeatures.AppleA12,
-                     [TuneAppleA12]>;
-def : ProcessorModel<"apple-s5", CycloneModel, ProcessorFeatures.AppleA12,
-                     [TuneAppleA12]>;
+def : ProcessorAlias<"apple-s4", "apple-a12">;
+def : ProcessorAlias<"apple-s5", "apple-a12">;
 
 def : ProcessorModel<"apple-a13", CycloneModel, ProcessorFeatures.AppleA13,
                      [TuneAppleA13]>;
 
 def : ProcessorModel<"apple-a14", CycloneModel, ProcessorFeatures.AppleA14,
                      [TuneAppleA14]>;
-def : ProcessorModel<"apple-m1", CycloneModel, ProcessorFeatures.AppleA14,
-                     [TuneAppleA14]>;
+def : ProcessorAlias<"apple-m1", "apple-a14">;
 
 def : ProcessorModel<"apple-a15", CycloneModel, ProcessorFeatures.AppleA15,
                      [TuneAppleA15]>;
-def : ProcessorModel<"apple-m2", CycloneModel, ProcessorFeatures.AppleA15,
-                     [TuneAppleA15]>;
+def : ProcessorAlias<"apple-m2", "apple-a15">;
 
 def : ProcessorModel<"apple-a16", CycloneModel, ProcessorFeatures.AppleA16,
                      [TuneAppleA16]>;
-def : ProcessorModel<"apple-m3", CycloneModel, ProcessorFeatures.AppleA16,
-                     [TuneAppleA16]>;
+def : ProcessorAlias<"apple-m3", "apple-a16">;
 
 def : ProcessorModel<"apple-a17", CycloneModel, ProcessorFeatures.AppleA17,
                      [TuneAppleA17]>;
@@ -1098,8 +1098,7 @@ def : ProcessorModel<"apple-m4", CycloneModel, ProcessorFeatures.AppleM4,
                      [TuneAppleM4]>;
 
 // Alias for the latest Apple processor model supported by LLVM.
-def : ProcessorModel<"apple-latest", CycloneModel, ProcessorFeatures.AppleM4,
-                     [TuneAppleM4]>;
+def : ProcessorAlias<"apple-latest", "apple-m4">;
 
 
 // Fujitsu A64FX
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index adc8e6d3ff8771..833e6cfcaf6c52 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -71,15 +71,17 @@ let usesCustomInserter = 1 in {
 def SDT_AArch64RDSVL  : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
 def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>;
 
-let Predicates = [HasSME] in {
+let Predicates = [HasSMEandIsNonStreamingSafe] in {
 def RDSVLI_XI  : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>;
 def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>;
 def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>;
 
+def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>;
+}
+
+let Predicates = [HasSME] in {
 defm ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha", int_aarch64_sme_addha>;
 defm ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva", int_aarch64_sme_addva>;
-
-def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>;
 }
 
 let Predicates = [HasSMEI16I64] in {
@@ -138,18 +140,20 @@ defm LD1_MXIPXX : sme_mem_ld_ss<"ld1">;
 defm ST1_MXIPXX : sme_mem_st_ss<"st1">;
 
 //===----------------------------------------------------------------------===//
-// Spill + fill
+// Move instructions
 //===----------------------------------------------------------------------===//
 
-defm LDR_ZA : sme_fill<"ldr">;
-defm STR_ZA : sme_spill<"str">;
+defm INSERT_MXIPZ  : sme_vector_to_tile<"mova">;
+defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">;
+} // End let Predicates = [HasSME]
 
+let Predicates = [HasSMEandIsNonStreamingSafe] in {
 //===----------------------------------------------------------------------===//
-// Move instructions
+// Spill + fill
 //===----------------------------------------------------------------------===//
 
-defm INSERT_MXIPZ  : sme_vector_to_tile<"mova">;
-defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">;
+defm LDR_ZA : sme_fill<"ldr">;
+defm STR_ZA : sme_spill<"str">;
 
 //===----------------------------------------------------------------------===//
 // Zero instruction
@@ -185,7 +189,7 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
 def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
           (MRS 0xde85)>;
 
-} // End let Predicates = [HasSME]
+} // End let Predicates = [HasSMEandIsNonStreamingSafe]
 
 multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
   def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
@@ -580,11 +584,6 @@ defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001, int_aarch64_sme_smops
 defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>;
 defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>;
 
-defm ZERO_T : sme2_zero_zt<"zero", 0b0001>;
-
-defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>;
-defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>;
-
 def MOVT_XTI : sme2_movt_zt_to_scalar<"movt", 0b0011111>;
 def MOVT_TIX : sme2_movt_scalar_to_zt<"movt", 0b0011111>;
 
@@ -710,7 +709,15 @@ def  STNT1D_2Z_STRIDED     : sme2_st_vector_vg2_multi_scalar_scalar<0b11, 0b1,
 def  STNT1D_4Z_STRIDED     : sme2_st_vector_vg4_multi_scalar_scalar<0b11, 0b1,    ZZZZ_d_strided, GPR64shifted64, "stnt1d">;
 defm STNT1D_2Z_STRIDED_IMM : sme2_st_vector_vg2_multi_scalar_immediate<0b11, 0b1, ZZ_d_strided,   simm4s2, "stnt1d">;
 defm STNT1D_4Z_STRIDED_IMM : sme2_st_vector_vg4_multi_scalar_immediate<0b11, 0b1, ZZZZ_d_strided, simm4s4, "stnt1d">;
-}
+} // End let Predicates = [HasSME2]
+
+
+let Predicates = [HasSME2andIsNonStreamingSafe] in {
+defm ZERO_T : sme2_zero_zt<"zero", 0b0001>;
+
+defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>;
+defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>;
+} // End let Predicates = [HasSME2andIsNonStreamingSafe]
 
 let Predicates = [HasSME2, HasSMEI16I64] in {
 defm ADD_VG2_M2ZZ_D  : sme2_dot_mla_add_sub_array_vg2_single<"add", 0b1011010, MatrixOp64, ZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_add_write_single_za_vg1x2>;
@@ -807,7 +814,8 @@ defm FSUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fsub", 0b1001, MatrixOp64
 }
 
 let Predicates = [HasSME2p1] in {
-defm MOVAZ_ZMI  : sme2p1_movaz_tile_to_vec<"movaz">;
+defm MOVAZ_ZMI  : sme2p1_movaz_tile_to_vec<"movaz", int_aarch64_sme_readz_horiz, int_aarch64_sme_readz_vert,
+                                           int_aarch64_sme_readz_q_horiz, int_aarch64_sme_readz_q_vert>;
 defm MOVAZ_2ZMI  : sme2p1_movaz_tile_to_vec_vg2<"movaz">;
 defm MOVAZ_4ZMI  : sme2p1_movaz_tile_to_vec_vg4<"movaz">;
 defm MOVAZ_VG2_2ZM : sme2_mova_array_to_vec_vg2_multi<0b010, "movaz">;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 7fed8fed900171..c17a3884867e82 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -1108,12 +1108,17 @@ def : InstRW<[V2Write_1cyc_1B_1R], (instrs BL, BLR)>;
 // ALU, basic
 // ALU, basic, flagset
 def : SchedAlias<WriteI, V2Write_1cyc_1I>;
-def : InstRW<[V2Write_1cyc_1F], (instregex "^(ADC|SBC)S[WX]r$")>;
+def : InstRW<[V2Write_1cyc_1F], (instregex "^(ADD|SUB)S[WX]r[ir]$",
+                        "^(ADC|SBC)S[WX]r$",
+                        "^ANDS[WX]ri$")>;
 def : InstRW<[V2Write_0or1cyc_1I], (instregex "^MOVZ[WX]i$")>;
 
 // ALU, extend and shift
 def : SchedAlias<WriteIEReg, V2Write_2cyc_1M>;
 
+// Conditional compare
+def : InstRW<[V2Write_1cyc_1F], (instregex "^CCM[NP][WX][ir]")>;
+
 // Arithmetic, LSL shift, shift <= 4
 // Arithmetic, flagset, LSL shift, shift <= 4
 // Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index a1ced770bfbbf7..96218ab8cf02ee 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -528,7 +528,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (UseStackSafety)
     SSI = &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult();
   F = &Fn;
-  DL = &Fn.getParent()->getDataLayout();
+  DL = &Fn.getDataLayout();
   if (MergeInit)
     AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 0c9352bda7599e..5faba09aa67bdc 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -361,20 +361,17 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
   void mirFileLoaded(MachineFunction &MF) const override;
 
-  bool hasSVEorSME() const { return hasSVE() || hasSME(); }
-  bool hasSVE2orSME() const { return hasSVE2() || hasSME(); }
-
   // Return the known range for the bit length of SVE data registers. A value
   // of 0 means nothing is known about that particular limit beyong what's
   // implied by the architecture.
   unsigned getMaxSVEVectorSizeInBits() const {
-    assert(hasSVEorSME() &&
+    assert(isSVEorStreamingSVEAvailable() &&
            "Tried to get SVE vector length without SVE support!");
     return MaxSVEVectorSizeInBits;
   }
 
   unsigned getMinSVEVectorSizeInBits() const {
-    assert(hasSVEorSME() &&
+    assert(isSVEorStreamingSVEAvailable() &&
            "Tried to get SVE vector length without SVE support!");
     return MinSVEVectorSizeInBits;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 8c924e7c937cd2..37ce07d4a09de2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -554,8 +554,7 @@ class AArch64PassConfig : public TargetPassConfig {
 
 } // end anonymous namespace
 
-void AArch64TargetMachine::registerPassBuilderCallbacks(
-    PassBuilder &PB, bool PopulateClassToPassNames) {
+void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerLateLoopOptimizationsEPCallback(
       [=](LoopPassManager &LPM, OptimizationLevel Level) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index e396d9204716a5..1a470ca87127ce 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -43,8 +43,7 @@ class AArch64TargetMachine : public LLVMTargetMachine {
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  void registerPassBuilderCallbacks(PassBuilder &PB,
-                                    bool PopulateClassToPassNames) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index e5c6b6d60d068d..3c0facd5867329 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -8,8 +8,12 @@
 
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
@@ -88,3 +92,37 @@ void AArch64_MachoTargetObjectFile::getNameWithPrefix(
   // be accessed via at least a linker-private symbol.
   getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true);
 }
+
+template <typename MachineModuleInfoTarget>
+static MCSymbol *getAuthPtrSlotSymbolHelper(
+    MCContext &Ctx, const TargetMachine &TM, MachineModuleInfo *MMI,
+    MachineModuleInfoTarget &TargetMMI, const MCSymbol *RawSym,
+    AArch64PACKey::ID Key, uint16_t Discriminator) {
+  const DataLayout &DL = MMI->getModule()->getDataLayout();
+
+  MCSymbol *StubSym = Ctx.getOrCreateSymbol(
+      DL.getLinkerPrivateGlobalPrefix() + RawSym->getName() +
+      Twine("$auth_ptr$") + AArch64PACKeyIDToString(Key) + Twine('$') +
+      Twine(Discriminator));
+
+  typename MachineModuleInfoTarget::AuthStubInfo &StubInfo =
+      TargetMMI.getAuthPtrStubEntry(StubSym);
+
+  if (StubInfo.AuthPtrRef)
+    return StubSym;
+
+  const MCExpr *Sym = MCSymbolRefExpr::create(RawSym, Ctx);
+
+  StubInfo.AuthPtrRef =
+      AArch64AuthMCExpr::create(Sym, Discriminator, Key,
+                                /*HasAddressDiversity=*/false, Ctx);
+  return StubSym;
+}
+
+MCSymbol *AArch64_ELFTargetObjectFile::getAuthPtrSlotSymbol(
+    const TargetMachine &TM, MachineModuleInfo *MMI, const MCSymbol *RawSym,
+    AArch64PACKey::ID Key, uint16_t Discriminator) const {
+  auto &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+  return getAuthPtrSlotSymbolHelper(getContext(), TM, MMI, ELFMMI, RawSym, Key,
+                                    Discriminator);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
index 7cd56fc9a189d8..c5ebf03c39c77a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
 
+#include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
@@ -29,6 +30,11 @@ class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
                                           const MCValue &MV, int64_t Offset,
                                           MachineModuleInfo *MMI,
                                           MCStreamer &Streamer) const override;
+
+  MCSymbol *getAuthPtrSlotSymbol(const TargetMachine &TM,
+                                 MachineModuleInfo *MMI, const MCSymbol *RawSym,
+                                 AArch64PACKey::ID Key,
+                                 uint16_t Discriminator) const;
 };
 
 /// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f5756fc7e4012..c0abbd32eeec41 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -985,6 +985,33 @@ static bool isAllActivePredicate(Value *Pred) {
                          m_ConstantInt<AArch64SVEPredPattern::all>()));
 }
 
+// Simplify unary operation where predicate has all inactive lanes by replacing
+// instruction with zeroed object
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
+  if (match(II.getOperand(0), m_ZeroInt())) {
+    Constant *Node;
+    Type *RetTy = II.getType();
+    if (RetTy->isStructTy()) {
+      auto StructT = cast<StructType>(RetTy);
+      auto VecT = StructT->getElementType(0);
+      SmallVector<llvm::Constant *, 4> ZerVec;
+      for (unsigned i = 0; i < StructT->getNumElements(); i++) {
+        ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
+                                                  : ConstantInt::get(VecT, 0));
+      }
+      Node = ConstantStruct::get(StructT, ZerVec);
+    } else if (RetTy->isFPOrFPVectorTy())
+      Node = ConstantFP::get(RetTy, 0.0);
+    else
+      Node = ConstantInt::get(II.getType(), 0);
+
+    IC.replaceInstUsesWith(II, Node);
+    return IC.eraseInstFromFunction(II);
+  }
+  return std::nullopt;
+}
+
 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
                                                       IntrinsicInst &II) {
   // svsel(ptrue, x, y) => x
@@ -1398,6 +1425,10 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   Value *PtrOp = II.getOperand(1);
   Type *VecTy = II.getType();
 
+  // Replace by zero constant when all lanes are inactive
+  if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+    return II_NA;
+
   if (isAllActivePredicate(Pred)) {
     LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
     Load->copyMetadata(II);
@@ -1745,6 +1776,10 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
   Type *Ty = II.getType();
   Value *PassThru = ConstantAggregateZero::get(Ty);
 
+  // Replace by zero constant when all lanes are inactive
+  if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+    return II_NA;
+
   // Contiguous gather => masked load.
   // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
   // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
@@ -1752,7 +1787,7 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
                        m_Value(IndexBase), m_SpecificInt(1)))) {
     Align Alignment =
-        BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
+        BasePtr->getPointerAlignment(II.getDataLayout());
 
     Type *VecPtrTy = PointerType::getUnqual(Ty);
     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
@@ -1782,7 +1817,7 @@ instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
                        m_Value(IndexBase), m_SpecificInt(1)))) {
     Align Alignment =
-        BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
+        BasePtr->getPointerAlignment(II.getDataLayout());
 
     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
                                       BasePtr, IndexBase);
@@ -1971,6 +2006,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   switch (IID) {
   default:
     break;
+
+  case Intrinsic::aarch64_sve_ld1_gather:
+  case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
+  case Intrinsic::aarch64_sve_ld1_gather_sxtw:
+  case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+  case Intrinsic::aarch64_sve_ld1_gather_uxtw:
+  case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+  case Intrinsic::aarch64_sve_ld1q_gather_index:
+  case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
+  case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
+  case Intrinsic::aarch64_sve_ld1ro:
+  case Intrinsic::aarch64_sve_ld1rq:
+  case Intrinsic::aarch64_sve_ld1udq:
+  case Intrinsic::aarch64_sve_ld1uwq:
+  case Intrinsic::aarch64_sve_ld2_sret:
+  case Intrinsic::aarch64_sve_ld2q_sret:
+  case Intrinsic::aarch64_sve_ld3_sret:
+  case Intrinsic::aarch64_sve_ld3q_sret:
+  case Intrinsic::aarch64_sve_ld4_sret:
+  case Intrinsic::aarch64_sve_ld4q_sret:
+  case Intrinsic::aarch64_sve_ldff1:
+  case Intrinsic::aarch64_sve_ldff1_gather:
+  case Intrinsic::aarch64_sve_ldff1_gather_index:
+  case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+  case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+  case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+  case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+  case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+  case Intrinsic::aarch64_sve_ldnf1:
+  case Intrinsic::aarch64_sve_ldnt1:
+  case Intrinsic::aarch64_sve_ldnt1_gather:
+  case Intrinsic::aarch64_sve_ldnt1_gather_index:
+  case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+  case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+    return instCombineSVENoActiveUnaryZero(IC, II);
   case Intrinsic::aarch64_neon_fmaxnm:
   case Intrinsic::aarch64_neon_fminnm:
     return instCombineMaxMinNM(IC, II);
@@ -2162,19 +2232,20 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(64);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode)
-      return TypeSize::getFixed(0);
-
-    if (ST->hasSVE())
+    if (ST->useSVEForFixedLengthVectors() &&
+        (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
       return TypeSize::getFixed(
           std::max(ST->getMinSVEVectorSizeInBits(), 128u));
-
-    return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
+    else if (ST->isNeonAvailable())
+      return TypeSize::getFixed(128);
+    else
+      return TypeSize::getFixed(0);
   case TargetTransformInfo::RGK_ScalableVector:
-    if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode)
+    if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
+                                 EnableScalableAutovecInStreamingMode))
+      return TypeSize::getScalable(128);
+    else
       return TypeSize::getScalable(0);
-
-    return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
   }
   llvm_unreachable("Unsupported register kind");
 }
@@ -2690,7 +2761,8 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       return AdjustCost(Entry->Cost);
 
   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
-      CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
+      CCH == TTI::CastContextHint::Masked &&
+      ST->isSVEorStreamingSVEAvailable() &&
       TLI->getTypeAction(Src->getContext(), SrcTy) ==
           TargetLowering::TypePromoteInteger &&
       TLI->getTypeAction(Dst->getContext(), DstTy) ==
@@ -2711,8 +2783,8 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
   // but we also want to include the TTI::CastContextHint::Masked case too.
   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
-      CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
-      TLI->isTypeLegal(DstTy))
+      CCH == TTI::CastContextHint::Masked &&
+      ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
     CCH = TTI::CastContextHint::Normal;
 
   return AdjustCost(
@@ -3187,11 +3259,16 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  // Return an invalid cost for element types that we are unable to lower.
+  auto *VT = cast<VectorType>(Src);
+  if (VT->getElementType()->isIntegerTy(1))
+    return InstructionCost::getInvalid();
+
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
   // sufficiently reliable.
-  if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
+  if (VT->getElementCount() == ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
   return LT.first;
@@ -3212,16 +3289,17 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  // Return an invalid cost for element types that we are unable to lower.
   if (!LT.second.isVector() ||
-      !isElementTypeLegalForScalableVector(VT->getElementType()))
+      !isElementTypeLegalForScalableVector(VT->getElementType()) ||
+      VT->getElementType()->isIntegerTy(1))
     return InstructionCost::getInvalid();
 
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
   // sufficiently reliable.
-  if (cast<VectorType>(DataTy)->getElementCount() ==
-      ElementCount::getScalable(1))
+  if (VT->getElementCount() == ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
   ElementCount LegalVF = LT.second.getVectorElementCount();
@@ -3259,8 +3337,12 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
   // sufficiently reliable.
+  // We also only support full register predicate loads and stores.
   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
-    if (VTy->getElementCount() == ElementCount::getScalable(1))
+    if (VTy->getElementCount() == ElementCount::getScalable(1) ||
+        (VTy->getElementType()->isIntegerTy(1) &&
+         !VTy->getElementCount().isKnownMultipleOf(
+             ElementCount::getScalable(16))))
       return InstructionCost::getInvalid();
 
   // TODO: consider latency as well for TCK_SizeAndLatency.
@@ -4234,4 +4316,4 @@ bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                     C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 
   return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e7d3a5899cef1d..3eb9aa963c0184 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -71,7 +71,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   bool areInlineCompatible(const Function *Caller,
@@ -248,7 +248,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
       return true;
 
-    if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
+    if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
         Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
       return true;
 
@@ -373,9 +373,11 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
 
-  bool supportsScalableVectors() const { return ST->hasSVE(); }
+  bool supportsScalableVectors() const {
+    return ST->isSVEorStreamingSVEAvailable();
+  }
 
-  bool enableScalableVectorization() const { return ST->hasSVE(); }
+  bool enableScalableVectorization() const { return ST->isSVEAvailable(); }
 
   bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                    ElementCount VF) const;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 1fb50a089ea388..5206ba46260ed6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -372,7 +372,7 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     MachineRegisterInfo &MRI = MF.getRegInfo();
     const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
     CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
-    auto &DL = F.getParent()->getDataLayout();
+    auto &DL = F.getDataLayout();
     LLVMContext &Ctx = Val->getType()->getContext();
 
     SmallVector<EVT, 4> SplitEVTs;
@@ -639,7 +639,7 @@ bool AArch64CallLowering::lowerFormalArguments(
   MachineFunction &MF = MIRBuilder.getMF();
   MachineBasicBlock &MBB = MIRBuilder.getMBB();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
 
   // Arm64EC has extra requirements for varargs calls which are only implemented
@@ -1257,7 +1257,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0357a7206c478e..9e0860934f777e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -224,6 +224,8 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
+  bool selectPtrAuthGlobalValue(MachineInstr &I,
+                                MachineRegisterInfo &MRI) const;
   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -2848,6 +2850,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
+    return selectPtrAuthGlobalValue(I, MRI);
+
   case TargetOpcode::G_ZEXTLOAD:
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE: {
@@ -6583,6 +6588,145 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
   return false;
 }
 
+// G_PTRAUTH_GLOBAL_VALUE lowering
+//
+// We have 3 lowering alternatives to choose from:
+// - MOVaddrPAC: similar to MOVaddr, with added PAC.
+//   If the GV doesn't need a GOT load (i.e., is locally defined)
+//   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
+//
+// - LOADgotPAC: similar to LOADgot, with added PAC.
+//   If the GV needs a GOT load, materialize the pointer using the usual
+//   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
+//   section is assumed to be read-only (for example, via relro mechanism). See
+//   LowerMOVaddrPAC.
+//
+// - LOADauthptrstatic: similar to LOADgot, but use a
+//   special stub slot instead of a GOT slot.
+//   Load a signed pointer for symbol 'sym' from a stub slot named
+//   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
+//   resolving. This usually lowers to adrp+ldr, but also emits an entry into
+//   .data with an
+//   @AUTH relocation. See LowerLOADauthptrstatic.
+//
+// All 3 are pseudos that are expand late to longer sequences: this lets us
+// provide integrity guarantees on the to-be-signed intermediate values.
+//
+// LOADauthptrstatic is undesirable because it requires a large section filled
+// with often similarly-signed pointers, making it a good harvesting target.
+// Thus, it's only used for ptrauth references to extern_weak to avoid null
+// checks.
+
+bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  Register DefReg = I.getOperand(0).getReg();
+  Register Addr = I.getOperand(1).getReg();
+  uint64_t Key = I.getOperand(2).getImm();
+  Register AddrDisc = I.getOperand(3).getReg();
+  uint64_t Disc = I.getOperand(4).getImm();
+  int64_t Offset = 0;
+
+  if (Key > AArch64PACKey::LAST)
+    report_fatal_error("key in ptrauth global out of range [0, " +
+                       Twine((int)AArch64PACKey::LAST) + "]");
+
+  // Blend only works if the integer discriminator is 16-bit wide.
+  if (!isUInt<16>(Disc))
+    report_fatal_error(
+        "constant discriminator in ptrauth global out of range [0, 0xffff]");
+
+  // Choosing between 3 lowering alternatives is target-specific.
+  if (!STI.isTargetELF())
+    report_fatal_error("ptrauth global lowering is only implemented for ELF");
+
+  if (!MRI.hasOneDef(Addr))
+    return false;
+
+  // First match any offset we take from the real global.
+  const MachineInstr *DefMI = &*MRI.def_instr_begin(Addr);
+  if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
+    Register OffsetReg = DefMI->getOperand(2).getReg();
+    if (!MRI.hasOneDef(OffsetReg))
+      return false;
+    const MachineInstr &OffsetMI = *MRI.def_instr_begin(OffsetReg);
+    if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
+      return false;
+
+    Addr = DefMI->getOperand(1).getReg();
+    if (!MRI.hasOneDef(Addr))
+      return false;
+
+    DefMI = &*MRI.def_instr_begin(Addr);
+    Offset = OffsetMI.getOperand(1).getCImm()->getSExtValue();
+  }
+
+  // We should be left with a genuine unauthenticated GlobalValue.
+  const GlobalValue *GV;
+  if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
+    GV = DefMI->getOperand(1).getGlobal();
+    Offset += DefMI->getOperand(1).getOffset();
+  } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
+    GV = DefMI->getOperand(2).getGlobal();
+    Offset += DefMI->getOperand(2).getOffset();
+  } else {
+    return false;
+  }
+
+  MachineIRBuilder MIB(I);
+
+  // Classify the reference to determine whether it needs a GOT load.
+  unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
+  const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
+  assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
+         "unsupported non-GOT op flags on ptrauth global reference");
+  assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
+         "unsupported non-GOT reference to weak ptrauth global");
+
+  std::optional<APInt> AddrDiscVal = getIConstantVRegVal(AddrDisc, MRI);
+  bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
+
+  // Non-extern_weak:
+  // - No GOT load needed -> MOVaddrPAC
+  // - GOT load for non-extern_weak -> LOADgotPAC
+  //   Note that we disallow extern_weak refs to avoid null checks later.
+  if (!GV->hasExternalWeakLinkage()) {
+    MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
+    MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
+    MIB.buildInstr(NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
+        .addGlobalAddress(GV, Offset)
+        .addImm(Key)
+        .addReg(HasAddrDisc ? AddrDisc : AArch64::XZR)
+        .addImm(Disc)
+        .constrainAllUses(TII, TRI, RBI);
+    MIB.buildCopy(DefReg, Register(AArch64::X16));
+    RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
+    I.eraseFromParent();
+    return true;
+  }
+
+  // extern_weak -> LOADauthptrstatic
+
+  // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
+  // offset alone as a pointer if the symbol wasn't available, which would
+  // probably break null checks in users. Ptrauth complicates things further:
+  // error out.
+  if (Offset != 0)
+    report_fatal_error(
+        "unsupported non-zero offset in weak ptrauth global reference");
+
+  if (HasAddrDisc)
+    report_fatal_error("unsupported weak addr-div ptrauth global");
+
+  MIB.buildInstr(AArch64::LOADauthptrstatic, {DefReg}, {})
+      .addGlobalAddress(GV, Offset)
+      .addImm(Key)
+      .addImm(Disc);
+  RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
+
+  I.eraseFromParent();
+  return true;
+}
+
 void AArch64InstructionSelector::SelectTable(MachineInstr &I,
                                              MachineRegisterInfo &MRI,
                                              unsigned NumVec, unsigned Opc1,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index fef0b722efe45d..82d5a56d1ff25a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -648,6 +648,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder(G_FPTRUNC)
       .legalFor(
           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
+      .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
       .clampNumElements(0, v4s16, v4s16)
       .clampNumElements(0, v2s32, v2s32)
       .scalarize(0);
@@ -760,6 +761,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   else
     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
 
+  getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE)
+      .legalIf(all(typeIs(0, p0), typeIs(1, p0)));
+
   getActionDefinitionsBuilder(G_PTRTOINT)
       .legalFor({{s64, p0}, {v2s64, v2p0}})
       .widenScalarToNextPow2(0, 64)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index e9b25924b35f73..8a50cb26b2c2fe 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -184,7 +184,7 @@ bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
 
   Type *T = GV->getValueType();
   if (!T->isSized() ||
-      NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
+      NewOffset > GV->getDataLayout().getTypeAllocSize(T))
     return false;
   MatchInfo = std::make_pair(NewOffset, MinOffset);
   return true;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index f5bea3336cbf73..7dba22c066dcd6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -28,7 +28,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -183,7 +183,7 @@ class AArch64ELFStreamer : public MCELFStreamer {
                       std::move(Emitter)),
         MappingSymbolCounter(0), LastEMS(EMS_None) {}
 
-  void changeSection(MCSection *Section, const MCExpr *Subsection) override {
+  void changeSection(MCSection *Section, uint32_t Subsection = 0) override {
     // We have to keep track of the mapping symbol state of any sections we
     // use. Each one should start off as EMS_None, which is provided as the
     // default constructor by DenseMap::lookup.
@@ -248,6 +248,7 @@ class AArch64ELFStreamer : public MCELFStreamer {
     emitDataMappingSymbol();
     MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
   }
+
 private:
   enum ElfMappingSymbol {
     EMS_None,
@@ -283,7 +284,6 @@ class AArch64ELFStreamer : public MCELFStreamer {
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS;
 };
-
 } // end anonymous namespace
 
 AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
@@ -299,6 +299,37 @@ void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) {
   cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
 }
 
+void AArch64TargetELFStreamer::finish() {
+  AArch64TargetStreamer::finish();
+  AArch64ELFStreamer &S = getStreamer();
+  MCContext &Ctx = S.getContext();
+  auto &Asm = S.getAssembler();
+  MCSectionELF *MemtagSec = nullptr;
+  for (const MCSymbol &Symbol : Asm.symbols()) {
+    const auto &Sym = cast<MCSymbolELF>(Symbol);
+    if (Sym.isMemtag()) {
+      MemtagSec = Ctx.getELFSection(".memtag.globals.static",
+                                    ELF::SHT_AARCH64_MEMTAG_GLOBALS_STATIC, 0);
+      break;
+    }
+  }
+  if (!MemtagSec)
+    return;
+
+  // switchSection registers the section symbol and invalidates symbols(). We
+  // need a separate symbols() loop.
+  S.switchSection(MemtagSec);
+  const auto *Zero = MCConstantExpr::create(0, Ctx);
+  for (const MCSymbol &Symbol : Asm.symbols()) {
+    const auto &Sym = cast<MCSymbolELF>(Symbol);
+    if (!Sym.isMemtag())
+      continue;
+    auto *SRE = MCSymbolRefExpr::create(&Sym, MCSymbolRefExpr::VK_None, Ctx);
+    (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(),
+                               *Ctx.getSubtargetInfo());
+  }
+}
+
 MCTargetStreamer *
 llvm::createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
                                      MCInstPrinter *InstPrint,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 6493a2ee4a9380..f05e5e6df7f8e3 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 
 using namespace llvm;
 
@@ -51,6 +52,8 @@ static MCInstrInfo *createAArch64MCInstrInfo() {
 
 static MCSubtargetInfo *
 createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  CPU = AArch64::resolveCPUAlias(CPU);
+
   if (CPU.empty()) {
     CPU = "generic";
     if (FS.empty())
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 04bd85260c5600..51ae41c0ca192f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -216,10 +217,10 @@ void AArch64MachObjectWriter::recordRelocation(
     }
   } else if (Target.getSymB()) { // A - B + constant
     const MCSymbol *A = &Target.getSymA()->getSymbol();
-    const MCSymbol *A_Base = Asm.getAtom(*A);
+    const MCSymbol *A_Base = Writer->getAtom(*A);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
-    const MCSymbol *B_Base = Asm.getAtom(*B);
+    const MCSymbol *B_Base = Writer->getAtom(*B);
 
     // Check for "_foo@got - .", which comes through here as:
     // Ltmp0:
@@ -313,11 +314,12 @@ void AArch64MachObjectWriter::recordRelocation(
         return;
       }
       const MCSection &Sec = Symbol->getSection();
-      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+      if (!MCAsmInfoDarwin::isSectionAtomizableBySymbols(Sec))
         Symbol->setUsedInReloc();
     }
 
-    const MCSymbol *Base = Asm.getAtom(*Symbol);
+    const MCSymbol *Base = Writer->getAtom(*Symbol);
+
     // If the symbol is a variable it can either be in a section and
     // we have a base or it is absolute and should have been expanded.
     assert(!Symbol->isVariable() || Base);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index e8a9dc445b96b1..ac441ae3b603ff 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -93,6 +93,7 @@ class AArch64TargetELFStreamer : public AArch64TargetStreamer {
 
   void emitInst(uint32_t Inst) override;
   void emitDirectiveVariantPCS(MCSymbol *Symbol) override;
+  void finish() override;
 
 public:
   AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 23b3cc9ec62152..174d95333d918d 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/Cloning.h"
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index b21b1faf5c9622..5bd19a73f03ce6 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -111,6 +111,12 @@ class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum
   let usesCustomInserter = 1;
 }
 
+class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
+    : SMEPseudo2Instr<name, 0>,
+      Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
+  let SMEMatrixType = za_flag;
+  let usesCustomInserter = 1;
+}
 //===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -199,6 +205,11 @@ class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand
 class SME2_Zero_Matrix_Pat<string name, SDPatternOperator intrinsic, Operand offset_ty, ComplexPattern tileslice>
     : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset))),
     (!cast<Instruction>(name) $base, $offset)>; 
+
+class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, Operand tile_imm, Operand index_ty, ComplexPattern tileslice>
+    : Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
+          (!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;
+
 //===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -4000,7 +4011,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
   def _B : sme2_mova_tile_to_vec_vg2_multi_base<0b00, v, opc, ZZ_b_mul_r,
                                                 !if(v, TileVectorOpV8,
                                                        TileVectorOpH8),
-                                                 uimm3s2range, mnemonic> {
+                                                 uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
     bits<3> imm;
     let Inst{7-5} = imm;
   }
@@ -4008,7 +4019,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
   def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r,
                                                 !if(v, TileVectorOpV16,
                                                        TileVectorOpH16),
-                                                 uimm2s2range, mnemonic> {
+                                                 uimm2s2range, mnemonic>, SMEPseudo2Instr<NAME # _H, 1> {
     bits<1> ZAn;
     bits<2> imm;
     let Inst{7}   = ZAn;
@@ -4018,7 +4029,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
   def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r,
                                                 !if(v, TileVectorOpV32,
                                                        TileVectorOpH32),
-                                                 uimm1s2range, mnemonic> {
+                                                 uimm1s2range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
     bits<2> ZAn;
     bits<1> imm;
     let Inst{7-6} = ZAn;
@@ -4028,7 +4039,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
   def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r,
                                                 !if(v, TileVectorOpV64,
                                                        TileVectorOpH64),
-                                                uimm0s2range, mnemonic> {
+                                                uimm0s2range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
     bits<3> ZAn;
     let Inst{7-5} = ZAn;
   }
@@ -4093,10 +4104,22 @@ multiclass sme2_mova_tile_to_vec_vg2_multi<string mnemonic>{
  defm _V : sme2_mova_tile_to_vec_vg2_multi_inst<0b1, 0b000, mnemonic>;
 }
 
+
 // SME2p1 move tile to vector and zero tile, two registers
 multiclass sme2p1_movaz_tile_to_vec_vg2<string mnemonic>{
  defm _H : sme2_mova_tile_to_vec_vg2_multi_inst<0b0, 0b010, mnemonic>;
  defm _V : sme2_mova_tile_to_vec_vg2_multi_inst<0b1, 0b010, mnemonic>;
+
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_B, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_H, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_S, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_D, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_B, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_H, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_S, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_D, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>;
 }
 
 class sme2_mova_tile_to_vec_vg4_multi_base<bits<2> sz, bit v, bits<6> op,
@@ -4130,7 +4153,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 ZZZZ_b_mul_r,
                                                 !if(v, TileVectorOpV8,
                                                        TileVectorOpH8),
-                                                uimm2s4range, mnemonic> {
+                                                uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
     bits<2> imm;
     let Inst{6-5} = imm;
   }
@@ -4139,7 +4162,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 ZZZZ_h_mul_r,
                                                 !if(v, TileVectorOpV16,
                                                        TileVectorOpH16),
-                                                uimm1s4range, mnemonic> {
+                                                uimm1s4range, mnemonic>, SMEPseudo2Instr<NAME # _H, 1> {
     bits<1> ZAn;
     bits<1> imm;
     let Inst{6}   = ZAn;
@@ -4150,7 +4173,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 ZZZZ_s_mul_r,
                                                 !if(v, TileVectorOpV32,
                                                        TileVectorOpH32),
-                                                 uimm0s4range, mnemonic> {
+                                                 uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
     bits<2> ZAn;
     let Inst{6-5} = ZAn;
   }
@@ -4159,7 +4182,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 ZZZZ_d_mul_r,
                                                 !if(v, TileVectorOpV64,
                                                        TileVectorOpH64),
-                                                uimm0s4range, mnemonic> {
+                                                uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
     bits<3> ZAn;
     let Inst{7-5} = ZAn;
   }
@@ -4228,6 +4251,16 @@ multiclass sme2_mova_tile_to_vec_vg4_multi<string mnemonic>{
 multiclass sme2p1_movaz_tile_to_vec_vg4<string mnemonic>{
  defm _H : sme2_mova_tile_to_vec_vg4_multi_base<0b0, 0b110, mnemonic>;
  defm _V : sme2_mova_tile_to_vec_vg4_multi_base<0b1, 0b110, mnemonic>;
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_B, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_H, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_S, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_D, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_B, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_H, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_S, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_D, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>;
 }
 
 
@@ -4761,14 +4794,14 @@ class sme2p1_movaz_tile_to_vec_base<bits<2> sz, bit q, bit v, ZPRRegOp vector_ty
 multiclass sme2p1_movaz_tile_to_vec_base<bit v, string mnemonic> {
   def _B : sme2p1_movaz_tile_to_vec_base<0b00, 0b0, v, ZPR8,
                                     !if(v, TileVectorOpV8, TileVectorOpH8),
-                                    sme_elm_idx0_15, mnemonic> {
+                                    sme_elm_idx0_15, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
     bits<4> imm;
     let Inst{8-5} = imm;
   }
 
   def _H : sme2p1_movaz_tile_to_vec_base<0b01, 0b0, v, ZPR16,
                                     !if(v, TileVectorOpV16, TileVectorOpH16),
-                                    sme_elm_idx0_7, mnemonic> {
+                                    sme_elm_idx0_7, mnemonic>, SMEPseudo2Instr<NAME # _H, 1> {
     bits<1> ZAn;
     bits<3> imm;
     let Inst{8}   = ZAn;
@@ -4777,7 +4810,7 @@ multiclass sme2p1_movaz_tile_to_vec_base<bit v, string mnemonic> {
 
   def _S : sme2p1_movaz_tile_to_vec_base<0b10, 0b0, v, ZPR32,
                                     !if(v, TileVectorOpV32, TileVectorOpH32),
-                                    sme_elm_idx0_3, mnemonic> {
+                                    sme_elm_idx0_3, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
     bits<2> ZAn;
     bits<2> imm;
     let Inst{8-7} = ZAn;
@@ -4786,7 +4819,7 @@ multiclass sme2p1_movaz_tile_to_vec_base<bit v, string mnemonic> {
 
   def _D : sme2p1_movaz_tile_to_vec_base<0b11, 0b0, v, ZPR64,
                                     !if(v, TileVectorOpV64, TileVectorOpH64),
-                                    sme_elm_idx0_1, mnemonic> {
+                                    sme_elm_idx0_1, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
     bits<3> ZAn;
     bits<1> imm;
     let Inst{8-6} = ZAn;
@@ -4795,15 +4828,66 @@ multiclass sme2p1_movaz_tile_to_vec_base<bit v, string mnemonic> {
 
   def _Q : sme2p1_movaz_tile_to_vec_base<0b11, 0b1, v, ZPR128,
                                     !if(v, TileVectorOpV128, TileVectorOpH128),
-                                    sme_elm_idx0_0, mnemonic> {
+                                    sme_elm_idx0_0, mnemonic>, SMEPseudo2Instr<NAME # _Q, 1> {
     bits<4> ZAn;
     let Inst{8-5} = ZAn;
   }
 }
 
-multiclass sme2p1_movaz_tile_to_vec<string mnemonic>{
+multiclass sme2p1_movaz_tile_to_vec<string mnemonic, SDPatternOperator intrinsic_horiz, SDPatternOperator intrinsic_vert,
+                                    SDPatternOperator intrinsic_horiz_q, SDPatternOperator intrinsic_vert_q>{
  defm _H : sme2p1_movaz_tile_to_vec_base<0b0, mnemonic>;
  defm _V : sme2p1_movaz_tile_to_vec_base<0b1, mnemonic>;
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_B, sme_elm_idx0_0,  sme_elm_idx0_15, ZPR8,   SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_H, sme_elm_idx0_1,  sme_elm_idx0_7,  ZPR16,  SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_S, sme_elm_idx0_3,  sme_elm_idx0_3,  ZPR32,  SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_D, sme_elm_idx0_7,  sme_elm_idx0_1,  ZPR64,  SMEMatrixTileD>;
+ def NAME # _H_Q_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _H_Q, sme_elm_idx0_15, sme_elm_idx0_0,  ZPR128, SMEMatrixTileQ>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_B, sme_elm_idx0_0, sme_elm_idx0_15, ZPR8, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_H, sme_elm_idx0_1, sme_elm_idx0_7, ZPR16, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_S, sme_elm_idx0_3, sme_elm_idx0_3, ZPR32, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_D, sme_elm_idx0_7, sme_elm_idx0_1, ZPR64, SMEMatrixTileD>;
+ def NAME # _V_Q_PSEUDO : sme2_movez_to_tile_pseudo<NAME # _V_Q, sme_elm_idx0_15, sme_elm_idx0_0, ZPR128, SMEMatrixTileQ>;
+
+ def : SME2_Tile_Movaz_Pat<NAME # _H_B, intrinsic_horiz, nxv16i8,sme_elm_idx0_0,  sme_elm_idx0_15, tileslice8>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_H, intrinsic_horiz, nxv8i16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_S, intrinsic_horiz, nxv4i32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_D, intrinsic_horiz, nxv2i64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_H, intrinsic_horiz, nxv8bf16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_H, intrinsic_horiz, nxv8f16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_S, intrinsic_horiz, nxv4f32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_D, intrinsic_horiz, nxv2f64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+
+ def : SME2_Tile_Movaz_Pat<NAME # _V_B, intrinsic_vert, nxv16i8, sme_elm_idx0_0, sme_elm_idx0_15, tileslice8>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_H, intrinsic_vert, nxv8i16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_S, intrinsic_vert, nxv4i32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_D, intrinsic_vert, nxv2i64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_H, intrinsic_vert, nxv8bf16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_H, intrinsic_vert, nxv8f16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_S, intrinsic_vert, nxv4f32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_D, intrinsic_vert, nxv2f64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+
+ // H_Q
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv16i8, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv8i16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv4i32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv2i64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv8bf16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv8f16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv4f32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv2f64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+
+ // _V_Q
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv16i8, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv8i16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv4i32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv2i64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv8bf16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv8f16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv4f32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv2f64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index fe68203ad5392d..f90a6d3c1a2a1e 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb5ceb9959325e..63d83346528ab8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -351,6 +351,7 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
   "GFX90AInsts",
   "true",
   "Additional instructions for GFX90A+"
+  // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO
 >;
 
 def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
@@ -711,6 +712,30 @@ def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
   [FeatureFlatGlobalInsts]
 >;
 
+def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32",
+  "HasAtomicFMinFMaxF32GlobalInsts",
+  "true",
+  "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64",
+  "HasAtomicFMinFMaxF64GlobalInsts",
+  "true",
+  "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
+  "HasAtomicFMinFMaxF32FlatInsts",
+  "true",
+  "Has flat memory instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
+  "HasAtomicFMinFMaxF64FlatInsts",
+  "true",
+  "Has flat memory instructions for atomicrmw fmin/fmax for double"
+>;
+
 def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
   "HasAtomicFaddNoRtnInsts",
   "true",
@@ -743,6 +768,12 @@ def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf
  [FeatureFlatGlobalInsts]
 >;
 
+def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
+ "HasAtomicBufferPkAddBF16Inst",
+ "true",
+ "Has buffer_atomic_pk_add_bf16 instruction"
+>;
+
 def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts",
   "HasAtomicCSubNoRtnInsts",
   "true",
@@ -1061,7 +1092,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
   FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
-  FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+  FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+  FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
   ]
 >;
 
@@ -1072,7 +1104,9 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
-  FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+  FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+  FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+  FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
   ]
 >;
 
@@ -1127,7 +1161,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
    FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength63
+   FeatureMaxHardClauseLength63,
+   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+   FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
   ]
 >;
 
@@ -1148,7 +1184,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
    FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength32
+   FeatureMaxHardClauseLength32,
+   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
   ]
 >;
 
@@ -1168,7 +1205,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast
+   FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
+   FeatureMaxHardClauseLength32,
+   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
   ]
 >;
 
@@ -1331,7 +1370,10 @@ def FeatureISAVersion9_0_A : FeatureSet<
      FeaturePackedTID,
      FullRate64Ops,
      FeatureBackOffBarrier,
-     FeatureKernargPreload])>;
+     FeatureKernargPreload,
+     FeatureAtomicFMinFMaxF64GlobalInsts,
+     FeatureAtomicFMinFMaxF64FlatInsts
+     ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
   !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
@@ -1371,7 +1413,10 @@ def FeatureISAVersion9_4_Common : FeatureSet<
    FeatureArchitectedFlatScratch,
    FullRate64Ops,
    FeatureBackOffBarrier,
-   FeatureKernargPreload]>;
+   FeatureKernargPreload,
+   FeatureAtomicFMinFMaxF64GlobalInsts,
+   FeatureAtomicFMinFMaxF64FlatInsts
+   ]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
@@ -1560,6 +1605,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureAtomicFlatPkAdd16Insts,
    FeatureAtomicBufferGlobalPkAddF16Insts,
    FeatureAtomicGlobalPkAddBF16Inst,
+   FeatureAtomicBufferPkAddBF16Inst,
    FeatureFlatAtomicFaddF32Inst,
    FeatureImageInsts,
    FeatureExtendedImageInsts,
@@ -1572,7 +1618,8 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureVGPRSingleUseHintInsts,
    FeatureScalarDwordx3Loads,
    FeatureDPPSrc1SGPR,
-   FeatureMaxHardClauseLength32]>;
+   FeatureMaxHardClauseLength32,
+   Feature1_5xVGPRs]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
@@ -1862,11 +1909,28 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-def HasBufferFlatGlobalAtomicsF64 :
+
+def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
   Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
   // FIXME: This is too coarse, and working around using pseudo's predicates on real instruction.
   AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, FeatureSouthernIslands, FeatureSeaIslands)>;
 
+def HasAtomicFMinFMaxF32GlobalInsts :
+  Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
+  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>;
+
+def HasAtomicFMinFMaxF64GlobalInsts :
+  Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">,
+  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>;
+
+def HasAtomicFMinFMaxF32FlatInsts :
+  Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">,
+  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>;
+
+def HasAtomicFMinFMaxF64FlatInsts :
+  Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">,
+  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>;
+
 def HasLdsAtomicAddF64 :
   Predicate<"Subtarget->hasLdsAtomicAddF64()">,
   AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
@@ -2120,7 +2184,10 @@ def HasAtomicBufferGlobalPkAddF16Insts
   AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>;
 def HasAtomicGlobalPkAddBF16Inst
   : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">,
-  AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+    AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+def HasAtomicBufferPkAddBF16Inst
+  : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">,
+    AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>;
 def HasFlatAtomicFaddF32Inst
   : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
   AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 8ce7000222fa06..aa0e9974667b78 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -13,6 +13,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
 
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Module.h"
 
 namespace llvm {
 
@@ -52,7 +53,7 @@ class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> {
   using Result = AMDGPUAAResult;
 
   AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) {
-    return AMDGPUAAResult(F.getParent()->getDataLayout());
+    return AMDGPUAAResult(F.getDataLayout());
   }
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index de25f9241a5036..f57fc168c1dfce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -115,6 +115,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
     return std::tuple(
         PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
         &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+  case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_SIZE:
+    return {PrivateSegmentSize ? &PrivateSegmentSize : nullptr,
+            &AMDGPU::SGPR_32RegClass, LLT::scalar(32)};
   case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR:
     return std::tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
                       &AMDGPU::SGPR_64RegClass,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8c4..2e02bb4271adc7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -114,11 +114,12 @@ struct AMDGPUFunctionArgInfo {
     PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
     IMPLICIT_BUFFER_PTR = 15,
     IMPLICIT_ARG_PTR = 16,
+    PRIVATE_SEGMENT_SIZE = 17,
 
     // VGPRS:
-    WORKITEM_ID_X       = 17,
-    WORKITEM_ID_Y       = 18,
-    WORKITEM_ID_Z       = 19,
+    WORKITEM_ID_X       = 18,
+    WORKITEM_ID_Y       = 19,
+    WORKITEM_ID_Z       = 20,
     FIRST_VGPR_VALUE    = WORKITEM_ID_X
   };
   // clang-format on
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d8e22f4b0d8fa4..eb67963c1d660b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -136,15 +136,6 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
     getTargetStreamer()->getPALMetadata()->readFromIR(M);
 }
 
-uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
-  int64_t Val;
-  if (!Value->evaluateAsAbsolute(Val)) {
-    Ctx.reportError(SMLoc(), "could not resolve expression when required.");
-    return 0;
-  }
-  return static_cast<uint64_t>(Val);
-}
-
 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Init target streamer if it has not yet happened
   if (!IsTargetStreamerInitialized)
@@ -249,14 +240,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
   getNameWithPrefix(KernelName, &MF->getFunction());
   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
       STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
-      getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
-      getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
-          IsaInfo::getNumExtraSGPRs(
-              &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
-              getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
-              getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
-      getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
-      getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
+      CurrentProgramInfo.NumVGPRsForWavesPerEU,
+      MCBinaryExpr::createSub(
+          CurrentProgramInfo.NumSGPRsForWavesPerEU,
+          AMDGPUMCExpr::createExtraSGPRs(
+              CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+              getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
+          Context),
+      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
 
   Streamer.popSection();
 }
@@ -333,7 +324,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
       report_fatal_error("symbol '" + Twine(GVSym->getName()) +
                          "' is already defined");
 
-    const DataLayout &DL = GV->getParent()->getDataLayout();
+    const DataLayout &DL = GV->getDataLayout();
     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
     Align Alignment = GV->getAlign().value_or(Align(4));
 
@@ -431,9 +422,10 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
                               false);
 }
 
-uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
     const MachineFunction &MF) const {
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  MCContext &Ctx = MF.getContext();
   uint16_t KernelCodeProperties = 0;
   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
 
@@ -461,16 +453,28 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
   }
+  if (UserSGPRInfo.hasPrivateSegmentSize()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+  }
   if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
   }
 
-  if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
-      CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
-    KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
-
-  return KernelCodeProperties;
+  // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
+  // un-evaluatable at this point so it cannot be conditionally checked here.
+  // Instead, we'll directly shift the possibly unknown MCExpr into its place
+  // and bitwise-or it into KernelCodeProperties.
+  const MCExpr *KernelCodePropExpr =
+      MCConstantExpr::create(KernelCodeProperties, Ctx);
+  const MCExpr *OrValue = MCConstantExpr::create(
+      amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
+  OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
+                                    OrValue, Ctx);
+  KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
+
+  return KernelCodePropExpr;
 }
 
 MCKernelDescriptor
@@ -493,11 +497,15 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
 
   KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
   KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
-  KernelDescriptor.kernel_code_properties =
-      MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
-
-  assert(STM.hasGFX90AInsts() ||
-         getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+  KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+  int64_t PGRM_Rsrc3 = 1;
+  bool EvaluatableRsrc3 =
+      CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
+  (void)PGRM_Rsrc3;
+  (void)EvaluatableRsrc3;
+  assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+         static_cast<uint64_t>(PGRM_Rsrc3) == 0);
   KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
 
   KernelDescriptor.kernarg_preload = MCConstantExpr::create(
@@ -793,7 +801,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // The calculations related to SGPR/VGPR blocks are
   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
   // unified.
-  const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
+  const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
       ProgInfo.VCCUsed, ProgInfo.FlatUsed,
       getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
 
@@ -854,7 +862,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
     // FIXME: We should be using the number of registers determined during
     // calling convention lowering to legalize the types.
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     unsigned PSArgCount = 0;
     unsigned IntermediateVGPR = 0;
     for (auto &Arg : F.args()) {
@@ -886,27 +894,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         }
       }
     }
-    ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax(
+    ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
         {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
 
-    ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax(
+    ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
         {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
 
-    ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
+    ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
   unsigned MaxWaves = MFI->getMaxWavesPerEU();
-  ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
-      {ProgInfo.NumSGPR, CreateExpr(1ul),
-       CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
-      Ctx);
-  ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
-      {ProgInfo.NumVGPR, CreateExpr(1ul),
-       CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
-      Ctx);
+  ProgInfo.NumSGPRsForWavesPerEU =
+      AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
+                               CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
+                              Ctx);
+  ProgInfo.NumVGPRsForWavesPerEU =
+      AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
+                               CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+                              Ctx);
 
   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
       STM.hasSGPRInitBug()) {
@@ -955,10 +963,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                              unsigned Granule) {
     const MCExpr *OneConst = CreateExpr(1ul);
     const MCExpr *GranuleConst = CreateExpr(Granule);
-    const MCExpr *MaxNumGPR =
-        AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
+    const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
     const MCExpr *AlignToGPR =
-        AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+        AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
     const MCExpr *DivGPR =
         MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
     const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
@@ -1000,7 +1007,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // The MCExpr equivalent of divideCeil.
   auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
     const MCExpr *Ceil =
-        AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
+        AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
     return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
   };
 
@@ -1073,7 +1080,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
-  ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy(
+  ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
       ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
 
@@ -1265,8 +1272,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
   // ScratchSize is in bytes, 16 aligned.
   MD->setScratchSize(
       CC,
-      AMDGPUVariadicMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
-                                          MCConstantExpr::create(16, Ctx), Ctx),
+      AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
+                                  MCConstantExpr::create(16, Ctx), Ctx),
       Ctx);
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1397,6 +1404,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
   if (UserSGPRInfo.hasFlatScratchInit())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 
+  if (UserSGPRInfo.hasPrivateSegmentSize())
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
   if (UserSGPRInfo.hasDispatchPtr())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 162cd40687c7ee..f70a60aef00736 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -74,8 +74,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
                                 const SIProgramInfo &CurrentProgramInfo,
                                 bool isModuleEntryFunction, bool hasMAIInsts);
 
-  uint16_t getAmdhsaKernelCodeProperties(
-      const MachineFunction &MF) const;
+  const MCExpr *getAmdhsaKernelCodeProperties(const MachineFunction &MF) const;
 
   AMDGPU::MCKernelDescriptor
   getAmdhsaKernelDescriptor(const MachineFunction &MF,
@@ -83,7 +82,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   void initTargetStreamer(Module &M);
 
-  static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx);
   SmallString<128> getMCExprStr(const MCExpr *Value);
 
 public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 38cc5a9bef969f..8062bc13f9a935 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -116,7 +116,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
 
   const UniformityInfo *UA =
       &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-  const DataLayout *DL = &F.getParent()->getDataLayout();
+  const DataLayout *DL = &F.getDataLayout();
 
   DominatorTreeWrapperPass *const DTW =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
@@ -137,7 +137,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
 
   const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
-  const DataLayout *DL = &F.getParent()->getDataLayout();
+  const DataLayout *DL = &F.getDataLayout();
 
   DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
                      DomTreeUpdater::UpdateStrategy::Lazy);
@@ -404,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   assert(ST->hasPermLaneX16());
   V = B.CreateBitCast(V, IntNTy);
   Value *Permlanex16Call = B.CreateIntrinsic(
-      Intrinsic::amdgcn_permlanex16, {},
+      V->getType(), Intrinsic::amdgcn_permlanex16,
       {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
   V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
                           B.CreateBitCast(Permlanex16Call, AtomicTy));
@@ -416,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
     // Reduce across the upper and lower 32 lanes.
     V = B.CreateBitCast(V, IntNTy);
     Value *Permlane64Call =
-        B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
+        B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
     return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
                                B.CreateBitCast(Permlane64Call, AtomicTy));
   }
@@ -424,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
   // combine them with a scalar operation.
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
   V = B.CreateBitCast(V, IntNTy);
   Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
   Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -472,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     assert(ST->hasPermLaneX16());
     V = B.CreateBitCast(V, IntNTy);
     Value *PermX = B.CreateIntrinsic(
-        Intrinsic::amdgcn_permlanex16, {},
+        V->getType(), Intrinsic::amdgcn_permlanex16,
         {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
 
     Value *UpdateDPPCall =
@@ -514,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
                       B.getInt32(0xf), B.getFalse()});
   } else {
-    Function *ReadLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
-    Function *WriteLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+    Function *ReadLane = Intrinsic::getDeclaration(
+        M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+    Function *WriteLane = Intrinsic::getDeclaration(
+        M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
 
     // On GFX10 all DPP operations are confined to a single row. To get cross-
     // row operations we have to use permlane or readlane.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7e1f041fa10933..b113904ce242f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -416,7 +416,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
                                         Align Alignment) const {
   MachineFunction &MF = B.getMF();
   const Function &F = MF.getFunction();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
 
   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -516,7 +516,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -598,7 +598,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -1353,7 +1353,7 @@ bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
 
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   // The function to jump to is actually the first argument, so we'll change the
   // Callee and other info to match that before using our existing helper.
@@ -1407,7 +1407,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   SmallVector<ArgInfo, 8> OutArgs;
   for (auto &OrigArg : Info.OrigArgs)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 7ab9ba28513324..1aaf514ae8f62c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -312,7 +312,7 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
   if (Node && ArgNo < Node->getNumOperands())
     TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
 
-  const DataLayout &DL = Func->getParent()->getDataLayout();
+  const DataLayout &DL = Func->getDataLayout();
 
   MaybeAlign PointeeAlign;
   Type *Ty = Arg.hasByRefAttr() ? Arg.getParamByRefType() : Arg.getType();
@@ -464,16 +464,6 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   const Function &F = MF.getFunction();
 
-  auto GetMCExprValue = [&MF](const MCExpr *Value) {
-    int64_t Val;
-    if (!Value->evaluateAsAbsolute(Val)) {
-      MCContext &Ctx = MF.getContext();
-      Ctx.reportError(SMLoc(), "could not resolve expression when required.");
-      Val = 0;
-    }
-    return static_cast<uint64_t>(Val);
-  };
-
   auto Kern = HSAMetadataDoc->getMapNode();
 
   Align MaxKernArgAlign;
@@ -481,11 +471,12 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
       STM.getKernArgSegmentSize(F, MaxKernArgAlign));
   Kern[".group_segment_fixed_size"] =
       Kern.getDocument()->getNode(ProgramInfo.LDSSize);
-  Kern[".private_segment_fixed_size"] =
-      Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize));
+  DelayedExprs->assignDocNode(Kern[".private_segment_fixed_size"],
+                              msgpack::Type::UInt, ProgramInfo.ScratchSize);
   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) {
-    Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
-        static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack)));
+    DelayedExprs->assignDocNode(Kern[".uses_dynamic_stack"],
+                                msgpack::Type::Boolean,
+                                ProgramInfo.DynamicCallStack);
   }
 
   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
@@ -497,15 +488,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
   Kern[".wavefront_size"] =
       Kern.getDocument()->getNode(STM.getWavefrontSize());
-  Kern[".sgpr_count"] =
-      Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR));
-  Kern[".vgpr_count"] =
-      Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR));
+  DelayedExprs->assignDocNode(Kern[".sgpr_count"], msgpack::Type::UInt,
+                              ProgramInfo.NumSGPR);
+  DelayedExprs->assignDocNode(Kern[".vgpr_count"], msgpack::Type::UInt,
+                              ProgramInfo.NumVGPR);
 
   // Only add AGPR count to metadata for supported devices
   if (STM.hasMAIInsts()) {
-    Kern[".agpr_count"] =
-        Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR));
+    DelayedExprs->assignDocNode(Kern[".agpr_count"], msgpack::Type::UInt,
+                                ProgramInfo.NumAccVGPR);
   }
 
   Kern[".max_flat_workgroup_size"] =
@@ -527,6 +518,7 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 }
 
 bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+  DelayedExprs->resolveDelayedExpressions();
   return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
 }
 
@@ -536,9 +528,11 @@ void MetadataStreamerMsgPackV4::begin(const Module &Mod,
   emitTargetID(TargetID);
   emitPrintf(Mod);
   getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
+  DelayedExprs->clear();
 }
 
 void MetadataStreamerMsgPackV4::end() {
+  DelayedExprs->resolveDelayedExpressions();
   std::string HSAMetadataString;
   raw_string_ostream StrOS(HSAMetadataString);
   HSAMetadataDoc->toYAML(StrOS);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 0e3bc63919f064..fd76666dc360be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
 
+#include "Utils/AMDGPUDelayedMCExpr.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/Alignment.h"
@@ -65,6 +66,9 @@ class MetadataStreamer {
 class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4
     : public MetadataStreamer {
 protected:
+  std::unique_ptr<DelayedMCExprs> DelayedExprs =
+      std::make_unique<DelayedMCExprs>();
+
   std::unique_ptr<msgpack::Document> HSAMetadataDoc =
       std::make_unique<msgpack::Document>();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5b616d8cc77d1f..6d5ffc66d98b25 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2004,12 +2004,31 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   return true;
 }
 
+// For unbuffered smem loads, it is illegal for the Immediate Offset to be
+// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
+// Handle the case where the Immediate Offset + SOffset is negative.
+bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
+                                                     bool Imm32Only,
+                                                     bool IsBuffer,
+                                                     int64_t ImmOffset) const {
+  if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
+      AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
+    KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
+    if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
+      return false;
+  }
+
+  return true;
+}
+
 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is
 // not null) offset. If Imm32Only is true, match only 32-bit immediate
 // offsets available on CI.
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
                                           SDValue *SOffset, SDValue *Offset,
-                                          bool Imm32Only, bool IsBuffer) const {
+                                          bool Imm32Only, bool IsBuffer,
+                                          bool HasSOffset,
+                                          int64_t ImmOffset) const {
   assert((!SOffset || !Offset) &&
          "Cannot match both soffset and offset at the same time!");
 
@@ -2017,15 +2036,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   if (!C) {
     if (!SOffset)
       return false;
+
     if (ByteOffsetNode.getValueType().isScalarInteger() &&
         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
       *SOffset = ByteOffsetNode;
-      return true;
+      return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
+                                         ImmOffset);
     }
     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
         *SOffset = ByteOffsetNode.getOperand(0);
-        return true;
+        return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
+                                           ImmOffset);
       }
     }
     return false;
@@ -2036,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   // GFX9 and GFX10 have signed byte immediate offsets. The immediate
   // offset for S_BUFFER instructions is unsigned.
   int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
-  std::optional<int64_t> EncodedOffset =
-      AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
+  std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
+      *Subtarget, ByteOffset, IsBuffer, HasSOffset);
   if (EncodedOffset && Offset && !Imm32Only) {
     *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
     return true;
@@ -2096,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
 // true, match only 32-bit immediate offsets available on CI.
 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
                                               SDValue *SOffset, SDValue *Offset,
-                                              bool Imm32Only,
-                                              bool IsBuffer) const {
+                                              bool Imm32Only, bool IsBuffer,
+                                              bool HasSOffset,
+                                              int64_t ImmOffset) const {
   if (SOffset && Offset) {
     assert(!Imm32Only && !IsBuffer);
     SDValue B;
-    return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
-           SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
+
+    if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+      return false;
+
+    int64_t ImmOff = 0;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
+      ImmOff = C->getSExtValue();
+
+    return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
+                                ImmOff);
   }
 
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2121,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
   }
   if (!N0 || !N1)
     return false;
-  if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
+
+  if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+                       ImmOffset)) {
     SBase = N0;
     return true;
   }
-  if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
+  if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+                       ImmOffset)) {
     SBase = N1;
     return true;
   }
@@ -2549,15 +2583,6 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
 
-void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) {
-  // TODO: Select this with a tablegen pattern. This is tricky because the
-  // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
-  // mayLoad/mayStore and tablegen complains about the mismatch.
-  SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32);
-  SDValue Chain = N->getOperand(0);
-  CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), {Reg, Chain});
-}
-
 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_gws_init:
@@ -2714,9 +2739,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     SelectDSBvhStackIntrinsic(N);
     return;
-  case Intrinsic::amdgcn_pops_exiting_wave_id:
-    SelectPOPSExitingWaveID(N);
-    return;
   }
 
   SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index ae6fdb594a2efb..e7911bc1793dfe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -132,6 +132,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isFlatScratchBaseLegal(SDValue Addr) const;
   bool isFlatScratchBaseLegalSV(SDValue Addr) const;
   bool isFlatScratchBaseLegalSVImm(SDValue Addr) const;
+  bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only,
+                                   bool IsBuffer, int64_t ImmOffset = 0) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -174,11 +176,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
                         SDValue *Offset, bool Imm32Only = false,
-                        bool IsBuffer = false) const;
+                        bool IsBuffer = false, bool HasSOffset = false,
+                        int64_t ImmOffset = 0) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
   bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
                             SDValue *Offset, bool Imm32Only = false,
-                            bool IsBuffer = false) const;
+                            bool IsBuffer = false, bool HasSOffset = false,
+                            int64_t ImmOffset = 0) const;
   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
                   SDValue *Offset, bool Imm32Only = false) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -190,6 +194,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
   bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
                                SDValue &Offset) const;
+  bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
+                             SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
 
   bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
@@ -263,7 +269,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectFP_EXTEND(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
   void SelectDSBvhStackIntrinsic(SDNode *N);
-  void SelectPOPSExitingWaveID(SDNode *N);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectInterpP1F16(SDNode *N);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 522b3a34161cd8..747cedb111d31e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1158,7 +1158,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
   Align MaxAlign = Align(1);
   uint64_t ExplicitArgOffset = 0;
-  const DataLayout &DL = Fn.getParent()->getDataLayout();
+  const DataLayout &DL = Fn.getDataLayout();
 
   unsigned InIndex = 0;
 
@@ -6015,7 +6015,7 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   case AtomicRMWInst::FMin:
     return AtomicExpansionKind::CmpXChg;
   case AtomicRMWInst::Xchg: {
-    const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
+    const DataLayout &DL = RMW->getFunction()->getDataLayout();
     unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
     if (ValSize == 32 || ValSize == 64)
       return AtomicExpansionKind::None;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 03e2d622dd3192..dcb0f47973c4a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -768,10 +768,6 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
 }
 
-bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
-  return selectG_ADD_SUB(I);
-}
-
 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
   const MachineOperand &MO = I.getOperand(0);
 
@@ -2079,21 +2075,6 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
-bool AMDGPUInstructionSelector::selectPOPSExitingWaveID(
-    MachineInstr &MI) const {
-  Register Dst = MI.getOperand(0).getReg();
-  const DebugLoc &DL = MI.getDebugLoc();
-  MachineBasicBlock *MBB = MI.getParent();
-
-  // TODO: Select this with a tablegen pattern. This is tricky because the
-  // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
-  // mayLoad/mayStore and tablegen complains about the mismatch.
-  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
-                 .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID);
-  MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-}
-
 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     MachineInstr &I) const {
   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
@@ -2144,8 +2125,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectSBarrierSignalIsfirst(I, IntrinsicID);
   case Intrinsic::amdgcn_s_barrier_leave:
     return selectSBarrierLeave(I);
-  case Intrinsic::amdgcn_pops_exiting_wave_id:
-    return selectPOPSExitingWaveID(I);
   }
   return selectImpl(I, *CoverageInfo);
 }
@@ -3547,6 +3526,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectG_AND_OR_XOR(I);
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_SUB:
+  case TargetOpcode::G_PTR_ADD:
     if (selectImpl(I, *CoverageInfo))
       return true;
     return selectG_ADD_SUB(I);
@@ -3561,6 +3541,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_INTTOPTR:
   case TargetOpcode::G_BITCAST:
   case TargetOpcode::G_PTRTOINT:
+  case TargetOpcode::G_FREEZE:
     return selectCOPY(I);
   case TargetOpcode::G_CONSTANT:
   case TargetOpcode::G_FCONSTANT:
@@ -3583,14 +3564,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_BUILD_VECTOR:
   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
     return selectG_BUILD_VECTOR(I);
-  case TargetOpcode::G_PTR_ADD:
-    if (selectImpl(I, *CoverageInfo))
-      return true;
-    return selectG_PTR_ADD(I);
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectG_IMPLICIT_DEF(I);
-  case TargetOpcode::G_FREEZE:
-    return selectCOPY(I);
   case TargetOpcode::G_INSERT:
     return selectG_INSERT(I);
   case TargetOpcode::G_INTRINSIC:
@@ -4216,10 +4191,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
     return false;
 
   const GEPInfo &GEPI = AddrInfo[0];
-  std::optional<int64_t> EncodedImm =
-      AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
+  std::optional<int64_t> EncodedImm;
 
   if (SOffset && Offset) {
+    EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
+                                              /*HasSOffset=*/true);
     if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
         AddrInfo.size() > 1) {
       const GEPInfo &GEPI2 = AddrInfo[1];
@@ -4229,6 +4205,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
           Base = GEPI2.SgprParts[0];
           *SOffset = OffsetReg;
           *Offset = *EncodedImm;
+          if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
+            return true;
+
+          // For unbuffered smem loads, it is illegal for the Immediate Offset
+          // to be negative if the resulting (Offset + (M0 or SOffset or zero)
+          // is negative. Handle the case where the Immediate Offset + SOffset
+          // is negative.
+          auto SKnown = KB->getKnownBits(*SOffset);
+          if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
+            return false;
+
           return true;
         }
       }
@@ -4236,6 +4223,8 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
     return false;
   }
 
+  EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
+                                            /*HasSOffset=*/false);
   if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
     Base = GEPI.SgprParts[0];
     *Offset = *EncodedImm;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 48f3b18118014a..2d3317e04ce126 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -103,7 +103,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectG_MERGE_VALUES(MachineInstr &I) const;
   bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
   bool selectG_BUILD_VECTOR(MachineInstr &I) const;
-  bool selectG_PTR_ADD(MachineInstr &I) const;
   bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
   bool selectG_INSERT(MachineInstr &I) const;
   bool selectG_SBFX_UBFX(MachineInstr &I) const;
@@ -125,7 +124,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
   bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
-  bool selectPOPSExitingWaveID(MachineInstr &MI) const;
 
   bool selectImageIntrinsic(MachineInstr &MI,
                             const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 21f541d07f6f03..c6dbc58395e48f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -630,7 +630,6 @@ defm int_amdgcn_global_atomic_fmin : noret_op;
 defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
-defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
 defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4ff945eda07e38..a219d015184582 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1637,6 +1637,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
   }
 
+  // TODO: v2bf16 operations, and fat buffer pointer support.
   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
   if (ST.hasLDSFPAtomicAddF32()) {
     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
@@ -1650,9 +1651,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   if (ST.hasFlatAtomicFaddF32Inst())
     Atomic.legalFor({{S32, FlatPtr}});
 
-  getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
-    .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
-
   if (ST.hasGFX90AInsts()) {
     // These are legal with some caveats, and should have undergone expansion in
     // the IR in most situations
@@ -1664,13 +1662,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       });
   }
 
-  if (ST.hasAtomicBufferGlobalPkAddF16Insts())
-    Atomic.legalFor({{V2F16, GlobalPtr}});
+  if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
+      ST.hasAtomicBufferGlobalPkAddF16Insts())
+    Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
   if (ST.hasAtomicGlobalPkAddBF16Inst())
     Atomic.legalFor({{V2BF16, GlobalPtr}});
   if (ST.hasAtomicFlatPkAdd16Insts())
     Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
 
+  // FIXME: Handle flat, global and buffer cases.
+  getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
+    .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
+
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
   // demarshalling
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
@@ -5401,34 +5404,123 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
   return true;
 }
 
-static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
-  switch (IID) {
-  case Intrinsic::amdgcn_ds_fadd:
-    return AMDGPU::G_ATOMICRMW_FADD;
-  case Intrinsic::amdgcn_ds_fmin:
-    return AMDGPU::G_ATOMICRMW_FMIN;
-  case Intrinsic::amdgcn_ds_fmax:
-    return AMDGPU::G_ATOMICRMW_FMAX;
-  default:
-    llvm_unreachable("not a DS FP intrinsic");
+// TODO: Fix pointer type handling
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+                                         MachineInstr &MI,
+                                         Intrinsic::ID IID) const {
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+                      IID == Intrinsic::amdgcn_permlanex16;
+
+  auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
+                                      Register Src2, LLT VT) -> Register {
+    auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
+    switch (IID) {
+    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_permlane64:
+      return LaneOp.getReg(0);
+    case Intrinsic::amdgcn_readlane:
+      return LaneOp.addUse(Src1).getReg(0);
+    case Intrinsic::amdgcn_writelane:
+      return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16: {
+      Register Src3 = MI.getOperand(5).getReg();
+      Register Src4 = MI.getOperand(6).getImm();
+      Register Src5 = MI.getOperand(7).getImm();
+      return LaneOp.addUse(Src1)
+          .addUse(Src2)
+          .addUse(Src3)
+          .addImm(Src4)
+          .addImm(Src5)
+          .getReg(0);
+    }
+    default:
+      llvm_unreachable("unhandled lane op");
+    }
+  };
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+  Register Src1, Src2;
+  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+      IsPermLane16) {
+    Src1 = MI.getOperand(3).getReg();
+    if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
+      Src2 = MI.getOperand(4).getReg();
+    }
   }
-}
 
-bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-                                                      MachineInstr &MI,
-                                                      Intrinsic::ID IID) const {
-  GISelChangeObserver &Observer = Helper.Observer;
-  Observer.changingInstr(MI);
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Size = Ty.getSizeInBits();
 
-  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
+  if (Size == 32) {
+    // Already legal
+    return true;
+  }
 
-  // The remaining operands were used to set fields in the MemOperand on
-  // construction.
-  for (int I = 6; I > 3; --I)
-    MI.removeOperand(I);
+  if (Size < 32) {
+    Src0 = B.buildAnyExt(S32, Src0).getReg(0);
 
-  MI.removeOperand(1); // Remove the intrinsic ID.
-  Observer.changedInstr(MI);
+    if (IsPermLane16)
+      Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
+
+    if (IID == Intrinsic::amdgcn_writelane)
+      Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+
+    Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+    B.buildTrunc(DstReg, LaneOpDst);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (Size % 32 != 0)
+    return false;
+
+  LLT PartialResTy = S32;
+  if (Ty.isVector()) {
+    LLT EltTy = Ty.getElementType();
+    switch (EltTy.getSizeInBits()) {
+    case 16:
+      PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
+      break;
+    case 32:
+      PartialResTy = EltTy;
+      break;
+    default:
+      // Handle all other cases via S32 pieces;
+      break;
+    }
+  }
+
+  SmallVector<Register, 2> PartialRes;
+  unsigned NumParts = Size / 32;
+  MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+  MachineInstrBuilder Src1Parts, Src2Parts;
+
+  if (IsPermLane16)
+    Src1Parts = B.buildUnmerge(PartialResTy, Src1);
+
+  if (IID == Intrinsic::amdgcn_writelane)
+    Src2Parts = B.buildUnmerge(PartialResTy, Src2);
+
+  for (unsigned i = 0; i < NumParts; ++i) {
+    Src0 = Src0Parts.getReg(i);
+
+    if (IsPermLane16)
+      Src1 = Src1Parts.getReg(i);
+
+    if (IID == Intrinsic::amdgcn_writelane)
+      Src2 = Src2Parts.getReg(i);
+
+    PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
+  }
+
+  B.buildMergeLikeInstr(DstReg, PartialRes);
+  MI.eraseFromParent();
   return true;
 }
 
@@ -7333,10 +7425,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeBufferAtomic(MI, B, IntrID);
   case Intrinsic::amdgcn_rsq_clamp:
     return legalizeRsqClampIntrinsic(MI, MRI, B);
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax:
-    return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
   case Intrinsic::amdgcn_image_bvh_intersect_ray:
     return legalizeBVHIntrinsic(MI, B);
   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
@@ -7373,6 +7461,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     Observer.changedInstr(MI);
     return true;
   }
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_writelane:
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_permlane16:
+  case Intrinsic::amdgcn_permlanex16:
+  case Intrinsic::amdgcn_permlane64:
+    return legalizeLaneOp(Helper, MI, IntrID);
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc215..db1c5874093a71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -175,9 +175,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
                                  MachineIRBuilder &B) const;
 
-  bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-                                   MachineInstr &MI, Intrinsic::ID IID) const;
-
   bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
                          MachineIRBuilder &B) const;
 
@@ -210,6 +207,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
                             Intrinsic::ID IID) const;
 
+  bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
+                      Intrinsic::ID IID) const;
+
   bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
 
   bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index a8f6ad09fe28c2..470180f2bcd281 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -969,7 +969,7 @@ void SplitPtrStructs::killAndReplaceSplitInstructions(
     findDbgValues(Dbgs, I);
     for (auto *Dbg : Dbgs) {
       IRB.SetInsertPoint(Dbg);
-      auto &DL = I->getModule()->getDataLayout();
+      auto &DL = I->getDataLayout();
       assert(isSplitFatPtr(I->getType()) &&
              "We should've RAUW'd away loads, stores, etc. at this point");
       auto *OffDbg = cast<DbgValueInst>(Dbg->clone());
@@ -1251,7 +1251,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   IRB.SetInsertPoint(&GEP);
 
   auto [Rsrc, Off] = getPtrParts(Ptr);
-  const DataLayout &DL = GEP.getModule()->getDataLayout();
+  const DataLayout &DL = GEP.getDataLayout();
   bool InBounds = GEP.isInBounds();
 
   // In order to call emitGEPOffset() and thus not have to reimplement it,
@@ -1295,7 +1295,7 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
   unsigned Width = ResTy->getScalarSizeInBits();
 
   auto [Rsrc, Off] = getPtrParts(Ptr);
-  const DataLayout &DL = PI.getModule()->getDataLayout();
+  const DataLayout &DL = PI.getDataLayout();
   unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
 
   Value *Res;
@@ -1324,7 +1324,7 @@ PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) {
   if (!isSplitFatPtr(IP.getType()))
     return {nullptr, nullptr};
   IRB.SetInsertPoint(&IP);
-  const DataLayout &DL = IP.getModule()->getDataLayout();
+  const DataLayout &DL = IP.getDataLayout();
   unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE);
   Value *Int = IP.getOperand(0);
   Type *IntTy = Int->getType();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 1873fdb4d25965..83cce6021693a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -104,7 +104,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
 
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
   LLVMContext &Ctx = F.getParent()->getContext();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   BasicBlock &EntryBlock = *F.begin();
   IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index bf7f67c086f2c8..e91d05954a1c99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -101,7 +101,7 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   Value *Remainders[3]  = {nullptr, nullptr, nullptr};
   Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
 
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
 
   // We expect to see several GEP users, casted to the appropriate type and
   // loaded.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 2d48be9ea542ad..5354ccf65b1cc5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCSymbol.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 0ddf9d6b932675..5c2ecaa65714f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -16,6 +16,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
 
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/LLVMContext.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index aef0ade6d9be63..c97a40976bd2be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0510a1d2eff882..9e7694f41d6b8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4905,8 +4905,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_global_load_tr_b128:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_ds_ordered_add:
-    case Intrinsic::amdgcn_ds_ordered_swap:
-    case Intrinsic::amdgcn_ds_fadd_v2bf16: {
+    case Intrinsic::amdgcn_ds_ordered_swap: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 7b29d573b61012..a323f637677375 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -252,10 +252,6 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd>;
-def : SourceOfDivergence<int_amdgcn_ds_fmin>;
-def : SourceOfDivergence<int_amdgcn_ds_fmax>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 3033b7f58f1a21..3e5d83b8e3fb10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -15,10 +15,9 @@
 /// SplitModule: load-balance the module's functions across a set of N
 /// partitions to allow parallel codegen. However, it does it very
 /// differently than the target-agnostic variant:
-///   - Kernels are used as the module's "roots".
-///     They're known entry points on AMDGPU, and everything else is often
-///     internal only.
-///   - Each kernel has a set of dependencies, and when a kernel and its
+///   - The module has "split roots", which are kernels in the vast
+//      majority of cases.
+///   - Each root has a set of dependencies, and when a root and its
 ///     dependencies is considered "big", we try to put it in a partition where
 ///     most dependencies are already imported, to avoid duplicating large
 ///     amounts of code.
@@ -67,20 +66,22 @@ using namespace llvm;
 
 namespace {
 
-static cl::opt<float> LargeKernelFactor(
-    "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
+static cl::opt<float> LargeFnFactor(
+    "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
     cl::Hidden,
     cl::desc(
-        "consider a kernel as large and needing special treatment when it "
+        "consider a function as large and needing special treatment when the "
+        "cost of importing it into a partition"
         "exceeds the average cost of a partition by this factor; e;g. 2.0 "
-        "means if the kernel and its dependencies is 2 times bigger than "
-        "an average partition; 0 disables large kernels handling entirely"));
+        "means if the function and its dependencies is 2 times bigger than "
+        "an average partition; 0 disables large functions handling entirely"));
 
-static cl::opt<float> LargeKernelOverlapForMerge(
-    "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
+static cl::opt<float> LargeFnOverlapForMerge(
+    "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
     cl::Hidden,
-    cl::desc("defines how much overlap between two large kernel's dependencies "
-             "is needed to put them in the same partition"));
+    cl::desc(
+        "defines how much overlap between two large function's dependencies "
+        "is needed to put them in the same partition"));
 
 static cl::opt<bool> NoExternalizeGlobals(
     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -276,9 +277,9 @@ static bool canBeIndirectlyCalled(const Function &F) {
                            /*IgnoreCastedDirectCall=*/true);
 }
 
-/// When a kernel or any of its callees performs an indirect call, this function
+/// When a function or any of its callees performs an indirect call, this
 /// takes over \ref addAllDependencies and adds all potentially callable
-/// functions to \p Fns so they can be counted as dependencies of the kernel.
+/// functions to \p Fns so they can be counted as dependencies of the function.
 ///
 /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
 /// presence of an indirect call, the function's resource usage is the same as
@@ -300,13 +301,14 @@ static void addAllIndirectCallDependencies(const Module &M,
 /// \param CG Call graph for \p Fn's module.
 /// \param Fn Current function to look at.
 /// \param Fns[out] Resulting list of functions.
+/// \param OnlyDirect Whether to only consider direct callees.
 /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
 /// point, either in \p Fn or in one of the function it calls. When that
 /// happens, we fall back to adding all callable functions inside \p Fn's module
 /// to \p Fns.
 static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
                                const Function &Fn,
-                               DenseSet<const Function *> &Fns,
+                               DenseSet<const Function *> &Fns, bool OnlyDirect,
                                bool &HadIndirectCall) {
   assert(!Fn.isDeclaration());
 
@@ -324,6 +326,9 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
       auto *CGNode = CGEntry.second;
       auto *Callee = CGNode->getFunction();
       if (!Callee) {
+        if (OnlyDirect)
+          continue;
+
         // Functions have an edge towards CallsExternalNode if they're external
         // declarations, or if they do an indirect call. As we only process
         // definitions here, we know this means the function has an indirect
@@ -352,13 +357,19 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
   }
 }
 
-/// Contains information about a kernel and its dependencies.
-struct KernelWithDependencies {
-  KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
-                         const DenseMap<const Function *, CostType> &FnCosts,
-                         const Function *Fn)
+/// Contains information about a function and its dependencies.
+/// This is a splitting root. The splitting algorithm works by
+/// assigning these to partitions.
+struct FunctionWithDependencies {
+  FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+                           const DenseMap<const Function *, CostType> &FnCosts,
+                           const Function *Fn)
       : Fn(Fn) {
-    addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
+    // When Fn is not a kernel, we don't need to collect indirect callees.
+    // Resource usage analysis is only performed on kernels, and we collect
+    // indirect callees for resource usage analysis.
+    addAllDependencies(SML, CG, *Fn, Dependencies,
+                       /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
     TotalCost = FnCosts.at(Fn);
     for (const auto *Dep : Dependencies) {
       TotalCost += FnCosts.at(Dep);
@@ -379,8 +390,8 @@ struct KernelWithDependencies {
 
   CostType TotalCost = 0;
 
-  /// \returns true if this kernel and its dependencies can be considered large
-  /// according to \p Threshold.
+  /// \returns true if this function and its dependencies can be considered
+  /// large according to \p Threshold.
   bool isLarge(CostType Threshold) const {
     return TotalCost > Threshold && !Dependencies.empty();
   }
@@ -419,40 +430,39 @@ static float calculateOverlap(const DenseSet<const Function *> &A,
 /// \param NumParts Number of partitions to create.
 /// \param ModuleCost Total cost of all functions in \p M.
 /// \param FnCosts Map of Function -> Cost
-/// \param WorkList Kernels and their dependencies to process in order.
+/// \param WorkList Functions and their dependencies to process in order.
 /// \returns The created partitions (a vector of size \p NumParts )
 static std::vector<DenseSet<const Function *>>
 doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
                CostType ModuleCost,
                const DenseMap<const Function *, CostType> &FnCosts,
-               const SmallVector<KernelWithDependencies> &WorkList) {
+               const SmallVector<FunctionWithDependencies> &WorkList) {
 
   SML << "\n--Partitioning Starts--\n";
 
-  // Calculate a "large kernel threshold". When more than one kernel's total
-  // import cost exceeds this value, we will try to merge it with other,
-  // similarly large kernels.
+  // Calculate a "large function threshold". When more than one function's total
+  // import cost exceeds this value, we will try to assign it to an existing
+  // partition to reduce the amount of duplication needed.
   //
-  // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
+  // e.g. let two functions X and Y have a import cost of ~10% of the module, we
   // assign X to a partition as usual, but when we get to Y, we check if it's
   // worth also putting it in Y's partition.
-  const CostType LargeKernelThreshold =
-      LargeKernelFactor
-          ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
-          : std::numeric_limits<CostType>::max();
+  const CostType LargeFnThreshold =
+      LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
+                    : std::numeric_limits<CostType>::max();
 
   std::vector<DenseSet<const Function *>> Partitions;
   Partitions.resize(NumParts);
 
-  // Assign a partition to each kernel, and try to keep the partitions more or
+  // Assign functions to partitions, and try to keep the partitions more or
   // less balanced. We do that through a priority queue sorted in reverse, so we
   // can always look at the partition with the least content.
   //
   // There are some cases where we will be deliberately unbalanced though.
-  //  - Large kernels: we try to merge with existing partitions to reduce code
+  //  - Large functions: we try to merge with existing partitions to reduce code
   //  duplication.
-  //  - Kernels with indirect or external calls always go in the first partition
-  //  (P0).
+  //  - Functions with indirect or external calls always go in the first
+  //  partition (P0).
   auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
                               const std::pair<PartitionID, CostType> &b) {
     // When two partitions have the same cost, assign to the one with the
@@ -471,17 +481,17 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
   for (unsigned I = 0; I < NumParts; ++I)
     BalancingQueue.push_back(std::make_pair(I, 0));
 
-  // Helper function to handle assigning a kernel to a partition. This takes
+  // Helper function to handle assigning a function to a partition. This takes
   // care of updating the balancing queue.
   const auto AssignToPartition = [&](PartitionID PID,
-                                     const KernelWithDependencies &KWD) {
+                                     const FunctionWithDependencies &FWD) {
     auto &FnsInPart = Partitions[PID];
-    FnsInPart.insert(KWD.Fn);
-    FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
+    FnsInPart.insert(FWD.Fn);
+    FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
 
-    SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n  ->  ";
-    if (!KWD.Dependencies.empty()) {
-      SML << KWD.Dependencies.size() << " dependencies added\n";
+    SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n  ->  ";
+    if (!FWD.Dependencies.empty()) {
+      SML << FWD.Dependencies.size() << " dependencies added\n";
     };
 
     // Update the balancing queue. we scan backwards because in the common case
@@ -506,44 +516,43 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
     sort(BalancingQueue, ComparePartitions);
   };
 
-  for (auto &CurKernel : WorkList) {
-    // When a kernel has indirect calls, it must stay in the first partition
+  for (auto &CurFn : WorkList) {
+    // When a function has indirect calls, it must stay in the first partition
     // alongside every reachable non-entry function. This is a nightmare case
     // for splitting as it severely limits what we can do.
-    if (CurKernel.HasIndirectCall) {
-      SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
+    if (CurFn.HasIndirectCall) {
+      SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
           << " defaulting to P0\n";
-      AssignToPartition(0, CurKernel);
+      AssignToPartition(0, CurFn);
       continue;
     }
 
-    // When a kernel has non duplicatable dependencies, we have to keep it in
+    // When a function has non duplicatable dependencies, we have to keep it in
     // the first partition as well. This is a conservative approach, a
     // finer-grained approach could keep track of which dependencies are
     // non-duplicatable exactly and just make sure they're grouped together.
-    if (CurKernel.HasNonDuplicatableDependecy) {
-      SML << "Kernel with externally visible dependency "
-          << getName(*CurKernel.Fn) << " defaulting to P0\n";
-      AssignToPartition(0, CurKernel);
+    if (CurFn.HasNonDuplicatableDependecy) {
+      SML << "Function with externally visible dependency "
+          << getName(*CurFn.Fn) << " defaulting to P0\n";
+      AssignToPartition(0, CurFn);
       continue;
     }
 
-    // Be smart with large kernels to avoid duplicating their dependencies.
-    if (CurKernel.isLarge(LargeKernelThreshold)) {
-      assert(LargeKernelOverlapForMerge >= 0.0f &&
-             LargeKernelOverlapForMerge <= 1.0f);
-      SML << "Large Kernel: " << getName(*CurKernel.Fn)
+    // Be smart with large functions to avoid duplicating their dependencies.
+    if (CurFn.isLarge(LargeFnThreshold)) {
+      assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
+      SML << "Large Function: " << getName(*CurFn.Fn)
           << " - looking for partition with at least "
-          << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
+          << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
 
       bool Assigned = false;
       for (const auto &[PID, Fns] : enumerate(Partitions)) {
-        float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
+        float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
         SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
             << PID << '\n';
-        if (Overlap > LargeKernelOverlapForMerge) {
+        if (Overlap > LargeFnOverlapForMerge) {
           SML << "  selecting P" << PID << '\n';
-          AssignToPartition(PID, CurKernel);
+          AssignToPartition(PID, CurFn);
           Assigned = true;
         }
       }
@@ -554,41 +563,34 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
 
     // Normal "load-balancing", assign to partition with least pressure.
     auto [PID, CurCost] = BalancingQueue.back();
-    AssignToPartition(PID, CurKernel);
+    AssignToPartition(PID, CurFn);
   }
 
-  // Work is mostly done now, verify the partioning and add all functions we may
-  // have missed (= unreachable, or we don't understand how they're reached) to
-  // P0.
-  DenseSet<const Function *> AllFunctions;
-  for (const auto &[Idx, Part] : enumerate(Partitions)) {
-    CostType Cost = 0;
-    for (auto *Fn : Part) {
-      // external linkage functions should exclusively be in the first partition
-      // at this stage. In theory, we should only ever see external linkage
-      // functions here if they're kernels, or if they've been added due to a
-      // kernel using indirect calls somewhere in its CallGraph.
-      assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
-      Cost += FnCosts.at(Fn);
+  if (SML) {
+    for (const auto &[Idx, Part] : enumerate(Partitions)) {
+      CostType Cost = 0;
+      for (auto *Fn : Part)
+        Cost += FnCosts.at(Fn);
+      SML << "P" << Idx << " has a total cost of " << Cost << " ("
+          << format("%0.2f", (float(Cost) / ModuleCost) * 100)
+          << "% of source module)\n";
     }
-    SML << "P" << Idx << " has a total cost of " << Cost << " ("
-        << format("%0.2f", (float(Cost) / ModuleCost) * 100)
-        << "% of source module)\n";
-    AllFunctions.insert(Part.begin(), Part.end());
+
+    SML << "--Partitioning Done--\n\n";
   }
 
-  // Add missed functions to P0. This will take care of adding things like
-  // external functions with no callers in the module to P0. This should be
-  // fairly rare as AMDGPU internalizes everything in most cases, so unused
-  // internal functions would get removed.
+  // Check no functions were missed.
+#ifndef NDEBUG
+  DenseSet<const Function *> AllFunctions;
+  for (const auto &Part : Partitions)
+    AllFunctions.insert(Part.begin(), Part.end());
+
   for (auto &Fn : M) {
     if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
-      SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
-      Partitions[0].insert(&Fn);
+      assert(AllFunctions.contains(&Fn) && "Missed a function?!");
     }
   }
-
-  SML << "--Partitioning Done--\n\n";
+#endif
 
   return Partitions;
 }
@@ -605,6 +607,14 @@ static void externalize(GlobalValue &GV) {
     GV.setName("__llvmsplit_unnamed");
 }
 
+static bool hasDirectCaller(const Function &Fn) {
+  for (auto &U : Fn.uses()) {
+    if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
+      return true;
+  }
+  return false;
+}
+
 static void splitAMDGPUModule(
     GetTTIFn GetTTI, Module &M, unsigned N,
     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
@@ -649,13 +659,34 @@ static void splitAMDGPUModule(
   DenseMap<const Function *, CostType> FnCosts;
   const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
 
-  // Gather every kernel into a WorkList, then sort it by descending total cost
-  // of the kernel so the biggest kernels are seen first.
-  SmallVector<KernelWithDependencies> WorkList;
+  // First, gather ever kernel into the worklist.
+  SmallVector<FunctionWithDependencies> WorkList;
   for (auto &Fn : M) {
     if (isEntryPoint(&Fn) && !Fn.isDeclaration())
       WorkList.emplace_back(SML, CG, FnCosts, &Fn);
   }
+
+  // Then, find missing functions that need to be considered as additional
+  // roots. These can't be called in theory, but in practice we still have to
+  // handle them to avoid linker errors.
+  {
+    DenseSet<const Function *> SeenFunctions;
+    for (const auto &FWD : WorkList) {
+      SeenFunctions.insert(FWD.Fn);
+      SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+    }
+
+    for (auto &Fn : M) {
+      // If this function is not part of any kernel's dependencies and isn't
+      // directly called, consider it as a root.
+      if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
+          !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
+        WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+      }
+    }
+  }
+
+  // Sort the worklist so the most expensive roots are seen first.
   sort(WorkList, [&](auto &A, auto &B) {
     // Sort by total cost, and if the total cost is identical, sort
     // alphabetically.
@@ -666,13 +697,20 @@ static void splitAMDGPUModule(
 
   if (SML) {
     SML << "Worklist\n";
-    for (const auto &KWD : WorkList) {
-      SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
-          << " indirect:" << KWD.HasIndirectCall
-          << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
+    for (const auto &FWD : WorkList) {
+      SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
+          << " indirect:" << FWD.HasIndirectCall
+          << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
           << ")\n";
-      for (const auto *Dep : KWD.Dependencies)
-        SML << "  [Dep] " << getName(*Dep) << '\n';
+      // Sort function names before printing to ensure determinism.
+      SmallVector<std::string> SortedDepNames;
+      SortedDepNames.reserve(FWD.Dependencies.size());
+      for (const auto *Dep : FWD.Dependencies)
+        SortedDepNames.push_back(getName(*Dep));
+      sort(SortedDepNames);
+
+      for (const auto &Name : SortedDepNames)
+        SML << "  [dependency] " << Name << '\n';
     }
   }
 
@@ -699,16 +737,8 @@ static void splitAMDGPUModule(
     std::unique_ptr<Module> MPart(
         CloneModule(M, VMap, [&](const GlobalValue *GV) {
           // Functions go in their assigned partition.
-          if (const auto *Fn = dyn_cast<Function>(GV)) {
-// Check we don't import an external linkage function in any
-// partition other than P0.
-#ifndef NDEBUG
-            if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
-              assert((I == 0) == FnsInPart.contains(Fn));
-            }
-#endif
+          if (const auto *Fn = dyn_cast<Function>(GV))
             return FnsInPart.contains(Fn);
-          }
 
           if (NeedsConservativeImport(GV))
             return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0751c8dc8b8bf6..b46adb9f1a0b3a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -597,7 +597,7 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   uint64_t ExplicitArgBytes = 0;
   MaxAlign = Align(1);
 
@@ -1104,6 +1104,9 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
 
   if (hasFlatScratchInit())
     NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
+
+  if (hasPrivateSegmentSize())
+    NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
 }
 
 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2e7d5cdfc3fe93..9162e110aa10b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -658,8 +658,7 @@ Error AMDGPUTargetMachine::buildCodeGenPipeline(
   return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
 }
 
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(
-    PassBuilder &PB, bool PopulateClassToPassNames) {
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
@@ -1437,11 +1436,6 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   // since FastRegAlloc does the replacements itself.
   addPass(createVirtRegRewriter(false));
 
-  // At this point, the sgpr-regalloc has been done and it is good to have the
-  // stack slot coloring to try to optimize the SGPR spill stack indices before
-  // attempting the custom SGPR spill lowering.
-  addPass(&StackSlotColoringID);
-
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
   addPass(&SIPreAllocateWWMRegsID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 98b0bc034b5be9..0f74fbc22fa84f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -58,8 +58,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
                              const CGPassBuilderOption &Opts,
                              PassInstrumentationCallbacks *PIC) override;
 
-  void registerPassBuilderCallbacks(PassBuilder &PB,
-                                    bool PopulateClassToPassNames) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
   void registerDefaultAliasAnalyses(AAManager &) override;
 
   /// Get the integer value of a null pointer in the given address space.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b60..0b1ecc002ae259 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -95,7 +95,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
 }
 
 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()),
+    : BaseT(TM, F.getDataLayout()),
       TargetTriple(TM->getTargetTriple()),
       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()) {}
@@ -144,7 +144,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
   for (const BasicBlock *BB : L->getBlocks()) {
-    const DataLayout &DL = BB->getModule()->getDataLayout();
+    const DataLayout &DL = BB->getDataLayout();
     unsigned LocalGEPsSeen = 0;
 
     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
@@ -292,7 +292,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
     AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
 
 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()),
+    : BaseT(TM, F.getDataLayout()),
       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()), CommonTTI(TM, F),
       IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
@@ -501,10 +501,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                        MemIntrinsicInfo &Info) const {
   switch (Inst->getIntrinsicID()) {
   case Intrinsic::amdgcn_ds_ordered_add:
-  case Intrinsic::amdgcn_ds_ordered_swap:
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
+  case Intrinsic::amdgcn_ds_ordered_swap: {
     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
     if (!Ordering || !Volatile)
@@ -850,7 +847,7 @@ bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
   if (Indices.size() > 1)
     return true;
 
-  const DataLayout &DL = CI->getModule()->getDataLayout();
+  const DataLayout &DL = CI->getDataLayout();
   const SIRegisterInfo *TRI = ST->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
@@ -981,7 +978,7 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                        m_Value(Mask)))) {
     const Function *F = cast<Instruction>(V)->getFunction();
-    const DataLayout &DL = F->getParent()->getDataLayout();
+    const DataLayout &DL = F->getDataLayout();
     return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
                ST->getWavefrontSizeLog2() &&
            ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
@@ -1019,9 +1016,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                             Intrinsic::ID IID) const {
   switch (IID) {
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax:
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private:
   case Intrinsic::amdgcn_flat_atomic_fadd:
@@ -1041,21 +1035,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
                                                     Value *NewV) const {
   auto IntrID = II->getIntrinsicID();
   switch (IntrID) {
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
-    const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
-    if (!IsVolatile->isZero())
-      return nullptr;
-    Module *M = II->getParent()->getParent()->getParent();
-    Type *DestTy = II->getType();
-    Type *SrcTy = NewV->getType();
-    Function *NewDecl =
-        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
-    II->setArgOperand(0, NewV);
-    II->setCalledFunction(NewDecl);
-    return II;
-  }
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private: {
     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 8c7b33fd0194ac..b08957d22ee74e 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1333,12 +1333,12 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   /// \param SGPRRange [in] Token range, used for SGPR diagnostics.
   /// \param VGPRBlocks [out] Result VGPR block count.
   /// \param SGPRBlocks [out] Result SGPR block count.
-  bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
-                          bool FlatScrUsed, bool XNACKUsed,
+  bool calculateGPRBlocks(const FeatureBitset &Features, const MCExpr *VCCUsed,
+                          const MCExpr *FlatScrUsed, bool XNACKUsed,
                           std::optional<bool> EnableWavefrontSize32,
-                          unsigned NextFreeVGPR, SMRange VGPRRange,
-                          unsigned NextFreeSGPR, SMRange SGPRRange,
-                          unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+                          const MCExpr *NextFreeVGPR, SMRange VGPRRange,
+                          const MCExpr *NextFreeSGPR, SMRange SGPRRange,
+                          const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks);
   bool ParseDirectiveAMDGCNTarget();
   bool ParseDirectiveAMDHSACodeObjectVersion();
   bool ParseDirectiveAMDHSAKernel();
@@ -5356,41 +5356,64 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
 }
 
 bool AMDGPUAsmParser::calculateGPRBlocks(
-    const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
-    bool XNACKUsed, std::optional<bool> EnableWavefrontSize32,
-    unsigned NextFreeVGPR, SMRange VGPRRange, unsigned NextFreeSGPR,
-    SMRange SGPRRange, unsigned &VGPRBlocks, unsigned &SGPRBlocks) {
+    const FeatureBitset &Features, const MCExpr *VCCUsed,
+    const MCExpr *FlatScrUsed, bool XNACKUsed,
+    std::optional<bool> EnableWavefrontSize32, const MCExpr *NextFreeVGPR,
+    SMRange VGPRRange, const MCExpr *NextFreeSGPR, SMRange SGPRRange,
+    const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks) {
   // TODO(scott.linder): These calculations are duplicated from
   // AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
   IsaVersion Version = getIsaVersion(getSTI().getCPU());
+  MCContext &Ctx = getContext();
 
-  unsigned NumVGPRs = NextFreeVGPR;
-  unsigned NumSGPRs = NextFreeSGPR;
+  const MCExpr *NumSGPRs = NextFreeSGPR;
+  int64_t EvaluatedSGPRs;
 
   if (Version.Major >= 10)
-    NumSGPRs = 0;
+    NumSGPRs = MCConstantExpr::create(0, Ctx);
   else {
     unsigned MaxAddressableNumSGPRs =
         IsaInfo::getAddressableNumSGPRs(&getSTI());
 
-    if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
-        NumSGPRs > MaxAddressableNumSGPRs)
+    if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) && Version.Major >= 8 &&
+        !Features.test(FeatureSGPRInitBug) &&
+        static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs)
       return OutOfRangeError(SGPRRange);
 
-    NumSGPRs +=
-        IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
+    const MCExpr *ExtraSGPRs =
+        AMDGPUMCExpr::createExtraSGPRs(VCCUsed, FlatScrUsed, XNACKUsed, Ctx);
+    NumSGPRs = MCBinaryExpr::createAdd(NumSGPRs, ExtraSGPRs, Ctx);
 
-    if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
-        NumSGPRs > MaxAddressableNumSGPRs)
+    if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) &&
+        (Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+        static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs)
       return OutOfRangeError(SGPRRange);
 
     if (Features.test(FeatureSGPRInitBug))
-      NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
-  }
+      NumSGPRs =
+          MCConstantExpr::create(IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG, Ctx);
+  }
+
+  // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
+  // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
+  auto GetNumGPRBlocks = [&Ctx](const MCExpr *NumGPR,
+                                unsigned Granule) -> const MCExpr * {
+    const MCExpr *OneConst = MCConstantExpr::create(1ul, Ctx);
+    const MCExpr *GranuleConst = MCConstantExpr::create(Granule, Ctx);
+    const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
+    const MCExpr *AlignToGPR =
+        AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+    const MCExpr *DivGPR =
+        MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
+    const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
+    return SubGPR;
+  };
 
-  VGPRBlocks = IsaInfo::getEncodedNumVGPRBlocks(&getSTI(), NumVGPRs,
-                                                EnableWavefrontSize32);
-  SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs);
+  VGPRBlocks = GetNumGPRBlocks(
+      NextFreeVGPR,
+      IsaInfo::getVGPREncodingGranule(&getSTI(), EnableWavefrontSize32));
+  SGPRBlocks =
+      GetNumGPRBlocks(NumSGPRs, IsaInfo::getSGPREncodingGranule(&getSTI()));
 
   return false;
 }
@@ -5414,14 +5437,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
 
   IsaVersion IVersion = getIsaVersion(getSTI().getCPU());
 
+  const MCExpr *ZeroExpr = MCConstantExpr::create(0, getContext());
+  const MCExpr *OneExpr = MCConstantExpr::create(1, getContext());
+
   SMRange VGPRRange;
-  uint64_t NextFreeVGPR = 0;
-  uint64_t AccumOffset = 0;
+  const MCExpr *NextFreeVGPR = ZeroExpr;
+  const MCExpr *AccumOffset = MCConstantExpr::create(0, getContext());
   uint64_t SharedVGPRCount = 0;
   uint64_t PreloadLength = 0;
   uint64_t PreloadOffset = 0;
   SMRange SGPRRange;
-  uint64_t NextFreeSGPR = 0;
+  const MCExpr *NextFreeSGPR = ZeroExpr;
 
   // Count the number of user SGPRs implied from the enabled feature bits.
   unsigned ImpliedUserSGPRCount = 0;
@@ -5429,8 +5455,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   // Track if the asm explicitly contains the directive for the user SGPR
   // count.
   std::optional<unsigned> ExplicitUserSGPRCount;
-  bool ReserveVCC = true;
-  bool ReserveFlatScr = true;
+  const MCExpr *ReserveVCC = OneExpr;
+  const MCExpr *ReserveFlatScr = OneExpr;
   std::optional<bool> EnableWavefrontSize32;
 
   while (true) {
@@ -5624,34 +5650,29 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_next_free_vgpr") {
-      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       VGPRRange = ValRange;
-      NextFreeVGPR = Val;
+      NextFreeVGPR = ExprVal;
     } else if (ID == ".amdhsa_next_free_sgpr") {
-      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       SGPRRange = ValRange;
-      NextFreeSGPR = Val;
+      NextFreeSGPR = ExprVal;
     } else if (ID == ".amdhsa_accum_offset") {
       if (!isGFX90A())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
-      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
-      AccumOffset = Val;
+      AccumOffset = ExprVal;
     } else if (ID == ".amdhsa_reserve_vcc") {
-      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
-      if (!isUInt<1>(Val))
+      if (EvaluatableExpr && !isUInt<1>(Val))
         return OutOfRangeError(ValRange);
-      ReserveVCC = Val;
+      ReserveVCC = ExprVal;
     } else if (ID == ".amdhsa_reserve_flat_scratch") {
-      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 7)
         return Error(IDRange.Start, "directive requires gfx7+", IDRange);
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
                      "directive is not supported with architected flat scratch",
                      IDRange);
-      if (!isUInt<1>(Val))
+      if (EvaluatableExpr && !isUInt<1>(Val))
         return OutOfRangeError(ValRange);
-      ReserveFlatScr = Val;
+      ReserveFlatScr = ExprVal;
     } else if (ID == ".amdhsa_reserve_xnack_mask") {
       if (IVersion.Major < 8)
         return Error(IDRange.Start, "directive requires gfx8+", IDRange);
@@ -5775,8 +5796,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   if (!Seen.contains(".amdhsa_next_free_sgpr"))
     return TokError(".amdhsa_next_free_sgpr directive is required");
 
-  unsigned VGPRBlocks;
-  unsigned SGPRBlocks;
+  const MCExpr *VGPRBlocks;
+  const MCExpr *SGPRBlocks;
   if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
                          getTargetStreamer().getTargetID()->isXnackOnOrAny(),
                          EnableWavefrontSize32, NextFreeVGPR,
@@ -5784,19 +5805,26 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                          SGPRBlocks))
     return true;
 
-  if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
-          VGPRBlocks))
+  int64_t EvaluatedVGPRBlocks;
+  bool VGPRBlocksEvaluatable =
+      VGPRBlocks->evaluateAsAbsolute(EvaluatedVGPRBlocks);
+  if (VGPRBlocksEvaluatable &&
+      !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
+          static_cast<uint64_t>(EvaluatedVGPRBlocks))) {
     return OutOfRangeError(VGPRRange);
+  }
   AMDGPU::MCKernelDescriptor::bits_set(
-      KD.compute_pgm_rsrc1, MCConstantExpr::create(VGPRBlocks, getContext()),
+      KD.compute_pgm_rsrc1, VGPRBlocks,
       COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT,
       COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, getContext());
 
-  if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
-          SGPRBlocks))
+  int64_t EvaluatedSGPRBlocks;
+  if (SGPRBlocks->evaluateAsAbsolute(EvaluatedSGPRBlocks) &&
+      !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
+          static_cast<uint64_t>(EvaluatedSGPRBlocks)))
     return OutOfRangeError(SGPRRange);
   AMDGPU::MCKernelDescriptor::bits_set(
-      KD.compute_pgm_rsrc1, MCConstantExpr::create(SGPRBlocks, getContext()),
+      KD.compute_pgm_rsrc1, SGPRBlocks,
       COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT,
       COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext());
 
@@ -5826,16 +5854,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   if (isGFX90A()) {
     if (!Seen.contains(".amdhsa_accum_offset"))
       return TokError(".amdhsa_accum_offset directive is required");
-    if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3))
+    int64_t EvaluatedAccum;
+    bool AccumEvaluatable = AccumOffset->evaluateAsAbsolute(EvaluatedAccum);
+    uint64_t UEvaluatedAccum = EvaluatedAccum;
+    if (AccumEvaluatable &&
+        (UEvaluatedAccum < 4 || UEvaluatedAccum > 256 || (UEvaluatedAccum & 3)))
       return TokError("accum_offset should be in range [4..256] in "
                       "increments of 4");
-    if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
+
+    int64_t EvaluatedNumVGPR;
+    if (NextFreeVGPR->evaluateAsAbsolute(EvaluatedNumVGPR) &&
+        AccumEvaluatable &&
+        UEvaluatedAccum >
+            alignTo(std::max((uint64_t)1, (uint64_t)EvaluatedNumVGPR), 4))
       return TokError("accum_offset exceeds total VGPR allocation");
-    MCKernelDescriptor::bits_set(
-        KD.compute_pgm_rsrc3,
-        MCConstantExpr::create(AccumOffset / 4 - 1, getContext()),
-        COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
-        COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext());
+    const MCExpr *AdjustedAccum = MCBinaryExpr::createSub(
+        MCBinaryExpr::createDiv(
+            AccumOffset, MCConstantExpr::create(4, getContext()), getContext()),
+        MCConstantExpr::create(1, getContext()), getContext());
+    MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, AdjustedAccum,
+                                 COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+                                 COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+                                 getContext());
   }
 
   if (IVersion.Major >= 10 && IVersion.Major < 12) {
@@ -5844,7 +5884,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       return TokError("shared_vgpr_count directive not valid on "
                       "wavefront size 32");
     }
-    if (SharedVGPRCount * 2 + VGPRBlocks > 63) {
+
+    if (VGPRBlocksEvaluatable &&
+        (SharedVGPRCount * 2 + static_cast<uint64_t>(EvaluatedVGPRBlocks) >
+         63)) {
       return TokError("shared_vgpr_count*2 + "
                       "compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot "
                       "exceed 63\n");
@@ -8357,7 +8400,7 @@ void AMDGPUAsmParser::onBeginOfFile() {
 ///           max(expr, ...)
 ///
 bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  using AGVK = AMDGPUVariadicMCExpr::VariadicKind;
+  using AGVK = AMDGPUMCExpr::VariantKind;
 
   if (isToken(AsmToken::Identifier)) {
     StringRef TokenId = getTokenStr();
@@ -8387,7 +8430,7 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
                   "mismatch of commas in " + Twine(TokenId) + " expression");
             return true;
           }
-          Res = AMDGPUVariadicMCExpr::create(VK, Exprs, getContext());
+          Res = AMDGPUMCExpr::create(VK, Exprs, getContext());
           return false;
         }
         const MCExpr *Expr;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f95c93981d71fc..3b8d94b7440008 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1149,21 +1149,21 @@ let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
 >;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
 defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fmin", VGPR_32, f32, null_frag
 >;
 defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fmax", VGPR_32, f32, null_frag
 >;
-
 }
 
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
-
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
 >;
-
 }
 
 let SubtargetPredicate = HasD16LoadStore in {
@@ -1241,7 +1241,9 @@ let SubtargetPredicate = isGFX12Plus in {
 defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_cond_sub_u32", VGPR_32, i32
 >;
+}
 
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
 let FPAtomic = 1 in
 defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
@@ -1645,6 +1647,16 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" #
 
 } // end foreach Ty
 
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f32, "BUFFER_ATOMIC_FMIN">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f32, "BUFFER_ATOMIC_FMAX">;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f64, "BUFFER_ATOMIC_MIN_F64">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f64, "BUFFER_ATOMIC_MAX_F64">;
+}
+
 defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
 defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
 
@@ -1737,8 +1749,11 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
 let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
 defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
 
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16">;
+}
+
 let SubtargetPredicate = isGFX12Plus in {
-  defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
   defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
 }
 
@@ -1746,11 +1761,12 @@ let SubtargetPredicate = HasAtomicCSubNoRtnInsts in {
 defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
 }
 
-let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
 }
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
 }
@@ -1822,9 +1838,12 @@ let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in {
 
 let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} //End let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts
 
 multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
   foreach RtnMode = ["ret", "noret"] in {
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e22f8b7d461713..219246b71fe80b 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1142,16 +1142,6 @@ def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret
 
 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
 defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
-
-def : GCNPat <
-  (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
-  (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
-let AddedComplexity = 1 in
-def : GCNPat <
-  (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
-  (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
 } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
 
 let OtherPredicates = [HasGDS] in
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 76a559c9443bdf..0c0e9163fc6ef6 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -494,10 +494,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       if (isGFX12() &&
           tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
         break;
-    }
 
-    // Reinitialize Bytes
-    Bytes = Bytes_.slice(0, MaxInstBytesNum);
+      // Reinitialize Bytes
+      Bytes = Bytes_.slice(0, MaxInstBytesNum);
+    }
 
     if (Bytes.size() >= 8) {
       const uint64_t QW = eatBytes<uint64_t>(Bytes);
@@ -552,10 +552,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       if (isGFX12() &&
           tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
         break;
-    }
 
-    // Reinitialize Bytes
-    Bytes = Bytes_.slice(0, MaxInstBytesNum);
+      // Reinitialize Bytes
+      Bytes = Bytes_.slice(0, MaxInstBytesNum);
+    }
 
     // Try decode 32-bit instruction
     if (Bytes.size() >= 4) {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8ecbd62903a24a..4bf8f20269a154 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -752,19 +752,29 @@ defm FLAT_ATOMIC_DEC_X2     : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
 
 // GFX7-, GFX10-only flat instructions.
 let SubtargetPredicate = isGFX7GFX10 in {
-
 defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
                                 VReg_64, f64, v2f64, VReg_128>;
-
 } // End SubtargetPredicate = isGFX7GFX10
 
+
+// The names may be flat_atomic_fmin_x2 on some subtargets, but we
+// choose this as the canonical name.
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64",
+                                               VReg_64, f64>;
+
+defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64",
+                                                VReg_64, f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
+defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
+}
+
 let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
   defm FLAT_ATOMIC_ADD_F64   : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
-  defm FLAT_ATOMIC_MIN_F64   : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
-  defm FLAT_ATOMIC_MAX_F64   : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
   defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
-  defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
-  defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
 } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
 
 let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in {
@@ -1415,6 +1425,17 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_"#as, f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_"#as, f32>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
+}
+
 } // end foreach as
 
 let SubtargetPredicate = isGFX12Plus in {
@@ -1576,33 +1597,22 @@ let OtherPredicates = [isGFX12Plus] in {
   }
 }
 
-let OtherPredicates = [isGFX10Plus] in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
-
-let OtherPredicates = [isGFX10GFX11] in {
 defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
 defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+}
 
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
 defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
 defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
 }
 
-let OtherPredicates = [isGFX10Only] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
-}
-
 let OtherPredicates = [isGFX12Only] in {
+  // FIXME: Remove these intrinsics
   defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
   defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
   defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
@@ -1616,6 +1626,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amd
 }
 
 let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
 defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
 defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
 }
@@ -1632,22 +1643,26 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_am
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
 }
 
-let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
 defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
 defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
 defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
 defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
 }
 
+let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
+}
+
 let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
 defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467f223899..07ff855756ec9d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -159,6 +159,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasFP8Insts = false;
   bool HasFP8ConversionInsts = false;
   bool HasPkFmacF16Inst = false;
+  bool HasAtomicFMinFMaxF32GlobalInsts = false;
+  bool HasAtomicFMinFMaxF64GlobalInsts = false;
+  bool HasAtomicFMinFMaxF32FlatInsts = false;
+  bool HasAtomicFMinFMaxF64FlatInsts = false;
   bool HasAtomicDsPkAdd16Insts = false;
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
@@ -167,6 +171,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
   bool HasAtomicGlobalPkAddBF16Inst = false;
+  bool HasAtomicBufferPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
   bool HasDefaultComponentZero = false;
   bool HasDefaultComponentBroadcast = false;
@@ -820,6 +825,22 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasPkFmacF16Inst;
   }
 
+  bool hasAtomicFMinFMaxF32GlobalInsts() const {
+    return HasAtomicFMinFMaxF32GlobalInsts;
+  }
+
+  bool hasAtomicFMinFMaxF64GlobalInsts() const {
+    return HasAtomicFMinFMaxF64GlobalInsts;
+  }
+
+  bool hasAtomicFMinFMaxF32FlatInsts() const {
+    return HasAtomicFMinFMaxF32FlatInsts;
+  }
+
+  bool hasAtomicFMinFMaxF64FlatInsts() const {
+    return HasAtomicFMinFMaxF64FlatInsts;
+  }
+
   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
 
   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
@@ -844,6 +865,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasAtomicGlobalPkAddBF16Inst;
   }
 
+  bool hasAtomicBufferPkAddBF16Inst() const {
+    return HasAtomicBufferPkAddBF16Inst;
+  }
+
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
@@ -1547,6 +1572,8 @@ class GCNUserSGPRUsageInfo {
 
   bool hasFlatScratchInit() const { return FlatScratchInit; }
 
+  bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
+
   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
 
   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
@@ -1611,6 +1638,8 @@ class GCNUserSGPRUsageInfo {
 
   bool FlatScratchInit = false;
 
+  bool PrivateSegmentSize = false;
+
   unsigned NumKernargPreloadSGPRs = 0;
 
   unsigned NumUsedUserSGPRs = 0;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 159664faf983fa..83fbf4ac53d507 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -21,13 +21,11 @@
 using namespace llvm;
 using namespace llvm::AMDGPU;
 
-AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
-                                           ArrayRef<const MCExpr *> Args,
-                                           MCContext &Ctx)
+AMDGPUMCExpr::AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args,
+                           MCContext &Ctx)
     : Kind(Kind), Ctx(Ctx) {
   assert(Args.size() >= 1 && "Needs a minimum of one expression.");
-  assert(Kind != AGVK_None &&
-         "Cannot construct AMDGPUVariadicMCExpr of kind none.");
+  assert(Kind != AGVK_None && "Cannot construct AMDGPUMCExpr of kind none.");
 
   // Allocating the variadic arguments through the same allocation mechanism
   // that the object itself is allocated with so they end up in the same memory.
@@ -40,25 +38,23 @@ AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
   this->Args = ArrayRef<const MCExpr *>(RawArgs, Args.size());
 }
 
-AMDGPUVariadicMCExpr::~AMDGPUVariadicMCExpr() { Ctx.deallocate(RawArgs); }
+AMDGPUMCExpr::~AMDGPUMCExpr() { Ctx.deallocate(RawArgs); }
 
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::create(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
-                             MCContext &Ctx) {
-  return new (Ctx) AMDGPUVariadicMCExpr(Kind, Args, Ctx);
+const AMDGPUMCExpr *AMDGPUMCExpr::create(VariantKind Kind,
+                                         ArrayRef<const MCExpr *> Args,
+                                         MCContext &Ctx) {
+  return new (Ctx) AMDGPUMCExpr(Kind, Args, Ctx);
 }
 
-const MCExpr *AMDGPUVariadicMCExpr::getSubExpr(size_t Index) const {
-  assert(Index < Args.size() &&
-         "Indexing out of bounds AMDGPUVariadicMCExpr sub-expr");
+const MCExpr *AMDGPUMCExpr::getSubExpr(size_t Index) const {
+  assert(Index < Args.size() && "Indexing out of bounds AMDGPUMCExpr sub-expr");
   return Args[Index];
 }
 
-void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
-                                     const MCAsmInfo *MAI) const {
+void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   switch (Kind) {
   default:
-    llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
+    llvm_unreachable("Unknown AMDGPUMCExpr kind.");
   case AGVK_Or:
     OS << "or(";
     break;
@@ -86,21 +82,19 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
   OS << ')';
 }
 
-static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
-                  int64_t Arg2) {
+static int64_t op(AMDGPUMCExpr::VariantKind Kind, int64_t Arg1, int64_t Arg2) {
   switch (Kind) {
   default:
-    llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
-  case AMDGPUVariadicMCExpr::AGVK_Max:
+    llvm_unreachable("Unknown AMDGPUMCExpr kind.");
+  case AMDGPUMCExpr::AGVK_Max:
     return std::max(Arg1, Arg2);
-  case AMDGPUVariadicMCExpr::AGVK_Or:
+  case AMDGPUMCExpr::AGVK_Or:
     return Arg1 | Arg2;
   }
 }
 
-bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
-                                              const MCAsmLayout *Layout,
-                                              const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
+                                      const MCFixup *Fixup) const {
   auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
     MCValue MCVal;
     if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -112,7 +106,7 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
   };
 
   assert(Args.size() == 3 &&
-         "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+         "AMDGPUMCExpr Argument count incorrect for ExtraSGPRs");
   const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
   uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0;
 
@@ -129,9 +123,8 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
   return true;
 }
 
-bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
-                                                const MCAsmLayout *Layout,
-                                                const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout,
+                                        const MCFixup *Fixup) const {
   auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
     MCValue MCVal;
     if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -142,7 +135,7 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
     return true;
   };
   assert(Args.size() == 2 &&
-         "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+         "AMDGPUMCExpr Argument count incorrect for TotalNumVGPRs");
   const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
   uint64_t NumAGPR = 0, NumVGPR = 0;
 
@@ -158,9 +151,8 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
   return true;
 }
 
-bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
-                                           const MCAsmLayout *Layout,
-                                           const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout,
+                                   const MCFixup *Fixup) const {
   auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
     MCValue MCVal;
     if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -172,7 +164,7 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
   };
 
   assert(Args.size() == 2 &&
-         "AMDGPUVariadic Argument count incorrect for AlignTo");
+         "AMDGPUMCExpr Argument count incorrect for AlignTo");
   uint64_t Value = 0, Align = 0;
   if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align))
     return false;
@@ -181,9 +173,8 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
   return true;
 }
 
-bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
-                                             const MCAsmLayout *Layout,
-                                             const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout,
+                                     const MCFixup *Fixup) const {
   auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
     MCValue MCVal;
     if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -194,7 +185,7 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
     return true;
   };
   assert(Args.size() == 7 &&
-         "AMDGPUVariadic Argument count incorrect for Occupancy");
+         "AMDGPUMCExpr Argument count incorrect for Occupancy");
   uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
       NumSGPRs, NumVGPRs;
 
@@ -226,8 +217,9 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
   return true;
 }
 
-bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
-    MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                             const MCAsmLayout *Layout,
+                                             const MCFixup *Fixup) const {
   std::optional<int64_t> Total;
 
   switch (Kind) {
@@ -258,12 +250,12 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
   return true;
 }
 
-void AMDGPUVariadicMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+void AMDGPUMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
   for (const MCExpr *Arg : Args)
     Streamer.visitUsedExpr(*Arg);
 }
 
-MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
+MCFragment *AMDGPUMCExpr::findAssociatedFragment() const {
   for (const MCExpr *Arg : Args) {
     if (Arg->findAssociatedFragment())
       return Arg->findAssociatedFragment();
@@ -275,18 +267,19 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
 /// are unresolvable but needed for further MCExprs). Derived from
 /// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
 ///
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
-                                       const MCExpr *FlatScrUsed,
-                                       bool XNACKUsed, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
+                                                   const MCExpr *FlatScrUsed,
+                                                   bool XNACKUsed,
+                                                   MCContext &Ctx) {
 
   return create(AGVK_ExtraSGPRs,
                 {VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)},
                 Ctx);
 }
 
-const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
-    const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
+                                                     const MCExpr *NumVGPR,
+                                                     MCContext &Ctx) {
   return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
 }
 
@@ -295,10 +288,11 @@ const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
 /// Remove dependency on GCNSubtarget and depend only only the necessary values
 /// for said occupancy computation. Should match computeOccupancy implementation
 /// without passing \p STM on.
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
-                                      const MCExpr *NumVGPRs,
-                                      const GCNSubtarget &STM, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
+                                                  const MCExpr *NumSGPRs,
+                                                  const MCExpr *NumVGPRs,
+                                                  const GCNSubtarget &STM,
+                                                  MCContext &Ctx) {
   unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
   unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
   unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index f92350b592350a..207a619d45a1fa 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -17,7 +17,7 @@ namespace llvm {
 class Function;
 class GCNSubtarget;
 
-/// AMDGPU target specific variadic MCExpr operations.
+/// AMDGPU target specific MCExpr operations.
 ///
 /// Takes in a minimum of 1 argument to be used with an operation. The supported
 /// operations are:
@@ -27,9 +27,9 @@ class GCNSubtarget;
 /// \note If the 'or'/'max' operations are provided only a single argument, the
 /// operation will act as a no-op and simply resolve as the provided argument.
 ///
-class AMDGPUVariadicMCExpr : public MCTargetExpr {
+class AMDGPUMCExpr : public MCTargetExpr {
 public:
-  enum VariadicKind {
+  enum VariantKind {
     AGVK_None,
     AGVK_Or,
     AGVK_Max,
@@ -40,14 +40,13 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
   };
 
 private:
-  VariadicKind Kind;
+  VariantKind Kind;
   MCContext &Ctx;
   const MCExpr **RawArgs;
   ArrayRef<const MCExpr *> Args;
 
-  AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
-                       MCContext &Ctx);
-  ~AMDGPUVariadicMCExpr();
+  AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+  ~AMDGPUMCExpr();
 
   bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
                           const MCFixup *Fixup) const;
@@ -59,40 +58,39 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
                          const MCFixup *Fixup) const;
 
 public:
-  static const AMDGPUVariadicMCExpr *
-  create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+  static const AMDGPUMCExpr *
+  create(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
 
-  static const AMDGPUVariadicMCExpr *createOr(ArrayRef<const MCExpr *> Args,
-                                              MCContext &Ctx) {
-    return create(VariadicKind::AGVK_Or, Args, Ctx);
+  static const AMDGPUMCExpr *createOr(ArrayRef<const MCExpr *> Args,
+                                      MCContext &Ctx) {
+    return create(VariantKind::AGVK_Or, Args, Ctx);
   }
 
-  static const AMDGPUVariadicMCExpr *createMax(ArrayRef<const MCExpr *> Args,
-                                               MCContext &Ctx) {
-    return create(VariadicKind::AGVK_Max, Args, Ctx);
+  static const AMDGPUMCExpr *createMax(ArrayRef<const MCExpr *> Args,
+                                       MCContext &Ctx) {
+    return create(VariantKind::AGVK_Max, Args, Ctx);
   }
 
-  static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
-                                                      const MCExpr *FlatScrUsed,
-                                                      bool XNACKUsed,
-                                                      MCContext &Ctx);
+  static const AMDGPUMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
+                                              const MCExpr *FlatScrUsed,
+                                              bool XNACKUsed, MCContext &Ctx);
 
-  static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
-                                                        const MCExpr *NumVGPR,
-                                                        MCContext &Ctx);
+  static const AMDGPUMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
+                                                const MCExpr *NumVGPR,
+                                                MCContext &Ctx);
 
-  static const AMDGPUVariadicMCExpr *
+  static const AMDGPUMCExpr *
   createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) {
-    return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx);
+    return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
   }
 
-  static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc,
-                                                     const MCExpr *NumSGPRs,
-                                                     const MCExpr *NumVGPRs,
-                                                     const GCNSubtarget &STM,
-                                                     MCContext &Ctx);
+  static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
+                                             const MCExpr *NumSGPRs,
+                                             const MCExpr *NumVGPRs,
+                                             const GCNSubtarget &STM,
+                                             MCContext &Ctx);
 
-  VariadicKind getKind() const { return Kind; }
+  VariantKind getKind() const { return Kind; }
   const MCExpr *getSubExpr(size_t Index) const;
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e805e964ffe4e5..531031b5803476 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -319,8 +319,9 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 
 void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
-    const MCKernelDescriptor &KD, uint64_t NextVGPR, uint64_t NextSGPR,
-    bool ReserveVCC, bool ReserveFlatScr) {
+    const MCKernelDescriptor &KD, const MCExpr *NextVGPR,
+    const MCExpr *NextSGPR, const MCExpr *ReserveVCC,
+    const MCExpr *ReserveFlatScr) {
   IsaVersion IVersion = getIsaVersion(STI.getCPU());
   const MCAsmInfo *MAI = getContext().getAsmInfo();
 
@@ -339,16 +340,25 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     OS << '\n';
   };
 
+  auto EmitMCExpr = [&](const MCExpr *Value) {
+    int64_t evaluatableValue;
+    if (Value->evaluateAsAbsolute(evaluatableValue)) {
+      OS << static_cast<uint64_t>(evaluatableValue);
+    } else {
+      Value->print(OS, MAI);
+    }
+  };
+
   OS << "\t\t.amdhsa_group_segment_fixed_size ";
-  KD.group_segment_fixed_size->print(OS, MAI);
+  EmitMCExpr(KD.group_segment_fixed_size);
   OS << '\n';
 
   OS << "\t\t.amdhsa_private_segment_fixed_size ";
-  KD.private_segment_fixed_size->print(OS, MAI);
+  EmitMCExpr(KD.private_segment_fixed_size);
   OS << '\n';
 
   OS << "\t\t.amdhsa_kernarg_size ";
-  KD.kernarg_size->print(OS, MAI);
+  EmitMCExpr(KD.kernarg_size);
   OS << '\n';
 
   PrintField(
@@ -433,8 +443,13 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
              ".amdhsa_system_vgpr_workitem_id");
 
   // These directives are required.
-  OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
-  OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+  OS << "\t\t.amdhsa_next_free_vgpr ";
+  EmitMCExpr(NextVGPR);
+  OS << '\n';
+
+  OS << "\t\t.amdhsa_next_free_sgpr ";
+  EmitMCExpr(NextSGPR);
+  OS << '\n';
 
   if (AMDGPU::isGFX90A(STI)) {
     // MCExpr equivalent of taking the (accum_offset + 1) * 4.
@@ -447,19 +462,19 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     accum_bits = MCBinaryExpr::createMul(
         accum_bits, MCConstantExpr::create(4, getContext()), getContext());
     OS << "\t\t.amdhsa_accum_offset ";
-    int64_t IVal;
-    if (accum_bits->evaluateAsAbsolute(IVal)) {
-      OS << static_cast<uint64_t>(IVal);
-    } else {
-      accum_bits->print(OS, MAI);
-    }
+    EmitMCExpr(accum_bits);
     OS << '\n';
   }
 
-  if (!ReserveVCC)
-    OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
-  if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
-    OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
+  OS << "\t\t.amdhsa_reserve_vcc ";
+  EmitMCExpr(ReserveVCC);
+  OS << '\n';
+
+  if (IVersion.Major >= 7 && !hasArchitectedFlatScratch(STI)) {
+    OS << "\t\t.amdhsa_reserve_flat_scratch ";
+    EmitMCExpr(ReserveFlatScr);
+    OS << '\n';
+  }
 
   switch (CodeObjectVersion) {
   default:
@@ -915,8 +930,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 
 void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
-    const MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR,
-    uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
+    const MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR,
+    const MCExpr *NextSGPR, const MCExpr *ReserveVCC,
+    const MCExpr *ReserveFlatScr) {
   auto &Streamer = getStreamer();
   auto &Context = Streamer.getContext();
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e5c90060cb5d02..bf1538c71d1543 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -94,8 +94,9 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual void
   EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
                              const AMDGPU::MCKernelDescriptor &KernelDescriptor,
-                             uint64_t NextVGPR, uint64_t NextSGPR,
-                             bool ReserveVCC, bool ReserveFlatScr) {}
+                             const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+                             const MCExpr *ReserveVCC,
+                             const MCExpr *ReserveFlatScr) {}
 
   static StringRef getArchNameFromElfMach(unsigned ElfMach);
   static unsigned getElfMach(StringRef GPU);
@@ -151,8 +152,9 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   void
   EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
                              const AMDGPU::MCKernelDescriptor &KernelDescriptor,
-                             uint64_t NextVGPR, uint64_t NextSGPR,
-                             bool ReserveVCC, bool ReserveFlatScr) override;
+                             const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+                             const MCExpr *ReserveVCC,
+                             const MCExpr *ReserveFlatScr) override;
 };
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -207,8 +209,9 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   void
   EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
                              const AMDGPU::MCKernelDescriptor &KernelDescriptor,
-                             uint64_t NextVGPR, uint64_t NextSGPR,
-                             bool ReserveVCC, bool ReserveFlatScr) override;
+                             const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+                             const MCExpr *ReserveVCC,
+                             const MCExpr *ReserveFlatScr) override;
 };
 }
 #endif
diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 195dc4f9a0f419..c1a5e3b593748d 100644
--- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index 1a1be4a442857b..ad4aaa8fdef84c 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 #define DEBUG_TYPE "R600tti"
 
 R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()),
+    : BaseT(TM, F.getDataLayout()),
       ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5921422e4b6a99..12977af0d7e852 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1279,10 +1279,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_ordered_add:
-  case Intrinsic::amdgcn_ds_ordered_swap:
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
+  case Intrinsic::amdgcn_ds_ordered_swap: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -1451,9 +1448,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
   case Intrinsic::amdgcn_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume:
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmax:
-  case Intrinsic::amdgcn_ds_fmin:
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
   case Intrinsic::amdgcn_flat_atomic_fadd:
@@ -1612,6 +1606,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
         return false;
     }
 
+    if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+        AM.BaseOffs < 0) {
+      // Scalar (non-buffer) loads can only use a negative offset if
+      // soffset+offset is non-negative. Since the compiler can only prove that
+      // in a few special cases, it is safer to claim that negative offsets are
+      // not supported.
+      return false;
+    }
+
     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
       return true;
 
@@ -2470,6 +2474,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
+  if (UserSGPRInfo.hasPrivateSegmentSize()) {
+    Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
+    MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(PrivateSegmentSizeReg);
+  }
+
   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
   // these from the dispatch pointer.
 }
@@ -6100,6 +6110,184 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
       DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
 }
 
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+                           SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  unsigned ValSize = VT.getSizeInBits();
+  unsigned IID = N->getConstantOperandVal(0);
+  bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+                      IID == Intrinsic::amdgcn_permlanex16;
+  SDLoc SL(N);
+  MVT IntVT = MVT::getIntegerVT(ValSize);
+
+  auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
+                                          SDValue Src2, MVT ValT) -> SDValue {
+    SmallVector<SDValue, 8> Operands;
+    switch (IID) {
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16:
+      Operands.push_back(N->getOperand(6));
+      Operands.push_back(N->getOperand(5));
+      Operands.push_back(N->getOperand(4));
+      [[fallthrough]];
+    case Intrinsic::amdgcn_writelane:
+      Operands.push_back(Src2);
+      [[fallthrough]];
+    case Intrinsic::amdgcn_readlane:
+      Operands.push_back(Src1);
+      [[fallthrough]];
+    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_permlane64:
+      Operands.push_back(Src0);
+      break;
+    default:
+      llvm_unreachable("unhandled lane op");
+    }
+
+    Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
+    std::reverse(Operands.begin(), Operands.end());
+
+    if (SDNode *GL = N->getGluedNode()) {
+      assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+      GL = GL->getOperand(0).getNode();
+      Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+                                     SDValue(GL, 0)));
+    }
+
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
+  };
+
+  SDValue Src0 = N->getOperand(1);
+  SDValue Src1, Src2;
+  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+      IsPermLane16) {
+    Src1 = N->getOperand(2);
+    if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
+      Src2 = N->getOperand(3);
+  }
+
+  if (ValSize == 32) {
+    // Already legal
+    return SDValue();
+  }
+
+  if (ValSize < 32) {
+    bool IsFloat = VT.isFloatingPoint();
+    Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+                                SL, MVT::i32);
+
+    if (IsPermLane16) {
+      Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
+                                  SL, MVT::i32);
+    }
+
+    if (IID == Intrinsic::amdgcn_writelane) {
+      Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+                                  SL, MVT::i32);
+    }
+
+    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+    SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+    return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+  }
+
+  if (ValSize % 32 != 0)
+    return SDValue();
+
+  auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
+    EVT VT = N->getValueType(0);
+    unsigned NE = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+    SmallVector<SDValue, 8> Scalars;
+    unsigned NumOperands = N->getNumOperands();
+    SmallVector<SDValue, 4> Operands(NumOperands);
+    SDNode *GL = N->getGluedNode();
+
+    // only handle convergencectrl_glue
+    assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+
+    for (unsigned i = 0; i != NE; ++i) {
+      for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
+           ++j) {
+        SDValue Operand = N->getOperand(j);
+        EVT OperandVT = Operand.getValueType();
+        if (OperandVT.isVector()) {
+          // A vector operand; extract a single element.
+          EVT OperandEltVT = OperandVT.getVectorElementType();
+          Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
+                                    Operand, DAG.getVectorIdxConstant(i, SL));
+        } else {
+          // A scalar operand; just use it as is.
+          Operands[j] = Operand;
+        }
+      }
+
+      if (GL)
+        Operands[NumOperands - 1] =
+            DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+                        SDValue(GL->getOperand(0).getNode(), 0));
+
+      Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
+    }
+
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
+    return DAG.getBuildVector(VecVT, SL, Scalars);
+  };
+
+  if (VT.isVector()) {
+    switch (MVT::SimpleValueType EltTy =
+                VT.getVectorElementType().getSimpleVT().SimpleTy) {
+    case MVT::i32:
+    case MVT::f32: {
+      SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
+      return unrollLaneOp(LaneOp.getNode());
+    }
+    case MVT::i16:
+    case MVT::f16:
+    case MVT::bf16: {
+      MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
+      SmallVector<SDValue, 4> Pieces;
+      SDValue Src0SubVec, Src1SubVec, Src2SubVec;
+      for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
+        Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
+                                 DAG.getConstant(EltIdx, SL, MVT::i32));
+
+        if (IsPermLane16)
+          Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
+                                   DAG.getConstant(EltIdx, SL, MVT::i32));
+
+        if (IID == Intrinsic::amdgcn_writelane)
+          Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
+                                   DAG.getConstant(EltIdx, SL, MVT::i32));
+
+        Pieces.push_back(
+            IsPermLane16
+                ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
+                : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
+        EltIdx += 2;
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
+    }
+    default:
+      // Handle all other cases by bitcasting to i32 vectors
+      break;
+    }
+  }
+
+  MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+  Src0 = DAG.getBitcast(VecVT, Src0);
+
+  if (IsPermLane16)
+    Src1 = DAG.getBitcast(VecVT, Src1);
+
+  if (IID == Intrinsic::amdgcn_writelane)
+    Src2 = DAG.getBitcast(VecVT, Src2);
+
+  SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+  SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
+  return DAG.getBitcast(VT, UnrolledLaneOp);
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
@@ -8566,6 +8754,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_addrspacecast_nonnull:
     return lowerADDRSPACECAST(Op, DAG);
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_writelane:
+  case Intrinsic::amdgcn_permlane16:
+  case Intrinsic::amdgcn_permlanex16:
+  case Intrinsic::amdgcn_permlane64:
+    return lowerLaneOp(*this, Op.getNode(), DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -8700,28 +8895,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                    M->getVTList(), Ops, M->getMemoryVT(),
                                    M->getMemOperand());
   }
-  case Intrinsic::amdgcn_ds_fadd: {
-    MemSDNode *M = cast<MemSDNode>(Op);
-    unsigned Opc;
-    switch (IntrID) {
-    case Intrinsic::amdgcn_ds_fadd:
-      Opc = ISD::ATOMIC_LOAD_FADD;
-      break;
-    }
-
-    return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
-                         M->getOperand(0), M->getOperand(2), M->getOperand(3),
-                         M->getMemOperand());
-  }
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
-    MemSDNode *M = cast<MemSDNode>(Op);
-    unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
-                                                       : ISD::ATOMIC_LOAD_FMAX;
-    return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
-                         M->getOperand(2), M->getOperand(3),
-                         M->getMemOperand());
-  }
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
   case Intrinsic::amdgcn_raw_buffer_load_format:
@@ -16029,9 +16202,10 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
       if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
         return AtomicExpansionKind::None;
 
-      // TODO: Handle <2 x bfloat> case. While gfx90a/gfx940 supports it for
-      // global/flat, it does not for buffer. gfx12 does have the buffer
-      // version.
+      // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
+      // buffer. gfx12 does have the buffer version.
+      if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
+        return AtomicExpansionKind::None;
     }
 
     if (unsafeFPAtomicsDisabled(RMW->getFunction()))
@@ -16045,13 +16219,20 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
       return ReportUnsafeHWInst(AtomicExpansionKind::None);
 
-    if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
-      // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
-      if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
-      // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
-      if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
+    if (AS != AMDGPUAS::FLAT_ADDRESS) {
+      if (Ty->isFloatTy()) {
+        // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
+        if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+        // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
+        if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      } else {
+        // gfx908
+        if (RMW->use_empty() &&
+            Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      }
     }
 
     // flat atomic fadd f32: gfx940, gfx11+.
@@ -16082,6 +16263,36 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
       return AtomicExpansionKind::None;
 
+    if (unsafeFPAtomicsDisabled(RMW->getFunction()))
+      return AtomicExpansionKind::CmpXChg;
+
+    // Always expand system scope fp atomics.
+    if (HasSystemScope)
+      return AtomicExpansionKind::CmpXChg;
+
+    // For flat and global cases:
+    // float, double in gfx7. Manual claims denormal support.
+    // Removed in gfx8.
+    // float, double restored in gfx10.
+    // double removed again in gfx11, so only f32 for gfx11/gfx12.
+    //
+    // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
+    // f32.
+    //
+    // FIXME: Check scope and fine grained memory
+    if (AS == AMDGPUAS::FLAT_ADDRESS) {
+      if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+        return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+        return ReportUnsafeHWInst(AtomicExpansionKind::None);
+    } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+               AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+      if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+        return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+        return ReportUnsafeHWInst(AtomicExpansionKind::None);
+    }
+
     return AtomicExpansionKind::CmpXChg;
   }
   case AtomicRMWInst::Min:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 531b23d3877cb7..835f44f9d0d61d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3389,7 +3389,7 @@ def : GCNPat<
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
 def : GCNPat<
-  (int_amdgcn_readfirstlane (i32 imm:$src)),
+  (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
   (S_MOV_B32 SReg_32:$src)
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 072c5aedc220b8..d9db0f7a4f531e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -83,7 +83,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     if (CC != CallingConv::AMDGPU_Gfx)
       ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
 
-    // TODO: Pick a high register, and shift down, similar to a kernel.
     FrameOffsetReg = AMDGPU::SGPR33;
     StackPtrOffsetReg = AMDGPU::SGPR32;
 
@@ -233,6 +232,12 @@ Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
   return ArgInfo.FlatScratchInit.getRegister();
 }
 
+Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) {
+  ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
+  NumUserSGPRs += 1;
+  return ArgInfo.PrivateSegmentSize.getRegister();
+}
+
 Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
   ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9fe02e24c8a154..7af5e7388f841e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -752,6 +752,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   Register addKernargSegmentPtr(const SIRegisterInfo &TRI);
   Register addDispatchID(const SIRegisterInfo &TRI);
   Register addFlatScratchInit(const SIRegisterInfo &TRI);
+  Register addPrivateSegmentSize(const SIRegisterInfo &TRI);
   Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
   Register addLDSKernelId();
   SmallVectorImpl<MCRegister> *
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 0d40816cdd4b8e..212edff097837b 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -161,45 +161,6 @@ static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
   return Val;
 }
 
-uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
-  int64_t VBlocks, SBlocks;
-  VGPRBlocks->evaluateAsAbsolute(VBlocks);
-  SGPRBlocks->evaluateAsAbsolute(SBlocks);
-
-  uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
-                 S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) |
-                 getComputePGMRSrc1Reg(*this, ST);
-
-  return Reg;
-}
-
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
-                                    const GCNSubtarget &ST) const {
-  if (AMDGPU::isCompute(CC)) {
-    return getComputePGMRSrc1(ST);
-  }
-  int64_t VBlocks, SBlocks;
-  VGPRBlocks->evaluateAsAbsolute(VBlocks);
-  SGPRBlocks->evaluateAsAbsolute(SBlocks);
-
-  return getPGMRSrc1Reg(*this, CC, ST) |
-         S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
-         S_00B848_SGPRS(static_cast<uint64_t>(SBlocks));
-}
-
-uint64_t SIProgramInfo::getComputePGMRSrc2() const {
-  int64_t ScratchEn;
-  ScratchEnable->evaluateAsAbsolute(ScratchEn);
-  return ScratchEn | getComputePGMRSrc2Reg(*this);
-}
-
-uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
-  if (AMDGPU::isCompute(CC))
-    return getComputePGMRSrc2();
-
-  return 0;
-}
-
 const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
                                                 MCContext &Ctx) const {
   uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index e66e5a194c8b57..c358a2d9db10b1 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -98,16 +98,12 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
     void reset(const MachineFunction &MF);
 
     /// Compute the value of the ComputePGMRsrc1 register.
-    uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
-    uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
     const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
                                      MCContext &Ctx) const;
     const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST,
                               MCContext &Ctx) const;
 
     /// Compute the value of the ComputePGMRsrc2 register.
-    uint64_t getComputePGMRSrc2() const;
-    uint64_t getPGMRSrc2(CallingConv::ID CC) const;
     const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
     const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
 };
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b4f1aaaa1705da..4c5e60c873bb95 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1775,13 +1775,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
 
   if (SpillToVGPR) {
 
-    // Since stack slot coloring pass is trying to optimize SGPR spills,
-    // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
-    // spills of different sizes. This accounts for number of VGPR lanes alloted
-    // equal to the largest SGPR being spilled in them.
-    assert(SB.NumSubRegs <= VGPRSpills.size() &&
-           "Num of SGPRs spilled should be less than or equal to num of "
-           "the VGPR lanes.");
+    assert(SB.NumSubRegs == VGPRSpills.size() &&
+           "Num of VGPR lanes should be equal to num of SGPRs spilled");
 
     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
       Register SubReg =
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e1253d3ed05000..64f33199545a1b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -215,6 +215,11 @@ let isMoveImm = 1 in {
   } // End Uses = [SCC]
 } // End isMoveImm = 1
 
+// Variant of S_MOV_B32 used for reading from volatile registers like
+// SRC_POPS_EXITING_WAVE_ID.
+let hasSideEffects = 1 in
+def S_MOV_B32_sideeffects : SOP1_32 <"s_mov_b32">;
+
 let Defs = [SCC] in {
   def S_NOT_B32 : SOP1_32 <"s_not_b32",
     [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
@@ -1880,6 +1885,12 @@ let SubtargetPredicate = isNotGFX9Plus in {
 def : GetFPModePat<fpmode_mask_gfx6plus>;
 }
 
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat<
+  (int_amdgcn_pops_exiting_wave_id),
+  (S_MOV_B32_sideeffects (i32 SRC_POPS_EXITING_WAVE_ID))
+>;
+
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e6a439eb58ce46..9886235121d258 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -159,6 +159,12 @@ namespace llvm {
 
 namespace AMDGPU {
 
+/// \returns true if the target supports signed immediate offset for SMRD
+/// instructions.
+bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
+  return isGFX9Plus(ST);
+}
+
 /// \returns True if \p STI is AMDHSA.
 bool isHsaAbi(const MCSubtargetInfo &STI) {
   return STI.getTargetTriple().getOS() == Triple::AMDHSA;
@@ -2819,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
   return isGCN3Encoding(ST) || isGFX10Plus(ST);
 }
 
-static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
-  return isGFX9Plus(ST);
-}
-
 bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
                                       int64_t EncodedOffset) {
   if (isGFX12Plus(ST))
@@ -2857,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
 }
 
 std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
-                                            int64_t ByteOffset, bool IsBuffer) {
+                                            int64_t ByteOffset, bool IsBuffer,
+                                            bool HasSOffset) {
+  // For unbuffered smem loads, it is illegal for the Immediate Offset to be
+  // negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
+  // Handle case where SOffset is not present.
+  if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST))
+    return std::nullopt;
+
   if (isGFX12Plus(ST)) // 24 bit signed offsets
     return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset)
                                  : std::nullopt;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 88b449c778ab03..af2f0bc1a6306e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1307,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI);
 bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
 int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
 unsigned hasKernargPreload(const MCSubtargetInfo &STI);
+bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST);
 
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -1479,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
 /// S_LOAD instructions have a signed offset, on other subtargets it is
 /// unsigned. S_BUFFER has an unsigned offset for all subtargets.
 std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
-                                            int64_t ByteOffset, bool IsBuffer);
+                                            int64_t ByteOffset, bool IsBuffer,
+                                            bool HasSOffset = false);
 
 /// \return The encoding that can be used for a 32-bit literal offset in an SMRD
 /// instruction. This is only useful on CI.s
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 04c6e940e6ed67..abe0ce375aedd4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -207,7 +207,9 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
   }
 
   // Verify that we fall into one of 2 cases:
-  //    - All variables are absolute: this is a re-run of the pass
+  //    - All variables are either absolute 
+  //      or direct mapped dynamic LDS that is not lowered.
+  //      this is a re-run of the pass
   //      so we don't have anything to do.
   //    - No variables are absolute.
   std::optional<bool> HasAbsoluteGVs;
@@ -215,6 +217,9 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
     for (auto &[Fn, GVs] : Map) {
       for (auto *GV : GVs) {
         bool IsAbsolute = GV->isAbsoluteSymbolRef();
+        bool IsDirectMapDynLDSGV = AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn);
+        if (IsDirectMapDynLDSGV)
+          continue;
         if (HasAbsoluteGVs.has_value()) {
           if (*HasAbsoluteGVs != IsAbsolute) {
             report_fatal_error(
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index d39b1a47f05265..2c0d61ee4afa1f 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
         !if(P.HasOMod,
             [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
                                                   i1:$clamp, i32:$omod))))],
-            [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
+            [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
         )
     );
 }
@@ -249,12 +249,17 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
 // FIXME: Specify SchedRW for READFIRSTLANE_B32
 // TODO: There is VOP3 encoding also
 def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
-                                       getVOP1Pat<int_amdgcn_readfirstlane,
-                                                  VOP_READFIRSTLANE>.ret, 1> {
+                                       [], 1> {
   let isConvergent = 1;
   let IsInvalidSingleUseConsumer = 1;
 }
 
+foreach vt = Reg32Types.types in {
+  def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))),
+        (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
+  >;
+}
+
 let isReMaterializable = 1 in {
 let SchedRW = [WriteDoubleCvt] in {
 // OMod clears exceptions when set in this instruction
@@ -727,9 +732,7 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
 let SubtargetPredicate = isGFX11Plus in {
   // Restrict src0 to be VGPR
   def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
-                                      getVOP1Pat<int_amdgcn_permlane64,
-                                                 VOP_MOVRELS>.ret,
-                                      /*VOP1Only=*/ 1> {
+                                      [], /*VOP1Only=*/ 1> {
     let IsInvalidSingleUseConsumer = 1;
     let IsInvalidSingleUseProducer = 1;
   }
@@ -739,6 +742,12 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_CVT_U32_U16    : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
 } // End SubtargetPredicate = isGFX11Plus
 
+foreach vt = Reg32Types.types in {
+  def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
+        (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
+  >;
+}
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index bed8d94a26dcb4..9989752c2f6bc0 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -780,16 +780,24 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
 
 // These are special and do not read the exec mask.
 let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
-def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
-  [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
 let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
-  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]> {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
     let IsInvalidSingleUseProducer = 1;
   }
 } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
+foreach vt = Reg32Types.types in {
+  def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
+        (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
+  >;
+
+  def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
+        (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
+  >;
+}
+
 let isReMaterializable = 1 in {
 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
 defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ce12506c9e6f6f..efa8e9c74d4495 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -842,9 +842,9 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">,
   GISDNodeXFormEquiv<opsel_i1timm>;
 
 class PermlanePat<SDPatternOperator permlane,
-  Instruction inst> : GCNPat<
-  (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
-            timm:$fi, timm:$bc),
+  Instruction inst, ValueType vt> : GCNPat<
+  (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
+            timm:$fi, timm:$bc)),
   (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
         SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
 >;
@@ -868,8 +868,10 @@ let SubtargetPredicate = isGFX10Plus in {
     defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
   } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1
 
-  def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
-  def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
+  foreach vt = Reg32Types.types in {
+    def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
+    def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
+  }
 
   defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
   defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
diff --git a/llvm/lib/Target/ARC/ARCTargetTransformInfo.h b/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
index 3e34008902b5cd..24c0dd053efd8d 100644
--- a/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
+++ b/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
@@ -38,7 +38,7 @@ class ARCTTIImpl : public BasicTTIImplBase<ARCTTIImpl> {
 
 public:
   explicit ARCTTIImpl(const ARCTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl()),
         TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e81e6bb6975886..bf6bb8c295efe5 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -49,6 +49,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index b2d291bbe7ffde..65538009b510d8 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -219,7 +219,7 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n");
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo());
-  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
+  BBUtils = std::make_unique<ARMBasicBlockUtils>(MF);
   MF.RenumberBlocks();
   BBUtils->computeAllBlockSizes();
   BBUtils->adjustBBOffsetsAfter(&MF.front());
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 90f5c6c40b49c6..99ad53232f8a5a 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -388,7 +388,7 @@ static bool AlignBlocks(MachineFunction *MF, const ARMSubtarget *STI) {
 bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MCP = mf.getConstantPool();
-  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(mf));
+  BBUtils = std::make_unique<ARMBasicBlockUtils>(mf);
 
   LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: "
                     << MCP->getConstants().size() << " CP entries, aligned to "
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5490c3c9df6c26..a3fd72fe1351c9 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1594,6 +1594,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
+  // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has
+  // it, but it's just a wrapper around ldexp.
+  if (Subtarget->isTargetWindows()) {
+    for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+      if (isOperationExpand(Op, MVT::f32))
+        setOperationAction(Op, MVT::f32, Promote);
+  }
+
+  // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
+  // isn't legal.
+  for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+    if (isOperationExpand(Op, MVT::f16))
+      setOperationAction(Op, MVT::f16, Promote);
+
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
   setTargetDAGCombine(
@@ -21029,7 +21043,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vld4dup: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &DL = I.getDataLayout();
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
@@ -21045,7 +21059,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vld1x4: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &DL = I.getDataLayout();
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
@@ -21064,7 +21078,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vst4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
-    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &DL = I.getDataLayout();
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
@@ -21086,7 +21100,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vst1x4: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
-    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &DL = I.getDataLayout();
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
@@ -21192,7 +21206,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
-    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &DL = I.getDataLayout();
     Type *ValTy = I.getParamElementType(0);
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(ValTy);
@@ -21204,7 +21218,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
   case Intrinsic::arm_stlex:
   case Intrinsic::arm_strex: {
-    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &DL = I.getDataLayout();
     Type *ValTy = I.getParamElementType(1);
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(ValTy);
@@ -21686,7 +21700,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
 
-  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const DataLayout &DL = LI->getDataLayout();
   Align Alignment = LI->getAlign();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
@@ -21832,7 +21846,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   Type *EltTy = VecTy->getElementType();
   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
 
-  const DataLayout &DL = SI->getModule()->getDataLayout();
+  const DataLayout &DL = SI->getDataLayout();
   Align Alignment = SI->getAlign();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index ccc883f646a62a..66d273e7ca9dd8 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 using namespace llvm;
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 00a29f8ecb2320..660f351bae64b7 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -157,8 +157,10 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
     getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
         .legalForCartesianProduct({s32, s64}, {s32});
 
-    getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).legalFor({s32});
+    getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_GET_FPMODE})
+        .legalFor({s32});
     getActionDefinitionsBuilder(G_RESET_FPENV).alwaysLegal();
+    getActionDefinitionsBuilder(G_SET_FPMODE).customFor({s32});
   } else {
     getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV})
         .libcallFor({s32, s64});
@@ -187,6 +189,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
     getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV})
         .libcall();
+    getActionDefinitionsBuilder({G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
+        .libcall();
   }
 
   // Just expand whatever loads and stores are left.
@@ -439,6 +443,21 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
                              *ConstantInt::get(Ctx, AsInteger));
     break;
   }
+  case G_SET_FPMODE: {
+    // New FPSCR = (FPSCR & FPStatusBits) | (Modes & ~FPStatusBits)
+    LLT FPEnvTy = LLT::scalar(32);
+    auto FPEnv = MRI.createGenericVirtualRegister(FPEnvTy);
+    Register Modes = MI.getOperand(0).getReg();
+    MIRBuilder.buildGetFPEnv(FPEnv);
+    auto StatusBitMask = MIRBuilder.buildConstant(FPEnvTy, ARM::FPStatusBits);
+    auto StatusBits = MIRBuilder.buildAnd(FPEnvTy, FPEnv, StatusBitMask);
+    auto NotStatusBitMask =
+        MIRBuilder.buildConstant(FPEnvTy, ~ARM::FPStatusBits);
+    auto FPModeBits = MIRBuilder.buildAnd(FPEnvTy, Modes, NotStatusBitMask);
+    auto NewFPSCR = MIRBuilder.buildOr(FPEnvTy, StatusBits, FPModeBits);
+    MIRBuilder.buildSetFPEnv(NewFPSCR);
+    break;
+  }
   }
 
   MI.eraseFromParent();
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 919828753f45a3..48df7d43a280d1 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1301,7 +1301,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
   MRI = &MF->getRegInfo();
   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
   TRI = ST.getRegisterInfo();
-  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
+  BBUtils = std::make_unique<ARMBasicBlockUtils>(*MF);
   BBUtils->computeAllBlockSizes();
   BBUtils->adjustBBOffsetsAfter(&MF->front());
 
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 54207562dbaeba..75f5d80b2e0d2b 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 8c1b3325f155f6..861d60d3bcce95 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -24,9 +24,10 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsARM.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index a6fdece10ba47c..9234881c9407e2 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -466,6 +466,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case G_GET_FPENV:
   case G_SET_FPENV:
+  case G_GET_FPMODE:
     OperandsMapping =
         getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 8c4b92b8568882..bea088065172e0 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -101,7 +101,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
 public:
   explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   bool areInlineCompatible(const Function *Caller,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 655492f245027e..31b577b9c301fc 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -426,6 +426,8 @@ class ARMTargetELFStreamer : public ARMTargetStreamer {
   // Reset state between object emissions
   void reset() override;
 
+  void finish() override;
+
 public:
   ARMTargetELFStreamer(MCStreamer &S)
     : ARMTargetStreamer(S), CurrentVendor("aeabi") {}
@@ -459,8 +461,6 @@ class ARMELFStreamer : public MCELFStreamer {
 
   ~ARMELFStreamer() override = default;
 
-  void finishImpl() override;
-
   // ARM exception handling directives
   void emitFnStart();
   void emitFnEnd();
@@ -479,7 +479,7 @@ class ARMELFStreamer : public MCELFStreamer {
     MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
   }
 
-  void changeSection(MCSection *Section, const MCExpr *Subsection) override {
+  void changeSection(MCSection *Section, uint32_t Subsection) override {
     LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo);
     MCELFStreamer::changeSection(Section, Subsection);
     auto LastMappingSymbol = LastMappingSymbols.find(Section);
@@ -1113,12 +1113,9 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
 
 void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
 
-void ARMELFStreamer::finishImpl() {
-  MCTargetStreamer &TS = *getTargetStreamer();
-  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
-  ATS.finishAttributeSection();
-
-  MCELFStreamer::finishImpl();
+void ARMTargetELFStreamer::finish() {
+  ARMTargetStreamer::finish();
+  finishAttributeSection();
 }
 
 void ARMELFStreamer::reset() {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 4a41fce711f3f0..5626c0b3eaca5b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -438,7 +438,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
     if (A->isVariable()) {
       int64_t Res;
       if (A->getVariableValue()->evaluateAsAbsolute(
-              Res, Layout, Writer->getSectionAddressMap())) {
+              Res, Asm, Writer->getSectionAddressMap())) {
         FixedValue = Res;
         return;
       }
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 924a4580aa5fdf..72a14556dc39cf 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -1257,7 +1257,7 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   if (!ST->hasMVEIntegerOps())
     return false;
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
   SmallVector<IntrinsicInst *, 4> Gathers;
   SmallVector<IntrinsicInst *, 4> Scatters;
 
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index fe97d4f758997a..c0fffe05fe1d7e 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -355,7 +355,7 @@ const SCEV *MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
     APInt Mask = APInt::getLowBitsSet(Ty->getPrimitiveSizeInBits(),
                                       Log2_64(VectorWidth));
     if (MaskedValueIsZero(BaseV->getValue(), Mask,
-                          L->getHeader()->getModule()->getDataLayout()))
+                          L->getHeader()->getDataLayout()))
       return SE->getMinusSCEV(EC, BaseV);
   } else if (auto *BaseMul = dyn_cast<SCEVMulExpr>(AddExpr->getStart())) {
     if (auto *BaseC = dyn_cast<SCEVConstant>(BaseMul->getOperand(0)))
@@ -436,7 +436,7 @@ bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) {
     }
     LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP. Start is " << *StartSCEV
                       << "\n");
-    SCEVExpander Expander(*SE, L->getHeader()->getModule()->getDataLayout(),
+    SCEVExpander Expander(*SE, L->getHeader()->getDataLayout(),
                           "start");
     Instruction *Ins = L->getLoopPreheader()->getTerminator();
     Value *Start = Expander.expandCodeFor(StartSCEV, StartSCEV->getType(), Ins);
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 5b0b799880a35f..396328e958d18a 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 8ef5c3d9d6bad3..09bcd3109f2b30 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrDesc.h"
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index aaec545fc1feaa..6cb154fc256707 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSectionELF.h"
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 4d0df9f1f683d7..6cfbf9c83dc329 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -761,28 +761,16 @@ let isBranch = 1, isTerminator = 1 in {
   // Conditional skipping on GPR register bits, and
   // conditional skipping on IO register bits.
   let isBarrier = 1 in {
-    def SBRCRrB : FRdB<0b10, (outs),
-                       (ins GPR8
-                        : $rd, i8imm
-                        : $b),
-                       "sbrc\t$rd, $b", []>;
+    def SBRCRrB : FRdB<0b10, (outs), (ins GPR8:$rd, i8imm:$b), "sbrc\t$rd, $b",
+                       []>;
 
-    def SBRSRrB : FRdB<0b11, (outs),
-                       (ins GPR8
-                        : $rd, i8imm
-                        : $b),
-                       "sbrs\t$rd, $b", []>;
-
-    def SBICAb : FIOBIT<0b01, (outs),
-                        (ins imm_port5
-                         : $addr, i8imm
-                         : $b),
+    def SBRSRrB : FRdB<0b11, (outs), (ins GPR8:$rd, i8imm:$b), "sbrs\t$rd, $b",
+                       []>;
+
+    def SBICAb : FIOBIT<0b01, (outs), (ins imm_port5:$addr, i8imm:$b),
                         "sbic\t$addr, $b", []>;
 
-    def SBISAb : FIOBIT<0b11, (outs),
-                        (ins imm_port5
-                         : $addr, i8imm
-                         : $b),
+    def SBISAb : FIOBIT<0b11, (outs), (ins imm_port5:$addr, i8imm:$b),
                         "sbis\t$addr, $b", []>;
   }
 
@@ -790,18 +778,12 @@ let isBranch = 1, isTerminator = 1 in {
   let Uses = [SREG] in {
     // BRBS s, k
     // Branch if `s` flag in status register is set.
-    def BRBSsk : FSK<0, (outs),
-                     (ins i8imm
-                      : $s, relbrtarget_7
-                      : $k),
+    def BRBSsk : FSK<0, (outs), (ins i8imm:$s, relbrtarget_7:$k),
                      "brbs\t$s, $k", []>;
 
     // BRBC s, k
     // Branch if `s` flag in status register is clear.
-    def BRBCsk : FSK<1, (outs),
-                     (ins i8imm
-                      : $s, relbrtarget_7
-                      : $k),
+    def BRBCsk : FSK<1, (outs), (ins i8imm:$s, relbrtarget_7:$k),
                      "brbc\t$s, $k", []>;
   }
 }
@@ -852,53 +834,29 @@ def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7 : $k)>;
 // Based on status register. We cannot simplify these into instruction aliases
 // because we also need to be able to specify a pattern to match for ISel.
 let isBranch = 1, isTerminator = 1, Uses = [SREG] in {
-  def BREQk : FBRsk<0, 0b001, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "breq\t$k", [(AVRbrcond bb
-                                  : $k, AVR_COND_EQ)]>;
-
-  def BRNEk : FBRsk<1, 0b001, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "brne\t$k", [(AVRbrcond bb
-                                 : $k, AVR_COND_NE)]>;
-
-  def BRSHk : FBRsk<1, 0b000, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "brsh\t$k", [(AVRbrcond bb
-                                 : $k, AVR_COND_SH)]>;
-
-  def BRLOk : FBRsk<0, 0b000, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "brlo\t$k", [(AVRbrcond bb
-                                 : $k, AVR_COND_LO)]>;
-
-  def BRMIk : FBRsk<0, 0b010, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "brmi\t$k", [(AVRbrcond bb
-                                 : $k, AVR_COND_MI)]>;
-
-  def BRPLk : FBRsk<1, 0b010, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "brpl\t$k", [(AVRbrcond bb
-                                 : $k, AVR_COND_PL)]>;
-
-  def BRGEk : FBRsk<1, 0b100, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "brge\t$k", [(AVRbrcond bb
-                                 : $k, AVR_COND_GE)]>;
-
-  def BRLTk : FBRsk<0, 0b100, (outs),
-                    (ins relbrtarget_7
-                     : $k),
-                    "brlt\t$k", [(AVRbrcond bb
-                                 : $k, AVR_COND_LT)]>;
+  def BREQk : FBRsk<0, 0b001, (outs), (ins relbrtarget_7:$k), "breq\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_EQ)]>;
+
+  def BRNEk : FBRsk<1, 0b001, (outs), (ins relbrtarget_7:$k), "brne\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_NE)]>;
+
+  def BRSHk : FBRsk<1, 0b000, (outs), (ins relbrtarget_7:$k), "brsh\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_SH)]>;
+
+  def BRLOk : FBRsk<0, 0b000, (outs), (ins relbrtarget_7:$k), "brlo\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_LO)]>;
+
+  def BRMIk : FBRsk<0, 0b010, (outs), (ins relbrtarget_7:$k), "brmi\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_MI)]>;
+
+  def BRPLk : FBRsk<1, 0b010, (outs), (ins relbrtarget_7:$k), "brpl\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_PL)]>;
+
+  def BRGEk : FBRsk<1, 0b100, (outs), (ins relbrtarget_7:$k), "brge\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_GE)]>;
+
+  def BRLTk : FBRsk<0, 0b100, (outs), (ins relbrtarget_7:$k), "brlt\t$k",
+                    [(AVRbrcond bb:$k, AVR_COND_LT)]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -906,62 +864,37 @@ let isBranch = 1, isTerminator = 1, Uses = [SREG] in {
 //===----------------------------------------------------------------------===//
 // 8 and 16-bit register move instructions.
 let hasSideEffects = 0 in {
-  def MOVRdRr : FRdRr<0b0010, 0b11,
-                      (outs GPR8
-                       : $rd),
-                      (ins GPR8
-                       : $rr),
+  def MOVRdRr : FRdRr<0b0010, 0b11, (outs GPR8:$rd), (ins GPR8:$rr),
                       "mov\t$rd, $rr", []>;
 
-  def MOVWRdRr : FMOVWRdRr<(outs DREGS
-                            : $rd),
-                           (ins DREGS
-                            : $rr),
-                           "movw\t$rd, $rr", []>,
+  def MOVWRdRr : FMOVWRdRr<(outs DREGS:$rd), (ins DREGS:$rr), "movw\t$rd, $rr",
+                           []>,
                  Requires<[HasMOVW]>;
 }
 
 // Load immediate values into registers.
 let isReMaterializable = 1 in {
-  def LDIRdK : FRdK<0b1110,
-                    (outs LD8
-                     : $rd),
-                    (ins imm_ldi8
-                     : $k),
-                    "ldi\t$rd, $k", [(set i8
-                                      : $rd, imm
-                                      : $k)]>;
+  def LDIRdK : FRdK<0b1110, (outs LD8:$rd), (ins imm_ldi8:$k), "ldi\t$rd, $k",
+                    [(set i8:$rd, imm:$k)]>;
 
   // LDIW Rd+1:Rd, K+1:K
   //
   // Expands to:
   // ldi Rd,   K
   // ldi Rd+1, K+1
-  def LDIWRdK : Pseudo<(outs DLDREGS
-                        : $dst),
-                       (ins i16imm
-                        : $src),
-                       "ldiw\t$dst, $src", [(set i16
-                                             : $dst, imm
-                                             : $src)]>;
+  def LDIWRdK : Pseudo<(outs DLDREGS:$dst), (ins i16imm:$src),
+                       "ldiw\t$dst, $src", [(set i16:$dst, imm:$src)]>;
 }
 
 // Load from data space into register.
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def LDSRdK : F32DM<0b0,
-                     (outs GPR8
-                      : $rd),
-                     (ins imm16
-                      : $k),
-                     "lds\t$rd, $k", [(set i8
-                                       : $rd, (load imm
-                                               : $k))]>,
+  def LDSRdK : F32DM<0b0, (outs GPR8:$rd), (ins imm16:$k), "lds\t$rd, $k",
+                     [(set i8:$rd, (load imm:$k))]>,
                Requires<[HasSRAM, HasNonTinyEncoding]>;
 
   // Load from data space into register, which is only available on AVRTiny.
   def LDSRdKTiny : FLDSSTSTINY<0b0, (outs LD8:$rd), (ins imm7tiny:$k),
-                               "lds\t$rd, $k",
-                               [(set i8:$rd, (load imm:$k))]>,
+                               "lds\t$rd, $k", [(set i8:$rd, (load imm:$k))]>,
                    Requires<[HasSRAM, HasTinyEncoding]>;
 
   // LDSW Rd+1:Rd, K+1:K
@@ -969,26 +902,16 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
   // Expands to:
   // lds Rd,  (K+1:K)
   // lds Rd+1 (K+1:K) + 1
-  def LDSWRdK : Pseudo<(outs DREGS
-                        : $dst),
-                       (ins i16imm
-                        : $src),
-                       "ldsw\t$dst, $src", [(set i16
-                                             : $dst, (load imm
-                                                      : $src))]>,
+  def LDSWRdK : Pseudo<(outs DREGS:$dst), (ins i16imm:$src), "ldsw\t$dst, $src",
+                       [(set i16:$dst, (load imm:$src))]>,
                 Requires<[HasSRAM, HasNonTinyEncoding]>;
 }
 
 // Indirect loads.
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def LDRdPtr : FSTLD<0, 0b00,
-                      (outs GPR8
-                       : $reg),
-                      (ins LDSTPtrReg
-                       : $ptrreg),
-                      "ld\t$reg, $ptrreg", [(set GPR8
-                                             : $reg, (load i16
-                                                      : $ptrreg))]>,
+  def LDRdPtr : FSTLD<0, 0b00, (outs GPR8:$reg), (ins LDSTPtrReg:$ptrreg),
+                      "ld\t$reg, $ptrreg",
+                      [(set GPR8:$reg, (load i16:$ptrreg))]>,
                 Requires<[HasSRAM]>;
 
   // LDW Rd+1:Rd, P
@@ -1001,13 +924,8 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
   //   ld    Rd+1, P+
   //   subiw P,    2
   let Constraints = "@earlyclobber $reg" in def LDWRdPtr
-      : Pseudo<(outs DREGS
-                : $reg),
-               (ins PTRDISPREGS
-                : $ptrreg),
-               "ldw\t$reg, $ptrreg", [(set i16
-                                       : $reg, (load i16
-                                                : $ptrreg))]>,
+      : Pseudo<(outs DREGS:$reg), (ins PTRDISPREGS:$ptrreg),
+                "ldw\t$reg, $ptrreg", [(set i16:$reg, (load i16:$ptrreg))]>,
       Requires<[HasSRAM]>;
 }
 
@@ -1027,21 +945,12 @@ let mayLoad = 1, hasSideEffects = 0,
   // Expands to:
   // ld Rd,   P+
   // ld Rd+1, P+
-  def LDWRdPtrPi : Pseudo<(outs DREGS
-                           : $reg, PTRREGS
-                           : $base_wb),
-                          (ins PTRREGS
-                           : $ptrreg),
-                          "ldw\t$reg, $ptrreg+", []>,
+  def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg), "ldw\t$reg, $ptrreg+", []>,
                    Requires<[HasSRAM]>;
 
-  def LDRdPtrPd : FSTLD<0, 0b10,
-                        (outs GPR8
-                         : $reg, PTRREGS
-                         : $base_wb),
-                        (ins LDSTPtrReg
-                         : $ptrreg),
-                        "ld\t$reg, -$ptrreg", []>,
+  def LDRdPtrPd : FSTLD<0, 0b10, (outs GPR8:$reg, PTRREGS:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg), "ld\t$reg, -$ptrreg", []>,
                   Requires<[HasSRAM]>;
 
   // LDW Rd+1:Rd, -P
@@ -1049,27 +958,17 @@ let mayLoad = 1, hasSideEffects = 0,
   // Expands to:
   // ld Rd+1, -P
   // ld Rd,   -P
-  def LDWRdPtrPd : Pseudo<(outs DREGS
-                           : $reg, PTRREGS
-                           : $base_wb),
-                          (ins PTRREGS
-                           : $ptrreg),
-                          "ldw\t$reg, -$ptrreg", []>,
+  def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg), "ldw\t$reg, -$ptrreg", []>,
                    Requires<[HasSRAM]>;
 }
 
 // Load indirect with displacement operations.
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def LDDRdPtrQ
-      : FSTDLDD<0,
-                (outs GPR8
-                 : $reg),
-                (ins memri
-                 : $memri),
-                "ldd\t$reg, $memri", [(set i8
-                                       : $reg, (load addr
-                                                : $memri))]>,
-      Requires<[HasSRAM, HasNonTinyEncoding]>;
+  def LDDRdPtrQ : FSTDLDD<0, (outs GPR8:$reg), (ins memri:$memri),
+                          "ldd\t$reg, $memri",
+                          [(set i8:$reg, (load addr:$memri))]>,
+                  Requires<[HasSRAM, HasNonTinyEncoding]>;
 
   // LDDW Rd+1:Rd, P+q
   //
@@ -1081,15 +980,11 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
   //   ld    Rd,   P+
   //   ld    Rd+1, P+
   //   subiw P,    q+2
-  let Constraints = "@earlyclobber $dst" in def LDDWRdPtrQ
-      : Pseudo<(outs DREGS
-                : $dst),
-               (ins memri
-                : $memri),
-               "lddw\t$dst, $memri", [(set i16
-                                       : $dst, (load addr
-                                                : $memri))]>,
-      Requires<[HasSRAM]>;
+  let Constraints = "@earlyclobber $dst" in
+  def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
+                          "lddw\t$dst, $memri",
+                          [(set i16:$dst, (load addr:$memri))]>,
+                   Requires<[HasSRAM]>;
 
   // An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y
   // register and without the @earlyclobber flag.
@@ -1107,35 +1002,23 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
   // The pseudo expansion pass trivially expands this into LDDWRdPtrQ.
   //
   // This instruction may be removed once PR13375 is fixed.
-  let mayLoad = 1,
-      hasSideEffects = 0 in def LDDWRdYQ : Pseudo<(outs DREGS
-                                                   : $dst),
-                                                  (ins memri
-                                                   : $memri),
-                                                  "lddw\t$dst, $memri", []>,
-      Requires<[HasSRAM]>;
+  let mayLoad = 1, hasSideEffects = 0 in
+  def LDDWRdYQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
+                        "lddw\t$dst, $memri", []>,
+                 Requires<[HasSRAM]>;
 }
 
 class AtomicLoad<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
-    : Pseudo<(outs DRC
-              : $rd),
-             (ins PTRRC
-              : $rr),
-             "atomic_op", [(set DRC
-                            : $rd, (Op i16
-                                    : $rr))]>;
+    : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr), "atomic_op",
+             [(set DRC:$rd, (Op i16:$rr))]>;
 
 class AtomicStore<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
-    : Pseudo<(outs),
-             (ins PTRRC
-              : $rd, DRC
-              : $rr),
-             "atomic_op", [(Op DRC:$rr, i16:$rd)]>;
+    : Pseudo<(outs), (ins PTRRC:$rd, DRC:$rr), "atomic_op",
+             [(Op DRC:$rr, i16:$rd)]>;
 
 class AtomicLoadOp<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
-    : Pseudo<(outs DRC:$rd),
-             (ins PTRRC:$rr, DRC:$operand),
-             "atomic_op", [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
+    : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr, DRC:$operand), "atomic_op",
+             [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
 
 // Atomic instructions
 // ===================
@@ -1167,17 +1050,13 @@ let usesCustomInserter=1 in {
   def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_i8>;
   def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_i16>;
 }
+
 def AtomicFence
     : Pseudo<(outs), (ins), "atomic_fence", [(atomic_fence timm, timm)]>;
 
 // Indirect store from register to data space.
-def STSKRr : F32DM<0b1, (outs),
-                   (ins imm16
-                    : $k, GPR8
-                    : $rd),
-                   "sts\t$k, $rd", [(store i8
-                                     : $rd, imm
-                                     : $k)]>,
+def STSKRr : F32DM<0b1, (outs), (ins imm16:$k, GPR8:$rd), "sts\t$k, $rd",
+                   [(store i8:$rd, imm:$k)]>,
              Requires<[HasSRAM, HasNonTinyEncoding]>;
 
 // Store from register to data space, which is only available on AVRTiny.
@@ -1190,25 +1069,15 @@ def STSKRrTiny : FLDSSTSTINY<0b1, (outs), (ins imm7tiny:$k, LD8:$rd),
 // Expands to:
 // sts Rr+1, (K+1:K) + 1
 // sts Rr,   (K+1:K)
-def STSWKRr : Pseudo<(outs),
-                     (ins i16imm
-                      : $dst, DREGS
-                      : $src),
-                     "stsw\t$dst, $src", [(store i16
-                                           : $src, imm
-                                           : $dst)]>,
+def STSWKRr : Pseudo<(outs), (ins i16imm:$dst, DREGS:$src),
+                     "stsw\t$dst, $src", [(store i16:$src, imm:$dst)]>,
               Requires<[HasSRAM, HasNonTinyEncoding]>;
 
 // Indirect stores.
 // ST P, Rr
 // Stores the value of Rr into the location addressed by pointer P.
-def STPtrRr : FSTLD<1, 0b00, (outs),
-                    (ins LDSTPtrReg
-                     : $ptrreg, GPR8
-                     : $reg),
-                    "st\t$ptrreg, $reg", [(store GPR8
-                                           : $reg, i16
-                                           : $ptrreg)]>,
+def STPtrRr : FSTLD<1, 0b00, (outs), (ins LDSTPtrReg:$ptrreg, GPR8:$reg),
+                    "st\t$ptrreg, $reg", [(store GPR8:$reg, i16:$ptrreg)]>,
               Requires<[HasSRAM]>;
 
 // STW P, Rr+1:Rr
@@ -1221,13 +1090,8 @@ def STPtrRr : FSTLD<1, 0b00, (outs),
 //   st    P+, Rr
 //   st    P+, Rr+1
 //   subiw P,  q+2
-def STWPtrRr : Pseudo<(outs),
-                      (ins PTRDISPREGS
-                       : $ptrreg, DREGS
-                       : $reg),
-                      "stw\t$ptrreg, $reg", [(store i16
-                                              : $reg, i16
-                                              : $ptrreg)]>,
+def STWPtrRr : Pseudo<(outs), (ins PTRDISPREGS:$ptrreg, DREGS:$reg),
+                      "stw\t$ptrreg, $reg", [(store i16:$reg, i16:$ptrreg)]>,
                Requires<[HasSRAM]>;
 
 // Indirect stores (with postincrement or predecrement).
@@ -1236,18 +1100,11 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
   // ST P+, Rr
   // Stores the value of Rr into the location addressed by pointer P.
   // Post increments P.
-  def STPtrPiRr : FSTLD<1, 0b01,
-                        (outs LDSTPtrReg
-                         : $base_wb),
-                        (ins LDSTPtrReg
-                         : $ptrreg, GPR8
-                         : $reg, i8imm
-                         : $offs),
-                        "st\t$ptrreg+, $reg", [(set i16
-                                                : $base_wb, (post_store GPR8
-                                                             : $reg, i16
-                                                             : $ptrreg, imm
-                                                             : $offs))]>,
+  def STPtrPiRr : FSTLD<1, 0b01, (outs LDSTPtrReg:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+                        "st\t$ptrreg+, $reg",
+                        [(set i16:$base_wb, (post_store GPR8:$reg, i16:$ptrreg,
+                         imm:$offs))]>,
                   Requires<[HasSRAM]>;
 
   // STW P+, Rr+1:Rr
@@ -1257,34 +1114,22 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
   // Expands to:
   // st P+, Rr
   // st P+, Rr+1
-  def STWPtrPiRr : Pseudo<(outs PTRREGS
-                           : $base_wb),
-                          (ins PTRREGS
-                           : $ptrreg, DREGS
-                           : $trh, i8imm
-                           : $offs),
-                          "stw\t$ptrreg+, $trh", [(set PTRREGS
-                                                   : $base_wb, (post_store DREGS
-                                                                : $trh, PTRREGS
-                                                                : $ptrreg, imm
-                                                                : $offs))]>,
+  def STWPtrPiRr : Pseudo<(outs PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg, DREGS:$trh, i8imm:$offs),
+                          "stw\t$ptrreg+, $trh",
+                          [(set PTRREGS:$base_wb,
+                           (post_store DREGS:$trh, PTRREGS:$ptrreg,
+                            imm:$offs))]>,
                    Requires<[HasSRAM]>;
 
   // ST -P, Rr
   // Stores the value of Rr into the location addressed by pointer P.
   // Pre decrements P.
-  def STPtrPdRr : FSTLD<1, 0b10,
-                        (outs LDSTPtrReg
-                         : $base_wb),
-                        (ins LDSTPtrReg
-                         : $ptrreg, GPR8
-                         : $reg, i8imm
-                         : $offs),
-                        "st\t-$ptrreg, $reg", [(set i16
-                                                : $base_wb, (pre_store GPR8
-                                                             : $reg, i16
-                                                             : $ptrreg, imm
-                                                             : $offs))]>,
+  def STPtrPdRr : FSTLD<1, 0b10, (outs LDSTPtrReg:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+                        "st\t-$ptrreg, $reg",
+                        [(set i16: $base_wb,
+                         (pre_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
                   Requires<[HasSRAM]>;
 
   // STW -P, Rr+1:Rr
@@ -1294,17 +1139,11 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
   // Expands to:
   // st -P, Rr+1
   // st -P, Rr
-  def STWPtrPdRr : Pseudo<(outs PTRREGS
-                           : $base_wb),
-                          (ins PTRREGS
-                           : $ptrreg, DREGS
-                           : $reg, i8imm
-                           : $offs),
-                          "stw\t-$ptrreg, $reg", [(set PTRREGS
-                                                   : $base_wb, (pre_store i16
-                                                                : $reg, i16
-                                                                : $ptrreg, imm
-                                                                : $offs))]>,
+  def STWPtrPdRr : Pseudo<(outs PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg, DREGS:$reg, i8imm:$offs),
+                          "stw\t-$ptrreg, $reg",
+                          [(set PTRREGS:$base_wb,
+                           (pre_store i16:$reg, i16:$ptrreg, imm:$offs))]>,
                    Requires<[HasSRAM]>;
 }
 
@@ -1312,13 +1151,8 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
 // STD P+q, Rr
 // Stores the value of Rr into the location addressed by pointer P with a
 // displacement of q. Does not modify P.
-def STDPtrQRr : FSTDLDD<1, (outs),
-                        (ins memri
-                         : $memri, GPR8
-                         : $reg),
-                        "std\t$memri, $reg", [(store i8
-                                               : $reg, addr
-                                               : $memri)]>,
+def STDPtrQRr : FSTDLDD<1, (outs), (ins memri:$memri, GPR8:$reg),
+                        "std\t$memri, $reg", [(store i8:$reg, addr:$memri)]>,
                 Requires<[HasSRAM, HasNonTinyEncoding]>;
 
 // STDW P+q, Rr+1:Rr
@@ -1333,13 +1167,8 @@ def STDPtrQRr : FSTDLDD<1, (outs),
 //   st    P+, Rr
 //   st    P+, Rr+1
 //   subiw P,  q+2
-def STDWPtrQRr : Pseudo<(outs),
-                        (ins memri
-                         : $memri, DREGS
-                         : $src),
-                        "stdw\t$memri, $src", [(store i16
-                                                : $src, addr
-                                                : $memri)]>,
+def STDWPtrQRr : Pseudo<(outs), (ins memri:$memri, DREGS:$src),
+                        "stdw\t$memri, $src", [(store i16:$src, addr:$memri)]>,
                  Requires<[HasSRAM]>;
 
 // Load program memory operations.
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index c8849bd50464c7..3d40bdbf164cd8 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index 2af150ad45c2b0..39390e8c38f8c1 100644
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/Debug.h"
 #include <set>
 
diff --git a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
index c64fe00beaefe0..5d8339b4a44cec 100644
--- a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
@@ -116,6 +116,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsBPF.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -374,7 +375,7 @@ static bool foldGEPChainAsU8Access(SmallVector<GetElementPtrInst *> &GEPs,
     return false;
 
   GetElementPtrInst *First = GEPs[0];
-  const DataLayout &DL = First->getModule()->getDataLayout();
+  const DataLayout &DL = First->getDataLayout();
   LLVMContext &C = First->getContext();
   Type *PtrTy = First->getType()->getScalarType();
   APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0);
diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
index 8761e4aa258c21..84af6806abb36c 100644
--- a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -47,9 +47,17 @@ BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL)
-{
+static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL,
+                     MachineBasicBlock& MBB) {
   if (Offset <= -BPFStackSizeOption) {
+    if (!DL)
+      /* try harder to get some debug loc */
+      for (auto &I : MBB)
+        if (I.getDebugLoc()) {
+          DL = I.getDebugLoc();
+          break;
+        }
+
     const Function &F = MF.getFunction();
     DiagnosticInfoUnsupported DiagStackSize(
         F,
@@ -73,14 +81,6 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MBB.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
-  if (!DL)
-    /* try harder to get some debug loc */
-    for (auto &I : MBB)
-      if (I.getDebugLoc()) {
-        DL = I.getDebugLoc();
-        break;
-      }
-
   while (!MI.getOperand(i).isFI()) {
     ++i;
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
@@ -93,7 +93,7 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (MI.getOpcode() == BPF::MOV_rr) {
     int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
 
-    WarnSize(Offset, MF, DL);
+    WarnSize(Offset, MF, DL, MBB);
     MI.getOperand(i).ChangeToRegister(FrameReg, false);
     Register reg = MI.getOperand(i - 1).getReg();
     BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
@@ -108,7 +108,7 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (!isInt<32>(Offset))
     llvm_unreachable("bug in frame offset");
 
-  WarnSize(Offset, MF, DL);
+  WarnSize(Offset, MF, DL, MBB);
 
   if (MI.getOpcode() == BPF::FI_ri) {
     // architecture does not really support FI_ri, replace it with
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 7b73c9f4a1e4cf..7d91fa8bb824cf 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -113,8 +113,7 @@ static Expected<bool> parseBPFPreserveStaticOffsetOptions(StringRef Params) {
                                             "BPFPreserveStaticOffsetPass");
 }
 
-void BPFTargetMachine::registerPassBuilderCallbacks(
-    PassBuilder &PB, bool PopulateClassToPassNames) {
+void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "BPFPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h
index 0a28394463b26a..4e6adc722e76a1 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -42,8 +42,7 @@ class BPFTargetMachine : public LLVMTargetMachine {
     return TLOF.get();
   }
 
-  void registerPassBuilderCallbacks(PassBuilder &PB,
-                                    bool PopulateClassToPassNames) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
 };
 }
 
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index 0ecff32a7525f8..13dcae7e67bcf2 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -34,7 +34,7 @@ class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
 
 public:
   explicit BPFTTIImpl(const BPFTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index b6d3b460005c94..34581e3b6286a7 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -1489,7 +1490,7 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
     }
 
     // Calculate symbol size
-    const DataLayout &DL = Global.getParent()->getDataLayout();
+    const DataLayout &DL = Global.getDataLayout();
     uint32_t Size = DL.getTypeAllocSize(Global.getValueType());
 
     DataSecEntries[std::string(SecName)]->addDataSecEntry(VarId,
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 67e04c212a6921..d47b9c7a25b8fe 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -12,11 +12,13 @@
 
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/DXContainerPSVInfo.h"
 #include "llvm/Pass.h"
@@ -37,6 +39,8 @@ class DXContainerGlobals : public llvm::ModulePass {
   GlobalVariable *buildSignature(Module &M, Signature &Sig, StringRef Name,
                                  StringRef SectionName);
   void addSignature(Module &M, SmallVector<GlobalValue *> &Globals);
+  void addPipelineStateValidationInfo(Module &M,
+                                      SmallVector<GlobalValue *> &Globals);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -63,6 +67,7 @@ bool DXContainerGlobals::runOnModule(Module &M) {
   Globals.push_back(getFeatureFlags(M));
   Globals.push_back(computeShaderHash(M));
   addSignature(M, Globals);
+  addPipelineStateValidationInfo(M, Globals);
   appendToCompilerUsed(M, Globals);
   return true;
 }
@@ -133,6 +138,34 @@ void DXContainerGlobals::addSignature(Module &M,
   Globals.emplace_back(buildSignature(M, OutputSig, "dx.osg1", "OSG1"));
 }
 
+void DXContainerGlobals::addPipelineStateValidationInfo(
+    Module &M, SmallVector<GlobalValue *> &Globals) {
+  SmallString<256> Data;
+  raw_svector_ostream OS(Data);
+  PSVRuntimeInfo PSV;
+  Triple TT(M.getTargetTriple());
+  PSV.BaseData.MinimumWaveLaneCount = 0;
+  PSV.BaseData.MaximumWaveLaneCount = std::numeric_limits<uint32_t>::max();
+  PSV.BaseData.ShaderStage =
+      static_cast<uint8_t>(TT.getEnvironment() - Triple::Pixel);
+
+  // Hardcoded values here to unblock loading the shader into D3D.
+  //
+  // TODO: Lots more stuff to do here!
+  //
+  // See issue https://github.com/llvm/llvm-project/issues/96674.
+  PSV.BaseData.NumThreadsX = 1;
+  PSV.BaseData.NumThreadsY = 1;
+  PSV.BaseData.NumThreadsZ = 1;
+  PSV.EntryName = "main";
+
+  PSV.finalize(TT.getEnvironment());
+  PSV.write(OS);
+  Constant *Constant =
+      ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false);
+  Globals.emplace_back(buildContainerGlobal(M, Constant, "dx.psv0", "PSV0"));
+}
+
 char DXContainerGlobals::ID = 0;
 INITIALIZE_PASS_BEGIN(DXContainerGlobals, "dxil-globals",
                       "DXContainer Global Emitter", false, true)
diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
index 3c45d0d5e00881..15def3637c5a74 100644
--- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
@@ -49,7 +49,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
   MCSection *TheSection = getObjFileLowering().SectionForGlobal(GV, GVKind, TM);
   OutStreamer->switchSection(TheSection);
-  emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+  emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() {
diff --git a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
index 97b7a41cac6da3..f99bb4f4eaee1a 100644
--- a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
+++ b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
@@ -12,7 +12,9 @@
 
 #include "PointerTypeAnalysis.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 using namespace llvm::dxil;
diff --git a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.h b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.h
index fe1d5d1c6c2a77..4ac7c3ffa269d9 100644
--- a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.h
+++ b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/TypedPointerType.h"
+#include "llvm/IR/Value.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index c853393e4282a4..e6dbb25b710ec6 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -102,8 +102,7 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT,
 
 DirectXTargetMachine::~DirectXTargetMachine() {}
 
-void DirectXTargetMachine::registerPassBuilderCallbacks(
-    PassBuilder &PB, bool PopulateClassToPassNames) {
+void DirectXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "DirectXPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
index 428beaf61cd0a8..d04c375b2736dd 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
@@ -47,8 +47,7 @@ class DirectXTargetMachine : public LLVMTargetMachine {
   }
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
-  void registerPassBuilderCallbacks(PassBuilder &PB,
-                                    bool PopulateClassToPassNames) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
index a0c09684dc67d6..ed98355fad002d 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
@@ -31,7 +31,7 @@ class DirectXTTIImpl : public BasicTTIImplBase<DirectXTTIImpl> {
 
 public:
   explicit DirectXTTIImpl(const DirectXTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
   unsigned getMinVectorRegisterBitWidth() const { return 32; }
 };
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 092cccbcca9c47..2fb1c484fc8a14 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -745,10 +745,8 @@ bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
   // end of the section.  Only legacy hexagon-gcc created assembly code
   // used negative subsections.
   if ((Res < 0) && (Res > -8193))
-    Subsection = HexagonMCExpr::create(
-        MCConstantExpr::create(8192 + Res, getContext()), getContext());
-
-  getStreamer().subSection(Subsection);
+    Res += 8192;
+  getStreamer().switchSection(getStreamer().getCurrentSectionOnly(), Res);
   return false;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 03f6882e6889f1..e99496da8a260a 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -161,7 +161,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
       AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addPreserved<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineLoopInfo>();
@@ -1056,8 +1056,9 @@ bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MLI = &getAnalysis<MachineLoopInfo>();
-  MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() :
-    nullptr;
+  MBPI = EnableHexagonBP
+             ? &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI()
+             : nullptr;
 
   Deleted.clear();
   bool Changed = false;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 22a88734afd44d..aefd94e69b0e1b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2110,7 +2110,7 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::hexagon_L2_loadrb_pbr:
   case Intrinsic::hexagon_L2_loadrub_pbr: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &DL = I.getDataLayout();
     auto &Cont = I.getCalledFunction()->getParent()->getContext();
     // The intrinsic function call is of the form { ElTy, i8* }
     // @llvm.hexagon.L2.loadXX.pbr(i8*, i32). The pointer and memory access type
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index e04f9c6faacd3e..3f6de365fe393e 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -41,6 +41,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrDesc.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
index c79b528ff2f3f9..6aa95ff6fd8f5e 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
@@ -74,7 +74,7 @@ class HexagonLoopAlign : public MachineFunctionPass {
   bool attemptToBalignSmallLoop(MachineFunction &MF, MachineBasicBlock &MBB);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
     AU.addRequired<MachineBlockFrequencyInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -148,7 +148,7 @@ bool HexagonLoopAlign::attemptToBalignSmallLoop(MachineFunction &MF,
     return false;
 
   const MachineBranchProbabilityInfo *MBPI =
-      &getAnalysis<MachineBranchProbabilityInfo>();
+      &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   const MachineBlockFrequencyInfo *MBFI =
       &getAnalysis<MachineBlockFrequencyInfo>();
 
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 7777ae23e8aecd..b75cd1beadc58e 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -770,8 +770,7 @@ bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
     //          select +++ ? T : 0
 
     Value *U = *SelI->user_begin();
-    if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
-        !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
+    if (!match(U, m_c_Xor(m_Specific(SelI), m_Value(R))))
       return false;
     // Matched: xor (select +++ ? 0 : T), R
     //          xor (select +++ ? T : 0), R
@@ -814,15 +813,13 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
   CmpInst::Predicate P;
   bool TrueIfZero;
 
-  if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
-      match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
+  if (match(CondV, m_c_ICmp(P, m_Value(C), m_Zero()))) {
     if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
       return false;
     // Matched: select C == 0 ? ... : ...
     //          select C != 0 ? ... : ...
     TrueIfZero = (P == CmpInst::ICMP_EQ);
-  } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
-             match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
+  } else if (match(CondV, m_c_ICmp(P, m_Value(C), m_One()))) {
     if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
       return false;
     // Matched: select C == 1 ? ... : ...
@@ -832,8 +829,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
     return false;
 
   Value *X = nullptr;
-  if (!match(C, m_And(m_Value(X), m_One())) &&
-      !match(C, m_And(m_One(), m_Value(X))))
+  if (!match(C, m_c_And(m_Value(X), m_One())))
     return false;
   // Matched: select (X & 1) == +++ ? ... : ...
   //          select (X & 1) != +++ ? ... : ...
@@ -845,8 +841,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
     if (!match(TrueV, m_LShr(m_Value(R), m_One())))
       return false;
     // Matched: select +++ ? (R >> 1) : ...
-    if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
-        !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
+    if (!match(FalseV, m_c_Xor(m_Specific(TrueV), m_Value(Q))))
       return false;
     // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
     // with commuting ^.
@@ -856,8 +851,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
     if (!match(FalseV, m_LShr(m_Value(R), m_One())))
       return false;
     // Matched: select +++ ? ... : (R >> 1)
-    if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
-        !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
+    if (!match(TrueV, m_c_Xor(m_Specific(FalseV), m_Value(Q))))
       return false;
     // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
     // with commuting ^.
@@ -2425,7 +2419,7 @@ bool HexagonLoopIdiomRecognize::run(Loop *L) {
   if (Name == "memset" || Name == "memcpy" || Name == "memmove")
     return false;
 
-  DL = &L->getHeader()->getModule()->getDataLayout();
+  DL = &L->getHeader()->getDataLayout();
 
   HasMemcpy = TLI->has(LibFunc_memcpy);
   HasMemmove = TLI->has(LibFunc_memmove);
diff --git a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index f539717e42d59e..36c52dfe81d5ab 100644
--- a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -78,7 +78,7 @@ namespace {
     HexagonNewValueJump() : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -107,7 +107,7 @@ char HexagonNewValueJump::ID = 0;
 
 INITIALIZE_PASS_BEGIN(HexagonNewValueJump, "hexagon-nvj",
                       "Hexagon NewValueJump", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_END(HexagonNewValueJump, "hexagon-nvj",
                     "Hexagon NewValueJump", false, false)
 
@@ -459,7 +459,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
   QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
   QRI = static_cast<const HexagonRegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
 
   if (DisableNewValueJumps ||
       !MF.getSubtarget<HexagonSubtarget>().useNewValueJumps())
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e4886506de19c8..b362285d4f16e9 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -312,8 +312,7 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
-void HexagonTargetMachine::registerPassBuilderCallbacks(
-    PassBuilder &PB, bool PopulateClassToPassNames) {
+void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "HexagonPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index 34ff45b6acf345..6e9a78b7665042 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -35,8 +35,7 @@ class HexagonTargetMachine : public LLVMTargetMachine {
   ~HexagonTargetMachine() override;
   const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
 
-  void registerPassBuilderCallbacks(PassBuilder &PB,
-                                    bool PopulateClassToPassNames) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 35188ff2e97f46..e5d10a75728bf8 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -254,7 +254,7 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
     }
   }
 
-  unsigned Size = GVar->getParent()->getDataLayout().getTypeAllocSize(GType);
+  unsigned Size = GVar->getDataLayout().getTypeAllocSize(GType);
   if (Size == 0) {
     LLVM_DEBUG(dbgs() << "no, has size 0\n");
     return false;
@@ -317,7 +317,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   case Type::FloatTyID:
   case Type::DoubleTyID:
   case Type::IntegerTyID: {
-    const DataLayout &DL = GV->getParent()->getDataLayout();
+    const DataLayout &DL = GV->getDataLayout();
     // It is unfortunate that DL's function take non-const Type*.
     return DL.getTypeAllocSize(const_cast<Type*>(Ty));
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 90c402876a57cd..b6f36b920da777 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -52,7 +52,7 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
 
 public:
   explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()),
+      : BaseT(TM, F.getDataLayout()),
         ST(*TM->getSubtargetImpl(F)), TLI(*ST.getTargetLowering()) {}
 
   /// \name Scalar TTI Implementations
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 2d5352b08caed7..ef2677a6af46af 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -96,7 +96,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
       AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineLoopInfo>();
       AU.addPreserved<MachineDominatorTreeWrapperPass>();
@@ -125,7 +125,7 @@ char HexagonPacketizer::ID = 0;
 INITIALIZE_PASS_BEGIN(HexagonPacketizer, "hexagon-packetizer",
                       "Hexagon Packetizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(HexagonPacketizer, "hexagon-packetizer",
@@ -213,7 +213,8 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
   HRI = HST.getRegisterInfo();
   auto &MLI = getAnalysis<MachineLoopInfo>();
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  auto *MBPI =
+      &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
 
   if (EnableGenAllInsnClass)
     HII->genAllInsnTimingClasses(MF);
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 797b798520aa15..847646a00a9f67 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -74,7 +74,7 @@ class HexagonVectorCombine {
   HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
                        DominatorTree &DT_, ScalarEvolution &SE_,
                        TargetLibraryInfo &TLI_, const TargetMachine &TM_)
-      : F(F_), DL(F.getParent()->getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
+      : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
         SE(SE_), TLI(TLI_),
         HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 54efe4bc25efe7..0536f892773177 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -722,7 +722,7 @@ class HexagonAsmBackend : public MCAsmBackend {
         default:
           break;
         case MCFragment::FT_Align: {
-          auto Size = Asm.computeFragmentSize(Layout, *Frags[J]);
+          auto Size = Asm.computeFragmentSize(*Frags[J]);
           for (auto K = J; K != 0 && Size >= HEXAGON_PACKET_SIZE;) {
             --K;
             switch (Frags[K]->getKind()) {
@@ -756,7 +756,7 @@ class HexagonAsmBackend : public MCAsmBackend {
               //assert(!Error);
               (void)Error;
               ReplaceInstruction(Asm.getEmitter(), RF, Inst);
-              Layout.invalidateFragmentsFrom(&RF);
+              I->setHasLayout(false);
               Size = 0; // Only look back one instruction
               break;
             }
diff --git a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
index a366a89af86360..e90706cd3ab5a9 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -95,7 +95,7 @@ bool LanaiTargetObjectFile::isGlobalInSmallSectionImpl(
 
   Type *Ty = GVA->getValueType();
   return isInSmallSection(
-      GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+      GVA->getDataLayout().getTypeAllocSize(Ty));
 }
 
 MCSection *LanaiTargetObjectFile::SelectSectionForGlobal(
diff --git a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index b064f4d5b1edf3..7dec962789d537 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -38,7 +38,7 @@ class LanaiTTIImpl : public BasicTTIImplBase<LanaiTTIImpl> {
 
 public:
   explicit LanaiTTIImpl(const LanaiTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   bool shouldBuildLookupTables() const { return false; }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 1721287dab4ddd..521239f5e3193b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -785,6 +785,7 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
   SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
+  SDValue Load;
 
   switch (M) {
   default:
@@ -796,33 +797,49 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
     // This is not actually used, but is necessary for successfully matching
     // the PseudoLA_*_LARGE nodes.
     SDValue Tmp = DAG.getConstant(0, DL, Ty);
-    if (IsLocal)
+    if (IsLocal) {
       // This generates the pattern (PseudoLA_PCREL_LARGE tmp sym), that
       // eventually becomes the desired 5-insn code sequence.
-      return SDValue(DAG.getMachineNode(LoongArch::PseudoLA_PCREL_LARGE, DL, Ty,
+      Load = SDValue(DAG.getMachineNode(LoongArch::PseudoLA_PCREL_LARGE, DL, Ty,
                                         Tmp, Addr),
                      0);
-
-    // This generates the pattern (PseudoLA_GOT_LARGE tmp sym), that eventually
-    // becomes the desired 5-insn code sequence.
-    return SDValue(
-        DAG.getMachineNode(LoongArch::PseudoLA_GOT_LARGE, DL, Ty, Tmp, Addr),
-        0);
+    } else {
+      // This generates the pattern (PseudoLA_GOT_LARGE tmp sym), that
+      // eventually becomes the desired 5-insn code sequence.
+      Load = SDValue(
+          DAG.getMachineNode(LoongArch::PseudoLA_GOT_LARGE, DL, Ty, Tmp, Addr),
+          0);
+    }
+    break;
   }
 
   case CodeModel::Small:
   case CodeModel::Medium:
-    if (IsLocal)
+    if (IsLocal) {
       // This generates the pattern (PseudoLA_PCREL sym), which expands to
       // (addi.w/d (pcalau12i %pc_hi20(sym)) %pc_lo12(sym)).
-      return SDValue(
+      Load = SDValue(
           DAG.getMachineNode(LoongArch::PseudoLA_PCREL, DL, Ty, Addr), 0);
+    } else {
+      // This generates the pattern (PseudoLA_GOT sym), which expands to (ld.w/d
+      // (pcalau12i %got_pc_hi20(sym)) %got_pc_lo12(sym)).
+      Load =
+          SDValue(DAG.getMachineNode(LoongArch::PseudoLA_GOT, DL, Ty, Addr), 0);
+    }
+  }
 
-    // This generates the pattern (PseudoLA_GOT sym), which expands to (ld.w/d
-    // (pcalau12i %got_pc_hi20(sym)) %got_pc_lo12(sym)).
-    return SDValue(DAG.getMachineNode(LoongArch::PseudoLA_GOT, DL, Ty, Addr),
-                   0);
+  if (!IsLocal) {
+    // Mark the load instruction as invariant to enable hoisting in MachineLICM.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineMemOperand *MemOp = MF.getMachineMemOperand(
+        MachinePointerInfo::getGOT(MF),
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8));
+    DAG.setNodeMemRefs(cast<MachineSDNode>(Load.getNode()), {MemOp});
   }
+
+  return Load;
 }
 
 SDValue LoongArchTargetLowering::lowerBlockAddress(SDValue Op,
@@ -860,7 +877,7 @@ SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op,
 
 SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
                                                   SelectionDAG &DAG,
-                                                  unsigned Opc,
+                                                  unsigned Opc, bool UseGOT,
                                                   bool Large) const {
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
@@ -873,6 +890,16 @@ SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
   SDValue Offset = Large
                        ? SDValue(DAG.getMachineNode(Opc, DL, Ty, Tmp, Addr), 0)
                        : SDValue(DAG.getMachineNode(Opc, DL, Ty, Addr), 0);
+  if (UseGOT) {
+    // Mark the load instruction as invariant to enable hoisting in MachineLICM.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineMemOperand *MemOp = MF.getMachineMemOperand(
+        MachinePointerInfo::getGOT(MF),
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8));
+    DAG.setNodeMemRefs(cast<MachineSDNode>(Offset.getNode()), {MemOp});
+  }
 
   // Add the thread pointer.
   return DAG.getNode(ISD::ADD, DL, Ty, Offset,
@@ -976,13 +1003,14 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
     return getStaticTLSAddr(N, DAG,
                             Large ? LoongArch::PseudoLA_TLS_IE_LARGE
                                   : LoongArch::PseudoLA_TLS_IE,
-                            Large);
+                            /*UseGOT=*/true, Large);
   case TLSModel::LocalExec:
     // This model is used when static linking as the TLS offsets are resolved
     // during program linking.
     //
     // This node doesn't need an extra argument for the large code model.
-    return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE);
+    return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE,
+                            /*UseGOT=*/false);
   }
 
   return getTLSDescAddr(N, DAG,
@@ -4666,7 +4694,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicRMWIntrinsic(
   // sign-extend.
   if (AI->getOperation() == AtomicRMWInst::Min ||
       AI->getOperation() == AtomicRMWInst::Max) {
-    const DataLayout &DL = AI->getModule()->getDataLayout();
+    const DataLayout &DL = AI->getDataLayout();
     unsigned ValWidth =
         DL.getTypeStoreSizeInBits(AI->getValOperand()->getType());
     Value *SextShamt =
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 9328831a17a30d..f4c57f80fdbe48 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -267,7 +267,7 @@ class LoongArchTargetLowering : public TargetLowering {
   SDValue getAddr(NodeTy *N, SelectionDAG &DAG, CodeModel::Model M,
                   bool IsLocal = true) const;
   SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
-                           unsigned Opc, bool Large = false) const;
+                           unsigned Opc, bool UseGOT, bool Large = false) const;
   SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
                             unsigned Opc, bool Large = false) const;
   SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ff63af5dec4dc3..67c3dfd3b2599d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -1582,13 +1582,26 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
                                 "la.abs", "$dst, $src">;
 def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                             "la.pcrel", "$dst, $src">;
-let Defs = [R20], Size = 20 in
+def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.ld", "$dst, $src">;
+def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.gd", "$dst, $src">;
+let Defs = [R20], Size = 20 in {
 def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
                                   (ins GPR:$tmp, bare_symbol:$src), [],
                                   "la.pcrel", "$dst, $tmp, $src">,
                            Requires<[IsLA64]>;
 def PseudoLA_TLS_LE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.le", "$dst, $src">;
+def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
+                                   (ins GPR:$tmp, bare_symbol:$src), [],
+                                   "la.tls.ld", "$dst, $tmp, $src">,
+                            Requires<[IsLA64]>;
+def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
+                                   (ins GPR:$tmp, bare_symbol:$src), [],
+                                   "la.tls.gd", "$dst, $tmp, $src">,
+                            Requires<[IsLA64]>;
+} // Defs = [R20], Size = 20
 }
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
@@ -1596,10 +1609,6 @@ def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                           "la.got", "$dst, $src">;
 def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.ie", "$dst, $src">;
-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.ld", "$dst, $src">;
-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.gd", "$dst, $src">;
 let Defs = [R20], Size = 20 in {
 def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
                                 (ins GPR:$tmp, bare_symbol:$src), [],
@@ -1609,14 +1618,6 @@ def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ie", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
-                                   (ins GPR:$tmp, bare_symbol:$src), [],
-                                   "la.tls.ld", "$dst, $tmp, $src">,
-                            Requires<[IsLA64]>;
-def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
-                                   (ins GPR:$tmp, bare_symbol:$src), [],
-                                   "la.tls.gd", "$dst, $tmp, $src">,
-                            Requires<[IsLA64]>;
 } // Defs = [R20], Size = 20
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index 06a03d29931d1e..b2eef80dd9d3d2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -37,7 +37,7 @@ class LoongArchTTIImpl : public BasicTTIImplBase<LoongArchTTIImpl> {
 
 public:
   explicit LoongArchTTIImpl(const LoongArchTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
diff --git a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
index e7e62951649416..f3813afa616cc8 100644
--- a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
@@ -103,7 +103,7 @@ bool M68kCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
   const M68kTargetLowering &TLI = *getTLI<M68kTargetLowering>();
   CCAssignFn *AssignFn =
       TLI.getCCAssignFn(F.getCallingConv(), true, F.isVarArg());
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   if (!VRegs.empty()) {
     SmallVector<ArgInfo, 8> SplitArgs;
     ArgInfo OrigArg{VRegs, Val->getType(), 0};
@@ -125,7 +125,7 @@ bool M68kCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                             FunctionLoweringInfo &FLI) const {
   MachineFunction &MF = MIRBuilder.getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const auto &DL = F.getParent()->getDataLayout();
+  const auto &DL = F.getDataLayout();
   auto &TLI = *getTLI<M68kTargetLowering>();
 
   SmallVector<ArgInfo, 8> SplitArgs;
@@ -191,7 +191,7 @@ bool M68kCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   MachineFunction &MF = MIRBuilder.getMF();
   Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   const M68kTargetLowering &TLI = *getTLI<M68kTargetLowering>();
   const M68kSubtarget &STI = MF.getSubtarget<M68kSubtarget>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index e907e8d8a70022..f861268c00158b 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -90,8 +90,7 @@ void MipsELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   Labels.push_back(Symbol);
 }
 
-void MipsELFStreamer::switchSection(MCSection *Section,
-                                    const MCExpr *Subsection) {
+void MipsELFStreamer::switchSection(MCSection *Section, uint32_t Subsection) {
   MCELFStreamer::switchSection(Section, Subsection);
   Labels.clear();
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 051806d2cfe8f5..1e8042e88c9e39 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -50,8 +50,7 @@ class MipsELFStreamer : public MCELFStreamer {
 
   /// Overriding this function allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .section directive is processed.
-  void switchSection(MCSection *Section,
-                     const MCExpr *Subsection = nullptr) override;
+  void switchSection(MCSection *Section, uint32_t Subsection = 0) override;
 
   /// Overriding these functions allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .word/.long/.4byte etc
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index f1aa90d240238a..6b013de2747720 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -20,7 +20,6 @@
 using namespace llvm;
 
 void MipsRegInfoRecord::EmitMipsOptionRecord() {
-  MCAssembler &MCA = Streamer->getAssembler();
   MipsTargetStreamer *MTS =
       static_cast<MipsTargetStreamer *>(Streamer->getTargetStreamer());
 
@@ -36,7 +35,6 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
     MCSectionELF *Sec =
         Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
                               ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1);
-    MCA.registerSection(*Sec);
     Sec->setAlignment(Align(8));
     Streamer->switchSection(Sec);
 
@@ -54,7 +52,6 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
   } else {
     MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
                                               ELF::SHF_ALLOC, 24);
-    MCA.registerSection(*Sec);
     Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4));
     Streamer->switchSection(Sec);
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index bdf299ae8ac112..3093b9fddc73a7 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -890,14 +890,15 @@ void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
 void MipsTargetELFStreamer::finish() {
   MCAssembler &MCA = getStreamer().getAssembler();
   const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
+  MCELFStreamer &S = getStreamer();
 
   // .bss, .text and .data are always at least 16-byte aligned.
   MCSection &TextSection = *OFI.getTextSection();
-  MCA.registerSection(TextSection);
+  S.switchSection(&TextSection);
   MCSection &DataSection = *OFI.getDataSection();
-  MCA.registerSection(DataSection);
+  S.switchSection(&DataSection);
   MCSection &BSSSection = *OFI.getBSSSection();
-  MCA.registerSection(BSSSection);
+  S.switchSection(&BSSSection);
 
   TextSection.ensureMinAlignment(Align(16));
   DataSection.ensureMinAlignment(Align(16));
@@ -908,16 +909,15 @@ void MipsTargetELFStreamer::finish() {
     // verifying the output of IAS against the output of other assemblers but
     // it's not necessary to produce a correct object and increases section
     // size.
-    MCStreamer &OS = getStreamer();
-    for (MCSection &S : MCA) {
-      MCSectionELF &Section = static_cast<MCSectionELF &>(S);
+    for (MCSection &Sec : MCA) {
+      MCSectionELF &Section = static_cast<MCSectionELF &>(Sec);
 
       Align Alignment = Section.getAlign();
-      OS.switchSection(&Section);
+      S.switchSection(&Section);
       if (Section.useCodeAlign())
-        OS.emitCodeAlignment(Alignment, &STI, Alignment.value());
+        S.emitCodeAlignment(Alignment, &STI, Alignment.value());
       else
-        OS.emitValueToAlignment(Alignment, 0, 1, Alignment.value());
+        S.emitValueToAlignment(Alignment, 0, 1, Alignment.value());
     }
   }
 
@@ -1015,19 +1015,15 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   MCContext &Context = MCA.getContext();
   MCStreamer &OS = getStreamer();
 
+  OS.pushSection();
   MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, 0);
+  OS.switchSection(Sec);
+  Sec->setAlignment(Align(4));
 
   MCSymbol *Sym = Context.getOrCreateSymbol(Name);
   const MCSymbolRefExpr *ExprRef =
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
 
-  MCA.registerSection(*Sec);
-  Sec->setAlignment(Align(4));
-
-  OS.pushSection();
-
-  OS.switchSection(Sec);
-
   OS.emitValueImpl(ExprRef, 4);
 
   OS.emitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask
@@ -1306,9 +1302,8 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() {
   MCStreamer &OS = getStreamer();
   MCSectionELF *Sec = Context.getELFSection(
       ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24);
-  MCA.registerSection(*Sec);
-  Sec->setAlignment(Align(8));
   OS.switchSection(Sec);
+  Sec->setAlignment(Align(8));
 
   OS << ABIFlagsSection;
 }
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index dda33f9a180876..018c620f5c84c7 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -1014,7 +1015,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   MCSectionELF *M = OutContext.getELFSection(
       ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
       ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
-  OutStreamer->switchSection(M, nullptr);
+  OutStreamer->switchSection(M);
   //
   // .align 2
   //
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index b525606b1f8fd7..4ec01ab7b45659 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -236,7 +236,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -872,7 +872,7 @@ MipsDelaySlotFiller::selectSuccBB(MachineBasicBlock &B) const {
     return nullptr;
 
   // Select the successor with the larget edge weight.
-  auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
+  auto &Prob = getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
   MachineBasicBlock *S = *std::max_element(
       B.succ_begin(), B.succ_end(),
       [&](const MachineBasicBlock *Dst0, const MachineBasicBlock *Dst1) {
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index c2be8c80b7a83f..973ac0e79cbcca 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -55,6 +55,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 9515e506c73e5d..7802767e31c2f6 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -287,7 +287,7 @@ MipsTargetMachine::getTargetTransformInfo(const Function &F) const {
   if (Subtarget->allowMixed16_32()) {
     LLVM_DEBUG(errs() << "No Target Transform Info Pass Added\n");
     // FIXME: This is no longer necessary as the TTI returned is per-function.
-    return TargetTransformInfo(F.getParent()->getDataLayout());
+    return TargetTransformInfo(F.getDataLayout());
   }
 
   LLVM_DEBUG(errs() << "Target Transform Info Pass Added\n");
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index 481157a8aa89a4..61b88eb1b23905 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -143,7 +143,7 @@ IsGlobalInSmallSectionImpl(const GlobalObject *GO,
     return false;
 
   return IsInSmallSection(
-      GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+      GVA->getDataLayout().getTypeAllocSize(Ty));
 }
 
 MCSection *MipsTargetObjectFile::SelectSectionForGlobal(
diff --git a/llvm/lib/Target/Mips/MipsTargetTransformInfo.h b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
index 6f52eaa2f83391..4c6a0cc32686f8 100644
--- a/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
+++ b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
@@ -29,7 +29,7 @@ class MipsTTIImpl : public BasicTTIImplBase<MipsTTIImpl> {
 
 public:
   explicit MipsTTIImpl(const MipsTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   bool hasDivRemOp(Type *DataType, bool IsSigned);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 15911f35b613c7..fc207b1a8871af 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -83,8 +83,7 @@ static bool isDwarfSection(const MCObjectFileInfo *FI,
 }
 
 void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
-                                        MCSection *Section,
-                                        const MCExpr *SubSection,
+                                        MCSection *Section, uint32_t SubSection,
                                         raw_ostream &OS) {
   assert(!SubSection && "SubSection is not null!");
   const MCObjectFileInfo *FI = getStreamer().getContext().getObjectFileInfo();
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
index b0d8978ee6857f..ca0d84ee2079ae 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -43,7 +43,7 @@ class NVPTXTargetStreamer : public MCTargetStreamer {
   /// functions.
   void emitDwarfFileDirective(StringRef Directive) override;
   void changeSection(const MCSection *CurSection, MCSection *Section,
-                     const MCExpr *SubSection, raw_ostream &OS) override;
+                     uint32_t SubSection, raw_ostream &OS) override;
   /// Emit the bytes in \p Data into the output.
   ///
   /// This is used to emit bytes in \p Data as sequence of .byte directives.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f4ef7c9914f131..476a532db0a37f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3232,9 +3232,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             if (NumElts != 1)
               return std::nullopt;
             Align PartAlign =
-                (Offsets[parti] == 0 && PAL.getParamAlignment(i))
-                    ? PAL.getParamAlignment(i).value()
-                    : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+                DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
             return commonAlignment(PartAlign, Offsets[parti]);
           }();
           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
@@ -4591,7 +4589,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_atomic_exch_gen_i_sys:
   case Intrinsic::nvvm_atomic_xor_gen_i_cta:
   case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
-    auto &DL = I.getModule()->getDataLayout();
+    auto &DL = I.getDataLayout();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
@@ -4604,7 +4602,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
-    auto &DL = I.getModule()->getDataLayout();
+    auto &DL = I.getDataLayout();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
       Info.memVT = getValueType(DL, I.getType());
@@ -4622,7 +4620,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldg_global_i:
   case Intrinsic::nvvm_ldg_global_f:
   case Intrinsic::nvvm_ldg_global_p: {
-    auto &DL = I.getModule()->getDataLayout();
+    auto &DL = I.getDataLayout();
 
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
@@ -5038,7 +5036,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 /// ensures that alignment is 16 or greater.
 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
     const Function *F, Type *ArgTy, const DataLayout &DL) const {
-  const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
+  // Capping the alignment to 128 bytes as that is the maximum alignment
+  // supported by PTX.
+  const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
 
   // If a function has linkage different from internal or private, we
   // must use default ABI alignment as external users rely on it. Same
@@ -5048,10 +5048,10 @@ Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
                          /*IgnoreCallbackUses=*/false,
                          /*IgnoreAssumeLikeCalls=*/true,
                          /*IgnoreLLVMUsed=*/true))
-    return Align(ABITypeAlign);
+    return ABITypeAlign;
 
   assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
-  return Align(std::max(uint64_t(16), ABITypeAlign));
+  return std::max(Align(16), ABITypeAlign);
 }
 
 /// Helper for computing alignment of a device function byval parameter.
@@ -5215,135 +5215,200 @@ bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
   return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
 }
 
+static bool isConstZero(const SDValue &Operand) {
+  const auto *Const = dyn_cast<ConstantSDNode>(Operand);
+  return Const && Const->getZExtValue() == 0;
+}
+
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
 /// called with the default operands, and if that fails, with commuted
 /// operands.
-static SDValue PerformADDCombineWithOperands(
-    SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,
-    const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
-  SelectionDAG  &DAG = DCI.DAG;
-  // Skip non-integer, non-scalar case
-  EVT VT=N0.getValueType();
-  if (VT.isVector())
+static SDValue
+PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                              TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N0.getValueType();
+
+  // Since integer multiply-add costs the same as integer multiply
+  // but is more costly than integer add, do the fusion only when
+  // the mul is only used in the add.
+  // TODO: this may not be true for later architectures, consider relaxing this
+  if (!N0.getNode()->hasOneUse())
     return SDValue();
 
   // fold (add (mul a, b), c) -> (mad a, b, c)
   //
-  if (N0.getOpcode() == ISD::MUL) {
-    assert (VT.isInteger());
-    // For integer:
-    // Since integer multiply-add costs the same as integer multiply
-    // but is more costly than integer add, do the fusion only when
-    // the mul is only used in the add.
-    if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
-        !N0.getNode()->hasOneUse())
+  if (N0.getOpcode() == ISD::MUL)
+    return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
+                           N0.getOperand(1), N1);
+
+  // fold (add (select cond, 0, (mul a, b)), c)
+  //   -> (select cond, c, (mad a, b, c))
+  //
+  if (N0.getOpcode() == ISD::SELECT) {
+    unsigned ZeroOpNum;
+    if (isConstZero(N0->getOperand(1)))
+      ZeroOpNum = 1;
+    else if (isConstZero(N0->getOperand(2)))
+      ZeroOpNum = 2;
+    else
+      return SDValue();
+
+    SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
+    if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
       return SDValue();
 
-    // Do the folding
-    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
-                       N0.getOperand(0), N0.getOperand(1), N1);
+    SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+                                  M->getOperand(0), M->getOperand(1), N1);
+    return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
+                             ((ZeroOpNum == 1) ? N1 : MAD),
+                             ((ZeroOpNum == 1) ? MAD : N1));
   }
-  else if (N0.getOpcode() == ISD::FMUL) {
-    if (VT == MVT::f32 || VT == MVT::f64) {
-      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
-          &DAG.getTargetLoweringInfo());
-      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
-        return SDValue();
 
-      // For floating point:
-      // Do the fusion only when the mul has less than 5 uses and all
-      // are add.
-      // The heuristic is that if a use is not an add, then that use
-      // cannot be fused into fma, therefore mul is still needed anyway.
-      // If there are more than 4 uses, even if they are all add, fusing
-      // them will increase register pressue.
-      //
-      int numUses = 0;
-      int nonAddCount = 0;
-      for (const SDNode *User : N0.getNode()->uses()) {
-        numUses++;
-        if (User->getOpcode() != ISD::FADD)
-          ++nonAddCount;
-      }
+  return SDValue();
+}
+
+static SDValue
+PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               CodeGenOptLevel OptLevel) {
+  EVT VT = N0.getValueType();
+  if (N0.getOpcode() == ISD::FMUL) {
+    const auto *TLI = static_cast<const NVPTXTargetLowering *>(
+        &DCI.DAG.getTargetLoweringInfo());
+    if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
+      return SDValue();
+
+    // For floating point:
+    // Do the fusion only when the mul has less than 5 uses and all
+    // are add.
+    // The heuristic is that if a use is not an add, then that use
+    // cannot be fused into fma, therefore mul is still needed anyway.
+    // If there are more than 4 uses, even if they are all add, fusing
+    // them will increase register pressue.
+    //
+    int numUses = 0;
+    int nonAddCount = 0;
+    for (const SDNode *User : N0.getNode()->uses()) {
+      numUses++;
+      if (User->getOpcode() != ISD::FADD)
+        ++nonAddCount;
       if (numUses >= 5)
         return SDValue();
-      if (nonAddCount) {
-        int orderNo = N->getIROrder();
-        int orderNo2 = N0.getNode()->getIROrder();
-        // simple heuristics here for considering potential register
-        // pressure, the logics here is that the differnce are used
-        // to measure the distance between def and use, the longer distance
-        // more likely cause register pressure.
-        if (orderNo - orderNo2 < 500)
-          return SDValue();
-
-        // Now, check if at least one of the FMUL's operands is live beyond the node N,
-        // which guarantees that the FMA will not increase register pressure at node N.
-        bool opIsLive = false;
-        const SDNode *left = N0.getOperand(0).getNode();
-        const SDNode *right = N0.getOperand(1).getNode();
-
-        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
-          opIsLive = true;
-
-        if (!opIsLive)
-          for (const SDNode *User : left->uses()) {
-            int orderNo3 = User->getIROrder();
-            if (orderNo3 > orderNo) {
-              opIsLive = true;
-              break;
-            }
-          }
+    }
+    if (nonAddCount) {
+      int orderNo = N->getIROrder();
+      int orderNo2 = N0.getNode()->getIROrder();
+      // simple heuristics here for considering potential register
+      // pressure, the logics here is that the differnce are used
+      // to measure the distance between def and use, the longer distance
+      // more likely cause register pressure.
+      if (orderNo - orderNo2 < 500)
+        return SDValue();
 
-        if (!opIsLive)
-          for (const SDNode *User : right->uses()) {
-            int orderNo3 = User->getIROrder();
-            if (orderNo3 > orderNo) {
-              opIsLive = true;
-              break;
-            }
+      // Now, check if at least one of the FMUL's operands is live beyond the
+      // node N, which guarantees that the FMA will not increase register
+      // pressure at node N.
+      bool opIsLive = false;
+      const SDNode *left = N0.getOperand(0).getNode();
+      const SDNode *right = N0.getOperand(1).getNode();
+
+      if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
+        opIsLive = true;
+
+      if (!opIsLive)
+        for (const SDNode *User : left->uses()) {
+          int orderNo3 = User->getIROrder();
+          if (orderNo3 > orderNo) {
+            opIsLive = true;
+            break;
           }
+        }
 
-        if (!opIsLive)
-          return SDValue();
-      }
+      if (!opIsLive)
+        for (const SDNode *User : right->uses()) {
+          int orderNo3 = User->getIROrder();
+          if (orderNo3 > orderNo) {
+            opIsLive = true;
+            break;
+          }
+        }
 
-      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
-                         N0.getOperand(0), N0.getOperand(1), N1);
+      if (!opIsLive)
+        return SDValue();
     }
+
+    return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
+                           N0.getOperand(1), N1);
   }
 
   return SDValue();
 }
 
+static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
+                                         std::size_t Back) {
+  if (all_of(N->ops().drop_front(Front).drop_back(Back),
+             [](const SDUse &U) { return U.get()->isUndef(); }))
+    // Operand 0 is the previous value in the chain. Cannot return EntryToken
+    // as the previous value will become unused and eliminated later.
+    return N->getOperand(0);
+
+  return SDValue();
+}
+
+static SDValue PerformStoreParamCombine(SDNode *N) {
+  // Operands from the 3rd to the 2nd last one are the values to be stored.
+  //   {Chain, ArgID, Offset, Val, Glue}
+  return PerformStoreCombineHelper(N, 3, 1);
+}
+
 static SDValue PerformStoreRetvalCombine(SDNode *N) {
   // Operands from the 2nd to the last one are the values to be stored
-  for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
-    if (!N->getOperand(I).isUndef())
-      return SDValue();
-
-  // Operand 0 is the previous value in the chain. Cannot return EntryToken
-  // as the previous value will become unused and eliminated later.
-  return N->getOperand(0);
+  return PerformStoreCombineHelper(N, 2, 0);
 }
 
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const NVPTXSubtarget &Subtarget,
                                  CodeGenOptLevel OptLevel) {
+  if (OptLevel == CodeGenOptLevel::None)
+    return SDValue();
+
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // Skip non-integer, non-scalar case
+  EVT VT = N0.getValueType();
+  if (VT.isVector() || VT != MVT::i32)
+    return SDValue();
+
   // First try with the default operand order.
-  if (SDValue Result =
-          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
+  if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
     return Result;
 
   // If that didn't work, try again with the operands commuted.
-  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+  return PerformADDCombineWithOperands(N, N1, N0, DCI);
+}
+
+/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
+///
+static SDValue PerformFADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOptLevel OptLevel) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  EVT VT = N0.getValueType();
+  if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
+    return SDValue();
+
+  // First try with the default operand order.
+  if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
 }
 
 static SDValue PerformANDCombine(SDNode *N,
@@ -5768,6 +5833,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
       VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
     return SDValue();
 
+  // Don't mess with undef values as sra may be simplified to 0, not undef.
+  if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
+    return SDValue();
+
   uint64_t VectorBits = VectorVT.getSizeInBits();
   // We only handle the types we can extract in-register.
   if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
@@ -5876,8 +5945,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
     default: break;
     case ISD::ADD:
+      return PerformADDCombine(N, DCI, OptLevel);
     case ISD::FADD:
-      return PerformADDCombine(N, DCI, STI, OptLevel);
+      return PerformFADDCombine(N, DCI, OptLevel);
     case ISD::MUL:
       return PerformMULCombine(N, DCI, OptLevel);
     case ISD::SHL:
@@ -5895,6 +5965,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case NVPTXISD::StoreRetvalV2:
     case NVPTXISD::StoreRetvalV4:
       return PerformStoreRetvalCombine(N);
+    case NVPTXISD::StoreParam:
+    case NVPTXISD::StoreParamV2:
+    case NVPTXISD::StoreParamV4:
+      return PerformStoreParamCombine(N);
     case ISD::EXTRACT_VECTOR_ELT:
       return PerformEXTRACTCombine(N, DCI);
     case ISD::VSELECT:
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a65170e56aa248..c81dfa68e4bd44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2475,6 +2475,7 @@ defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal>
 defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>;
 defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>;
 defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>;
+defm cvta_param  : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>;
 
 defm cvta_to_local  : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>;
 defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>;
@@ -6724,6 +6725,7 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
                   # FragC.regstring # ";";
 }
 
+let isConvergent = true in {
 defset list<WMMA_INSTR> WMMAs  = {
   foreach layout_a = ["row", "col"] in {
     foreach layout_b = ["row", "col"] in {
@@ -6745,6 +6747,7 @@ defset list<WMMA_INSTR> WMMAs  = {
     } // layout_b
   } // layout_a
 } // defset
+}
 
 // MMA
 class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
@@ -6774,6 +6777,7 @@ class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
                   # FragC.regstring # ";";
 }
 
+let isConvergent = true in {
 defset list<WMMA_INSTR> MMAs  = {
   foreach layout_a = ["row", "col"] in {
     foreach layout_b = ["row", "col"] in {
@@ -6793,6 +6797,7 @@ defset list<WMMA_INSTR> MMAs  = {
     } // layout_b
   } // layout_a
 } // defset
+}
 
 //
 // ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 6ee4b160e90aa4..9aa97fb30c488c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -60,7 +60,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   SmallVector<LoadInst *, 4> AggrLoads;
   SmallVector<MemIntrinsic *, 4> MemCalls;
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   LLVMContext &Context = F.getParent()->getContext();
   const TargetTransformInfo &TTI =
       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index cde02c25c48342..773322937498dc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -12,8 +12,7 @@
 // http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
 //
 // Kernel parameters are read-only and accessible only via ld.param
-// instruction, directly or via a pointer. Pointers to kernel
-// arguments can't be converted to generic address space.
+// instruction, directly or via a pointer.
 //
 // Device function parameters are directly accessible via
 // ld.param/st.param, but taking the address of one returns a pointer
@@ -54,8 +53,10 @@
 //      ...
 //    }
 //
-// 2. Convert pointers in a byval kernel parameter to pointers in the global
-//    address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
+// 2. Convert byval kernel parameters to pointers in the param address space
+//    (so that NVPTX emits ld/st.param).  Convert pointers *within* a byval
+//    kernel parameter to pointers in the global address space. This allows
+//    NVPTX to emit ld/st.global.
 //
 //    struct S {
 //      int *x;
@@ -68,22 +69,68 @@
 //
 //    "b" points to the global address space. In the IR level,
 //
-//    define void @foo({i32*, i32*}* byval %input) {
-//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
-//      %b = load i32*, i32** %b_ptr
+//    define void @foo(ptr byval %input) {
+//      %b_ptr = getelementptr {ptr, ptr}, ptr %input, i64 0, i32 1
+//      %b = load ptr, ptr %b_ptr
 //      ; use %b
 //    }
 //
 //    becomes
 //
 //    define void @foo({i32*, i32*}* byval %input) {
-//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
-//      %b = load i32*, i32** %b_ptr
-//      %b_global = addrspacecast i32* %b to i32 addrspace(1)*
-//      %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32*
+//      %b_param = addrspacecat ptr %input to ptr addrspace(101)
+//      %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
+//      %b = load ptr, ptr addrspace(101) %b_ptr
+//      %b_global = addrspacecast ptr %b to ptr addrspace(1)
 //      ; use %b_generic
 //    }
 //
+//    Create a local copy of kernel byval parameters used in a way that *might* mutate
+//    the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
+//    are undefined behaviour, and don't require local copies.
+//
+//    define void @foo(ptr byval(%struct.s) align 4 %input) {
+//       store i32 42, ptr %input
+//       ret void
+//    }
+//
+//    becomes
+//
+//    define void @foo(ptr byval(%struct.s) align 4 %input) #1 {
+//      %input1 = alloca %struct.s, align 4
+//      %input2 = addrspacecast ptr %input to ptr addrspace(101)
+//      %input3 = load %struct.s, ptr addrspace(101) %input2, align 4
+//      store %struct.s %input3, ptr %input1, align 4
+//      store i32 42, ptr %input1, align 4
+//      ret void
+//    }
+//
+//    If %input were passed to a device function, or written to memory,
+//    conservatively assume that %input gets mutated, and create a local copy.
+//
+//    Convert param pointers to grid_constant byval kernel parameters that are
+//    passed into calls (device functions, intrinsics, inline asm), or otherwise
+//    "escape" (into stores/ptrtoints) to the generic address space, using the
+//    `nvvm.ptr.param.to.gen` intrinsic, so that NVPTX emits cvta.param
+//    (available for sm70+)
+//
+//    define void @foo(ptr byval(%struct.s) %input) {
+//      ; %input is a grid_constant
+//      %call = call i32 @escape(ptr %input)
+//      ret void
+//    }
+//
+//    becomes
+//
+//    define void @foo(ptr byval(%struct.s) %input) {
+//      %input1 = addrspacecast ptr %input to ptr addrspace(101)
+//      ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
+//      ; to prevent generic -> param -> generic from getting cancelled out
+//      %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
+//      %call = call i32 @escape(ptr %input1.gen)
+//      ret void
+//    }
+//
 // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
 // cancel the addrspacecast pair this pass emits.
 //===----------------------------------------------------------------------===//
@@ -95,7 +142,9 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
@@ -164,19 +213,22 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
 // ones in parameter AS, so we can access them using ld.param.
 // =============================================================================
 
-// Replaces the \p OldUser instruction with the same in parameter AS.
-// Only Load and GEP are supported.
-static void convertToParamAS(Value *OldUser, Value *Param) {
-  Instruction *I = dyn_cast<Instruction>(OldUser);
-  assert(I && "OldUser must be an instruction");
+// For Loads, replaces the \p OldUse of the pointer with a Use of the same
+// pointer in parameter AS.
+// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
+// generic using cvta.param.
+static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+  Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
+  assert(I && "OldUse must be in an instruction");
   struct IP {
+    Use *OldUse;
     Instruction *OldInstruction;
     Value *NewParam;
   };
-  SmallVector<IP> ItemsToConvert = {{I, Param}};
+  SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
   SmallVector<Instruction *> InstructionsToDelete;
 
-  auto CloneInstInParamAS = [](const IP &I) -> Value * {
+  auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
     if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
       LI->setOperand(0, I.NewParam);
       return LI;
@@ -200,6 +252,43 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
       // Just pass through the argument, the old ASC is no longer needed.
       return I.NewParam;
     }
+
+    if (GridConstant) {
+      auto GetParamAddrCastToGeneric =
+          [](Value *Addr, Instruction *OriginalUser) -> Value * {
+        PointerType *ReturnTy =
+            PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC);
+        Function *CvtToGen = Intrinsic::getDeclaration(
+            OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen,
+            {ReturnTy, PointerType::get(OriginalUser->getContext(),
+                                        ADDRESS_SPACE_PARAM)});
+
+        // Cast param address to generic address space
+        Value *CvtToGenCall =
+            CallInst::Create(CvtToGen, Addr, Addr->getName() + ".gen",
+                             OriginalUser->getIterator());
+        return CvtToGenCall;
+      };
+
+      if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+        I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
+        return CI;
+      }
+      if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+        // byval address is being stored, cast it to generic
+        if (SI->getValueOperand() == I.OldUse->get())
+          SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
+        return SI;
+      }
+      if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+        if (PI->getPointerOperand() == I.OldUse->get())
+          PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
+        return PI;
+      }
+      llvm_unreachable(
+          "Instruction unsupported even for grid_constant argument");
+    }
+
     llvm_unreachable("Unsupported instruction");
   };
 
@@ -211,8 +300,8 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
       // We've created a new instruction. Queue users of the old instruction to
       // be converted and the instruction itself to be deleted. We can't delete
       // the old instruction yet, because it's still in use by a load somewhere.
-      for (Value *V : I.OldInstruction->users())
-        ItemsToConvert.push_back({cast<Instruction>(V), NewInst});
+      for (Use &U : I.OldInstruction->uses())
+        ItemsToConvert.push_back({&U, cast<Instruction>(U.getUser()), NewInst});
 
       InstructionsToDelete.push_back(I.OldInstruction);
     }
@@ -270,6 +359,7 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
   SmallVector<Load> Loads;
   std::queue<LoadContext> Worklist;
   Worklist.push({ArgInParamAS, 0});
+  bool IsGridConstant = isParamGridConstant(*Arg);
 
   while (!Worklist.empty()) {
     LoadContext Ctx = Worklist.front();
@@ -301,8 +391,14 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
         continue;
       }
 
+      // supported for grid_constant
+      if (IsGridConstant &&
+          (isa<CallInst>(CurUser) || isa<StoreInst>(CurUser) ||
+           isa<PtrToIntInst>(CurUser)))
+        continue;
+
       llvm_unreachable("All users must be one of: load, "
-                       "bitcast, getelementptr.");
+                       "bitcast, getelementptr, call, store, ptrtoint");
     }
   }
 
@@ -315,14 +411,15 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
 
 void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
                                       Argument *Arg) {
+  bool IsGridConstant = isParamGridConstant(*Arg);
   Function *Func = Arg->getParent();
   BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
   Type *StructType = Arg->getParamByValType();
   assert(StructType && "Missing byval type");
 
-  auto IsALoadChain = [&](Value *Start) {
+  auto AreSupportedUsers = [&](Value *Start) {
     SmallVector<Value *, 16> ValuesToCheck = {Start};
-    auto IsALoadChainInstr = [](Value *V) -> bool {
+    auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
       if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
         return true;
       // ASC to param space are OK, too -- we'll just strip them.
@@ -330,33 +427,43 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
         if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
           return true;
       }
+      // Simple calls and stores are supported for grid_constants
+      // writes to these pointers are undefined behaviour
+      if (IsGridConstant &&
+          (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
+        return true;
       return false;
     };
 
     while (!ValuesToCheck.empty()) {
       Value *V = ValuesToCheck.pop_back_val();
-      if (!IsALoadChainInstr(V)) {
-        LLVM_DEBUG(dbgs() << "Need a copy of " << *Arg << " because of " << *V
-                          << "\n");
+      if (!IsSupportedUse(V)) {
+        LLVM_DEBUG(dbgs() << "Need a "
+                          << (isParamGridConstant(*Arg) ? "cast " : "copy ")
+                          << "of " << *Arg << " because of " << *V << "\n");
         (void)Arg;
         return false;
       }
-      if (!isa<LoadInst>(V))
+      if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
+          !isa<PtrToIntInst>(V))
         llvm::append_range(ValuesToCheck, V->users());
     }
     return true;
   };
 
-  if (llvm::all_of(Arg->users(), IsALoadChain)) {
+  if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
     // Convert all loads and intermediate operations to use parameter AS and
     // skip creation of a local copy of the argument.
-    SmallVector<User *, 16> UsersToUpdate(Arg->users());
+    SmallVector<Use *, 16> UsesToUpdate;
+    for (Use &U : Arg->uses())
+      UsesToUpdate.push_back(&U);
+
     Value *ArgInParamAS = new AddrSpaceCastInst(
         Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
         FirstInst);
-    for (Value *V : UsersToUpdate)
-      convertToParamAS(V, ArgInParamAS);
-    LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n");
+    for (Use *U : UsesToUpdate)
+      convertToParamAS(U, ArgInParamAS, IsGridConstant);
+    LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
 
     const auto *TLI =
         cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());
@@ -366,27 +473,54 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     return;
   }
 
-  // Otherwise we have to create a temporary copy.
-  const DataLayout &DL = Func->getParent()->getDataLayout();
+  const DataLayout &DL = Func->getDataLayout();
   unsigned AS = DL.getAllocaAddrSpace();
-  AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
-  // Set the alignment to alignment of the byval parameter. This is because,
-  // later load/stores assume that alignment, and we are going to replace
-  // the use of the byval parameter with this alloca instruction.
-  AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
-                           .value_or(DL.getPrefTypeAlign(StructType)));
-  Arg->replaceAllUsesWith(AllocA);
-
-  Value *ArgInParam = new AddrSpaceCastInst(
-      Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
-      FirstInst);
-  // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
-  // addrspacecast preserves alignment.  Since params are constant, this load is
-  // definitely not volatile.
-  LoadInst *LI =
-      new LoadInst(StructType, ArgInParam, Arg->getName(),
-                   /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
-  new StoreInst(LI, AllocA, FirstInst);
+  if (isParamGridConstant(*Arg)) {
+    // Writes to a grid constant are undefined behaviour. We do not need a
+    // temporary copy. When a pointer might have escaped, conservatively replace
+    // all of its uses (which might include a device function call) with a cast
+    // to the generic address space.
+    IRBuilder<> IRB(&Func->getEntryBlock().front());
+
+    // Cast argument to param address space
+    auto *CastToParam = cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast(
+        Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param"));
+
+    // Cast param address to generic address space. We do not use an
+    // addrspacecast to generic here, because, LLVM considers `Arg` to be in the
+    // generic address space, and a `generic -> param` cast followed by a `param
+    // -> generic` cast will be folded away. The `param -> generic` intrinsic
+    // will be correctly lowered to `cvta.param`.
+    Value *CvtToGenCall = IRB.CreateIntrinsic(
+        IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen,
+        CastToParam, nullptr, CastToParam->getName() + ".gen");
+
+    Arg->replaceAllUsesWith(CvtToGenCall);
+
+    // Do not replace Arg in the cast to param space
+    CastToParam->setOperand(0, Arg);
+  } else {
+    // Otherwise we have to create a temporary copy.
+    AllocaInst *AllocA =
+        new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
+    // Set the alignment to alignment of the byval parameter. This is because,
+    // later load/stores assume that alignment, and we are going to replace
+    // the use of the byval parameter with this alloca instruction.
+    AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
+                             .value_or(DL.getPrefTypeAlign(StructType)));
+    Arg->replaceAllUsesWith(AllocA);
+
+    Value *ArgInParam = new AddrSpaceCastInst(
+        Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM),
+        Arg->getName(), FirstInst);
+    // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
+    // addrspacecast preserves alignment.  Since params are constant, this load
+    // is definitely not volatile.
+    LoadInst *LI =
+        new LoadInst(StructType, ArgInParam, Arg->getName(),
+                     /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
+    new StoreInst(LI, AllocA, FirstInst);
+  }
 }
 
 void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index b60a1d747af791..152f200b9d0f36 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -224,8 +224,7 @@ void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
   AAM.registerFunctionAnalysis<NVPTXAA>();
 }
 
-void NVPTXTargetMachine::registerPassBuilderCallbacks(
-    PassBuilder &PB, bool PopulateClassToPassNames) {
+void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "NVPTXPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 870ea20c26f3f2..2b88da67a50f95 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -66,8 +66,7 @@ class NVPTXTargetMachine : public LLVMTargetMachine {
 
   void registerDefaultAliasAnalyses(AAManager &AAM) override;
 
-  void registerPassBuilderCallbacks(PassBuilder &PB,
-                                    bool PopulateClassToPassNames) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 5a4fbab97f952e..4160f5f6bfae76 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -38,7 +38,7 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
 
 public:
   explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl()),
         TLI(ST->getTargetLowering()) {}
 
   bool hasBranchDivergence(const Function *F = nullptr) { return true; }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 3a536db1c97279..80361744fd5b6f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -52,29 +52,46 @@ void clearAnnotationCache(const Module *Mod) {
   AC.Cache.erase(Mod);
 }
 
-static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+static void readIntVecFromMDNode(const MDNode *MetadataNode,
+                                 std::vector<unsigned> &Vec) {
+  for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) {
+    ConstantInt *Val =
+        mdconst::extract<ConstantInt>(MetadataNode->getOperand(i));
+    Vec.push_back(Val->getZExtValue());
+  }
+}
+
+static void cacheAnnotationFromMD(const MDNode *MetadataNode,
+                                  key_val_pair_t &retval) {
   auto &AC = getAnnotationCache();
   std::lock_guard<sys::Mutex> Guard(AC.Lock);
-  assert(md && "Invalid mdnode for annotation");
-  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  assert(MetadataNode && "Invalid mdnode for annotation");
+  assert((MetadataNode->getNumOperands() % 2) == 1 &&
+         "Invalid number of operands");
   // start index = 1, to skip the global variable key
   // increment = 2, to skip the value for each property-value pairs
-  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+  for (unsigned i = 1, e = MetadataNode->getNumOperands(); i != e; i += 2) {
     // property
-    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    const MDString *prop = dyn_cast<MDString>(MetadataNode->getOperand(i));
     assert(prop && "Annotation property not a string");
+    std::string Key = prop->getString().str();
 
     // value
-    ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(md->getOperand(i + 1));
-    assert(Val && "Value operand not a constant int");
-
-    std::string keyname = prop->getString().str();
-    if (retval.find(keyname) != retval.end())
-      retval[keyname].push_back(Val->getZExtValue());
-    else {
-      std::vector<unsigned> tmp;
-      tmp.push_back(Val->getZExtValue());
-      retval[keyname] = tmp;
+    if (ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(
+            MetadataNode->getOperand(i + 1))) {
+      retval[Key].push_back(Val->getZExtValue());
+    } else if (MDNode *VecMd =
+                   dyn_cast<MDNode>(MetadataNode->getOperand(i + 1))) {
+      // note: only "grid_constant" annotations support vector MDNodes.
+      // assert: there can only exist one unique key value pair of
+      // the form (string key, MDNode node). Operands of such a node
+      // shall always be unsigned ints.
+      if (retval.find(Key) == retval.end()) {
+        readIntVecFromMDNode(VecMd, retval[Key]);
+        continue;
+      }
+    } else {
+      llvm_unreachable("Value operand not a constant int or an mdnode");
     }
   }
 }
@@ -153,9 +170,9 @@ bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
 
 bool isTexture(const Value &val) {
   if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, "texture", annot)) {
-      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, "texture", Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a texture symbol");
       return true;
     }
   }
@@ -164,70 +181,68 @@ bool isTexture(const Value &val) {
 
 bool isSurface(const Value &val) {
   if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, "surface", annot)) {
-      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, "surface", Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a surface symbol");
       return true;
     }
   }
   return false;
 }
 
-bool isSampler(const Value &val) {
-  const char *AnnotationName = "sampler";
-
-  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, AnnotationName, annot)) {
-      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
-      return true;
-    }
-  }
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, AnnotationName, annot)) {
-      if (is_contained(annot, arg->getArgNo()))
+static bool argHasNVVMAnnotation(const Value &Val,
+                                 const std::string &Annotation,
+                                 const bool StartArgIndexAtOne = false) {
+  if (const Argument *Arg = dyn_cast<Argument>(&Val)) {
+    const Function *Func = Arg->getParent();
+    std::vector<unsigned> Annot;
+    if (findAllNVVMAnnotation(Func, Annotation, Annot)) {
+      const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0;
+      if (is_contained(Annot, BaseOffset + Arg->getArgNo())) {
         return true;
+      }
     }
   }
   return false;
 }
 
-bool isImageReadOnly(const Value &val) {
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, "rdoimage", annot)) {
-      if (is_contained(annot, arg->getArgNo()))
-        return true;
+bool isParamGridConstant(const Value &V) {
+  if (const Argument *Arg = dyn_cast<Argument>(&V)) {
+    // "grid_constant" counts argument indices starting from 1
+    if (Arg->hasByValAttr() &&
+        argHasNVVMAnnotation(*Arg, "grid_constant",
+                             /*StartArgIndexAtOne*/ true)) {
+      assert(isKernelFunction(*Arg->getParent()) &&
+             "only kernel arguments can be grid_constant");
+      return true;
     }
   }
   return false;
 }
 
-bool isImageWriteOnly(const Value &val) {
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, "wroimage", annot)) {
-      if (is_contained(annot, arg->getArgNo()))
-        return true;
+bool isSampler(const Value &val) {
+  const char *AnnotationName = "sampler";
+
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, AnnotationName, Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
     }
   }
-  return false;
+  return argHasNVVMAnnotation(val, AnnotationName);
+}
+
+bool isImageReadOnly(const Value &val) {
+  return argHasNVVMAnnotation(val, "rdoimage");
+}
+
+bool isImageWriteOnly(const Value &val) {
+  return argHasNVVMAnnotation(val, "wroimage");
 }
 
 bool isImageReadWrite(const Value &val) {
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, "rdwrimage", annot)) {
-      if (is_contained(annot, arg->getArgNo()))
-        return true;
-    }
-  }
-  return false;
+  return argHasNVVMAnnotation(val, "rdwrimage");
 }
 
 bool isImage(const Value &val) {
@@ -236,9 +251,9 @@ bool isImage(const Value &val) {
 
 bool isManaged(const Value &val) {
   if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, "managed", annot)) {
-      assert((annot == 1) && "Unexpected annotation on a managed symbol");
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, "managed", Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a managed symbol");
       return true;
     }
   }
@@ -323,8 +338,7 @@ bool getMaxNReg(const Function &F, unsigned &x) {
 
 bool isKernelFunction(const Function &F) {
   unsigned x = 0;
-  bool retval = findOneNVVMAnnotation(&F, "kernel", x);
-  if (!retval) {
+  if (!findOneNVVMAnnotation(&F, "kernel", x)) {
     // There is no NVVM metadata, check the calling convention
     return F.getCallingConv() == CallingConv::PTX_Kernel;
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index e020bc0f02e960..c15ff6cae1f270 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -62,6 +62,7 @@ bool getMaxClusterRank(const Function &, unsigned &);
 bool getMinCTASm(const Function &, unsigned &);
 bool getMaxNReg(const Function &, unsigned &);
 bool isKernelFunction(const Function &);
+bool isParamGridConstant(const Value &);
 
 MaybeAlign getAlign(const Function &, unsigned);
 MaybeAlign getAlign(const CallInst &, unsigned);
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 4024953bb51dbc..d653f80d7f3c32 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -192,7 +192,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   while (!ToSimplify.empty()) {
     Instruction *I = ToSimplify.pop_back_val();
     if (Constant *C =
-            ConstantFoldInstruction(I, F.getParent()->getDataLayout())) {
+            ConstantFoldInstruction(I, F.getDataLayout())) {
       for (User *U : I->users())
         if (Instruction *I = dyn_cast<Instruction>(U))
           ToSimplify.push_back(I);
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
index 99ecc3fe360d17..a56e2aaec061f0 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
@@ -80,7 +80,7 @@ bool PPCCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   if (!VRegs.empty()) {
     // Setup the information about the return value.
     ArgInfo OrigArg{VRegs, Val->getType(), 0};
@@ -117,7 +117,7 @@ bool PPCCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                            FunctionLoweringInfo &FLI) const {
   MachineFunction &MF = MIRBuilder.getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const auto &DL = F.getParent()->getDataLayout();
+  const auto &DL = F.getDataLayout();
   auto &TLI = *getTLI<PPCTargetLowering>();
 
   // Loop over each arg, set flags and split to single value types
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 1eaa57e162607d..53eb3610535f8f 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -76,7 +76,7 @@ void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
     // label to the top of the fragment containing the aligned instruction that
     // was just added.
     if (InstLine == LabelLine) {
-      assignFragment(LastLabel, InstructionFragment);
+      LastLabel->setFragment(InstructionFragment);
       LastLabel->setOffset(0);
     }
   }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bc0ae7a32c0517..d74143b487880a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1613,7 +1613,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
     const MachineOperand &MO = MI->getOperand(OpNum);
     if (MO.isGlobal()) {
-      const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout();
+      const DataLayout &DL = MO.getGlobal()->getDataLayout();
       if (MO.getGlobal()->getPointerAlignment(DL) < 4)
         llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
     }
@@ -2732,7 +2732,7 @@ static void tocDataChecks(unsigned PointerSize, const GlobalVariable *GV) {
   Type *GVType = GV->getValueType();
   assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
                               "supported by the toc data transformation.");
-  if (GV->getParent()->getDataLayout().getTypeSizeInBits(GVType) >
+  if (GV->getDataLayout().getTypeSizeInBits(GVType) >
       PointerSize * 8)
     report_fatal_error(
         "A GlobalVariable with size larger than a TOC entry is not currently "
@@ -2750,7 +2750,7 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // If the Global Variable has the toc-data attribute, it needs to be emitted
   // when we emit the .toc section.
   if (GV->hasAttribute("toc-data")) {
-    unsigned PointerSize = GV->getParent()->getDataLayout().getPointerSize();
+    unsigned PointerSize = GV->getDataLayout().getPointerSize();
     tocDataChecks(PointerSize, GV);
     TOCDataGlobalVars.push_back(GV);
     return;
@@ -2794,7 +2794,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
   // Switch to the containing csect.
   OutStreamer->switchSection(Csect);
 
-  const DataLayout &DL = GV->getParent()->getDataLayout();
+  const DataLayout &DL = GV->getDataLayout();
 
   // Handle common and zero-initialized local symbols.
   if (GV->hasCommonLinkage() || GVKind.isBSSLocal() ||
@@ -2837,7 +2837,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
 
   // No alias to emit.
   if (!GOAliasMap[GV].size()) {
-    emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+    emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
     return;
   }
 
@@ -2848,7 +2848,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
     AliasList[getAliasOffset(GA->getAliasee())].push_back(GA);
 
   // Emit alias label and element value for global variable.
-  emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer(),
+  emitGlobalConstant(GV->getDataLayout(), GV->getInitializer(),
                      &AliasList);
 }
 
@@ -3007,7 +3007,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
     MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
         getObjFileLowering().SectionForGlobal(GO, GOKind, TM));
 
-    Align GOAlign = getGVAlignment(GO, GO->getParent()->getDataLayout());
+    Align GOAlign = getGVAlignment(GO, GO->getDataLayout());
     Csect->ensureMinAlignment(GOAlign);
   };
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 9e56b8522fa633..4d4008ac0ba706 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3467,7 +3467,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
           IsTLSLocalExecModel) {
         Type *GVType = GV->getValueType();
         if (GVType->isSized() && !GVType->isEmptyTy() &&
-            GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <=
+            GV->getDataLayout().getTypeAllocSize(GVType) <=
                 AIXSmallTlsPolicySizeLimit)
           return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA, TLSReg);
       }
@@ -3530,7 +3530,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
     if (HasAIXSmallLocalDynamicTLS) {
       Type *GVType = GV->getValueType();
       if (GVType->isSized() && !GVType->isEmptyTy() &&
-          GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <=
+          GV->getDataLayout().getTypeAllocSize(GVType) <=
               AIXSmallTlsPolicySizeLimit)
         return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA,
                            ModuleHandle);
@@ -5873,7 +5873,7 @@ bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const {
 
   GetReturnInfo(CalleeCC, CalleeFunc->getReturnType(),
                 CalleeFunc->getAttributes(), Outs, *this,
-                CalleeFunc->getParent()->getDataLayout());
+                CalleeFunc->getDataLayout());
 
   return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB,
                           CalleeFunc->isVarArg(), Outs, Ins, CallerFunc,
@@ -17543,7 +17543,7 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
 
     const TargetOptions &Options = getTargetMachine().Options;
     const Function *F = I->getFunction();
-    const DataLayout &DL = F->getParent()->getDataLayout();
+    const DataLayout &DL = F->getDataLayout();
     Type *Ty = User->getOperand(0)->getType();
 
     return !(
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 85bbfabf5d3c91..e3f59f3bd28dfc 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -4519,7 +4519,7 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
     // load. A DForm load cannot be represented if it is a multiple of say 2.
     // XForm loads do not have this restriction.
     if (ImmMO.isGlobal()) {
-      const DataLayout &DL = ImmMO.getGlobal()->getParent()->getDataLayout();
+      const DataLayout &DL = ImmMO.getGlobal()->getDataLayout();
       if (ImmMO.getGlobal()->getPointerAlignment(DL) < III.ImmMustBeMultipleOf)
         return false;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index f19eb2af5e0709..53a79db5843a9f 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -557,7 +557,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
   BasicBlock *Header = L->getHeader();
   BasicBlock *LoopPredecessor = L->getLoopPredecessor();
 
-  SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
+  SCEVExpander SCEVE(*SE, Header->getDataLayout(),
                      "loopprepare-chaincommon");
 
   for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) {
@@ -1025,7 +1025,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(
     return MadeChange;
 
   BasicBlock *Header = L->getHeader();
-  SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
+  SCEVExpander SCEVE(*SE, Header->getDataLayout(),
                      "loopprepare-formrewrite");
   if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart()))
     return MadeChange;
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 31a2614823587f..d8f3c29cb021fa 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index d1cc2ad5c481ff..0bfcba9a524861 100644
--- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -426,7 +426,7 @@ class PPCReduceCRLogicals : public MachineFunctionPass {
   }
   CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI);
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
     AU.addRequired<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -570,7 +570,7 @@ void PPCReduceCRLogicals::initialize(MachineFunction &MFParam) {
   MF = &MFParam;
   MRI = &MF->getRegInfo();
   TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
 
   AllCRLogicalOps.clear();
 }
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index bd9af12b30f5ec..aaee08d5de100d 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -23,7 +23,9 @@
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 061f5da5daf53b..5be093f81d3f60 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -37,7 +37,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
 
 public:
   explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   std::optional<Instruction *> instCombineIntrinsic(InstCombiner & IC,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index a091380e8ce82c..f511a201098034 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -577,12 +577,14 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
     const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF();
     APInt Imm = FPimm.bitcastToAPInt();
     unsigned Size = MRI.getType(DstReg).getSizeInBits();
-    if (Size == 32 || (Size == 64 && Subtarget->is64Bit())) {
+    if (Size == 16 || Size == 32 || (Size == 64 && Subtarget->is64Bit())) {
       Register GPRReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
       if (!materializeImm(GPRReg, Imm.getSExtValue(), MIB))
         return false;
 
-      unsigned Opcode = Size == 64 ? RISCV::FMV_D_X : RISCV::FMV_W_X;
+      unsigned Opcode = Size == 64   ? RISCV::FMV_D_X
+                        : Size == 32 ? RISCV::FMV_W_X
+                                     : RISCV::FMV_H_X;
       auto FMV = MIB.buildInstr(Opcode, {DstReg}, {GPRReg});
       if (!FMV.constrainAllUses(TII, TRI, RBI))
         return false;
@@ -1146,16 +1148,16 @@ bool RISCVInstructionSelector::selectSelect(MachineInstr &MI,
 
 // Convert an FCMP predicate to one of the supported F or D instructions.
 static unsigned getFCmpOpcode(CmpInst::Predicate Pred, unsigned Size) {
-  assert((Size == 32 || Size == 64) && "Unsupported size");
+  assert((Size == 16 || Size == 32 || Size == 64) && "Unsupported size");
   switch (Pred) {
   default:
     llvm_unreachable("Unsupported predicate");
   case CmpInst::FCMP_OLT:
-    return Size == 32 ? RISCV::FLT_S : RISCV::FLT_D;
+    return Size == 16 ? RISCV::FLT_H : Size == 32 ? RISCV::FLT_S : RISCV::FLT_D;
   case CmpInst::FCMP_OLE:
-    return Size == 32 ? RISCV::FLE_S : RISCV::FLE_D;
+    return Size == 16 ? RISCV::FLE_H : Size == 32 ? RISCV::FLE_S : RISCV::FLE_D;
   case CmpInst::FCMP_OEQ:
-    return Size == 32 ? RISCV::FEQ_S : RISCV::FEQ_D;
+    return Size == 16 ? RISCV::FEQ_H : Size == 32 ? RISCV::FEQ_S : RISCV::FEQ_D;
   }
 }
 
@@ -1207,7 +1209,7 @@ bool RISCVInstructionSelector::selectFPCompare(MachineInstr &MI,
   Register RHS = CmpMI.getRHSReg();
 
   unsigned Size = MRI.getType(LHS).getSizeInBits();
-  assert((Size == 32 || Size == 64) && "Unexpected size");
+  assert((Size == 16 || Size == 32 || Size == 64) && "Unexpected size");
 
   Register TmpReg = DstReg;
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index e003b6b8ee4e14..f033ea7250030d 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -35,7 +35,8 @@ static LegalityPredicate typeIsScalarFPArith(unsigned TypeIdx,
                                              const RISCVSubtarget &ST) {
   return [=, &ST](const LegalityQuery &Query) {
     return Query.Types[TypeIdx].isScalar() &&
-           ((ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) ||
+           ((ST.hasStdExtZfh() && Query.Types[TypeIdx].getSizeInBits() == 16) ||
+            (ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) ||
             (ST.hasStdExtD() && Query.Types[TypeIdx].getSizeInBits() == 64));
   };
 }
@@ -383,15 +384,24 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   getActionDefinitionsBuilder(G_FCOPYSIGN)
       .legalIf(all(typeIsScalarFPArith(0, ST), typeIsScalarFPArith(1, ST)));
 
+  // FIXME: Use Zfhmin.
   getActionDefinitionsBuilder(G_FPTRUNC).legalIf(
       [=, &ST](const LegalityQuery &Query) -> bool {
         return (ST.hasStdExtD() && typeIs(0, s32)(Query) &&
+                typeIs(1, s64)(Query)) ||
+               (ST.hasStdExtZfh() && typeIs(0, s16)(Query) &&
+                typeIs(1, s32)(Query)) ||
+               (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s16)(Query) &&
                 typeIs(1, s64)(Query));
       });
   getActionDefinitionsBuilder(G_FPEXT).legalIf(
       [=, &ST](const LegalityQuery &Query) -> bool {
         return (ST.hasStdExtD() && typeIs(0, s64)(Query) &&
-                typeIs(1, s32)(Query));
+                typeIs(1, s32)(Query)) ||
+               (ST.hasStdExtZfh() && typeIs(0, s32)(Query) &&
+                typeIs(1, s16)(Query)) ||
+               (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s64)(Query) &&
+                typeIs(1, s16)(Query));
       });
 
   getActionDefinitionsBuilder(G_FCMP)
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 686c8d89a73212..d25e96525399e6 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -29,6 +29,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] = {
     // clang-format off
     {0, 32, GPRBRegBank},
     {0, 64, GPRBRegBank},
+    {0, 16, FPRBRegBank},
     {0, 32, FPRBRegBank},
     {0, 64, FPRBRegBank},
     {0, 64, VRBRegBank},
@@ -41,12 +42,13 @@ const RegisterBankInfo::PartialMapping PartMappings[] = {
 enum PartialMappingIdx {
   PMI_GPRB32 = 0,
   PMI_GPRB64 = 1,
-  PMI_FPRB32 = 2,
-  PMI_FPRB64 = 3,
-  PMI_VRB64 = 4,
-  PMI_VRB128 = 5,
-  PMI_VRB256 = 6,
-  PMI_VRB512 = 7,
+  PMI_FPRB16 = 2,
+  PMI_FPRB32 = 3,
+  PMI_FPRB64 = 4,
+  PMI_VRB64 = 5,
+  PMI_VRB128 = 6,
+  PMI_VRB256 = 7,
+  PMI_VRB512 = 8,
 };
 
 const RegisterBankInfo::ValueMapping ValueMappings[] = {
@@ -60,6 +62,10 @@ const RegisterBankInfo::ValueMapping ValueMappings[] = {
     {&PartMappings[PMI_GPRB64], 1},
     {&PartMappings[PMI_GPRB64], 1},
     {&PartMappings[PMI_GPRB64], 1},
+    // Maximum 3 FPR operands; 16 bit.
+    {&PartMappings[PMI_FPRB16], 1},
+    {&PartMappings[PMI_FPRB16], 1},
+    {&PartMappings[PMI_FPRB16], 1},
     // Maximum 3 FPR operands; 32 bit.
     {&PartMappings[PMI_FPRB32], 1},
     {&PartMappings[PMI_FPRB32], 1},
@@ -90,12 +96,13 @@ enum ValueMappingIdx {
   InvalidIdx = 0,
   GPRB32Idx = 1,
   GPRB64Idx = 4,
-  FPRB32Idx = 7,
-  FPRB64Idx = 10,
-  VRB64Idx = 13,
-  VRB128Idx = 16,
-  VRB256Idx = 19,
-  VRB512Idx = 22,
+  FPRB16Idx = 7,
+  FPRB32Idx = 10,
+  FPRB64Idx = 13,
+  VRB64Idx = 16,
+  VRB128Idx = 19,
+  VRB256Idx = 22,
+  VRB512Idx = 25,
 };
 } // namespace RISCV
 } // namespace llvm
@@ -151,8 +158,20 @@ RISCVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
 }
 
 static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) {
-  assert(Size == 32 || Size == 64);
-  unsigned Idx = Size == 64 ? RISCV::FPRB64Idx : RISCV::FPRB32Idx;
+  unsigned Idx;
+  switch (Size) {
+  default:
+    llvm_unreachable("Unexpected size");
+  case 16:
+    Idx = RISCV::FPRB16Idx;
+    break;
+  case 32:
+    Idx = RISCV::FPRB32Idx;
+    break;
+  case 64:
+    Idx = RISCV::FPRB64Idx;
+    break;
+  }
   return &RISCV::ValueMappings[Idx];
 }
 
@@ -459,7 +478,6 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     LLT Ty = MRI.getType(MI.getOperand(2).getReg());
 
     unsigned Size = Ty.getSizeInBits();
-    assert((Size == 32 || Size == 64) && "Unsupported size for G_FCMP");
 
     OpdsMapping[0] = GPRValueMapping;
     OpdsMapping[2] = OpdsMapping[3] = getFPValueMapping(Size);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 182a9c1544fd25..23d18e98d43d1b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -280,7 +280,7 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
   size_t OldSize = Data.size();
 
   int64_t Value;
-  if (AddrDelta.evaluateAsAbsolute(Value, Layout.getAssembler()))
+  if (AddrDelta.evaluateAsAbsolute(Value, Layout))
     return false;
   [[maybe_unused]] bool IsAbsolute =
       AddrDelta.evaluateKnownAbsolute(Value, Layout);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index ae7ce476fff222..87c5a756e02581 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -159,8 +159,7 @@ void RISCVELFStreamer::emitMappingSymbol(StringRef Name) {
   Symbol->setBinding(ELF::STB_LOCAL);
 }
 
-void RISCVELFStreamer::changeSection(MCSection *Section,
-                                     const MCExpr *Subsection) {
+void RISCVELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   // We have to keep track of the mapping symbol state of any sections we
   // use. Each one should start off as EMS_None, which is provided as the
   // default constructor by DenseMap::lookup.
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 212d731889f1ae..40c6b5ac3dcc8f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -32,7 +32,7 @@ class RISCVELFStreamer : public MCELFStreamer {
                    std::unique_ptr<MCCodeEmitter> MCE)
       : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {}
 
-  void changeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void changeSection(MCSection *Section, uint32_t Subsection) override;
   void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
   void emitBytes(StringRef Data) override;
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc) override;
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 09f496574d64ae..d96fafbe608077 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -51,6 +51,7 @@ include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
 include "RISCVSchedSiFiveP600.td"
 include "RISCVSchedSyntacoreSCR1.td"
+include "RISCVSchedSyntacoreSCR3.td"
 include "RISCVSchedXiangShanNanHu.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 6eceaddc747d15..c0a4d0e9c520f9 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 53fcc527e615dd..6e0f429c34b2f6 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -163,7 +163,7 @@ bool RISCVCodeGenPrepare::runOnFunction(Function &F) {
   auto &TM = TPC.getTM<RISCVTargetMachine>();
   ST = &TM.getSubtarget<RISCVSubtarget>(F);
 
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
 
   bool MadeChange = false;
   for (auto &BB : F)
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index a5e34def81c85a..7b3057b187d5ed 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -158,7 +158,7 @@ def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">,
                        AssemblerPredicate<(all_of FeatureStdExtZicfilp),
                                           "'Zicfilp' (Landing pad)">;
 def NoStdExtZicfilp : Predicate<"!Subtarget->hasStdExtZicfilp()">,
-                                 AssemblerPredicate<(all_of (not FeatureStdExtZicfilp))>;
+                      AssemblerPredicate<(all_of (not FeatureStdExtZicfilp))>;
 
 def FeatureStdExtZicfiss
     : RISCVExperimentalExtension<"zicfiss", 0, 4,
@@ -458,6 +458,8 @@ def FeatureStdExtZbb
 def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbb),
                                       "'Zbb' (Basic Bit-Manipulation)">;
+def NoStdExtZbb : Predicate<"!Subtarget->hasStdExtZbb()">,
+                  AssemblerPredicate<(all_of (not FeatureStdExtZbb))>;
 
 def FeatureStdExtZbc
     : RISCVExtension<"zbc", 1, 0,
@@ -961,32 +963,32 @@ def FeatureStdExtSvpbmt
 // privilege mode (U-mode), and for VS- and VU-modes if the H extension is
 // present.
 def FeatureStdExtSsnpm
-    : RISCVExperimentalExtension<"ssnpm", 0, 8,
+    : RISCVExperimentalExtension<"ssnpm", 1, 0,
                                  "'Ssnpm' (Supervisor-level Pointer Masking for next lower privilege mode)">;
 
 // A machine-level extension that provides pointer masking for the next lower
 // privilege mode (S/HS if S-mode is implemented, or U-mode otherwise).
 def FeatureStdExtSmnpm
-    : RISCVExperimentalExtension<"smnpm", 0, 8,
+    : RISCVExperimentalExtension<"smnpm", 1, 0,
                                  "'Smnpm' (Machine-level Pointer Masking for next lower privilege mode)">;
 
 // A machine-level extension that provides pointer masking for M-mode.
 def FeatureStdExtSmmpm
-    : RISCVExperimentalExtension<"smmpm", 0, 8,
+    : RISCVExperimentalExtension<"smmpm", 1, 0,
                                  "'Smmpm' (Machine-level Pointer Masking for M-mode)">;
 
 // An extension that indicates that there is pointer-masking support available
 // in supervisor mode, with some facility provided in the supervisor execution
 // environment to control pointer masking.
 def FeatureStdExtSspm
-    : RISCVExperimentalExtension<"sspm", 0, 8,
+    : RISCVExperimentalExtension<"sspm", 1, 0,
                                  "'Sspm' (Indicates Supervisor-mode Pointer Masking)">;
 
 // An extension that indicates that there is pointer-masking support available
 // in user mode, with some facility provided in the application execution
 // environment to control pointer masking.
 def FeatureStdExtSupm
-    : RISCVExperimentalExtension<"supm", 0, 8,
+    : RISCVExperimentalExtension<"supm", 1, 0,
                                  "'Supm' (Indicates User-mode Pointer Masking)">;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index cff46e15251bc5..d9971791a2cfaa 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -548,7 +548,7 @@ bool RISCVGatherScatterLowering::runOnFunction(Function &F) {
     return false;
 
   TLI = ST->getTargetLowering();
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   StridedAddrs.clear();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a02fd5bd1b65e7..5e94fbec5a04a1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1559,7 +1559,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                              const CallInst &I,
                                              MachineFunction &MF,
                                              unsigned Intrinsic) const {
-  auto &DL = I.getModule()->getDataLayout();
+  auto &DL = I.getDataLayout();
 
   auto SetRVVLoadStoreInfo = [&](unsigned PtrOp, bool IsStore,
                                  bool IsUnitStrided, bool UsePtrVal = false) {
@@ -12502,12 +12502,15 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     }
     break;
   }
-  case RISCVISD::BREV8: {
+  case RISCVISD::BREV8:
+  case RISCVISD::ORC_B: {
     MVT VT = N->getSimpleValueType(0);
     MVT XLenVT = Subtarget.getXLenVT();
     assert((VT == MVT::i16 || (VT == MVT::i32 && Subtarget.is64Bit())) &&
            "Unexpected custom legalisation");
-    assert(Subtarget.hasStdExtZbkb() && "Unexpected extension");
+    assert(((N->getOpcode() == RISCVISD::BREV8 && Subtarget.hasStdExtZbkb()) ||
+            (N->getOpcode() == RISCVISD::ORC_B && Subtarget.hasStdExtZbb())) &&
+           "Unexpected extension");
     SDValue NewOp = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, N->getOperand(0));
     SDValue NewRes = DAG.getNode(N->getOpcode(), DL, XLenVT, NewOp);
     // ReplaceNodeResults requires we maintain the same type for the return
@@ -13345,6 +13348,35 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS);
 }
 
+// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are
+// non-zero. Replace with orc.b.
+static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
+                                     const RISCVSubtarget &Subtarget) {
+  if (!Subtarget.hasStdExtZbb())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  if (VT != Subtarget.getXLenVT() && VT != MVT::i32 && VT != MVT::i16)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse())
+    return SDValue();
+
+  auto *ShAmtC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!ShAmtC || ShAmtC->getZExtValue() != 8)
+    return SDValue();
+
+  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe));
+  if (!DAG.MaskedValueIsZero(N1, Mask))
+    return SDValue();
+
+  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1);
+}
+
 static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   if (SDValue V = combineSubOfBoolean(N, DAG))
@@ -13367,6 +13399,8 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
 
   if (SDValue V = combineBinOpOfZExt(N, DAG))
     return V;
+  if (SDValue V = combineSubShiftToOrcB(N, DAG, Subtarget))
+    return V;
 
   // fold (sub x, (select lhs, rhs, cc, 0, y)) ->
   //      (select lhs, rhs, cc, x, (sub x, y))
@@ -16860,7 +16894,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                        DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound));
   }
   case ISD::MGATHER: {
-    const auto *MGN = dyn_cast<MaskedGatherSDNode>(N);
+    const auto *MGN = cast<MaskedGatherSDNode>(N);
     const EVT VT = N->getValueType(0);
     SDValue Index = MGN->getIndex();
     SDValue ScaleOp = MGN->getScale();
@@ -16960,7 +16994,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case ISD::MSCATTER:{
-    const auto *MSN = dyn_cast<MaskedScatterSDNode>(N);
+    const auto *MSN = cast<MaskedScatterSDNode>(N);
     SDValue Index = MSN->getIndex();
     SDValue ScaleOp = MSN->getScale();
     ISD::MemIndexType IndexType = MSN->getIndexType();
@@ -16996,7 +17030,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case ISD::VP_GATHER: {
-    const auto *VPGN = dyn_cast<VPGatherSDNode>(N);
+    const auto *VPGN = cast<VPGatherSDNode>(N);
     SDValue Index = VPGN->getIndex();
     SDValue ScaleOp = VPGN->getScale();
     ISD::MemIndexType IndexType = VPGN->getIndexType();
@@ -17021,7 +17055,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case ISD::VP_SCATTER: {
-    const auto *VPSN = dyn_cast<VPScatterSDNode>(N);
+    const auto *VPSN = cast<VPScatterSDNode>(N);
     SDValue Index = VPSN->getIndex();
     SDValue ScaleOp = VPSN->getScale();
     ISD::MemIndexType IndexType = VPSN->getIndexType();
@@ -20872,7 +20906,7 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
   // sign-extend.
   if (AI->getOperation() == AtomicRMWInst::Min ||
       AI->getOperation() == AtomicRMWInst::Max) {
-    const DataLayout &DL = AI->getModule()->getDataLayout();
+    const DataLayout &DL = AI->getDataLayout();
     unsigned ValWidth =
         DL.getTypeStoreSizeInBits(AI->getValOperand()->getType());
     Value *SextShamt =
@@ -21497,7 +21531,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
   auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
   if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
                                     LI->getPointerAddressSpace(),
-                                    LI->getModule()->getDataLayout()))
+                                    LI->getDataLayout()))
     return false;
 
   auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
@@ -21551,7 +21585,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
                                    ShuffleVTy->getNumElements() / Factor);
   if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(),
-                                    SI->getModule()->getDataLayout()))
+                                    SI->getDataLayout()))
     return false;
 
   auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
@@ -21596,7 +21630,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
 
   if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
                                     LI->getPointerAddressSpace(),
-                                    LI->getModule()->getDataLayout()))
+                                    LI->getDataLayout()))
     return false;
 
   Function *VlsegNFunc;
@@ -21646,7 +21680,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(),
-                                    SI->getModule()->getDataLayout()))
+                                    SI->getDataLayout()))
     return false;
 
   Function *VssegNFunc;
@@ -21785,7 +21819,7 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
       Op == Instruction::And || Op == Instruction::Or ||
       Op == Instruction::Xor || Op == Instruction::InsertElement ||
       Op == Instruction::ShuffleVector || Op == Instruction::Load ||
-      Op == Instruction::Freeze)
+      Op == Instruction::Freeze || Op == Instruction::Store)
     return false;
 
   if (Inst.getType()->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 877535513c721f..101f188374e0c7 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -833,7 +833,7 @@ class VSETVLIInfo {
     if (isUnknown())
       OS << "unknown";
     if (hasAVLReg())
-      OS << "AVLReg=" << (unsigned)getAVLReg();
+      OS << "AVLReg=" << llvm::printReg(getAVLReg());
     if (hasAVLImm())
       OS << "AVLImm=" << (unsigned)AVLImm;
     if (hasAVLVLMAX())
@@ -962,6 +962,17 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
 
+  // If AVL is defined by a vsetvli with the same VLMAX, we can replace the
+  // AVL operand with the AVL of the defining vsetvli.
+  if (NewInfo.hasAVLReg()) {
+    if (const MachineInstr *DefMI = NewInfo.getAVLDefMI(LIS);
+        DefMI && isVectorConfigInstr(*DefMI)) {
+      VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
+      if (DefInstrInfo.hasSameVLMAX(NewInfo))
+        NewInfo.setAVL(DefInstrInfo);
+    }
+  }
+
   return NewInfo;
 }
 
@@ -1050,15 +1061,12 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
   InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
 
   // If AVL is defined by a vsetvli with the same VLMAX, we can replace the
-  // AVL operand with the AVL of the defining vsetvli.  We avoid general
-  // register AVLs to avoid extending live ranges without being sure we can
-  // kill the original source reg entirely.
+  // AVL operand with the AVL of the defining vsetvli.
   if (InstrInfo.hasAVLReg()) {
     if (const MachineInstr *DefMI = InstrInfo.getAVLDefMI(LIS);
         DefMI && isVectorConfigInstr(*DefMI)) {
       VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
-      if (DefInstrInfo.hasSameVLMAX(InstrInfo) &&
-          (DefInstrInfo.hasAVLImm() || DefInstrInfo.hasAVLVLMAX()))
+      if (DefInstrInfo.hasSameVLMAX(InstrInfo))
         InstrInfo.setAVL(DefInstrInfo);
     }
   }
@@ -1146,9 +1154,13 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
     LIS->InsertMachineInstrInMaps(*MI);
     // Normally the AVL's live range will already extend past the inserted
     // vsetvli because the pseudos below will already use the AVL. But this
-    // isn't always the case, e.g. PseudoVMV_X_S doesn't have an AVL operand.
-    LIS->getInterval(AVLReg).extendInBlock(
-        LIS->getMBBStartIdx(&MBB), LIS->getInstructionIndex(*MI).getRegSlot());
+    // isn't always the case, e.g. PseudoVMV_X_S doesn't have an AVL operand or
+    // we've taken the AVL from the VL output of another vsetvli.
+    LiveInterval &LI = LIS->getInterval(AVLReg);
+    // Need to get non-const VNInfo
+    VNInfo *VNI = LI.getValNumInfo(Info.getAVLVNInfo()->id);
+    LI.addSegment(LiveInterval::Segment(
+        VNI->def, LIS->getInstructionIndex(*MI).getRegSlot(), VNI));
   }
 }
 
@@ -1163,19 +1175,6 @@ bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
   if (CurInfo.isCompatible(Used, Require, LIS))
     return false;
 
-  // We didn't find a compatible value. If our AVL is a virtual register,
-  // it might be defined by a VSET(I)VLI. If it has the same VLMAX we need
-  // and the last VL/VTYPE we observed is the same, we don't need a
-  // VSETVLI here.
-  if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
-    if (const MachineInstr *DefMI = Require.getAVLDefMI(LIS);
-        DefMI && isVectorConfigInstr(*DefMI)) {
-      VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
-      if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
-        return false;
-    }
-  }
-
   return true;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 00eb83da652b7f..67e2f3f5d6373f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 8a0bbf6abd334d..d5389deb4303ad 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -481,6 +481,14 @@ def : InstAlias<"bext $rd, $rs1, $shamt",
                 (BEXTI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
 } // Predicates = [HasStdExtZbs]
 
+let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV32] in {
+def : InstAlias<"zext.h $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
+} // Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV32]
+
+let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64] in {
+def : InstAlias<"zext.h $rd, $rs", (PACKW GPR:$rd, GPR:$rs, X0)>;
+} // Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64]
+
 //===----------------------------------------------------------------------===//
 // Codegen patterns
 //===----------------------------------------------------------------------===//
@@ -642,6 +650,11 @@ def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV32 GPR:$rs)>;
 let Predicates = [HasStdExtZbb, IsRV64] in
 def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV64 GPR:$rs)>;
 
+let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV32] in
+def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (PACK GPR:$rs, (XLenVT X0))>;
+let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64] in
+def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (PACKW GPR:$rs, (XLenVT X0))>;
+
 let Predicates = [HasStdExtZba] in {
 
 foreach i = {1,2,3} in {
@@ -752,6 +765,10 @@ def : Pat<(i32 (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;
 def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV64 GPR:$rs)>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
+let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64] in {
+def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (PACKW GPR:$rs, (XLenVT X0))>;
+}
+
 let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
 def : Pat<(i32 (and GPR:$rs1, (not GPR:$rs2))), (ANDN GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (or  GPR:$rs1, (not GPR:$rs2))), (ORN  GPR:$rs1, GPR:$rs2)>;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index b7b0c47c084c64..e5413f64a34188 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -89,10 +89,6 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
 
   int64_t Val = MBBI->getOperand(1).getImm();
 
-  RISCVMatInt::InstSeq Seq =
-      RISCVMatInt::generateInstSeq(Val, MBB.getParent()->getSubtarget());
-  assert(!Seq.empty());
-
   Register DstReg = MBBI->getOperand(0).getReg();
   bool DstIsDead = MBBI->getOperand(0).isDead();
   bool Renamable = MBBI->getOperand(0).isRenamable();
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6d71cb6a41423b..13a2491116b5da 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -327,7 +327,7 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
                                              [TuneNoDefaultUnroll]>;
 
 def SYNTACORE_SCR3_RV32 : RISCVProcessorModel<"syntacore-scr3-rv32",
-                                              NoSchedModel,
+                                              SyntacoreSCR3RV32Model,
                                               [Feature32Bit,
                                                FeatureStdExtI,
                                                FeatureStdExtZicsr,
@@ -337,7 +337,7 @@ def SYNTACORE_SCR3_RV32 : RISCVProcessorModel<"syntacore-scr3-rv32",
                                               [TuneNoDefaultUnroll, FeaturePostRAScheduler]>;
 
 def SYNTACORE_SCR3_RV64 : RISCVProcessorModel<"syntacore-scr3-rv64",
-                                              NoSchedModel,
+                                              SyntacoreSCR3RV64Model,
                                               [Feature64Bit,
                                                FeatureStdExtI,
                                                FeatureStdExtZicsr,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index caa5dbc15f8bd4..760d12103c36d4 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -104,14 +104,17 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
 
-  // Mark any registers requested to be reserved as such
   for (size_t Reg = 0; Reg < getNumRegs(); Reg++) {
+    // Mark any GPRs requested to be reserved as such
     if (Subtarget.isRegisterReservedByUser(Reg))
       markSuperRegs(Reserved, Reg);
+
+    // Mark all the registers defined as constant in TableGen as reserved.
+    if (isConstantPhysReg(Reg))
+      markSuperRegs(Reserved, Reg);
   }
 
   // Use markSuperRegs to ensure any register aliases are also reserved
-  markSuperRegs(Reserved, RISCV::X0); // zero
   markSuperRegs(Reserved, RISCV::X2); // sp
   markSuperRegs(Reserved, RISCV::X3); // gp
   markSuperRegs(Reserved, RISCV::X4); // tp
@@ -136,7 +139,6 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   markSuperRegs(Reserved, RISCV::VTYPE);
   markSuperRegs(Reserved, RISCV::VXSAT);
   markSuperRegs(Reserved, RISCV::VXRM);
-  markSuperRegs(Reserved, RISCV::VLENB); // vlenb (constant)
 
   // Floating point environment registers.
   markSuperRegs(Reserved, RISCV::FRM);
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td
new file mode 100644
index 00000000000000..607637bc0de596
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td
@@ -0,0 +1,189 @@
+//==- RISCVSchedSyntacoreSCR3.td - Syntacore SCR3 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// This model covers SYNTACORE_SCR3_RV32IMC and SYNTACORE_RV64IMAC
+// configurations (syntacore-scr3-rv32/64).
+// Overview: https://syntacore.com/products/scr3
+
+// SCR3 is single-issue in-order processor
+class SyntacoreSCR3Model : SchedMachineModel {
+  let MicroOpBufferSize = 0;
+  let IssueWidth = 1;
+  let LoadLatency = 2;
+  let MispredictPenalty = 3;
+  let CompleteModel = 0;
+  let UnsupportedFeatures = [HasStdExtD, HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+                             HasVInstructions];
+}
+
+// Branching
+multiclass SCR3_Branching<ProcResourceKind BRU> {
+  def : WriteRes<WriteJmp, [BRU]>;
+  def : WriteRes<WriteJal, [BRU]>;
+  def : WriteRes<WriteJalr, [BRU]>;
+}
+
+// Single-cycle integer arithmetic and logic
+multiclass SCR3_IntALU<ProcResourceKind ALU> {
+  def : WriteRes<WriteIALU, [ALU]>;
+  def : WriteRes<WriteIALU32, [ALU]>;
+  def : WriteRes<WriteShiftImm, [ALU]>;
+  def : WriteRes<WriteShiftImm32, [ALU]>;
+  def : WriteRes<WriteShiftReg, [ALU]>;
+  def : WriteRes<WriteShiftReg32, [ALU]>;
+}
+
+// Integer multiplication
+multiclass SCR3_IntMul<ProcResourceKind MUL> {
+  let Latency = 2 in {
+    def : WriteRes<WriteIMul, [MUL]>;
+    def : WriteRes<WriteIMul32, [MUL]>;
+  }
+}
+
+// Integer division
+multiclass SCR3_IntDiv<ProcResourceKind DIV, int DivLatency> {
+  let Latency = DivLatency, ReleaseAtCycles = [DivLatency] in {
+    def : WriteRes<WriteIDiv, [DIV]>;
+    def : WriteRes<WriteIDiv32, [DIV]>;
+    def : WriteRes<WriteIRem, [DIV]>;
+    def : WriteRes<WriteIRem32, [DIV]>;
+  }
+}
+
+// Load/store instructions on SCR3 have latency 2
+multiclass SCR3_Memory<ProcResourceKind LSU> {
+  let Latency = 2 in {
+    def : WriteRes<WriteSTB, [LSU]>;
+    def : WriteRes<WriteSTH, [LSU]>;
+    def : WriteRes<WriteSTW, [LSU]>;
+    def : WriteRes<WriteSTD, [LSU]>;
+    def : WriteRes<WriteLDB, [LSU]>;
+    def : WriteRes<WriteLDH, [LSU]>;
+    def : WriteRes<WriteLDW, [LSU]>;
+    def : WriteRes<WriteLDD, [LSU]>;
+  }
+}
+
+// Atomic memory
+multiclass SCR3_AtomicMemory<ProcResourceKind LSU> {
+  let Latency = 20 in {
+    def : WriteRes<WriteAtomicLDW, [LSU]>;
+    def : WriteRes<WriteAtomicLDD, [LSU]>;
+    def : WriteRes<WriteAtomicW, [LSU]>;
+    def : WriteRes<WriteAtomicD, [LSU]>;
+    def : WriteRes<WriteAtomicSTW, [LSU]>;
+    def : WriteRes<WriteAtomicSTD, [LSU]>;
+  }
+}
+
+// Others
+multiclass SCR3_Other {
+  def : WriteRes<WriteCSR, []>;
+  def : WriteRes<WriteNop, []>;
+
+  def : InstRW<[WriteIALU], (instrs COPY)>;
+}
+
+
+multiclass SCR3_Unsupported {
+  defm : UnsupportedSchedD;
+  defm : UnsupportedSchedF;
+  defm : UnsupportedSchedSFB;
+  defm : UnsupportedSchedV;
+  defm : UnsupportedSchedXsfvcp;
+  defm : UnsupportedSchedZabha;
+  defm : UnsupportedSchedZba;
+  defm : UnsupportedSchedZbb;
+  defm : UnsupportedSchedZbc;
+  defm : UnsupportedSchedZbs;
+  defm : UnsupportedSchedZbkb;
+  defm : UnsupportedSchedZbkx;
+  defm : UnsupportedSchedZfa;
+  defm : UnsupportedSchedZfh;
+  defm : UnsupportedSchedZvk;
+}
+
+// Bypasses (none)
+multiclass SCR3_NoReadAdvances {
+  def : ReadAdvance<ReadJmp, 0>;
+  def : ReadAdvance<ReadJalr, 0>;
+  def : ReadAdvance<ReadCSR, 0>;
+  def : ReadAdvance<ReadStoreData, 0>;
+  def : ReadAdvance<ReadMemBase, 0>;
+  def : ReadAdvance<ReadIALU, 0>;
+  def : ReadAdvance<ReadIALU32, 0>;
+  def : ReadAdvance<ReadShiftImm, 0>;
+  def : ReadAdvance<ReadShiftImm32, 0>;
+  def : ReadAdvance<ReadShiftReg, 0>;
+  def : ReadAdvance<ReadShiftReg32, 0>;
+  def : ReadAdvance<ReadIDiv, 0>;
+  def : ReadAdvance<ReadIDiv32, 0>;
+  def : ReadAdvance<ReadIRem, 0>;
+  def : ReadAdvance<ReadIRem32, 0>;
+  def : ReadAdvance<ReadIMul, 0>;
+  def : ReadAdvance<ReadIMul32, 0>;
+  def : ReadAdvance<ReadAtomicWA, 0>;
+  def : ReadAdvance<ReadAtomicWD, 0>;
+  def : ReadAdvance<ReadAtomicDA, 0>;
+  def : ReadAdvance<ReadAtomicDD, 0>;
+  def : ReadAdvance<ReadAtomicLDW, 0>;
+  def : ReadAdvance<ReadAtomicLDD, 0>;
+  def : ReadAdvance<ReadAtomicSTW, 0>;
+  def : ReadAdvance<ReadAtomicSTD, 0>;
+}
+
+def SyntacoreSCR3RV32Model : SyntacoreSCR3Model;
+
+let SchedModel = SyntacoreSCR3RV32Model in {
+  let BufferSize = 0 in {
+    def SCR3RV32_ALU : ProcResource<1>;
+    def SCR3RV32_MUL : ProcResource<1>;
+    def SCR3RV32_DIV : ProcResource<1>;
+    def SCR3RV32_LSU : ProcResource<1>;
+    def SCR3RV32_CFU : ProcResource<1>;
+  }
+
+  defm : SCR3_Branching<SCR3RV32_CFU>;
+  defm : SCR3_IntALU<SCR3RV32_ALU>;
+  defm : SCR3_IntMul<SCR3RV32_MUL>;
+  defm : SCR3_IntDiv<SCR3RV32_DIV, /* div latency = */ 8>;
+  defm : SCR3_Memory<SCR3RV32_LSU>;
+  defm : SCR3_AtomicMemory<SCR3RV32_LSU>;
+  defm : SCR3_Other;
+
+  defm : SCR3_Unsupported;
+  defm : SCR3_NoReadAdvances;
+}
+
+def SyntacoreSCR3RV64Model : SyntacoreSCR3Model;
+
+let SchedModel = SyntacoreSCR3RV64Model in {
+  let BufferSize = 0 in {
+    def SCR3RV64_ALU : ProcResource<1>;
+    def SCR3RV64_MUL : ProcResource<1>;
+    def SCR3RV64_DIV : ProcResource<1>;
+    def SCR3RV64_LSU : ProcResource<1>;
+    def SCR3RV64_CFU : ProcResource<1>;
+  }
+
+  defm : SCR3_Branching<SCR3RV64_CFU>;
+  defm : SCR3_IntALU<SCR3RV64_ALU>;
+  defm : SCR3_IntMul<SCR3RV64_MUL>;
+  defm : SCR3_IntDiv<SCR3RV64_DIV, /* div latency = */ 11>;
+  defm : SCR3_Memory<SCR3RV64_LSU>;
+  defm : SCR3_AtomicMemory<SCR3RV64_LSU>;
+  defm : SCR3_Other;
+
+  defm : SCR3_Unsupported;
+  defm : SCR3_NoReadAdvances;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
index e7c1a7e5d8bca1..065541ba9f5933 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/RISCVMCObjectFileInfo.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
@@ -98,7 +99,7 @@ bool RISCVELFTargetObjectFile::isGlobalInSmallSection(
     return false;
 
   return isInSmallSection(
-      GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+      GVA->getDataLayout().getTypeAllocSize(Ty));
 }
 
 MCSection *RISCVELFTargetObjectFile::SelectSectionForGlobal(
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index a4d13908750958..c4d10aada1f4c5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -57,7 +57,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                                           TTI::TargetCostKind CostKind);
 public:
   explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   bool areInlineCompatible(const Function *Caller,
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index d96d2bf31b6204..0f9a2a69e07390 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -198,6 +198,8 @@ std::string getExtInstSetName(SPIRV::InstructionSet::InstructionSet Set) {
     return "OpenCL.std";
   case SPIRV::InstructionSet::GLSL_std_450:
     return "GLSL.std.450";
+  case SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100:
+    return "NonSemantic.Shader.DebugInfo.100";
   case SPIRV::InstructionSet::SPV_AMD_shader_trinary_minmax:
     return "SPV_AMD_shader_trinary_minmax";
   }
@@ -206,8 +208,9 @@ std::string getExtInstSetName(SPIRV::InstructionSet::InstructionSet Set) {
 
 SPIRV::InstructionSet::InstructionSet
 getExtInstSetFromString(std::string SetName) {
-  for (auto Set : {SPIRV::InstructionSet::GLSL_std_450,
-                   SPIRV::InstructionSet::OpenCL_std}) {
+  for (auto Set :
+       {SPIRV::InstructionSet::GLSL_std_450, SPIRV::InstructionSet::OpenCL_std,
+        SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100}) {
     if (SetName == getExtInstSetName(Set))
       return Set;
   }
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index 990eb1d230bcc7..44625793e94138 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -197,6 +197,11 @@ namespace GLSLExtInst {
 #include "SPIRVGenTables.inc"
 } // namespace GLSLExtInst
 
+namespace NonSemanticExtInst {
+#define GET_NonSemanticExtInst_DECL
+#include "SPIRVGenTables.inc"
+} // namespace NonSemanticExtInst
+
 namespace Opcode {
 #define GET_Opcode_DECL
 #include "SPIRVGenTables.inc"
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
index 842958695e101f..a6dd7138edf30a 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
@@ -21,7 +21,7 @@ class SPIRVTargetStreamer : public MCTargetStreamer {
   ~SPIRVTargetStreamer() override;
 
   void changeSection(const MCSection *CurSection, MCSection *Section,
-                     const MCExpr *SubSection, raw_ostream &OS) override {}
+                     uint32_t SubSection, raw_ostream &OS) override {}
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index c14e5098be7111..0b93a4d85eedfd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -558,16 +558,21 @@ static Register buildMemSemanticsReg(Register SemanticsRegister,
 
 static bool buildOpFromWrapper(MachineIRBuilder &MIRBuilder, unsigned Opcode,
                                const SPIRV::IncomingCall *Call,
-                               Register TypeReg = Register(0)) {
+                               Register TypeReg,
+                               ArrayRef<uint32_t> ImmArgs = {}) {
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
   auto MIB = MIRBuilder.buildInstr(Opcode);
   if (TypeReg.isValid())
     MIB.addDef(Call->ReturnRegister).addUse(TypeReg);
-  for (Register ArgReg : Call->Arguments) {
+  unsigned Sz = Call->Arguments.size() - ImmArgs.size();
+  for (unsigned i = 0; i < Sz; ++i) {
+    Register ArgReg = Call->Arguments[i];
     if (!MRI->getRegClassOrNull(ArgReg))
       MRI->setRegClass(ArgReg, &SPIRV::IDRegClass);
     MIB.addUse(ArgReg);
   }
+  for (uint32_t ImmArg : ImmArgs)
+    MIB.addImm(ImmArg);
   return true;
 }
 
@@ -575,7 +580,7 @@ static bool buildOpFromWrapper(MachineIRBuilder &MIRBuilder, unsigned Opcode,
 static bool buildAtomicInitInst(const SPIRV::IncomingCall *Call,
                                 MachineIRBuilder &MIRBuilder) {
   if (Call->isSpirvOp())
-    return buildOpFromWrapper(MIRBuilder, SPIRV::OpStore, Call);
+    return buildOpFromWrapper(MIRBuilder, SPIRV::OpStore, Call, Register(0));
 
   assert(Call->Arguments.size() == 2 &&
          "Need 2 arguments for atomic init translation");
@@ -633,7 +638,7 @@ static bool buildAtomicStoreInst(const SPIRV::IncomingCall *Call,
                                  MachineIRBuilder &MIRBuilder,
                                  SPIRVGlobalRegistry *GR) {
   if (Call->isSpirvOp())
-    return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call);
+    return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call, Register(0));
 
   Register ScopeRegister =
       buildConstantIntReg(SPIRV::Scope::Device, MIRBuilder, GR);
@@ -760,7 +765,7 @@ static bool buildAtomicCompareExchangeInst(
   return true;
 }
 
-/// Helper function for building an atomic load instruction.
+/// Helper function for building atomic instructions.
 static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
                                MachineIRBuilder &MIRBuilder,
                                SPIRVGlobalRegistry *GR) {
@@ -785,13 +790,36 @@ static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
   MemSemanticsReg = buildMemSemanticsReg(MemSemanticsReg, PtrRegister,
                                          Semantics, MIRBuilder, GR);
   MRI->setRegClass(Call->Arguments[1], &SPIRV::IDRegClass);
+  Register ValueReg = Call->Arguments[1];
+  Register ValueTypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+  // support cl_ext_float_atomics
+  if (Call->ReturnType->getOpcode() == SPIRV::OpTypeFloat) {
+    if (Opcode == SPIRV::OpAtomicIAdd) {
+      Opcode = SPIRV::OpAtomicFAddEXT;
+    } else if (Opcode == SPIRV::OpAtomicISub) {
+      // Translate OpAtomicISub applied to a floating type argument to
+      // OpAtomicFAddEXT with the negative value operand
+      Opcode = SPIRV::OpAtomicFAddEXT;
+      Register NegValueReg =
+          MRI->createGenericVirtualRegister(MRI->getType(ValueReg));
+      MRI->setRegClass(NegValueReg, &SPIRV::IDRegClass);
+      GR->assignSPIRVTypeToVReg(Call->ReturnType, NegValueReg,
+                                MIRBuilder.getMF());
+      MIRBuilder.buildInstr(TargetOpcode::G_FNEG)
+          .addDef(NegValueReg)
+          .addUse(ValueReg);
+      insertAssignInstr(NegValueReg, nullptr, Call->ReturnType, GR, MIRBuilder,
+                        MIRBuilder.getMF().getRegInfo());
+      ValueReg = NegValueReg;
+    }
+  }
   MIRBuilder.buildInstr(Opcode)
       .addDef(Call->ReturnRegister)
-      .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+      .addUse(ValueTypeReg)
       .addUse(PtrRegister)
       .addUse(ScopeRegister)
       .addUse(MemSemanticsReg)
-      .addUse(Call->Arguments[1]);
+      .addUse(ValueReg);
   return true;
 }
 
@@ -870,7 +898,7 @@ static bool buildBarrierInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
                              MachineIRBuilder &MIRBuilder,
                              SPIRVGlobalRegistry *GR) {
   if (Call->isSpirvOp())
-    return buildOpFromWrapper(MIRBuilder, Opcode, Call);
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
 
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
   unsigned MemFlags = getIConstVal(Call->Arguments[0], MRI);
@@ -1824,6 +1852,45 @@ static bool generateSelectInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+static bool generateConstructInst(const SPIRV::IncomingCall *Call,
+                                  MachineIRBuilder &MIRBuilder,
+                                  SPIRVGlobalRegistry *GR) {
+  return buildOpFromWrapper(MIRBuilder, SPIRV::OpCompositeConstruct, Call,
+                            GR->getSPIRVTypeID(Call->ReturnType));
+}
+
+static bool generateCoopMatrInst(const SPIRV::IncomingCall *Call,
+                                 MachineIRBuilder &MIRBuilder,
+                                 SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+  bool IsSet = Opcode != SPIRV::OpCooperativeMatrixStoreKHR;
+  unsigned ArgSz = Call->Arguments.size();
+  unsigned LiteralIdx = 0;
+  if (Opcode == SPIRV::OpCooperativeMatrixLoadKHR && ArgSz > 3)
+    LiteralIdx = 3;
+  else if (Opcode == SPIRV::OpCooperativeMatrixStoreKHR && ArgSz > 4)
+    LiteralIdx = 4;
+  SmallVector<uint32_t, 1> ImmArgs;
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+  if (LiteralIdx > 0)
+    ImmArgs.push_back(getConstFromIntrinsic(Call->Arguments[LiteralIdx], MRI));
+  Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+  if (Opcode == SPIRV::OpCooperativeMatrixLengthKHR) {
+    SPIRVType *CoopMatrType = GR->getSPIRVTypeForVReg(Call->Arguments[0]);
+    if (!CoopMatrType)
+      report_fatal_error("Can't find a register's type definition");
+    MIRBuilder.buildInstr(Opcode)
+        .addDef(Call->ReturnRegister)
+        .addUse(TypeReg)
+        .addUse(CoopMatrType->getOperand(0).getReg());
+    return true;
+  }
+  return buildOpFromWrapper(MIRBuilder, Opcode, Call,
+                            IsSet ? TypeReg : Register(0), ImmArgs);
+}
+
 static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call,
                                      MachineIRBuilder &MIRBuilder,
                                      SPIRVGlobalRegistry *GR) {
@@ -1928,7 +1995,10 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call,
           .addDef(GlobalWorkSize)
           .addUse(GR->getSPIRVTypeID(SpvFieldTy))
           .addUse(GWSPtr);
-      Const = GR->getOrCreateConsIntArray(0, MIRBuilder, SpvFieldTy);
+      const SPIRVSubtarget &ST =
+          cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget());
+      Const = GR->getOrCreateConstIntArray(0, Size, *MIRBuilder.getInsertPt(),
+                                           SpvFieldTy, *ST.getInstrInfo());
     } else {
       Const = GR->buildConstantInt(0, MIRBuilder, SpvTy);
     }
@@ -2382,6 +2452,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateSampleImageInst(DemangledCall, Call.get(), MIRBuilder, GR);
   case SPIRV::Select:
     return generateSelectInst(Call.get(), MIRBuilder);
+  case SPIRV::Construct:
+    return generateConstructInst(Call.get(), MIRBuilder, GR);
   case SPIRV::SpecConstant:
     return generateSpecConstantInst(Call.get(), MIRBuilder, GR);
   case SPIRV::Enqueue:
@@ -2400,6 +2472,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateGroupUniformInst(Call.get(), MIRBuilder, GR);
   case SPIRV::KernelClock:
     return generateKernelClockInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::CoopMatr:
+    return generateCoopMatrInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
@@ -2524,6 +2598,22 @@ static SPIRVType *getPipeType(const TargetExtType *ExtensionType,
                                        ExtensionType->getIntParameter(0)));
 }
 
+static SPIRVType *getCoopMatrType(const TargetExtType *ExtensionType,
+                                  MachineIRBuilder &MIRBuilder,
+                                  SPIRVGlobalRegistry *GR) {
+  assert(ExtensionType->getNumIntParameters() == 4 &&
+         "Invalid number of parameters for SPIR-V coop matrices builtin!");
+  assert(ExtensionType->getNumTypeParameters() == 1 &&
+         "SPIR-V coop matrices builtin type must have a type parameter!");
+  const SPIRVType *ElemType =
+      GR->getOrCreateSPIRVType(ExtensionType->getTypeParameter(0), MIRBuilder);
+  // Create or get an existing type from GlobalRegistry.
+  return GR->getOrCreateOpTypeCoopMatr(
+      MIRBuilder, ExtensionType, ElemType, ExtensionType->getIntParameter(0),
+      ExtensionType->getIntParameter(1), ExtensionType->getIntParameter(2),
+      ExtensionType->getIntParameter(3));
+}
+
 static SPIRVType *
 getImageType(const TargetExtType *ExtensionType,
              const SPIRV::AccessQualifier::AccessQualifier Qualifier,
@@ -2654,6 +2744,9 @@ SPIRVType *lowerBuiltinType(const Type *OpaqueType,
   case SPIRV::OpTypeSampledImage:
     TargetType = getSampledImageType(BuiltinType, MIRBuilder, GR);
     break;
+  case SPIRV::OpTypeCooperativeMatrixKHR:
+    TargetType = getCoopMatrType(BuiltinType, MIRBuilder, GR);
+    break;
   default:
     TargetType =
         getNonParameterizedType(BuiltinType, TypeRecord, MIRBuilder, GR);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 2b8e6d856686ad..fb88332ab89029 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -26,6 +26,7 @@ class InstructionSet<bits<32> value> {
 def OpenCL_std : InstructionSet<0>;
 def GLSL_std_450 : InstructionSet<1>;
 def SPV_AMD_shader_trinary_minmax : InstructionSet<2>;
+def NonSemantic_Shader_DebugInfo_100 : InstructionSet<3>;
 
 // Define various builtin groups
 def BuiltinGroup : GenericEnum {
@@ -60,6 +61,8 @@ def AtomicFloating : BuiltinGroup;
 def GroupUniform : BuiltinGroup;
 def KernelClock : BuiltinGroup;
 def CastToPtr : BuiltinGroup;
+def Construct : BuiltinGroup;
+def CoopMatr : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -114,6 +117,9 @@ def : DemangledBuiltin<"__spirv_ImageSampleExplicitLod", OpenCL_std, SampleImage
 // Select builtin record:
 def : DemangledBuiltin<"__spirv_Select", OpenCL_std, Select, 3, 3>;
 
+// Composite Construct builtin record:
+def : DemangledBuiltin<"__spirv_CompositeConstruct", OpenCL_std, Construct, 1, 0>;
+
 //===----------------------------------------------------------------------===//
 // Class defining an extended builtin record used for lowering into an
 // OpExtInst instruction.
@@ -171,6 +177,17 @@ class GLSLExtInst<string name, bits<32> value> {
   bits<32> Value = value;
 }
 
+def NonSemanticExtInst : GenericEnum {
+  let FilterClass = "NonSemanticExtInst";
+  let NameField = "Name";
+  let ValueField = "Value";
+}
+
+class NonSemanticExtInst<string name, bits<32> value> {
+  string Name = name;
+  bits<32> Value = value;
+}
+
 // Multiclass used to define at the same time both a demangled builtin record
 // and a corresponding extended builtin record.
 multiclass DemangledExtendedBuiltin<string name, InstructionSet set, int number> {
@@ -184,6 +201,10 @@ multiclass DemangledExtendedBuiltin<string name, InstructionSet set, int number>
   if !eq(set, GLSL_std_450) then {
     def : GLSLExtInst<name, number>;
   }
+
+  if !eq(set, NonSemantic_Shader_DebugInfo_100) then {
+    def : NonSemanticExtInst<name, number>;
+  }
 }
 
 // Extended builtin records:
@@ -431,6 +452,50 @@ defm : DemangledExtendedBuiltin<"NMin", GLSL_std_450, 79>;
 defm : DemangledExtendedBuiltin<"NMax", GLSL_std_450, 80>;
 defm : DemangledExtendedBuiltin<"NClamp", GLSL_std_450, 81>;
 
+defm : DemangledExtendedBuiltin<"DebugInfoNone", NonSemantic_Shader_DebugInfo_100, 0>;
+defm : DemangledExtendedBuiltin<"DebugCompilationUnit", NonSemantic_Shader_DebugInfo_100, 1>;
+defm : DemangledExtendedBuiltin<"DebugTypeBasic", NonSemantic_Shader_DebugInfo_100, 2>;
+defm : DemangledExtendedBuiltin<"DebugTypePointer", NonSemantic_Shader_DebugInfo_100, 3>;
+defm : DemangledExtendedBuiltin<"DebugTypeQualifier", NonSemantic_Shader_DebugInfo_100, 4>;
+defm : DemangledExtendedBuiltin<"DebugTypeArray", NonSemantic_Shader_DebugInfo_100, 5>;
+defm : DemangledExtendedBuiltin<"DebugTypeVector", NonSemantic_Shader_DebugInfo_100, 6>;
+defm : DemangledExtendedBuiltin<"DebugTypedef", NonSemantic_Shader_DebugInfo_100, 7>;
+defm : DemangledExtendedBuiltin<"DebugTypeFunction", NonSemantic_Shader_DebugInfo_100, 8>;
+defm : DemangledExtendedBuiltin<"DebugTypeEnum", NonSemantic_Shader_DebugInfo_100, 9>;
+defm : DemangledExtendedBuiltin<"DebugTypeComposite", NonSemantic_Shader_DebugInfo_100, 10>;
+defm : DemangledExtendedBuiltin<"DebugTypeMember", NonSemantic_Shader_DebugInfo_100, 11>;
+defm : DemangledExtendedBuiltin<"DebugTypeInheritance", NonSemantic_Shader_DebugInfo_100, 12>;
+defm : DemangledExtendedBuiltin<"DebugTypePtrToMember", NonSemantic_Shader_DebugInfo_100, 13>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplate", NonSemantic_Shader_DebugInfo_100, 14>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplateParameter", NonSemantic_Shader_DebugInfo_100, 15>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplateTemplateParameter", NonSemantic_Shader_DebugInfo_100, 16>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplateParameterPack", NonSemantic_Shader_DebugInfo_100, 17>;
+defm : DemangledExtendedBuiltin<"DebugGlobalVariable", NonSemantic_Shader_DebugInfo_100, 18>;
+defm : DemangledExtendedBuiltin<"DebugFunctionDeclaration", NonSemantic_Shader_DebugInfo_100, 19>;
+defm : DemangledExtendedBuiltin<"DebugFunction", NonSemantic_Shader_DebugInfo_100, 20>;
+defm : DemangledExtendedBuiltin<"DebugLexicalBlock", NonSemantic_Shader_DebugInfo_100, 21>;
+defm : DemangledExtendedBuiltin<"DebugLexicalBlockDiscriminator", NonSemantic_Shader_DebugInfo_100, 22>;
+defm : DemangledExtendedBuiltin<"DebugScope", NonSemantic_Shader_DebugInfo_100, 23>;
+defm : DemangledExtendedBuiltin<"DebugNoScope", NonSemantic_Shader_DebugInfo_100, 24>;
+defm : DemangledExtendedBuiltin<"DebugInlinedAt", NonSemantic_Shader_DebugInfo_100, 25>;
+defm : DemangledExtendedBuiltin<"DebugLocalVariable", NonSemantic_Shader_DebugInfo_100, 26>;
+defm : DemangledExtendedBuiltin<"DebugInlinedVariable", NonSemantic_Shader_DebugInfo_100, 27>;
+defm : DemangledExtendedBuiltin<"DebugDeclare", NonSemantic_Shader_DebugInfo_100, 28>;
+defm : DemangledExtendedBuiltin<"DebugValue", NonSemantic_Shader_DebugInfo_100, 29>;
+defm : DemangledExtendedBuiltin<"DebugOperation", NonSemantic_Shader_DebugInfo_100, 30>;
+defm : DemangledExtendedBuiltin<"DebugExpression", NonSemantic_Shader_DebugInfo_100, 31>;
+defm : DemangledExtendedBuiltin<"DebugMacroDef", NonSemantic_Shader_DebugInfo_100, 32>;
+defm : DemangledExtendedBuiltin<"DebugMacroUndef", NonSemantic_Shader_DebugInfo_100, 33>;
+defm : DemangledExtendedBuiltin<"DebugImportedEntity", NonSemantic_Shader_DebugInfo_100, 34>;
+defm : DemangledExtendedBuiltin<"DebugSource", NonSemantic_Shader_DebugInfo_100, 35>;
+defm : DemangledExtendedBuiltin<"DebugFunctionDefinition", NonSemantic_Shader_DebugInfo_100, 101>;
+defm : DemangledExtendedBuiltin<"DebugSourceContinued", NonSemantic_Shader_DebugInfo_100, 102>;
+defm : DemangledExtendedBuiltin<"DebugLine", NonSemantic_Shader_DebugInfo_100, 103>;
+defm : DemangledExtendedBuiltin<"DebugNoLine", NonSemantic_Shader_DebugInfo_100, 104>;
+defm : DemangledExtendedBuiltin<"DebugBuildIdentifier", NonSemantic_Shader_DebugInfo_100, 105>;
+defm : DemangledExtendedBuiltin<"DebugStoragePath", NonSemantic_Shader_DebugInfo_100, 106>;
+defm : DemangledExtendedBuiltin<"DebugEntryPoint", NonSemantic_Shader_DebugInfo_100, 107>;
+defm : DemangledExtendedBuiltin<"DebugTypeMatrix", NonSemantic_Shader_DebugInfo_100, 108>;
 //===----------------------------------------------------------------------===//
 // Class defining an native builtin record used for direct translation into a
 // SPIR-V instruction.
@@ -541,11 +606,11 @@ defm : DemangledNativeBuiltin<"atomic_fetch_sub", OpenCL_std, Atomic, 2, 4, OpAt
 defm : DemangledNativeBuiltin<"atomic_fetch_or", OpenCL_std, Atomic, 2, 4, OpAtomicOr>;
 defm : DemangledNativeBuiltin<"atomic_fetch_xor", OpenCL_std, Atomic, 2, 4, OpAtomicXor>;
 defm : DemangledNativeBuiltin<"atomic_fetch_and", OpenCL_std, Atomic, 2, 4, OpAtomicAnd>;
-defm : DemangledNativeBuiltin<"atomic_fetch_add_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicIAdd>;
-defm : DemangledNativeBuiltin<"atomic_fetch_sub_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicISub>;
-defm : DemangledNativeBuiltin<"atomic_fetch_or_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicOr>;
-defm : DemangledNativeBuiltin<"atomic_fetch_xor_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicXor>;
-defm : DemangledNativeBuiltin<"atomic_fetch_and_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicAnd>;
+defm : DemangledNativeBuiltin<"atomic_fetch_add_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicIAdd>;
+defm : DemangledNativeBuiltin<"atomic_fetch_sub_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicISub>;
+defm : DemangledNativeBuiltin<"atomic_fetch_or_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicOr>;
+defm : DemangledNativeBuiltin<"atomic_fetch_xor_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicXor>;
+defm : DemangledNativeBuiltin<"atomic_fetch_and_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicAnd>;
 defm : DemangledNativeBuiltin<"atomic_flag_test_and_set", OpenCL_std, Atomic, 1, 1, OpAtomicFlagTestAndSet>;
 defm : DemangledNativeBuiltin<"__spirv_AtomicFlagTestAndSet", OpenCL_std, Atomic, 3, 3, OpAtomicFlagTestAndSet>;
 defm : DemangledNativeBuiltin<"atomic_flag_test_and_set_explicit", OpenCL_std, Atomic, 2, 3, OpAtomicFlagTestAndSet>;
@@ -604,9 +669,15 @@ defm : DemangledNativeBuiltin<"to_private", OpenCL_std, CastToPtr, 1, 1, OpGener
 defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
 defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
 defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
-defm : DemangledNativeBuiltin<"__spirv_OpGenericCastToPtrExplicit_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
-defm : DemangledNativeBuiltin<"__spirv_OpGenericCastToPtrExplicit_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
-defm : DemangledNativeBuiltin<"__spirv_OpGenericCastToPtrExplicit_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+
+// Cooperative Matrix builtin records:
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixLoadKHR", OpenCL_std, CoopMatr, 2, 0, OpCooperativeMatrixLoadKHR>;
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixStoreKHR", OpenCL_std, CoopMatr, 3, 0, OpCooperativeMatrixStoreKHR>;
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixMulAddKHR", OpenCL_std, CoopMatr, 3, 0, OpCooperativeMatrixMulAddKHR>;
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixLengthKHR", OpenCL_std, CoopMatr, 1, 1, OpCooperativeMatrixLengthKHR>;
 
 //===----------------------------------------------------------------------===//
 // Class defining a work/sub group builtin that should be translated into a
@@ -1026,8 +1097,6 @@ multiclass DemangledAtomicFloatingBuiltin<string name, bits<8> minNumArgs, bits<
 defm : DemangledAtomicFloatingBuiltin<"AddEXT", 4, 4, OpAtomicFAddEXT>;
 defm : DemangledAtomicFloatingBuiltin<"MinEXT", 4, 4, OpAtomicFMinEXT>;
 defm : DemangledAtomicFloatingBuiltin<"MaxEXT", 4, 4, OpAtomicFMaxEXT>;
-// TODO: add support for cl_ext_float_atomics to enable performing atomic operations
-//       on floating-point numbers in memory (float arguments for atomic_fetch_add, ...)
 
 //===----------------------------------------------------------------------===//
 // Class defining a sub group builtin that should be translated into a
@@ -1436,7 +1505,7 @@ def : BuiltinType<"spirv.DeviceEvent", OpTypeDeviceEvent>;
 def : BuiltinType<"spirv.Image", OpTypeImage>;
 def : BuiltinType<"spirv.SampledImage", OpTypeSampledImage>;
 def : BuiltinType<"spirv.Pipe", OpTypePipe>;
-
+def : BuiltinType<"spirv.CooperativeMatrixKHR", OpTypeCooperativeMatrixKHR>;
 
 //===----------------------------------------------------------------------===//
 // Class matching an OpenCL builtin type name to an equivalent SPIR-V
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 75aa1823b11f2a..c7c244cfa89770 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -66,6 +66,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_INTEL_function_pointers},
         {"SPV_KHR_shader_clock",
          SPIRV::Extension::Extension::SPV_KHR_shader_clock},
+        {"SPV_KHR_cooperative_matrix",
+         SPIRV::Extension::Extension::SPV_KHR_cooperative_matrix},
 };
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 5c10e04325d515..3eab6f5a071868 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -375,7 +375,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
         {"to_private", 0},
         {"__spirv_GenericCastToPtr_ToGlobal", 0},
         {"__spirv_GenericCastToPtr_ToLocal", 0},
-        {"__spirv_GenericCastToPtr_ToPrivate", 0}};
+        {"__spirv_GenericCastToPtr_ToPrivate", 0},
+        {"__spirv_GenericCastToPtrExplicit_ToGlobal", 0},
+        {"__spirv_GenericCastToPtrExplicit_ToLocal", 0},
+        {"__spirv_GenericCastToPtrExplicit_ToPrivate", 0}};
     // TODO: maybe improve performance by caching demangled names
     if (Function *CalledF = CI->getCalledFunction()) {
       std::string DemangledName =
@@ -670,10 +673,8 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
         AggrConst = cast<Constant>(COp);
         ResTy = B.getInt32Ty();
       } else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) {
-        if (!Op->getType()->isVectorTy()) {
-          AggrConst = cast<Constant>(COp);
-          ResTy = B.getInt32Ty();
-        }
+        AggrConst = cast<Constant>(COp);
+        ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty();
       }
       if (AggrConst) {
         SmallVector<Value *> Args;
@@ -1069,7 +1070,7 @@ Instruction *SPIRVEmitIntrinsics::visitLoadInst(LoadInst &I) {
   TrackConstants = false;
   const auto *TLI = TM->getSubtargetImpl()->getTargetLowering();
   MachineMemOperand::Flags Flags =
-      TLI->getLoadMemOperandFlags(I, F->getParent()->getDataLayout());
+      TLI->getLoadMemOperandFlags(I, F->getDataLayout());
   auto *NewI =
       B.CreateIntrinsic(Intrinsic::spv_load, {I.getOperand(0)->getType()},
                         {I.getPointerOperand(), B.getInt16(Flags),
@@ -1086,7 +1087,7 @@ Instruction *SPIRVEmitIntrinsics::visitStoreInst(StoreInst &I) {
   TrackConstants = false;
   const auto *TLI = TM->getSubtargetImpl()->getTargetLowering();
   MachineMemOperand::Flags Flags =
-      TLI->getStoreMemOperandFlags(I, F->getParent()->getDataLayout());
+      TLI->getStoreMemOperandFlags(I, F->getDataLayout());
   auto *PtrOp = I.getPointerOperand();
   auto *NewI = B.CreateIntrinsic(
       Intrinsic::spv_store, {I.getValueOperand()->getType(), PtrOp->getType()},
@@ -1260,8 +1261,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
   }
   bool IsPhi = isa<PHINode>(I), BPrepared = false;
   for (const auto &Op : I->operands()) {
-    if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) ||
-        isa<PHINode>(I) || isa<SwitchInst>(I))
+    if (isa<PHINode>(I) || isa<SwitchInst>(I))
       TrackConstants = false;
     if ((isa<ConstantData>(Op) || isa<ConstantExpr>(Op)) && TrackConstants) {
       unsigned OpNo = Op.getOperandNo();
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index b22d2a04f75b1f..5558c7a5a4a5f6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -394,7 +394,7 @@ Register SPIRVGlobalRegistry::getOrCreateCompositeOrNull(
     Constant *Val, MachineInstr &I, SPIRVType *SpvType,
     const SPIRVInstrInfo &TII, Constant *CA, unsigned BitWidth,
     unsigned ElemCnt, bool ZeroAsNull) {
-  // Find a constant vector in DT or build a new one.
+  // Find a constant vector or array in DT or build a new one.
   Register Res = DT.find(CA, CurMF);
   // If no values are attached, the composite is null constant.
   bool IsNull = Val->isNullValue() && ZeroAsNull;
@@ -474,20 +474,28 @@ Register SPIRVGlobalRegistry::getOrCreateConstVector(APFloat Val,
                                     ZeroAsNull);
 }
 
-Register
-SPIRVGlobalRegistry::getOrCreateConsIntArray(uint64_t Val, MachineInstr &I,
-                                             SPIRVType *SpvType,
-                                             const SPIRVInstrInfo &TII) {
+Register SPIRVGlobalRegistry::getOrCreateConstIntArray(
+    uint64_t Val, size_t Num, MachineInstr &I, SPIRVType *SpvType,
+    const SPIRVInstrInfo &TII) {
   const Type *LLVMTy = getTypeForSPIRVType(SpvType);
   assert(LLVMTy->isArrayTy());
   const ArrayType *LLVMArrTy = cast<ArrayType>(LLVMTy);
   Type *LLVMBaseTy = LLVMArrTy->getElementType();
-  auto *ConstInt = ConstantInt::get(LLVMBaseTy, Val);
-  auto *ConstArr =
-      ConstantArray::get(const_cast<ArrayType *>(LLVMArrTy), {ConstInt});
+  Constant *CI = ConstantInt::get(LLVMBaseTy, Val);
   SPIRVType *SpvBaseTy = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg());
   unsigned BW = getScalarOrVectorBitWidth(SpvBaseTy);
-  return getOrCreateCompositeOrNull(ConstInt, I, SpvType, TII, ConstArr, BW,
+  // The following is reasonably unique key that is better that [Val]. The naive
+  // alternative would be something along the lines of:
+  //   SmallVector<Constant *> NumCI(Num, CI);
+  //   Constant *UniqueKey =
+  //     ConstantArray::get(const_cast<ArrayType*>(LLVMArrTy), NumCI);
+  // that would be a truly unique but dangerous key, because it could lead to
+  // the creation of constants of arbitrary length (that is, the parameter of
+  // memset) which were missing in the original module.
+  Constant *UniqueKey = ConstantStruct::getAnon(
+      {PoisonValue::get(const_cast<ArrayType *>(LLVMArrTy)),
+       ConstantInt::get(LLVMBaseTy, Val), ConstantInt::get(LLVMBaseTy, Num)});
+  return getOrCreateCompositeOrNull(CI, I, SpvType, TII, UniqueKey, BW,
                                     LLVMArrTy->getNumElements());
 }
 
@@ -545,24 +553,6 @@ SPIRVGlobalRegistry::getOrCreateConsIntVector(uint64_t Val,
                                        SpvType->getOperand(2).getImm());
 }
 
-Register
-SPIRVGlobalRegistry::getOrCreateConsIntArray(uint64_t Val,
-                                             MachineIRBuilder &MIRBuilder,
-                                             SPIRVType *SpvType, bool EmitIR) {
-  const Type *LLVMTy = getTypeForSPIRVType(SpvType);
-  assert(LLVMTy->isArrayTy());
-  const ArrayType *LLVMArrTy = cast<ArrayType>(LLVMTy);
-  Type *LLVMBaseTy = LLVMArrTy->getElementType();
-  const auto ConstInt = ConstantInt::get(LLVMBaseTy, Val);
-  auto ConstArr =
-      ConstantArray::get(const_cast<ArrayType *>(LLVMArrTy), {ConstInt});
-  SPIRVType *SpvBaseTy = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg());
-  unsigned BW = getScalarOrVectorBitWidth(SpvBaseTy);
-  return getOrCreateIntCompositeOrNull(Val, MIRBuilder, SpvType, EmitIR,
-                                       ConstArr, BW,
-                                       LLVMArrTy->getNumElements());
-}
-
 Register
 SPIRVGlobalRegistry::getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder,
                                              SPIRVType *SpvType) {
@@ -1080,12 +1070,14 @@ bool SPIRVGlobalRegistry::isScalarOrVectorSigned(const SPIRVType *Type) const {
   return IntType && IntType->getOperand(2).getImm() != 0;
 }
 
+SPIRVType *SPIRVGlobalRegistry::getPointeeType(SPIRVType *PtrType) {
+  return PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer
+             ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg())
+             : nullptr;
+}
+
 unsigned SPIRVGlobalRegistry::getPointeeTypeOp(Register PtrReg) {
-  SPIRVType *PtrType = getSPIRVTypeForVReg(PtrReg);
-  SPIRVType *ElemType =
-      PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer
-          ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg())
-          : nullptr;
+  SPIRVType *ElemType = getPointeeType(getSPIRVTypeForVReg(PtrReg));
   return ElemType ? ElemType->getOpcode() : 0;
 }
 
@@ -1189,6 +1181,26 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeSampledImage(
       .addUse(getSPIRVTypeID(ImageType));
 }
 
+SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr(
+    MachineIRBuilder &MIRBuilder, const TargetExtType *ExtensionType,
+    const SPIRVType *ElemType, uint32_t Scope, uint32_t Rows, uint32_t Columns,
+    uint32_t Use) {
+  Register ResVReg = DT.find(ExtensionType, &MIRBuilder.getMF());
+  if (ResVReg.isValid())
+    return MIRBuilder.getMF().getRegInfo().getUniqueVRegDef(ResVReg);
+  ResVReg = createTypeVReg(MIRBuilder);
+  SPIRVType *SpirvTy =
+      MIRBuilder.buildInstr(SPIRV::OpTypeCooperativeMatrixKHR)
+          .addDef(ResVReg)
+          .addUse(getSPIRVTypeID(ElemType))
+          .addUse(buildConstantInt(Scope, MIRBuilder, nullptr, true))
+          .addUse(buildConstantInt(Rows, MIRBuilder, nullptr, true))
+          .addUse(buildConstantInt(Columns, MIRBuilder, nullptr, true))
+          .addUse(buildConstantInt(Use, MIRBuilder, nullptr, true));
+  DT.add(ExtensionType, &MIRBuilder.getMF(), ResVReg);
+  return SpirvTy;
+}
+
 SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeByOpcode(
     const Type *Ty, MachineIRBuilder &MIRBuilder, unsigned Opcode) {
   Register ResVReg = DT.find(Ty, &MIRBuilder.getMF());
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index db01f68f48de97..a45e1ccd0717fb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -292,6 +292,8 @@ class SPIRVGlobalRegistry {
     return Res->second;
   }
 
+  // Return a pointee's type, or nullptr otherwise.
+  SPIRVType *getPointeeType(SPIRVType *PtrType);
   // Return a pointee's type op code, or 0 otherwise.
   unsigned getPointeeTypeOp(Register PtrReg);
 
@@ -327,6 +329,12 @@ class SPIRVGlobalRegistry {
     return Ret;
   }
 
+  // Return true if the type is an aggregate type.
+  bool isAggregateType(SPIRVType *Type) const {
+    return Type && (Type->getOpcode() == SPIRV::OpTypeStruct &&
+                    Type->getOpcode() == SPIRV::OpTypeArray);
+  }
+
   // Whether the given VReg has an OpTypeXXX instruction mapped to it with the
   // given opcode (e.g. OpTypeFloat).
   bool isScalarOfType(Register VReg, unsigned TypeOpcode) const;
@@ -449,13 +457,11 @@ class SPIRVGlobalRegistry {
   Register getOrCreateConstVector(APFloat Val, MachineInstr &I,
                                   SPIRVType *SpvType, const SPIRVInstrInfo &TII,
                                   bool ZeroAsNull = true);
-  Register getOrCreateConsIntArray(uint64_t Val, MachineInstr &I,
-                                   SPIRVType *SpvType,
-                                   const SPIRVInstrInfo &TII);
+  Register getOrCreateConstIntArray(uint64_t Val, size_t Num, MachineInstr &I,
+                                    SPIRVType *SpvType,
+                                    const SPIRVInstrInfo &TII);
   Register getOrCreateConsIntVector(uint64_t Val, MachineIRBuilder &MIRBuilder,
                                     SPIRVType *SpvType, bool EmitIR = true);
-  Register getOrCreateConsIntArray(uint64_t Val, MachineIRBuilder &MIRBuilder,
-                                   SPIRVType *SpvType, bool EmitIR = true);
   Register getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder,
                                    SPIRVType *SpvType);
   Register buildConstantSampler(Register Res, unsigned AddrMode, unsigned Param,
@@ -514,7 +520,11 @@ class SPIRVGlobalRegistry {
 
   SPIRVType *getOrCreateOpTypeSampledImage(SPIRVType *ImageType,
                                            MachineIRBuilder &MIRBuilder);
-
+  SPIRVType *getOrCreateOpTypeCoopMatr(MachineIRBuilder &MIRBuilder,
+                                       const TargetExtType *ExtensionType,
+                                       const SPIRVType *ElemType,
+                                       uint32_t Scope, uint32_t Rows,
+                                       uint32_t Columns, uint32_t Use);
   SPIRVType *
   getOrCreateOpTypePipe(MachineIRBuilder &MIRBuilder,
                         SPIRV::AccessQualifier::AccessQualifier AccQual);
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index dedfd5e6e32db0..63549b06e96700 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -211,6 +211,9 @@ def OpTypeAccelerationStructureNV: Op<5341, (outs TYPE:$res), (ins),
 def OpTypeCooperativeMatrixNV: Op<5358, (outs TYPE:$res),
                   (ins TYPE:$compType, ID:$scope, ID:$rows, ID:$cols),
                   "$res = OpTypeCooperativeMatrixNV $compType $scope $rows $cols">;
+def OpTypeCooperativeMatrixKHR: Op<4456, (outs TYPE:$res),
+                  (ins TYPE:$compType, ID:$scope, ID:$rows, ID:$cols, ID:$use),
+                  "$res = OpTypeCooperativeMatrixKHR $compType $scope $rows $cols $use">;
 
 // 3.42.7 Constant-Creation Instructions
 
@@ -864,3 +867,16 @@ def OpAsmINTEL: Op<5610, (outs ID:$res), (ins TYPE:$type, TYPE:$asm_type, ID:$ta
                   "$res = OpAsmINTEL $type $asm_type $target $asm">;
 def OpAsmCallINTEL: Op<5611, (outs ID:$res), (ins TYPE:$type, ID:$asm, variable_ops),
                   "$res = OpAsmCallINTEL $type $asm">;
+
+// SPV_KHR_cooperative_matrix
+def OpCooperativeMatrixLoadKHR: Op<4457, (outs ID:$res),
+                  (ins TYPE:$resType, ID:$pointer, ID:$memory_layout, variable_ops),
+                  "$res = OpCooperativeMatrixLoadKHR $resType $pointer $memory_layout">;
+def OpCooperativeMatrixStoreKHR: Op<4458, (outs),
+                  (ins ID:$pointer, ID:$objectToStore, ID:$memory_layout, variable_ops),
+                  "OpCooperativeMatrixStoreKHR $pointer $objectToStore $memory_layout">;
+def OpCooperativeMatrixMulAddKHR: Op<4459, (outs ID:$res),
+                  (ins TYPE:$type, ID:$A, ID:$B, ID:$C, variable_ops),
+                  "$res = OpCooperativeMatrixMulAddKHR $type $A $B $C">;
+def OpCooperativeMatrixLengthKHR: Op<4460, (outs ID:$res), (ins TYPE:$type, ID:$coop_matr_type),
+                  "$res = OpCooperativeMatrixLengthKHR $type $coop_matr_type">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index d7b96b28445d60..9be736ce88ce45 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -846,7 +846,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
     unsigned Num = getIConstVal(I.getOperand(2).getReg(), MRI);
     SPIRVType *ValTy = GR.getOrCreateSPIRVIntegerType(8, I, TII);
     SPIRVType *ArrTy = GR.getOrCreateSPIRVArrayType(ValTy, Num, I, TII);
-    Register Const = GR.getOrCreateConsIntArray(Val, I, ArrTy, TII);
+    Register Const = GR.getOrCreateConstIntArray(Val, Num, I, ArrTy, TII);
     SPIRVType *VarTy = GR.getOrCreateSPIRVPointerType(
         ArrTy, I, TII, SPIRV::StorageClass::UniformConstant);
     // TODO: check if we have such GV, add init, use buildGlobalVariable.
@@ -1117,7 +1117,7 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
   if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
     Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass);
     SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
-        SrcPtrTy, I, TII, SPIRV::StorageClass::Generic);
+        GR.getPointeeType(SrcPtrTy), I, TII, SPIRV::StorageClass::Generic);
     MachineBasicBlock &BB = *I.getParent();
     const DebugLoc &DL = I.getDebugLoc();
     bool Success = BuildMI(BB, I, DL, TII.get(SPIRV::OpPtrCastToGeneric))
@@ -1445,20 +1445,50 @@ static unsigned getArrayComponentCount(MachineRegisterInfo *MRI,
 }
 
 // Return true if the type represents a constant register
-static bool isConstReg(MachineRegisterInfo *MRI, SPIRVType *OpDef) {
+static bool isConstReg(MachineRegisterInfo *MRI, SPIRVType *OpDef,
+                       SmallPtrSet<SPIRVType *, 4> &Visited) {
   if (OpDef->getOpcode() == SPIRV::ASSIGN_TYPE &&
       OpDef->getOperand(1).isReg()) {
     if (SPIRVType *RefDef = MRI->getVRegDef(OpDef->getOperand(1).getReg()))
       OpDef = RefDef;
   }
-  return OpDef->getOpcode() == TargetOpcode::G_CONSTANT ||
-         OpDef->getOpcode() == TargetOpcode::G_FCONSTANT;
+
+  if (Visited.contains(OpDef))
+    return true;
+  Visited.insert(OpDef);
+
+  unsigned Opcode = OpDef->getOpcode();
+  switch (Opcode) {
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+    return true;
+  case TargetOpcode::G_INTRINSIC:
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+  case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
+    return cast<GIntrinsic>(*OpDef).getIntrinsicID() ==
+           Intrinsic::spv_const_composite;
+  case TargetOpcode::G_BUILD_VECTOR:
+  case TargetOpcode::G_SPLAT_VECTOR: {
+    for (unsigned i = OpDef->getNumExplicitDefs(); i < OpDef->getNumOperands();
+         i++) {
+      SPIRVType *OpNestedDef =
+          OpDef->getOperand(i).isReg()
+              ? MRI->getVRegDef(OpDef->getOperand(i).getReg())
+              : nullptr;
+      if (OpNestedDef && !isConstReg(MRI, OpNestedDef, Visited))
+        return false;
+    }
+    return true;
+  }
+  }
+  return false;
 }
 
 // Return true if the virtual register represents a constant
 static bool isConstReg(MachineRegisterInfo *MRI, Register OpReg) {
+  SmallPtrSet<SPIRVType *, 4> Visited;
   if (SPIRVType *OpDef = MRI->getVRegDef(OpReg))
-    return isConstReg(MRI, OpDef);
+    return isConstReg(MRI, OpDef, Visited);
   return false;
 }
 
@@ -1772,15 +1802,18 @@ bool SPIRVInstructionSelector::selectOpUndef(Register ResVReg,
 static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI) {
   assert(MO.isReg());
   const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg());
-  if (TypeInst->getOpcode() != SPIRV::ASSIGN_TYPE)
-    return false;
-  assert(TypeInst->getOperand(1).isReg());
-  MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
-  return ImmInst->getOpcode() == TargetOpcode::G_CONSTANT;
+  if (TypeInst->getOpcode() == SPIRV::ASSIGN_TYPE) {
+    assert(TypeInst->getOperand(1).isReg());
+    MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
+    return ImmInst->getOpcode() == TargetOpcode::G_CONSTANT;
+  }
+  return TypeInst->getOpcode() == SPIRV::OpConstantI;
 }
 
 static int64_t foldImm(const MachineOperand &MO, MachineRegisterInfo *MRI) {
   const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg());
+  if (TypeInst->getOpcode() == SPIRV::OpConstantI)
+    return TypeInst->getOperand(2).getImm();
   MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
   assert(ImmInst->getOpcode() == TargetOpcode::G_CONSTANT);
   return ImmInst->getOperand(1).getCImm()->getZExtValue();
@@ -1882,8 +1915,10 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp(
     Register OpReg = I.getOperand(i).getReg();
     SPIRVType *OpDefine = MRI->getVRegDef(OpReg);
     SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
-    if (!OpDefine || !OpType || isConstReg(MRI, OpDefine) ||
-        OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) {
+    SmallPtrSet<SPIRVType *, 4> Visited;
+    if (!OpDefine || !OpType || isConstReg(MRI, OpDefine, Visited) ||
+        OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
+        GR.isAggregateType(OpType)) {
       // The case of G_ADDRSPACE_CAST inside spv_const_composite() is processed
       // by selectAddrSpaceCast()
       CompositeArgs.push_back(OpReg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 30a6c474f467a0..ac0aa682ea4beb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1168,6 +1168,15 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::AsmINTEL);
     }
     break;
+  case SPIRV::OpTypeCooperativeMatrixKHR:
+    if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_cooperative_matrix))
+      report_fatal_error(
+          "OpTypeCooperativeMatrixKHR type requires the "
+          "following SPIR-V extension: SPV_KHR_cooperative_matrix",
+          false);
+    Reqs.addExtension(SPIRV::Extension::SPV_KHR_cooperative_matrix);
+    Reqs.addCapability(SPIRV::Capability::CooperativeMatrixKHR);
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 2df96c45b4a88d..0ea2f176565e60 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -415,6 +415,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<MachineInstr *, 10> ToErase;
+  DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
 
   for (MachineBasicBlock *MBB : post_order(&MF)) {
     if (MBB->empty())
@@ -466,6 +467,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         // %rctmp = G_CONSTANT ty Val
         // %rc = ASSIGN_TYPE %rctmp, %cty
         Register Reg = MI.getOperand(0).getReg();
+        bool NeedAssignType = true;
         if (MRI.hasOneUse(Reg)) {
           MachineInstr &UseMI = *MRI.use_instr_begin(Reg);
           if (isSpvIntrinsic(UseMI, Intrinsic::spv_assign_type) ||
@@ -478,7 +480,20 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           Ty = TargetExtIt == TargetExtConstTypes.end()
                    ? MI.getOperand(1).getCImm()->getType()
                    : TargetExtIt->second;
-          GR->add(MI.getOperand(1).getCImm(), &MF, Reg);
+          const ConstantInt *OpCI = MI.getOperand(1).getCImm();
+          Register PrimaryReg = GR->find(OpCI, &MF);
+          if (!PrimaryReg.isValid()) {
+            GR->add(OpCI, &MF, Reg);
+          } else if (PrimaryReg != Reg &&
+                     MRI.getType(Reg) == MRI.getType(PrimaryReg)) {
+            auto *RCReg = MRI.getRegClassOrNull(Reg);
+            auto *RCPrimary = MRI.getRegClassOrNull(PrimaryReg);
+            if (!RCReg || RCPrimary == RCReg) {
+              RegsAlreadyAddedToDT[&MI] = PrimaryReg;
+              ToErase.push_back(&MI);
+              NeedAssignType = false;
+            }
+          }
         } else if (MIOp == TargetOpcode::G_FCONSTANT) {
           Ty = MI.getOperand(1).getFPImm()->getType();
         } else {
@@ -497,7 +512,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
               MI.getNumExplicitOperands() - MI.getNumExplicitDefs();
           Ty = VectorType::get(ElemTy, NumElts, false);
         }
-        insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
+        if (NeedAssignType)
+          insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
       } else if (MIOp == TargetOpcode::G_GLOBAL_VALUE) {
         propagateSPIRVType(&MI, GR, MRI, MIB);
       }
@@ -508,8 +524,12 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         --MII;
     }
   }
-  for (MachineInstr *MI : ToErase)
+  for (MachineInstr *MI : ToErase) {
+    auto It = RegsAlreadyAddedToDT.find(MI);
+    if (RegsAlreadyAddedToDT.contains(MI))
+      MRI.replaceRegWith(MI->getOperand(0).getReg(), It->second);
     MI->eraseFromParent();
+  }
 
   // Address the case when IRTranslator introduces instructions with new
   // registers without SPIRVType associated.
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 318c5cebb7a43a..96601dd8796c63 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -302,6 +302,7 @@ defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
 defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
 defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
 defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
+defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -478,6 +479,7 @@ defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_gl
 defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
 defm GlobalVariableFPGADecorationsINTEL : CapabilityOperand<6189, 0, 0, [SPV_INTEL_global_variable_fpga_decorations], []>;
 defm CacheControlsINTEL : CapabilityOperand<6441, 0, 0, [SPV_INTEL_cache_controls], []>;
+defm CooperativeMatrixKHR : CapabilityOperand<6022, 0, 0, [SPV_KHR_cooperative_matrix], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index ac351cf42f5ca5..2fbb4381da2637 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -35,7 +35,7 @@ class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
 
 public:
   explicit SPIRVTTIImpl(const SPIRVTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 };
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index c1b90b0e9d8840..927683ad7e32be 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -253,7 +253,11 @@ SPIRV::MemorySemantics::MemorySemantics getMemSemantics(AtomicOrdering Ord) {
 
 MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,
                                        const MachineRegisterInfo *MRI) {
-  MachineInstr *ConstInstr = MRI->getVRegDef(ConstReg);
+  MachineInstr *MI = MRI->getVRegDef(ConstReg);
+  MachineInstr *ConstInstr =
+      MI->getOpcode() == SPIRV::G_TRUNC || MI->getOpcode() == SPIRV::G_ZEXT
+          ? MRI->getVRegDef(MI->getOperand(1).getReg())
+          : MI;
   if (auto *GI = dyn_cast<GIntrinsic>(ConstInstr)) {
     if (GI->is(Intrinsic::spv_track_constant)) {
       ConstReg = ConstInstr->getOperand(2).getReg();
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index f0a3a4e88b30ce..af634a7da71c84 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -107,10 +107,15 @@ class SparcAsmParser : public MCTargetAsmParser {
 
   ParseStatus parseBranchModifiers(OperandVector &Operands);
 
+  ParseStatus parseExpression(int64_t &Val);
+
   // Helper function for dealing with %lo / %hi in PIC mode.
   const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK,
                                          const MCExpr *subExpr);
 
+  // Helper function to see if current token can start an expression.
+  bool isPossibleExpression(const AsmToken &Token);
+
   // returns true if Tok is matched to a register and returns register in RegNo.
   MCRegister matchRegisterName(const AsmToken &Tok, unsigned &RegKind);
 
@@ -1085,32 +1090,35 @@ ParseStatus SparcAsmParser::parseASITag(OperandVector &Operands) {
   SMLoc E = Parser.getTok().getEndLoc();
   int64_t ASIVal = 0;
 
-  if (is64Bit() && (getLexer().getKind() == AsmToken::Hash)) {
-    // For now we only support named tags for 64-bit/V9 systems.
-    // TODO: add support for 32-bit/V8 systems.
-    SMLoc TagStart = getLexer().peekTok(false).getLoc();
-    Parser.Lex(); // Eat the '#'.
-    auto ASIName = Parser.getTok().getString();
-    auto ASITag = SparcASITag::lookupASITagByName(ASIName);
-    if (!ASITag)
-      ASITag = SparcASITag::lookupASITagByAltName(ASIName);
-    Parser.Lex(); // Eat the identifier token.
+  if (getLexer().getKind() != AsmToken::Hash) {
+    // If the ASI tag provided is not a named tag, then it
+    // must be a constant expression.
+    ParseStatus ParseExprStatus = parseExpression(ASIVal);
+    if (!ParseExprStatus.isSuccess())
+      return ParseExprStatus;
 
-    if (!ASITag)
-      return Error(TagStart, "unknown ASI tag");
-
-    ASIVal = ASITag->Encoding;
-  } else if (!getParser().parseAbsoluteExpression(ASIVal)) {
     if (!isUInt<8>(ASIVal))
       return Error(S, "invalid ASI number, must be between 0 and 255");
-  } else {
-    return Error(
-        S, is64Bit()
-               ? "malformed ASI tag, must be %asi, a constant integer "
-                 "expression, or a named tag"
-               : "malformed ASI tag, must be a constant integer expression");
+
+    Operands.push_back(SparcOperand::CreateASITag(ASIVal, S, E));
+    return ParseStatus::Success;
   }
 
+  // For now we only support named tags for 64-bit/V9 systems.
+  // TODO: add support for 32-bit/V8 systems.
+  SMLoc TagStart = getLexer().peekTok(false).getLoc();
+  Parser.Lex(); // Eat the '#'.
+  const StringRef ASIName = Parser.getTok().getString();
+  const SparcASITag::ASITag *ASITag = SparcASITag::lookupASITagByName(ASIName);
+  if (!ASITag)
+    ASITag = SparcASITag::lookupASITagByAltName(ASIName);
+  Parser.Lex(); // Eat the identifier token.
+
+  if (!ASITag)
+    return Error(TagStart, "unknown ASI tag");
+
+  ASIVal = ASITag->Encoding;
+
   Operands.push_back(SparcOperand::CreateASITag(ASIVal, S, E));
   return ParseStatus::Success;
 }
@@ -1120,35 +1128,32 @@ ParseStatus SparcAsmParser::parsePrefetchTag(OperandVector &Operands) {
   SMLoc E = Parser.getTok().getEndLoc();
   int64_t PrefetchVal = 0;
 
-  switch (getLexer().getKind()) {
-  case AsmToken::LParen:
-  case AsmToken::Integer:
-  case AsmToken::Identifier:
-  case AsmToken::Plus:
-  case AsmToken::Minus:
-  case AsmToken::Tilde:
-    if (getParser().parseAbsoluteExpression(PrefetchVal) ||
-        !isUInt<5>(PrefetchVal))
-      return Error(S, "invalid prefetch number, must be between 0 and 31");
-    break;
-  case AsmToken::Hash: {
-    SMLoc TagStart = getLexer().peekTok(false).getLoc();
-    Parser.Lex(); // Eat the '#'.
-    const StringRef PrefetchName = Parser.getTok().getString();
-    const SparcPrefetchTag::PrefetchTag *PrefetchTag =
-        SparcPrefetchTag::lookupPrefetchTagByName(PrefetchName);
-    Parser.Lex(); // Eat the identifier token.
+  if (getLexer().getKind() != AsmToken::Hash) {
+    // If the prefetch tag provided is not a named tag, then it
+    // must be a constant expression.
+    ParseStatus ParseExprStatus = parseExpression(PrefetchVal);
+    if (!ParseExprStatus.isSuccess())
+      return ParseExprStatus;
 
-    if (!PrefetchTag)
-      return Error(TagStart, "unknown prefetch tag");
+    if (!isUInt<8>(PrefetchVal))
+      return Error(S, "invalid prefetch number, must be between 0 and 31");
 
-    PrefetchVal = PrefetchTag->Encoding;
-    break;
-  }
-  default:
-    return ParseStatus::NoMatch;
+    Operands.push_back(SparcOperand::CreatePrefetchTag(PrefetchVal, S, E));
+    return ParseStatus::Success;
   }
 
+  SMLoc TagStart = getLexer().peekTok(false).getLoc();
+  Parser.Lex(); // Eat the '#'.
+  const StringRef PrefetchName = Parser.getTok().getString();
+  const SparcPrefetchTag::PrefetchTag *PrefetchTag =
+      SparcPrefetchTag::lookupPrefetchTagByName(PrefetchName);
+  Parser.Lex(); // Eat the identifier token.
+
+  if (!PrefetchTag)
+    return Error(TagStart, "unknown prefetch tag");
+
+  PrefetchVal = PrefetchTag->Encoding;
+
   Operands.push_back(SparcOperand::CreatePrefetchTag(PrefetchVal, S, E));
   return ParseStatus::Success;
 }
@@ -1230,8 +1235,12 @@ ParseStatus SparcAsmParser::parseOperand(OperandVector &Operands,
     // Parse an optional address-space identifier after the address.
     // This will be either an immediate constant expression, or, on 64-bit
     // processors, the %asi register.
-    if (is64Bit() && getLexer().is(AsmToken::Percent)) {
+    if (getLexer().is(AsmToken::Percent)) {
       SMLoc S = Parser.getTok().getLoc();
+      if (!is64Bit())
+        return Error(
+            S, "malformed ASI tag, must be a constant integer expression");
+
       Parser.Lex(); // Eat the %.
       const AsmToken Tok = Parser.getTok();
       if (Tok.is(AsmToken::Identifier) && Tok.getString() == "asi") {
@@ -1360,6 +1369,15 @@ ParseStatus SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
+ParseStatus SparcAsmParser::parseExpression(int64_t &Val) {
+  AsmToken Tok = getLexer().getTok();
+
+  if (!isPossibleExpression(Tok))
+    return ParseStatus::NoMatch;
+
+  return getParser().parseAbsoluteExpression(Val);
+}
+
 #define GET_REGISTER_MATCHER
 #include "SparcGenAsmMatcher.inc"
 
@@ -1590,6 +1608,20 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
   return true;
 }
 
+bool SparcAsmParser::isPossibleExpression(const AsmToken &Token) {
+  switch (Token.getKind()) {
+  case AsmToken::LParen:
+  case AsmToken::Integer:
+  case AsmToken::Identifier:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Tilde:
+    return true;
+  default:
+    return false;
+  }
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmParser() {
   RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget());
   RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target());
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 5696ae117d69f0..3d025a99b3d837 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index fa20977ec01877..a769fdeff56845 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -1475,6 +1475,28 @@ void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized(
   // with existing compilers.
   MFFrame.setMaxCallFrameSize(
       std::max(64U, (unsigned)alignTo(MFFrame.getMaxCallFrameSize(), 64)));
+
+  // Add frame values with positive object offsets. Since the displacement from
+  // the SP/FP is calculated by ObjectOffset + StackSize + Bias, object offsets
+  // with positive values are in the caller's stack frame. We need to include
+  // that since it is accessed by displacement to SP/FP.
+  int64_t LargestArgOffset = 0;
+  for (int I = MFFrame.getObjectIndexBegin(); I != 0; ++I) {
+    if (MFFrame.getObjectOffset(I) >= 0) {
+      int64_t ObjOffset = MFFrame.getObjectOffset(I) + MFFrame.getObjectSize(I);
+      LargestArgOffset = std::max(ObjOffset, LargestArgOffset);
+    }
+  }
+
+  uint64_t MaxReach = (StackSize + Regs.getCallFrameSize() +
+                       Regs.getStackPointerBias() + LargestArgOffset);
+
+  if (!isUInt<12>(MaxReach)) {
+    // We may need register scavenging slots if some parts of the frame
+    // are outside the reach of an unsigned 12-bit displacement.
+    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
+    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
+  }
 }
 
 // Determines the size of the frame, and creates the deferred spill objects.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 6c6a9b490d917c..90d7bd934af405 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1554,7 +1554,7 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
 
       // The alignment of the symbol itself must be at least the store size.
       const GlobalValue *GV = GA->getGlobal();
-      const DataLayout &DL = GV->getParent()->getDataLayout();
+      const DataLayout &DL = GV->getDataLayout();
       if (GV->getPointerAlignment(DL).value() < StoreSize)
         return false;
     }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 5cb67ebf4e4b47..be6503f4ddac78 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsS390.h"
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index d0badd3692e406..80dbe5fb2b0cae 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -116,7 +116,7 @@ bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
   //
   // FIXME: Explicitly check for functions: the datalayout is currently
   // missing information about function pointers.
-  const DataLayout &DL = GV->getParent()->getDataLayout();
+  const DataLayout &DL = GV->getDataLayout();
   if (GV->getPointerAlignment(DL) == 1 && !GV->getValueType()->isFunctionTy())
     return false;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 3cf4a69ac28186..47db8f132337fc 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -32,7 +32,7 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   /// \name Scalar TTI Implementations
diff --git a/llvm/lib/Target/Target.cpp b/llvm/lib/Target/Target.cpp
index f916a77204fcb4..ec673ef4cda520 100644
--- a/llvm/lib/Target/Target.cpp
+++ b/llvm/lib/Target/Target.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include <cstring>
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 26e9f9eb3f3278..7d9b926f4c42b6 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -129,7 +129,7 @@ MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
   assert(!Suffix.empty());
 
   SmallString<60> NameStr;
-  NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix();
+  NameStr += GV->getDataLayout().getPrivateGlobalPrefix();
   TM.getNameWithPrefix(NameStr, GV, *Mang);
   NameStr.append(Suffix.begin(), Suffix.end());
   return getContext().getOrCreateSymbol(NameStr);
@@ -284,7 +284,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
       // a section for this size, use it, otherwise use the arbitrary sized
       // mergable section.
       switch (
-          GVar->getParent()->getDataLayout().getTypeAllocSize(C->getType())) {
+          GVar->getDataLayout().getTypeAllocSize(C->getType())) {
       case 4:  return SectionKind::getMergeableConst4();
       case 8:  return SectionKind::getMergeableConst8();
       case 16: return SectionKind::getMergeableConst16();
@@ -350,7 +350,7 @@ TargetLoweringObjectFile::SectionForGlobal(const GlobalObject *GO,
 MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
     const Function &F, const TargetMachine &TM) const {
   Align Alignment(1);
-  return getSectionForConstant(F.getParent()->getDataLayout(),
+  return getSectionForConstant(F.getDataLayout(),
                                SectionKind::getReadOnly(), /*C=*/nullptr,
                                Alignment);
 }
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 8ddc742004292b..f5065bfcb752af 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -105,7 +106,7 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
                                 GV->getName().starts_with("__start_") ||
                                 GV->getName().starts_with("__stop_")))
       return true;
-    const DataLayout &DL = GV->getParent()->getDataLayout();
+    const DataLayout &DL = GV->getDataLayout();
     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
     return Size == 0 || Size > LargeDataThreshold;
   }
@@ -268,7 +269,7 @@ void TargetMachine::setOptLevel(CodeGenOptLevel Level) { OptLevel = Level; }
 
 TargetTransformInfo
 TargetMachine::getTargetTransformInfo(const Function &F) const {
-  return TargetTransformInfo(F.getParent()->getDataLayout());
+  return TargetTransformInfo(F.getDataLayout());
 }
 
 void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h
index c688447088782a..7a73280e76d956 100644
--- a/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -82,7 +82,7 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
 
 public:
   explicit VETTIImpl(const VETargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   unsigned getNumberOfRegisters(unsigned ClassID) const {
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 3cc2cc0e830f1b..c63740d267819c 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -756,7 +756,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 
   bool CheckDataSection() {
     if (CurrentState != DataSection) {
-      auto WS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
+      auto WS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
       if (WS && WS->isText())
         return error("data directive must occur in a data segment: ",
                      Lexer.getTok());
@@ -1073,8 +1073,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 
   void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override {
     // Code below only applies to labels in text sections.
-    auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
-    if (!CWS || !CWS->isText())
+    auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+    if (!CWS->isText())
       return;
 
     auto WasmSym = cast<MCSymbolWasm>(Symbol);
@@ -1106,9 +1106,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
     // assembly currently.
     if (Group)
       WasmSym->setComdat(true);
-    auto *WS =
-        getContext().getWasmSection(SecName, SectionKind::getText(), 0, Group,
-                                    MCContext::GenericSectionID, nullptr);
+    auto *WS = getContext().getWasmSection(SecName, SectionKind::getText(), 0,
+                                           Group, MCContext::GenericSectionID);
     getStreamer().switchSection(WS);
     // Also generate DWARF for this section if requested.
     if (getContext().getGenDwarfForAssembly())
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index b0a97c725c872c..6dd6145ed00573 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
@@ -195,7 +196,7 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
       // can actually calculate the legal VTs.
       const WebAssemblyTargetLowering &TLI = *Subtarget->getTargetLowering();
       computeLegalValueVTs(TLI, GV->getParent()->getContext(),
-                           GV->getParent()->getDataLayout(), GlobalVT, VTs);
+                           GV->getDataLayout(), GlobalVT, VTs);
     }
     WebAssembly::wasmSymbolSetType(Sym, GlobalVT, VTs);
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
index f23f21c8f69fb9..77ac5fcb5c0d33 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -207,12 +208,12 @@ void WebAssemblyExceptionInfo::recalculate(
     auto *SrcWE = P.first;
     auto *DstWE = P.second;
 
-    for (auto *MBB : SrcWE->getBlocksSet()) {
+    SrcWE->getBlocksSet().remove_if([&](MachineBasicBlock *MBB){
       if (MBB->isEHPad()) {
         assert(!isReachableAmongDominated(DstWE->getEHPad(), MBB,
                                           SrcWE->getEHPad(), MDT) &&
                "We already handled EH pads above");
-        continue;
+        return false;
       }
       if (isReachableAmongDominated(DstWE->getEHPad(), MBB, SrcWE->getEHPad(),
                                     MDT)) {
@@ -227,15 +228,16 @@ void WebAssemblyExceptionInfo::recalculate(
           InnerWE->removeFromBlocksSet(MBB);
           InnerWE = InnerWE->getParentException();
         }
-        SrcWE->removeFromBlocksSet(MBB);
         LLVM_DEBUG(dbgs() << "  removed from " << SrcWE->getEHPad()->getNumber()
                           << "." << SrcWE->getEHPad()->getName()
                           << "'s exception\n");
         changeExceptionFor(MBB, SrcWE->getParentException());
         if (SrcWE->getParentException())
           SrcWE->getParentException()->addToBlocksSet(MBB);
+        return true;
       }
-    }
+      return false;
+    });
   }
 
   // Add BBs to exceptions' block vector
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 81a450dbc0d9c5..a3cc9bae470859 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -112,7 +112,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
   Function *Wrapper = Function::Create(Ty, Function::PrivateLinkage,
                                        F->getName() + "_bitcast", M);
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
-  const DataLayout &DL = BB->getModule()->getDataLayout();
+  const DataLayout &DL = BB->getDataLayout();
 
   // Determine what arguments to pass.
   SmallVector<Value *, 4> Args;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 3888175efd115d..2ee430c88169d0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1320,16 +1320,23 @@ def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDConvert<Vec vec, Vec arg, SDPatternOperator op, string name,
-                       bits<32> simdop> {
+                       bits<32> simdop, list<Predicate> reqs = []> {
   defm op#_#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
            [(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))],
-           vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>;
+           vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop, reqs>;
+}
+
+multiclass HalfPrecisionConvert<Vec vec, Vec arg, SDPatternOperator op,
+                                string name, bits<32> simdop> {
+  defm "" : SIMDConvert<vec, arg, op, name, simdop, [HasHalfPrecision]>;
 }
 
 // Floating point to integer with saturation: trunc_sat
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_sint, "trunc_sat_f16x8_s", 0x148>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_uint, "trunc_sat_f16x8_u", 0x149>;
 
 // Support the saturating variety as well.
 def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>;
@@ -1355,6 +1362,8 @@ defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
 defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
 defm "" : SIMDConvert<F64x2, I32x4, convert_low_s, "convert_low_i32x4_s", 0xfe>;
 defm "" : SIMDConvert<F64x2, I32x4, convert_low_u, "convert_low_i32x4_u", 0xff>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, sint_to_fp, "convert_i16x8_s", 0x14a>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, uint_to_fp, "convert_i16x8_u", 0x14b>;
 
 // Extending operations
 // TODO: refactor this to be uniform for i64x2 if the numbering is not changed.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 027ee1086bf4e0..7cc030460e30f8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -271,6 +271,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
index 13acbd2e24cc79..c2651a73c0651d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 7505c2995cf7ef..3a5dda946adfba 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -56,7 +56,7 @@ void llvm::computeLegalValueVTs(const WebAssemblyTargetLowering &TLI,
 
 void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM,
                                 Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
-  const DataLayout &DL(F.getParent()->getDataLayout());
+  const DataLayout &DL(F.getDataLayout());
   const WebAssemblyTargetLowering &TLI =
       *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering();
   computeLegalValueVTs(TLI, F.getContext(), DL, Ty, ValueVTs);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index e38905c20b8390..87e63a1cebc3b1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iterator>
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 801f905d377ed9..e10f0928ed5317 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -38,7 +38,7 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
 
 public:
   WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   /// \name Scalar TTI Implementations
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index 48830769fdf6cf..8e5a23d95683b8 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -309,7 +309,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
   const X86RegisterInfo *TRI = STI.getRegisterInfo();
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index c2188d206b5f68..fce484acf0eed6 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86EncodingOptimization.h"
+#include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInst.h"
@@ -162,8 +163,8 @@ class X86AsmBackend : public MCAsmBackend {
   bool allowAutoPadding() const override;
   bool allowEnhancedRelaxation() const override;
   void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
-                            const MCSubtargetInfo &STI) override;
-  void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
+                            const MCSubtargetInfo &STI);
+  void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst);
 
   unsigned getNumFixupKinds() const override {
     return X86::NumTargetFixupKinds;
@@ -787,7 +788,7 @@ bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
   const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize);
   const unsigned RemainingPrefixSize = [&]() -> unsigned {
     SmallString<15> Code;
-    Emitter.emitPrefix(RF.getInst(), Code, STI);
+    X86_MC::emitPrefix(Emitter, RF.getInst(), Code, STI);
     assert(Code.size() < 15 && "The number of prefixes must be less than 15.");
 
     // TODO: It turns out we need a decent amount of plumbing for the target
@@ -918,13 +919,12 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
 #ifndef NDEBUG
       const uint64_t OrigOffset = Layout.getFragmentOffset(&F);
 #endif
-      const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F);
+      const uint64_t OrigSize = Asm.computeFragmentSize(F);
 
       // To keep the effects local, prefer to relax instructions closest to
       // the align directive.  This is purely about human understandability
       // of the resulting code.  If we later find a reason to expand
       // particular instructions over others, we can adjust.
-      MCFragment *FirstChangedFragment = nullptr;
       unsigned RemainingSize = OrigSize;
       while (!Relaxable.empty() && RemainingSize != 0) {
         auto &RF = *Relaxable.pop_back_val();
@@ -932,7 +932,7 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
         // the encoding size of the given instruction.  Target independent code
         // will try further relaxation, but target's may play further tricks.
         if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
-          FirstChangedFragment = &RF;
+          Sec.setHasLayout(false);
 
         // If we have an instruction which hasn't been fully relaxed, we can't
         // skip past it and insert bytes before it.  Changing its starting
@@ -945,20 +945,13 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
       }
       Relaxable.clear();
 
-      if (FirstChangedFragment) {
-        // Make sure the offsets for any fragments in the effected range get
-        // updated.  Note that this (conservatively) invalidates the offsets of
-        // those following, but this is not required.
-        Layout.invalidateFragmentsFrom(FirstChangedFragment);
-      }
-
       // BoundaryAlign explicitly tracks it's size (unlike align)
       if (F.getKind() == MCFragment::FT_BoundaryAlign)
         cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
 
 #ifndef NDEBUG
       const uint64_t FinalOffset = Layout.getFragmentOffset(&F);
-      const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F);
+      const uint64_t FinalSize = Asm.computeFragmentSize(F);
       assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
              "can't move start of next fragment!");
       assert(FinalSize == RemainingSize && "inconsistent size computation?");
@@ -981,7 +974,7 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
   for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
     MCSection &Section = *Layout.getSectionOrder()[i];
     Layout.getFragmentOffset(&*Section.curFragList()->Tail);
-    Asm.computeFragmentSize(Layout, *Section.curFragList()->Tail);
+    Asm.computeFragmentSize(*Section.curFragList()->Tail);
   }
 }
 
@@ -1546,3 +1539,37 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
     return new ELFX86_X32AsmBackend(T, OSABI, STI);
   return new ELFX86_64AsmBackend(T, OSABI, STI);
 }
+
+namespace {
+class X86ELFStreamer : public MCELFStreamer {
+public:
+  X86ELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
+                 std::unique_ptr<MCObjectWriter> OW,
+                 std::unique_ptr<MCCodeEmitter> Emitter)
+      : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+                      std::move(Emitter)) {}
+
+  void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+};
+} // end anonymous namespace
+
+void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
+                             const MCSubtargetInfo &STI) {
+  auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
+  Backend.emitInstructionBegin(S, Inst, STI);
+  S.MCObjectStreamer::emitInstruction(Inst, STI);
+  Backend.emitInstructionEnd(S, Inst);
+}
+
+void X86ELFStreamer::emitInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI) {
+  X86_MC::emitInstruction(*this, Inst, STI);
+}
+
+MCStreamer *llvm::createX86ELFStreamer(const Triple &T, MCContext &Context,
+                                       std::unique_ptr<MCAsmBackend> &&MAB,
+                                       std::unique_ptr<MCObjectWriter> &&MOW,
+                                       std::unique_ptr<MCCodeEmitter> &&MCE) {
+  return new X86ELFStreamer(Context, std::move(MAB), std::move(MOW),
+                            std::move(MCE));
+}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 72219c136c7e17..3cd0af0c7f5462 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -344,7 +344,7 @@ class X86MCCodeEmitter : public MCCodeEmitter {
   ~X86MCCodeEmitter() override = default;
 
   void emitPrefix(const MCInst &MI, SmallVectorImpl<char> &CB,
-                  const MCSubtargetInfo &STI) const override;
+                  const MCSubtargetInfo &STI) const;
 
   void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -1531,6 +1531,11 @@ void X86MCCodeEmitter::emitPrefix(const MCInst &MI, SmallVectorImpl<char> &CB,
   emitPrefixImpl(CurOp, MI, STI, CB);
 }
 
+void X86_MC::emitPrefix(MCCodeEmitter &MCE, const MCInst &MI,
+                        SmallVectorImpl<char> &CB, const MCSubtargetInfo &STI) {
+  static_cast<X86MCCodeEmitter &>(MCE).emitPrefix(MI, CB, STI);
+}
+
 void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
                                          SmallVectorImpl<char> &CB,
                                          SmallVectorImpl<MCFixup> &Fixups,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index ed4d0a45bd8f26..07c2c73e84ed9d 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -742,6 +742,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMC() {
     TargetRegistry::RegisterNullTargetStreamer(*T, createX86NullTargetStreamer);
 
     TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer);
+    TargetRegistry::RegisterELFStreamer(*T, createX86ELFStreamer);
 
     // Register the MCInstPrinter.
     TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 18ecca34943f6b..96a7823b04ad8a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 
+#include "llvm/ADT/SmallVector.h"
 #include <memory>
 #include <string>
 
@@ -24,6 +25,7 @@ class MCContext;
 class MCInst;
 class MCInstPrinter;
 class MCInstrInfo;
+class MCObjectStreamer;
 class MCObjectTargetWriter;
 class MCObjectWriter;
 class MCRegister;
@@ -89,6 +91,12 @@ bool needsAddressSizeOverride(const MCInst &MI, const MCSubtargetInfo &STI,
 /// do not need to go through TargetRegistry.
 MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
                                           StringRef FS);
+
+void emitInstruction(MCObjectStreamer &, const MCInst &Inst,
+                     const MCSubtargetInfo &STI);
+
+void emitPrefix(MCCodeEmitter &MCE, const MCInst &MI, SmallVectorImpl<char> &CB,
+                const MCSubtargetInfo &STI);
 }
 
 MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
@@ -123,6 +131,11 @@ MCStreamer *createX86WinCOFFStreamer(MCContext &C,
                                      std::unique_ptr<MCCodeEmitter> &&CE,
                                      bool IncrementalLinkerCompatible);
 
+MCStreamer *createX86ELFStreamer(const Triple &T, MCContext &Context,
+                                 std::unique_ptr<MCAsmBackend> &&MAB,
+                                 std::unique_ptr<MCObjectWriter> &&MOW,
+                                 std::unique_ptr<MCCodeEmitter> &&MCE);
+
 /// Construct an X86 Mach-O object writer.
 std::unique_ptr<MCObjectTargetWriter>
 createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index dff9b315f3cdec..825bd3a203d1f4 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -145,12 +146,12 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     const MCSymbol *A = &Target.getSymA()->getSymbol();
     if (A->isTemporary())
       A = &Writer->findAliasedSymbol(*A);
-    const MCSymbol *A_Base = Asm.getAtom(*A);
+    const MCSymbol *A_Base = Writer->getAtom(*A);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
     if (B->isTemporary())
       B = &Writer->findAliasedSymbol(*B);
-    const MCSymbol *B_Base = Asm.getAtom(*B);
+    const MCSymbol *B_Base = Writer->getAtom(*B);
 
     // Neither symbol can be modified.
     if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) {
@@ -217,10 +218,10 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
     if (Symbol->isTemporary() && Value) {
       const MCSection &Sec = Symbol->getSection();
-      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+      if (!MCAsmInfoDarwin::isSectionAtomizableBySymbols(Sec))
         Symbol->setUsedInReloc();
     }
-    RelSymbol = Asm.getAtom(*Symbol);
+    RelSymbol = Writer->getAtom(*Symbol);
 
     // Relocations inside debug sections always use local relocations when
     // possible. This seems to be done because the debugger doesn't fully
@@ -251,8 +252,8 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     } else if (Symbol->isVariable()) {
       const MCExpr *Value = Symbol->getVariableValue();
       int64_t Res;
-      bool isAbs = Value->evaluateAsAbsolute(Res, Layout,
-                                             Writer->getSectionAddressMap());
+      bool isAbs =
+          Value->evaluateAsAbsolute(Res, Asm, Writer->getSectionAddressMap());
       if (isAbs) {
         FixedValue = Res;
         return;
@@ -565,7 +566,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     if (A->isVariable()) {
       int64_t Res;
       if (A->getVariableValue()->evaluateAsAbsolute(
-              Res, Layout, Writer->getSectionAddressMap())) {
+              Res, Asm, Writer->getSectionAddressMap())) {
         FixedValue = Res;
         return;
       }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index dac8bc1fb1be35..b1e5362c5d24b7 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -26,6 +26,7 @@ class X86WinCOFFStreamer : public MCWinCOFFStreamer {
                      std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
+  void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
   void emitWinEHHandlerData(SMLoc Loc) override;
   void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
   void emitWindowsUnwindTables() override;
@@ -33,6 +34,11 @@ class X86WinCOFFStreamer : public MCWinCOFFStreamer {
   void finishImpl() override;
 };
 
+void X86WinCOFFStreamer::emitInstruction(const MCInst &Inst,
+                                         const MCSubtargetInfo &STI) {
+  X86_MC::emitInstruction(*this, Inst, STI);
+}
+
 void X86WinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) {
   MCStreamer::emitWinEHHandlerData(Loc);
 
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 628ff560017ed8..68b78c7c44771f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -346,6 +346,8 @@ def FeatureNF : SubtargetFeature<"nf", "HasNF", "true",
                                  "Support status flags update suppression">;
 def FeatureCF : SubtargetFeature<"cf", "HasCF", "true",
                                  "Support conditional faulting">;
+def FeatureZU : SubtargetFeature<"zu", "HasZU", "true",
+                                 "Support zero-upper SETcc/IMUL">;
 def FeatureUseGPR32InInlineAsm
     : SubtargetFeature<"inline-asm-use-gpr32", "UseInlineAsmGPR32", "true",
                        "Enable use of GPR32 in inline assembly for APX">;
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 9819bfd1298554..d979517e12af6d 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -50,8 +50,7 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
 
 } // namespace
 
-void X86TargetMachine::registerPassBuilderCallbacks(
-    PassBuilder &PB, bool PopulateClassToPassNames) {
+void X86TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "X86PassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 }
diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 5c7105988070c3..2de89947c45197 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -1,4 +1,4 @@
-//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
+//===- X86FixupSetCC.cpp - fix zero-extension of setcc patterns -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,6 +17,11 @@
 // performed by the setcc. Instead, we can use:
 // xor %eax, %eax; seta %al
 // This both avoids the stall, and encodes shorter.
+//
+// Furthurmore, we can use:
+// setzua %al
+// if feature zero-upper is available. It's faster than the xor+setcc sequence.
+// When r16-r31 is used, it even encodes shorter.
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
@@ -46,6 +51,7 @@ class X86FixupSetCCPass : public MachineFunctionPass {
 
 private:
   MachineRegisterInfo *MRI = nullptr;
+  const X86Subtarget *ST = nullptr;
   const X86InstrInfo *TII = nullptr;
 
   enum { SearchBound = 16 };
@@ -61,7 +67,8 @@ FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
 bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   MRI = &MF.getRegInfo();
-  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  ST = &MF.getSubtarget<X86Subtarget>();
+  TII = ST->getInstrInfo();
 
   SmallVector<MachineInstr*, 4> ToErase;
 
@@ -79,7 +86,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       MachineInstr *ZExt = nullptr;
-      for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+      Register Reg0 = MI.getOperand(0).getReg();
+      for (auto &Use : MRI->use_instructions(Reg0))
         if (Use.getOpcode() == X86::MOVZX32rr8)
           ZExt = &Use;
 
@@ -98,9 +106,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       // On 32-bit, we need to be careful to force an ABCD register.
-      const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
-                                          ? &X86::GR32RegClass
-                                          : &X86::GR32_ABCDRegClass;
+      const TargetRegisterClass *RC =
+          ST->is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass;
       if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
         // If we cannot constrain the register, we would need an additional copy
         // and are better off keeping the MOVZX32rr8 we have now.
@@ -110,17 +117,24 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
       ++NumSubstZexts;
       Changed = true;
 
-      // Initialize a register with 0. This must go before the eflags def
+      // X86 setcc/setzucc only takes an output GR8, so fake a GR32 input by
+      // inserting the setcc/setzucc result into the low byte of the zeroed
+      // register.
       Register ZeroReg = MRI->createVirtualRegister(RC);
-      BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
-              ZeroReg);
+      if (ST->hasZU()) {
+        MI.setDesc(TII->get(X86::SETZUCCr));
+        BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+                TII->get(TargetOpcode::IMPLICIT_DEF), ZeroReg);
+      } else {
+        // Initialize a register with 0. This must go before the eflags def
+        BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+                ZeroReg);
+      }
 
-      // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
-      // the setcc result into the low byte of the zeroed register.
       BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
               TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
           .addReg(ZeroReg)
-          .addReg(MI.getOperand(0).getReg())
+          .addReg(Reg0)
           .addImm(X86::sub_8bit);
       ToErase.push_back(ZExt);
     }
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 76bcf875f00e22..8a263e1a0678b3 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/EHPersonalities.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSymbol.h"
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3933e82b718f2d..f2b3855ebc5445 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 82d2b301d854ec..1d4af62c3227d7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2500,6 +2500,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(Op, MVT::f32, Promote);
   // clang-format on
 
+  // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has
+  // it, but it's just a wrapper around ldexp.
+  if (Subtarget.isOSWindows()) {
+    for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+      if (isOperationExpand(Op, MVT::f32))
+        setOperationAction(Op, MVT::f32, Promote);
+  }
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
                        ISD::SCALAR_TO_VECTOR,
@@ -28503,17 +28511,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
 
     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
-    // For vXi8 mul-by-constant, try PMADDUBSW to avoid the need for extension.
+    // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
     // Don't do this if we only need to unpack one half.
-    if (Subtarget.hasSSSE3() &&
-        ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
-      bool IsLoLaneAllZeroOrUndef = true;
-      bool IsHiLaneAllZeroOrUndef = true;
-      for (auto [Idx, Val] : enumerate(B->ops())) {
-        if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
-          IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
-        else
-          IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+    if (Subtarget.hasSSSE3()) {
+      bool BIsBuildVector = isa<BuildVectorSDNode>(B);
+      bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
+      bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
+      if (BIsBuildVector) {
+        for (auto [Idx, Val] : enumerate(B->ops())) {
+          if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
+            IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+          else
+            IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+        }
       }
       if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
         SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
@@ -30547,7 +30557,7 @@ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
     bool Not = false;
     // Check if we have a NOT
     Value *PeekI;
-    if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
+    if (match(I, m_Not(m_Value(PeekI))) ||
         match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
       Not = true;
       I = dyn_cast<Instruction>(PeekI);
@@ -32305,6 +32315,54 @@ bool X86TargetLowering::isInlineAsmTargetBranch(
   return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
 }
 
+static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL,
+                                      SDValue Mask) {
+  EVT Ty = MVT::i8;
+  auto V = DAG.getBitcast(MVT::i1, Mask);
+  auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
+  auto Zero = DAG.getConstant(0, DL, Ty);
+  SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
+  auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
+  return SDValue(CmpZero.getNode(), 1);
+}
+
+SDValue X86TargetLowering::visitMaskedLoad(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
+    SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
+  // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
+  // ->
+  // _, flags = SUB 0, mask
+  // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
+  // bit_cast_to_vector<res>
+  EVT VTy = PassThru.getValueType();
+  EVT Ty = VTy.getVectorElementType();
+  SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
+  auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
+                                           : DAG.getBitcast(Ty, PassThru);
+  auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
+  auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+  SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
+  NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
+  return DAG.getBitcast(VTy, NewLoad);
+}
+
+SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
+                                            SDValue Chain,
+                                            MachineMemOperand *MMO, SDValue Ptr,
+                                            SDValue Val, SDValue Mask) const {
+  // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
+  // ->
+  // _, flags = SUB 0, mask
+  // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
+  EVT Ty = Val.getValueType().getVectorElementType();
+  SDVTList Tys = DAG.getVTList(MVT::Other);
+  auto ScalarVal = DAG.getBitcast(Ty, Val);
+  auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
+  auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+  SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
+  return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
+}
+
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -34021,6 +34079,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STRICT_FP80_ADD)
   NODE_NAME_CASE(CCMP)
   NODE_NAME_CASE(CTEST)
+  NODE_NAME_CASE(CLOAD)
+  NODE_NAME_CASE(CSTORE)
   }
   return nullptr;
 #undef NODE_NAME_CASE
@@ -37098,7 +37158,7 @@ static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS,
   KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
   KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
   KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
-  Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true,
+  Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
                                       /*NUW=*/false, Lo, Hi);
 }
 
@@ -41273,6 +41333,34 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
 
     return SDValue();
   }
+  case X86ISD::VPERMV3: {
+    // VPERM[I,T]2[B,W] are 3 uops on Skylake and Icelake so we try to use
+    // VPERMV.
+    SDValue V1 = peekThroughBitcasts(N.getOperand(0));
+    SDValue V2 = peekThroughBitcasts(N.getOperand(2));
+    MVT SVT = V1.getSimpleValueType();
+    MVT EVT = VT.getVectorElementType();
+    MVT NVT = VT.getDoubleNumVectorElementsVT();
+    if ((EVT == MVT::i8 || EVT == MVT::i16) &&
+        (NVT.is256BitVector() ||
+         (NVT.is512BitVector() && Subtarget.hasEVEX512())) &&
+        V1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        V1.getConstantOperandVal(1) == 0 &&
+        V2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        V2.getConstantOperandVal(1) == SVT.getVectorNumElements() &&
+        V1.getOperand(0) == V2.getOperand(0)) {
+      SDValue Mask =
+          DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NVT, DAG.getUNDEF(NVT),
+                      N.getOperand(1), DAG.getIntPtrConstant(0, DL));
+      return DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, DL, VT,
+          DAG.getNode(X86ISD::VPERMV, DL, NVT, Mask,
+                      DAG.getBitcast(NVT, V1.getOperand(0))),
+          DAG.getIntPtrConstant(0, DL));
+    }
+
+    return SDValue();
+  }
   default:
     return SDValue();
   }
@@ -55626,6 +55714,32 @@ static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
+  // res, flags2 = sub 0, (setcc cc, flag)
+  // cload/cstore ..., cond_ne, flag2
+  // ->
+  // cload/cstore cc, flag
+  if (N->getConstantOperandVal(3) != X86::COND_NE)
+    return SDValue();
+
+  SDValue Sub = N->getOperand(4);
+  if (Sub.getOpcode() != X86ISD::SUB)
+    return SDValue();
+
+  SDValue SetCC = Sub.getOperand(1);
+
+  if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC)
+    return SDValue();
+
+  SmallVector<SDValue, 5> Ops(N->op_values());
+  Ops[3] = SetCC.getOperand(0);
+  Ops[4] = SetCC.getOperand(1);
+
+  return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
+                                 cast<MemSDNode>(N)->getMemoryVT(),
+                                 cast<MemSDNode>(N)->getMemOperand());
+}
+
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -55696,19 +55810,70 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
 
 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
+  unsigned Opcode = N->getOpcode();
+  assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
+         "Unknown PCMP opcode");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
   MVT VT = N->getSimpleValueType(0);
+  unsigned EltBits = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
   SDLoc DL(N);
 
-  if (N->getOperand(0) == N->getOperand(1)) {
-    if (N->getOpcode() == X86ISD::PCMPEQ)
-      return DAG.getConstant(-1, DL, VT);
-    if (N->getOpcode() == X86ISD::PCMPGT)
-      return DAG.getConstant(0, DL, VT);
+  if (LHS == RHS)
+    return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
+                                      : DAG.getConstant(0, DL, VT);
+
+  // Constant Folding.
+  // PCMPEQ(X,UNDEF) -> UNDEF
+  // PCMPGT(X,UNDEF) -> 0
+  // PCMPGT(UNDEF,X) -> 0
+  APInt LHSUndefs, RHSUndefs;
+  SmallVector<APInt> LHSBits, RHSBits;
+  if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
+      getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
+    APInt Ones = APInt::getAllOnes(EltBits);
+    APInt Zero = APInt::getZero(EltBits);
+    SmallVector<APInt> Results(NumElts);
+    for (unsigned I = 0; I != NumElts; ++I) {
+      if (Opcode == X86ISD::PCMPEQ) {
+        Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
+      } else {
+        bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
+        Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
+      }
+    }
+    if (Opcode == X86ISD::PCMPEQ)
+      return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
+    return getConstVector(Results, VT, DAG, DL);
   }
 
   return SDValue();
 }
 
+// Helper to determine if we can convert an integer comparison to a float
+// comparison byt casting the operands.
+static std::optional<unsigned>
+CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
+                 unsigned NumSignificantBitsRHS) {
+  MVT SVT = VT.getScalarType();
+  assert(SVT == MVT::f32 && "Only tested for float so far");
+  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(SVT);
+  assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
+         "Only PCMPEQ/PCMPGT currently supported");
+
+  // TODO: Handle bitcastable integers.
+
+  // For cvt + signed compare we need lhs and rhs to be exactly representable as
+  // a fp value.
+  unsigned FPPrec = APFloat::semanticsPrecision(Sem);
+  if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
+    return ISD::SINT_TO_FP;
+
+  return std::nullopt;
+}
+
 /// Helper that combines an array of subvector ops as if they were the operands
 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
@@ -56101,11 +56266,50 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       break;
     case X86ISD::PCMPEQ:
     case X86ISD::PCMPGT:
-      if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256() &&
+      if (!IsSplat && VT.is256BitVector() &&
+          (Subtarget.hasInt256() || VT == MVT::v8i32) &&
           (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
-        return DAG.getNode(Op0.getOpcode(), DL, VT,
-                           ConcatSubOperand(VT, Ops, 0),
-                           ConcatSubOperand(VT, Ops, 1));
+        if (Subtarget.hasInt256())
+          return DAG.getNode(Op0.getOpcode(), DL, VT,
+                             ConcatSubOperand(VT, Ops, 0),
+                             ConcatSubOperand(VT, Ops, 1));
+
+        // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
+        // TODO: Handle v4f64 as well?
+        unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
+        for (unsigned I = 0; I != NumOps; ++I) {
+          MaxSigBitsLHS =
+              std::max(MaxSigBitsLHS,
+                       DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
+          MaxSigBitsRHS =
+              std::max(MaxSigBitsRHS,
+                       DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
+          if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
+            break;
+        }
+
+        ISD::CondCode ICC =
+            Op0.getOpcode() == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
+        ISD::CondCode FCC =
+            Op0.getOpcode() == X86ISD::PCMPEQ ? ISD::SETOEQ : ISD::SETOGT;
+
+        MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
+        MVT FpVT = VT.changeVectorElementType(FpSVT);
+
+        if (std::optional<unsigned> CastOpc =
+                CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
+          SDValue LHS = ConcatSubOperand(VT, Ops, 0);
+          SDValue RHS = ConcatSubOperand(VT, Ops, 1);
+          LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
+          RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
+
+          bool IsAlwaysSignaling;
+          unsigned FSETCC =
+              translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
+          return DAG.getBitcast(
+              VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
+                              DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
+        }
       }
       break;
     case ISD::CTPOP:
@@ -56926,9 +57130,13 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
 // Simplify VPMADDUBSW/VPMADDWD operations.
 static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
-  EVT VT = N->getValueType(0);
+  MVT VT = N->getSimpleValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
+  unsigned Opc = N->getOpcode();
+  bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
+  assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) &&
+         "Unexpected PMADD opcode");
 
   // Multiply by zero.
   // Don't return LHS/RHS as it may contain UNDEFs.
@@ -56936,6 +57144,27 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
       ISD::isBuildVectorAllZeros(RHS.getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
+  // Constant folding.
+  APInt LHSUndefs, RHSUndefs;
+  SmallVector<APInt> LHSBits, RHSBits;
+  unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
+  unsigned DstEltBits = VT.getScalarSizeInBits();
+  if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
+      getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
+    SmallVector<APInt> Result;
+    for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
+      APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
+      APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
+      LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
+      LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
+      APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
+      APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
+      APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
+      Result.push_back(Res);
+    }
+    return getConstVector(Result, VT, DAG, SDLoc(N));
+  }
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
@@ -57335,6 +57564,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
   case X86ISD::ADD:
   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI, Subtarget);
+  case X86ISD::CLOAD:
+  case X86ISD::CSTORE:      return combineX86CloadCstore(N, DAG);
   case X86ISD::SBB:         return combineSBB(N, DAG);
   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
@@ -57540,16 +57771,20 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
     case ISD::SIGN_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::ANY_EXTEND:
+    case ISD::MUL:
+      return false;
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
     case ISD::SUB:
     case ISD::ADD:
-    case ISD::MUL:
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
-      return false;
+      // NDD instruction never has "partial register write" issue b/c it has
+      // destination register's upper bits [63:OSIZE]) zeroed even when
+      // OSIZE=8/16.
+      return Subtarget.hasNDD();
     }
   }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3c5c903bc0d98c..362daa98e1f8e0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -903,6 +903,10 @@ namespace llvm {
     // is needed so that this can be expanded with control flow.
     VASTART_SAVE_XMM_REGS,
 
+    // Conditional load/store instructions
+    CLOAD,
+    CSTORE,
+
     // WARNING: Do not add anything in the end unless you want the node to
     // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
     // opcodes will be thought as target memory ops!
@@ -1556,6 +1560,14 @@ namespace llvm {
     bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
                                  unsigned OpNo) const override;
 
+    SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                            MachineMemOperand *MMO, SDValue &NewLoad,
+                            SDValue Ptr, SDValue PassThru,
+                            SDValue Mask) const override;
+    SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                             MachineMemOperand *MMO, SDValue Ptr, SDValue Val,
+                             SDValue Mask) const override;
+
     /// Lower interleaved load(s) into target specific
     /// instructions/intrinsics.
     bool lowerInterleavedLoad(LoadInst *LI,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 8be64b6fff3fc6..ed3fb13b2b2323 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 
 #define DEBUG_TYPE "x86-isel"
 
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 785bdd83cd998b..381286a3bbfc00 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 9aa70dff5f9327..5d6b829aea7ec0 100644
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/VirtualFileSystem.h"
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 8e75e185f0f668..49f16833ee5132 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -213,7 +213,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
   if (IsImm) {
     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
     KnownBits KnownAmtBits =
-        llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+        llvm::computeKnownBits(Amt, II.getDataLayout());
     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
       Amt = Builder.CreateVectorSplat(VWidth, Amt);
@@ -237,9 +237,9 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
     KnownBits KnownLowerBits = llvm::computeKnownBits(
-        Amt, DemandedLower, II.getModule()->getDataLayout());
+        Amt, DemandedLower, II.getDataLayout());
     KnownBits KnownUpperBits = llvm::computeKnownBits(
-        Amt, DemandedUpper, II.getModule()->getDataLayout());
+        Amt, DemandedUpper, II.getDataLayout());
     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
       SmallVector<int, 16> ZeroSplat(VWidth, 0);
@@ -357,7 +357,7 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
   // If the shift amount is guaranteed to be in-range we can replace it with a
   // generic shift.
   KnownBits KnownAmt =
-      llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+      llvm::computeKnownBits(Amt, II.getDataLayout());
   if (KnownAmt.getMaxValue().ult(BitWidth)) {
     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
                                       : Builder.CreateLShr(Vec, Amt))
@@ -502,6 +502,60 @@ static Value *simplifyX86pack(IntrinsicInst &II,
   return Builder.CreateTrunc(Shuffle, ResTy);
 }
 
+static Value *simplifyX86pmadd(IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder,
+                               bool IsPMADDWD) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  auto *ResTy = cast<FixedVectorType>(II.getType());
+  [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
+
+  unsigned NumDstElts = ResTy->getNumElements();
+  assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
+         ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
+         "Unexpected PMADD types");
+
+  // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
+  if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
+    return ConstantAggregateZero::get(ResTy);
+
+  // Multiply by zero.
+  if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
+    return ConstantAggregateZero::get(ResTy);
+
+  // Constant folding.
+  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+    return nullptr;
+
+  // Split Lo/Hi elements pairs, extend and add together.
+  // PMADDWD(X,Y) =
+  // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
+  // PMADDUBSW(X,Y) =
+  // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
+  SmallVector<int> LoMask, HiMask;
+  for (unsigned I = 0; I != NumDstElts; ++I) {
+    LoMask.push_back(2 * I + 0);
+    HiMask.push_back(2 * I + 1);
+  }
+
+  auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
+  auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
+  auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
+  auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
+
+  auto LHSCast =
+      IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
+  LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
+  LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
+  RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
+  RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
+  Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
+  Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
+  return IsPMADDWD
+             ? Builder.CreateAdd(Lo, Hi)
+             : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
+}
+
 static Value *simplifyX86movmsk(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   Value *Arg = II.getArgOperand(0);
@@ -2478,6 +2532,22 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
 
+  case Intrinsic::x86_sse2_pmadd_wd:
+  case Intrinsic::x86_avx2_pmadd_wd:
+  case Intrinsic::x86_avx512_pmaddw_d_512:
+    if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+  case Intrinsic::x86_avx2_pmadd_ub_sw:
+  case Intrinsic::x86_avx512_pmaddubs_w_512:
+    if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
   case Intrinsic::x86_pclmulqdq:
   case Intrinsic::x86_pclmulqdq_256:
   case Intrinsic::x86_pclmulqdq_512: {
@@ -2701,14 +2771,14 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
         BoolVec->getType()->isVectorTy() &&
         BoolVec->getType()->getScalarSizeInBits() == 1) {
-      assert(Mask->getType()->getPrimitiveSizeInBits() ==
-                 II.getType()->getPrimitiveSizeInBits() &&
+      auto *MaskTy = cast<FixedVectorType>(Mask->getType());
+      auto *OpTy = cast<FixedVectorType>(II.getType());
+      assert(MaskTy->getPrimitiveSizeInBits() ==
+                 OpTy->getPrimitiveSizeInBits() &&
              "Not expecting mask and operands with different sizes");
+      unsigned NumMaskElts = MaskTy->getNumElements();
+      unsigned NumOperandElts = OpTy->getNumElements();
 
-      unsigned NumMaskElts =
-          cast<FixedVectorType>(Mask->getType())->getNumElements();
-      unsigned NumOperandElts =
-          cast<FixedVectorType>(II.getType())->getNumElements();
       if (NumMaskElts == NumOperandElts) {
         return SelectInst::Create(BoolVec, Op1, Op0);
       }
@@ -2716,8 +2786,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       // If the mask has less elements than the operands, each mask bit maps to
       // multiple elements of the operands. Bitcast back and forth.
       if (NumMaskElts < NumOperandElts) {
-        Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
-        Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
+        Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
+        Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
         return new BitCastInst(Sel, II.getType());
       }
@@ -3079,6 +3149,24 @@ std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
     break;
   }
 
+  case Intrinsic::x86_sse2_pmadd_wd:
+  case Intrinsic::x86_avx2_pmadd_wd:
+  case Intrinsic::x86_avx512_pmaddw_d_512:
+  case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+  case Intrinsic::x86_avx2_pmadd_ub_sw:
+  case Intrinsic::x86_avx512_pmaddubs_w_512: {
+    // PMADD - demand both src elements that map to each dst element.
+    auto *ArgTy = II.getArgOperand(0)->getType();
+    unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
+    assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
+    APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
+    APInt Op0UndefElts(InnerVWidth, 0);
+    APInt Op1UndefElts(InnerVWidth, 0);
+    simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
+    simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
+    break;
+  }
+
   // PSHUFB
   case Intrinsic::x86_ssse3_pshuf_b_128:
   case Intrinsic::x86_avx2_pshuf_b:
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index e27aa4115990e9..7d5d7cf4a83abf 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -113,6 +113,27 @@ let Predicates = [HasCMOV, HasCF] in {
             (CFCMOV32rr GR32:$src1, (inv_cond_XFORM timm:$cond))>;
   def : Pat<(X86cmov GR64:$src1, 0, timm:$cond, EFLAGS),
             (CFCMOV64rr GR64:$src1, (inv_cond_XFORM timm:$cond))>;
+
+  def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV16rm addr:$src1, timm:$cond)>;
+  def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV32rm addr:$src1, timm:$cond)>;
+  def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV64rm addr:$src1, timm:$cond)>;
+
+  def : Pat<(X86cload addr:$src2, GR16:$src1, timm:$cond, EFLAGS),
+            (CFCMOV16rm_ND GR16:$src1, addr:$src2, timm:$cond)>;
+  def : Pat<(X86cload addr:$src2, GR32:$src1, timm:$cond, EFLAGS),
+            (CFCMOV32rm_ND GR32:$src1, addr:$src2, timm:$cond)>;
+  def : Pat<(X86cload addr:$src2, GR64:$src1, timm:$cond, EFLAGS),
+            (CFCMOV64rm_ND GR64:$src1, addr:$src2, timm:$cond)>;
+
+  def : Pat<(X86cstore GR16:$src2, addr:$src1, timm:$cond, EFLAGS),
+            (CFCMOV16mr addr:$src1, GR16:$src2, timm:$cond)>;
+  def : Pat<(X86cstore GR32:$src2, addr:$src1, timm:$cond, EFLAGS),
+            (CFCMOV32mr addr:$src1, GR32:$src2, timm:$cond)>;
+  def : Pat<(X86cstore GR64:$src2, addr:$src1, timm:$cond, EFLAGS),
+            (CFCMOV64mr addr:$src1, GR64:$src2, timm:$cond)>;
 }
 
 // SetCC instructions.
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 162e322712a6d3..038100b8264de7 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -15,6 +15,15 @@ def SDTX86FCmp    : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>,
 def SDTX86Ccmp    : SDTypeProfile<1, 5,
                                   [SDTCisVT<3, i8>, SDTCisVT<4, i8>, SDTCisVT<5, i32>]>;
 
+// RES = op PTR, PASSTHRU, COND, EFLAGS
+def SDTX86Cload    : SDTypeProfile<1, 4,
+                                  [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisSameAs<0, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+// op VAL, PTR, COND, EFLAGS
+def SDTX86Cstore    : SDTypeProfile<0, 4,
+                                  [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                   SDTCisVT<2, i8>, SDTCisVT<3, i32>]>;
+
 def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
@@ -144,6 +153,9 @@ def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
 def X86ccmp    : SDNode<"X86ISD::CCMP",     SDTX86Ccmp>;
 def X86ctest   : SDNode<"X86ISD::CTEST",    SDTX86Ccmp>;
 
+def X86cload    : SDNode<"X86ISD::CLOAD",   SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86cstore   : SDNode<"X86ISD::CSTORE",  SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
 def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
                         [SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1a43d5c17080e2..fab7c167e385f9 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -8832,6 +8833,11 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
       Opcode == X86::PLDTILECFGV)
     return true;
 
+  // Frame setup and destory can't be scheduled around.
+  if (MI.getFlag(MachineInstr::FrameSetup) ||
+      MI.getFlag(MachineInstr::FrameDestroy))
+    return true;
+
   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
 }
 
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index e38f70d83f9cc5..7e2445c01ff0da 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -112,7 +112,7 @@ class X86InterleavedAccessGroup {
                                      const X86Subtarget &STarget,
                                      IRBuilder<> &B)
       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
-        DL(Inst->getModule()->getDataLayout()), Builder(B) {}
+        DL(Inst->getDataLayout()), Builder(B) {}
 
   /// Returns true if this interleaved access group can be lowered into
   /// x86-specific instructions/intrinsics, false otherwise.
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 079ac983a8a01d..9921e14ed95e1d 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -103,8 +103,7 @@ static bool containsAMXCode(Function &F) {
 static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
                                            Type *Ty) {
   Function &F = *BB->getParent();
-  Module *M = BB->getModule();
-  const DataLayout &DL = M->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   LLVMContext &Ctx = Builder.getContext();
   auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
@@ -454,10 +453,9 @@ bool X86LowerAMXType::visit() {
 } // anonymous namespace
 
 static Value *getAllocaPos(BasicBlock *BB) {
-  Module *M = BB->getModule();
   Function *F = BB->getParent();
   IRBuilder<> Builder(&F->getEntryBlock().front());
-  const DataLayout &DL = M->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   unsigned AllocaAS = DL.getAllocaAddrSpace();
   Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
   AllocaInst *AllocaRes =
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index a11be9507ceadc..5bbfabcbd67bc6 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -508,7 +508,7 @@ bool X86PartialReduction::runOnFunction(Function &F) {
   auto &TM = TPC->getTM<X86TargetMachine>();
   ST = TM.getSubtargetImpl(F);
 
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
 
   bool MadeChange = false;
   for (auto &BB : F) {
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index ecf923c0e2a222..4c1c1b9cf09356 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/X86/X86ReturnThunks.cpp b/llvm/lib/Target/X86/X86ReturnThunks.cpp
index a65e9bfe26b029..fe89238f26f975 100644
--- a/llvm/lib/Target/X86/X86ReturnThunks.cpp
+++ b/llvm/lib/Target/X86/X86ReturnThunks.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index c2e6ddd7e7fa2c..6c9a94c9495908 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index 916445c74bb903..ec4a93e9c9d4b0 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -66,8 +66,7 @@ class X86TargetMachine final : public LLVMTargetMachine {
                                 SMDiagnostic &Error,
                                 SMRange &SourceRange) const override;
 
-  void registerPassBuilderCallbacks(PassBuilder &PB,
-                                    bool PopulateClassToPassNames) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
 
   Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
                              raw_pwrite_stream *, CodeGenFileType,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index de0144331dba3c..bb0270c018c982 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,6 +176,27 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   return 8;
 }
 
+bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const {
+  if (!ST->hasCF())
+    return false;
+  if (!Ty)
+    return true;
+  // Conditional faulting is supported by CFCMOV, which only accepts
+  // 16/32/64-bit operands.
+  // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
+  // profitable.
+  if (!Ty->isIntegerTy())
+    return false;
+  switch (cast<IntegerType>(Ty)->getBitWidth()) {
+  default:
+    return false;
+  case 16:
+  case 32:
+  case 64:
+    return true;
+  }
+}
+
 TypeSize
 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
@@ -852,8 +873,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::SUB,   MVT::v32i16,  {  1,  1, 1, 1 } }, // psubw
 
     { ISD::MUL,   MVT::v16i8,   {  4, 12, 4, 5 } }, // extend/pmullw/trunc
-    { ISD::MUL,   MVT::v32i8,   {  6, 11,10,11 } }, // extend/pmullw/trunc
-    { ISD::MUL,   MVT::v64i8,   {  6, 12,10,11 } }, // unpack/pmullw
+    { ISD::MUL,   MVT::v32i8,   {  3, 10, 7,10 } }, // pmaddubsw
+    { ISD::MUL,   MVT::v64i8,   {  3, 11, 7,10 } }, // pmaddubsw
     { ISD::MUL,   MVT::v32i16,  {  1,  5, 1, 1 } }, // pmullw
 
     { ISD::SUB,   MVT::v32i8,   {  1,  1, 1, 1 } }, // psubb
@@ -1119,7 +1140,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::ADD,  MVT::v4i64,   {  1,  1, 1, 2 } }, // paddq
 
     { ISD::MUL,  MVT::v16i8,   {  5, 18, 6,12 } }, // extend/pmullw/pack
-    { ISD::MUL,  MVT::v32i8,   {  6, 11,10,20 } }, // unpack/pmullw
+    { ISD::MUL,  MVT::v32i8,   {  4,  8, 8,16 } }, // pmaddubsw
     { ISD::MUL,  MVT::v16i16,  {  2,  5, 1, 2 } }, // pmullw
     { ISD::MUL,  MVT::v8i32,   {  4, 10, 1, 2 } }, // pmulld
     { ISD::MUL,  MVT::v4i32,   {  2, 10, 1, 2 } }, // pmulld
@@ -1170,8 +1191,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     // We don't have to scalarize unsupported ops. We can issue two half-sized
     // operations and we only need to extract the upper YMM half.
     // Two ops + 1 extract + 1 insert = 4.
-    { ISD::MUL,     MVT::v32i8,   { 12, 12, 22, 23 } }, // unpack/pmullw + split
-    { ISD::MUL,     MVT::v16i8,   {  5,  6, 10, 12 } }, // unpack/pmullw
+    { ISD::MUL,     MVT::v32i8,   { 10, 11, 18, 19 } }, // pmaddubsw + split
+    { ISD::MUL,     MVT::v16i8,   {  5,  6,  8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
     { ISD::MUL,     MVT::v16i16,  {  4,  8,  5,  6 } }, // pmullw + split
     { ISD::MUL,     MVT::v8i32,   {  5,  8,  5, 10 } }, // pmulld + split
     { ISD::MUL,     MVT::v4i32,   {  2,  5,  1,  3 } }, // pmulld
@@ -1311,7 +1332,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v2i64,  {  8, 17, 5, 7 } }, // splat+shuffle sequence.
 
-    { ISD::MUL,  MVT::v16i8,  {  6, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
     { ISD::MUL,  MVT::v4i32,  {  2, 11, 1, 1 } }  // pmulld (Nehalem from agner.org)
   };
 
@@ -1320,6 +1340,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
       if (auto KindCost = Entry->Cost[CostKind])
         return LT.first * *KindCost;
 
+  static const CostKindTblEntry SSSE3CostTable[] = {
+    { ISD::MUL,  MVT::v16i8,  {  5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
+  };
+
+  if (ST->hasSSSE3())
+    if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
   static const CostKindTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
@@ -5062,7 +5091,12 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
   auto VT = TLI->getValueType(DL, SrcVTy);
   InstructionCost Cost = 0;
-  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+  MVT Ty = LT.second;
+  if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
+    // APX masked load/store for scalar is cheap.
+    return Cost + LT.first;
+
+  if (VT.isSimple() && Ty != VT.getSimpleVT() &&
       LT.second.getVectorNumElements() == NumElem)
     // Promotion requires extend/truncate for data and a shuffle for mask.
     Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
@@ -5070,9 +5104,9 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
             getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
                            CostKind, 0, nullptr);
 
-  else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
+  else if (LT.first * Ty.getVectorNumElements() > NumElem) {
     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
-                                           LT.second.getVectorNumElements());
+                                           Ty.getVectorNumElements());
     // Expanding requires fill mask with zeroes
     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
                            CostKind, 0, MaskTy);
@@ -5891,14 +5925,14 @@ bool X86TTIImpl::canMacroFuseCmp() {
 }
 
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
-  if (!ST->hasAVX())
-    return false;
+  Type *ScalarTy = DataTy->getScalarType();
 
-  // The backend can't handle a single element vector.
-  if (isa<VectorType>(DataTy) &&
-      cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+  // The backend can't handle a single element vector w/o CFCMOV.
+  if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+    return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
+
+  if (!ST->hasAVX())
     return false;
-  Type *ScalarTy = DataTy->getScalarType();
 
   if (ScalarTy->isPointerTy())
     return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index e14dc9fc090516..5eccb1aea308d7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -111,7 +111,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
 
 public:
   explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   /// \name Scalar TTI Implementations
@@ -132,6 +132,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   /// @{
 
   unsigned getNumberOfRegisters(unsigned ClassID) const;
+  bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 18feeaadb03c83..f889f0b26e9afd 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -264,7 +264,7 @@ static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL)
   if (!ObjType->isSized())
     return false;
 
-  auto &DL = GV->getParent()->getDataLayout();
+  auto &DL = GV->getDataLayout();
   unsigned ObjSize = DL.getTypeAllocSize(ObjType);
   return ObjSize < CodeModelLargeSize && ObjSize != 0;
 }
diff --git a/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
index ae697f43b0eedd..d0f9cce105e14c 100644
--- a/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -118,7 +118,7 @@ MCSection *XCoreTargetObjectFile::SelectSectionForGlobal(
     if (Kind.isMergeableConst16())      return MergeableConst16Section;
   }
   Type *ObjType = GO->getValueType();
-  auto &DL = GO->getParent()->getDataLayout();
+  auto &DL = GO->getDataLayout();
   if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
       DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) {
     if (Kind.isReadOnly())              return UseCPRel? ReadOnlySection
diff --git a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
index 58df1f290ec9d7..2f39884c1c917b 100644
--- a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -37,7 +37,7 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
 
 public:
   explicit XCoreTTIImpl(const XCoreTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl()),
         TLI(ST->getTargetLowering()) {}
 
   unsigned getNumberOfRegisters(unsigned ClassID) const {
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index e222919b28dc98..3f99387f759d93 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -42,9 +43,20 @@ getModifierVariantKind(XtensaCP::XtensaCPModifier Modifier) {
 }
 
 void XtensaAsmPrinter::emitInstruction(const MachineInstr *MI) {
-  MCInst LoweredMI;
-  lowerToMCInst(MI, LoweredMI);
-  EmitToStreamer(*OutStreamer, LoweredMI);
+  unsigned Opc = MI->getOpcode();
+
+  switch (Opc) {
+  case Xtensa::BR_JT:
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(Xtensa::JX).addReg(MI->getOperand(0).getReg()));
+    return;
+  default:
+    MCInst LoweredMI;
+    lowerToMCInst(MI, LoweredMI);
+    EmitToStreamer(*OutStreamer, LoweredMI);
+    return;
+  }
 }
 
 void XtensaAsmPrinter::emitMachineConstantPoolValue(
@@ -52,16 +64,27 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
   XtensaConstantPoolValue *ACPV = static_cast<XtensaConstantPoolValue *>(MCPV);
   MCSymbol *MCSym;
 
-  assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
-
-  XtensaConstantPoolSymbol *XtensaSym = cast<XtensaConstantPoolSymbol>(ACPV);
-  const char *Sym = XtensaSym->getSymbol();
-  std::string SymName(Sym);
-
-  if (XtensaSym->isPrivateLinkage())
-    SymName = ".L" + SymName;
+  if (ACPV->isBlockAddress()) {
+    const BlockAddress *BA =
+        cast<XtensaConstantPoolConstant>(ACPV)->getBlockAddress();
+    MCSym = GetBlockAddressSymbol(BA);
+  } else if (ACPV->isJumpTable()) {
+    unsigned Idx = cast<XtensaConstantPoolJumpTable>(ACPV)->getIndex();
+    MCSym = this->GetJTISymbol(Idx, false);
+  } else {
+    assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+    XtensaConstantPoolSymbol *XtensaSym = cast<XtensaConstantPoolSymbol>(ACPV);
+    const char *SymName = XtensaSym->getSymbol();
+
+    if (XtensaSym->isPrivateLinkage()) {
+      const DataLayout &DL = getDataLayout();
+      MCSym = OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           SymName);
+    } else {
+      MCSym = OutContext.getOrCreateSymbol(SymName);
+    }
+  }
 
-  MCSym = GetExternalSymbolSymbol(StringRef(SymName));
   MCSymbol *LblSym = GetCPISymbol(ACPV->getLabelId());
   auto *TS =
       static_cast<XtensaTargetStreamer *>(OutStreamer->getTargetStreamer());
@@ -71,7 +94,7 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
     std::string SymName(MCSym->getName());
     StringRef Modifier = ACPV->getModifierText();
     SymName += Modifier;
-    MCSym = GetExternalSymbolSymbol(StringRef(SymName));
+    MCSym = OutContext.getOrCreateSymbol(SymName);
   }
 
   const MCExpr *Expr = MCSymbolRefExpr::create(MCSym, VK, OutContext);
@@ -140,6 +163,10 @@ XtensaAsmPrinter::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
   return GetCPISymbol(MO.getIndex());
 }
 
+MCSymbol *XtensaAsmPrinter::GetJumpTableSymbol(const MachineOperand &MO) const {
+  return GetJTISymbol(MO.getIndex());
+}
+
 MCOperand
 XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
                                      MachineOperand::MachineOperandType MOTy,
@@ -152,6 +179,20 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
     Symbol = getSymbol(MO.getGlobal());
     Offset += MO.getOffset();
     break;
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+  case MachineOperand::MO_BlockAddress:
+    Symbol = GetBlockAddressSymbol(MO.getBlockAddress());
+    Offset += MO.getOffset();
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = GetExternalSymbolSymbol(MO.getSymbolName());
+    Offset += MO.getOffset();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = GetJumpTableSymbol(MO);
+    break;
   case MachineOperand::MO_ConstantPoolIndex:
     Symbol = GetConstantPoolIndexSymbol(MO);
     Offset += MO.getOffset();
@@ -191,6 +232,10 @@ MCOperand XtensaAsmPrinter::lowerOperand(const MachineOperand &MO,
   case MachineOperand::MO_RegisterMask:
     break;
   case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_JumpTableIndex:
   case MachineOperand::MO_ConstantPoolIndex:
     return LowerSymbolOperand(MO, MOTy, Offset);
   default:
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
index f3fec19724aab6..f9cf5ae8c9f656 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
@@ -44,6 +44,8 @@ class LLVM_LIBRARY_VISIBILITY XtensaAsmPrinter : public AsmPrinter {
 
   MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
 
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperand::MachineOperandType MOTy,
                                unsigned Offset) const;
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index 6c3258b4bb46c2..65097930125040 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -77,6 +77,18 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
   }
 
   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+  setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+  setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+  setOperationAction(ISD::JumpTable, PtrVT, Custom);
+
+  // Expand jump table branches as address arithmetic followed by an
+  // indirect jump.
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+  setOperationPromotedToType(ISD::BR_CC, MVT::i1, MVT::i32);
+  setOperationAction(ISD::BR_CC, MVT::i32, Legal);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
 
   // Implement custom stack allocations
   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
@@ -88,6 +100,12 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
   computeRegisterProperties(STI.getRegisterInfo());
 }
 
+bool XtensaTargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // The Xtensa target isn't yet aware of offsets.
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Calling conventions
 //===----------------------------------------------------------------------===//
@@ -519,6 +537,72 @@ SDValue XtensaTargetLowering::LowerImmediate(SDValue Op,
   return Op;
 }
 
+SDValue XtensaTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(Op);
+  auto PtrVT = Op.getValueType();
+  const GlobalValue *GV = G->getGlobal();
+
+  SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
+  SDValue CPWrap = getAddrPCRel(CPAddr, DAG);
+
+  return CPWrap;
+}
+
+SDValue XtensaTargetLowering::LowerBlockAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  BlockAddressSDNode *Node = cast<BlockAddressSDNode>(Op);
+  const BlockAddress *BA = Node->getBlockAddress();
+  EVT PtrVT = Op.getValueType();
+
+  XtensaConstantPoolValue *CPV =
+      XtensaConstantPoolConstant::Create(BA, 0, XtensaCP::CPBlockAddress);
+  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+  SDValue CPWrap = getAddrPCRel(CPAddr, DAG);
+
+  return CPWrap;
+}
+
+SDValue XtensaTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  SDLoc DL(Op);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+  const DataLayout &TD = DAG.getDataLayout();
+  EVT PtrVT = Table.getValueType();
+  unsigned EntrySize = MJTI->getEntrySize(TD);
+
+  Index = DAG.getNode(ISD::MUL, DL, Index.getValueType(), Index,
+                      DAG.getConstant(EntrySize, DL, Index.getValueType()));
+  SDValue Addr = DAG.getNode(ISD::ADD, DL, Index.getValueType(), Index, Table);
+  SDValue LD =
+      DAG.getLoad(PtrVT, DL, Chain, Addr,
+                  MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
+
+  return DAG.getNode(XtensaISD::BR_JT, DL, MVT::Other, LD.getValue(1), LD,
+                     TargetJT);
+}
+
+SDValue XtensaTargetLowering::LowerJumpTable(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = Op.getValueType();
+
+  // Create a constant pool entry for the callee address
+  XtensaConstantPoolValue *CPV =
+      XtensaConstantPoolJumpTable::Create(*DAG.getContext(), JT->getIndex());
+
+  // Get the address of the callee into a register
+  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+
+  return getAddrPCRel(CPAddr, DAG);
+}
+
 SDValue XtensaTargetLowering::getAddrPCRel(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -580,8 +664,16 @@ SDValue XtensaTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 SDValue XtensaTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
+  case ISD::BR_JT:
+    return LowerBR_JT(Op, DAG);
   case ISD::Constant:
     return LowerImmediate(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
   case ISD::STACKSAVE:
@@ -597,6 +689,8 @@ SDValue XtensaTargetLowering::LowerOperation(SDValue Op,
 
 const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
+  case XtensaISD::BR_JT:
+    return "XtensaISD::BR_JT";
   case XtensaISD::CALL:
     return "XtensaISD::CALL";
   case XtensaISD::PCREL_WRAPPER:
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
index 6f6ec391430a99..23a0217daaa96d 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -23,6 +23,7 @@ namespace llvm {
 namespace XtensaISD {
 enum {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  BR_JT,
 
   // Calls a function.  Operand 0 is the chain operand and operand 1
   // is the target address.  The arguments start at operand 2.
@@ -43,6 +44,8 @@ class XtensaTargetLowering : public TargetLowering {
   explicit XtensaTargetLowering(const TargetMachine &TM,
                                 const XtensaSubtarget &STI);
 
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
   const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -71,8 +74,16 @@ class XtensaTargetLowering : public TargetLowering {
 private:
   const XtensaSubtarget &Subtarget;
 
+  SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerImmediate(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const;
 
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 6e9e75257ccf4c..f68d20dcdd54ab 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -416,6 +416,15 @@ def BBSI : RRI8_Inst<0x07, (outs),
   let imm8 = target;
 }
 
+def : Pat<(brcc SETGT, AR:$s, AR:$t, bb:$target),
+          (BLT AR:$t, AR:$s, bb:$target)>;
+def : Pat<(brcc SETUGT, AR:$s, AR:$t, bb:$target),
+          (BLTU AR:$t, AR:$s, bb:$target)>;
+def : Pat<(brcc SETLE, AR:$s, AR:$t, bb:$target),
+          (BGE AR:$t, AR:$s, bb:$target)>;
+def : Pat<(brcc SETULE, AR:$s, AR:$t, bb:$target),
+          (BGEU AR:$t, AR:$s, bb:$target)>;
+
 //===----------------------------------------------------------------------===//
 // Call and jump instructions
 //===----------------------------------------------------------------------===//
@@ -471,6 +480,12 @@ def : Pat<(Xtensa_call (i32 texternalsym:$dst)),
 def : Pat<(Xtensa_call AR:$dst),
           (CALLX0 AR:$dst)>;
 
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1, Size = 3 in {
+  def BR_JT: Pseudo<(outs), (ins AR:$s, i32imm:$jt),
+                    "!br_jt_p, $s, $jt",
+                    [(Xtensa_brjt AR:$s, tjumptable:$jt)]>;
+}
+
 //===----------------------------------------------------------------------===//
 // Mem barrier instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaOperators.td b/llvm/lib/Target/Xtensa/XtensaOperators.td
index cd4d831c85b581..88d3c9dfe7fd83 100644
--- a/llvm/lib/Target/Xtensa/XtensaOperators.td
+++ b/llvm/lib/Target/Xtensa/XtensaOperators.td
@@ -17,6 +17,8 @@ def SDT_XtensaWrapPtr             : SDTypeProfile<1, 1,
                                                  [SDTCisSameAs<0, 1>,
                                                   SDTCisPtrTy<0>]>;
 
+def SDT_XtensaBrJT                : SDTypeProfile<0, 2,
+                                                 [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
 //===----------------------------------------------------------------------===//
 // Node definitions
 //===----------------------------------------------------------------------===//
@@ -34,3 +36,5 @@ def Xtensa_callseq_start: SDNode<"ISD::CALLSEQ_START", SDT_XtensaCallSeqStart,
 def Xtensa_callseq_end  : SDNode<"ISD::CALLSEQ_END",   SDT_XtensaCallSeqEnd,
                                 [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
                                  SDNPOutGlue]>;
+
+def Xtensa_brjt: SDNode<"XtensaISD::BR_JT", SDT_XtensaBrJT, [SDNPHasChain]>;
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 7986f07538df72..9fc7201efac6e2 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -18,6 +18,7 @@
 #include "llvm/TargetParser/ARMTargetParserCommon.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cctype>
+#include <vector>
 
 #define DEBUG_TYPE "target-parser"
 
@@ -61,8 +62,8 @@ bool AArch64::getExtensionFeatures(
     std::vector<StringRef> &Features) {
   for (const auto &E : Extensions)
     /* INVALID and NONE have no feature name. */
-    if (InputExts.test(E.ID) && !E.Feature.empty())
-      Features.push_back(E.Feature);
+    if (InputExts.test(E.ID) && !E.PosTargetFeature.empty())
+      Features.push_back(E.PosTargetFeature);
 
   return true;
 }
@@ -79,8 +80,8 @@ StringRef AArch64::getArchExtFeature(StringRef ArchExt) {
   StringRef ArchExtBase = IsNegated ? ArchExt.drop_front(2) : ArchExt;
 
   if (auto AE = parseArchExtension(ArchExtBase)) {
-    assert(!(AE.has_value() && AE->NegFeature.empty()));
-    return IsNegated ? AE->NegFeature : AE->Feature;
+    assert(!(AE.has_value() && AE->NegTargetFeature.empty()));
+    return IsNegated ? AE->NegTargetFeature : AE->PosTargetFeature;
   }
 
   return StringRef();
@@ -88,10 +89,14 @@ StringRef AArch64::getArchExtFeature(StringRef ArchExt) {
 
 void AArch64::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
   for (const auto &C : CpuInfos)
-      Values.push_back(C.Name);
+    Values.push_back(C.Name);
 
   for (const auto &Alias : CpuAliases)
-    Values.push_back(Alias.AltName);
+    // The apple-latest alias is backend only, do not expose it to clang's -mcpu.
+    if (Alias.AltName != "apple-latest")
+      Values.push_back(Alias.AltName);
+
+  llvm::sort(Values);
 }
 
 bool AArch64::isX18ReservedByDefault(const Triple &TT) {
@@ -115,8 +120,10 @@ const AArch64::ArchInfo *AArch64::parseArch(StringRef Arch) {
 
 std::optional<AArch64::ExtensionInfo>
 AArch64::parseArchExtension(StringRef ArchExt) {
+  if (ArchExt.empty())
+    return {};
   for (const auto &A : Extensions) {
-    if (ArchExt == A.Name || ArchExt == A.Alias)
+    if (ArchExt == A.UserVisibleName || ArchExt == A.Alias)
       return A;
   }
   return {};
@@ -137,7 +144,7 @@ std::optional<AArch64::FMVInfo> AArch64::parseFMVExtension(StringRef FMVExt) {
 std::optional<AArch64::ExtensionInfo>
 AArch64::targetFeatureToExtension(StringRef TargetFeature) {
   for (const auto &E : Extensions)
-    if (TargetFeature == E.Feature)
+    if (TargetFeature == E.PosTargetFeature)
       return E;
   return {};
 }
@@ -154,21 +161,48 @@ std::optional<AArch64::CpuInfo> AArch64::parseCpu(StringRef Name) {
   return {};
 }
 
-void AArch64::PrintSupportedExtensions(StringMap<StringRef> DescMap) {
+void AArch64::PrintSupportedExtensions() {
   outs() << "All available -march extensions for AArch64\n\n"
          << "    " << left_justify("Name", 20)
-         << (DescMap.empty() ? "\n" : "Description\n");
+         << left_justify("Architecture Feature(s)", 55)
+         << "Description\n";
   for (const auto &Ext : Extensions) {
     // Extensions without a feature cannot be used with -march.
-    if (!Ext.Feature.empty()) {
-      std::string Description = DescMap[Ext.Name].str();
+    if (!Ext.UserVisibleName.empty() && !Ext.PosTargetFeature.empty()) {
       outs() << "    "
-             << format(Description.empty() ? "%s\n" : "%-20s%s\n",
-                       Ext.Name.str().c_str(), Description.c_str());
+             << format(Ext.Description.empty() ? "%-20s%s\n" : "%-20s%-55s%s\n",
+                       Ext.UserVisibleName.str().c_str(),
+                       Ext.ArchFeatureName.str().c_str(),
+                       Ext.Description.str().c_str());
     }
   }
 }
 
+void
+AArch64::printEnabledExtensions(const std::set<StringRef> &EnabledFeatureNames) {
+  outs() << "Extensions enabled for the given AArch64 target\n\n"
+         << "    " << left_justify("Architecture Feature(s)", 55)
+         << "Description\n";
+  std::vector<ExtensionInfo> EnabledExtensionsInfo;
+  for (const auto &FeatureName : EnabledFeatureNames) {
+    std::string PosFeatureName = '+' + FeatureName.str();
+    if (auto ExtInfo = targetFeatureToExtension(PosFeatureName))
+      EnabledExtensionsInfo.push_back(*ExtInfo);
+  }
+
+  std::sort(EnabledExtensionsInfo.begin(), EnabledExtensionsInfo.end(),
+            [](const ExtensionInfo &Lhs, const ExtensionInfo &Rhs) {
+              return Lhs.ArchFeatureName < Rhs.ArchFeatureName;
+            });
+
+  for (const auto &Ext : EnabledExtensionsInfo) {
+    outs() << "    "
+           << format("%-55s%s\n",
+                     Ext.ArchFeatureName.str().c_str(),
+                     Ext.Description.str().c_str());
+  }
+}
+
 const llvm::AArch64::ExtensionInfo &
 lookupExtensionByID(llvm::AArch64::ArchExtKind ExtID) {
   for (const auto &E : llvm::AArch64::Extensions)
@@ -181,7 +215,7 @@ void AArch64::ExtensionSet::enable(ArchExtKind E) {
   if (Enabled.test(E))
     return;
 
-  LLVM_DEBUG(llvm::dbgs() << "Enable " << lookupExtensionByID(E).Name << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "Enable " << lookupExtensionByID(E).UserVisibleName << "\n");
 
   Touched.set(E);
   Enabled.set(E);
@@ -222,7 +256,7 @@ void AArch64::ExtensionSet::disable(ArchExtKind E) {
   if (!Enabled.test(E))
     return;
 
-  LLVM_DEBUG(llvm::dbgs() << "Disable " << lookupExtensionByID(E).Name << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "Disable " << lookupExtensionByID(E).UserVisibleName << "\n");
 
   Touched.set(E);
   Enabled.reset(E);
@@ -266,7 +300,7 @@ bool AArch64::ExtensionSet::parseModifier(StringRef Modifier,
   StringRef ArchExt = Modifier.drop_front(NChars);
 
   if (auto AE = parseArchExtension(ArchExt)) {
-    if (AE->Feature.empty() || AE->NegFeature.empty())
+    if (AE->PosTargetFeature.empty() || AE->NegTargetFeature.empty())
       return false;
     if (IsNegated)
       disable(AE->ID);
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 258238bfb212d0..2a6eceee45fcab 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -177,21 +177,21 @@ bool RISCVISAInfo::addExtension(StringRef ExtName,
 }
 
 static StringRef getExtensionTypeDesc(StringRef Ext) {
-  if (Ext.starts_with("s"))
+  if (Ext.starts_with('s'))
     return "standard supervisor-level extension";
-  if (Ext.starts_with("x"))
+  if (Ext.starts_with('x'))
     return "non-standard user-level extension";
-  if (Ext.starts_with("z"))
+  if (Ext.starts_with('z'))
     return "standard user-level extension";
   return StringRef();
 }
 
 static StringRef getExtensionType(StringRef Ext) {
-  if (Ext.starts_with("s"))
+  if (Ext.starts_with('s'))
     return "s";
-  if (Ext.starts_with("x"))
+  if (Ext.starts_with('x'))
     return "x";
-  if (Ext.starts_with("z"))
+  if (Ext.starts_with('z'))
     return "z";
   return StringRef();
 }
@@ -287,7 +287,7 @@ std::vector<std::string> RISCVISAInfo::toFeatures(bool AddAllExtensions,
   return Features;
 }
 
-static Error getStringErrorForInvalidExt(std::string_view ExtName) {
+static Error getStringErrorForInvalidExt(StringRef ExtName) {
   if (ExtName.size() == 1) {
     return createStringError(errc::invalid_argument,
                              "unsupported standard user-level extension '" +
@@ -521,87 +521,6 @@ RISCVISAInfo::parseNormalizedArchString(StringRef Arch) {
   return std::move(ISAInfo);
 }
 
-static Error processMultiLetterExtension(
-    StringRef RawExt,
-    MapVector<std::string, RISCVISAUtils::ExtensionVersion,
-              std::map<std::string, unsigned>> &SeenExtMap,
-    bool IgnoreUnknown, bool EnableExperimentalExtension,
-    bool ExperimentalExtensionVersionCheck) {
-  StringRef Type = getExtensionType(RawExt);
-  StringRef Desc = getExtensionTypeDesc(RawExt);
-  auto Pos = findLastNonVersionCharacter(RawExt) + 1;
-  StringRef Name(RawExt.substr(0, Pos));
-  StringRef Vers(RawExt.substr(Pos));
-
-  if (Type.empty()) {
-    if (IgnoreUnknown)
-      return Error::success();
-    return createStringError(errc::invalid_argument,
-                             "invalid extension prefix '" + RawExt + "'");
-  }
-
-  if (!IgnoreUnknown && Name.size() == Type.size())
-    return createStringError(errc::invalid_argument,
-                             Desc + " name missing after '" + Type + "'");
-
-  unsigned Major, Minor, ConsumeLength;
-  if (auto E = getExtensionVersion(Name, Vers, Major, Minor, ConsumeLength,
-                                   EnableExperimentalExtension,
-                                   ExperimentalExtensionVersionCheck)) {
-    if (IgnoreUnknown) {
-      consumeError(std::move(E));
-      return Error::success();
-    }
-    return E;
-  }
-
-  // Check if duplicated extension.
-  if (!IgnoreUnknown && SeenExtMap.contains(Name.str()))
-    return createStringError(errc::invalid_argument,
-                             "duplicated " + Desc + " '" + Name + "'");
-
-  if (IgnoreUnknown && !RISCVISAInfo::isSupportedExtension(Name))
-    return Error::success();
-
-  SeenExtMap[Name.str()] = {Major, Minor};
-  return Error::success();
-}
-
-static Error processSingleLetterExtension(
-    StringRef &RawExt,
-    MapVector<std::string, RISCVISAUtils::ExtensionVersion,
-              std::map<std::string, unsigned>> &SeenExtMap,
-    bool IgnoreUnknown, bool EnableExperimentalExtension,
-    bool ExperimentalExtensionVersionCheck) {
-  unsigned Major, Minor, ConsumeLength;
-  StringRef Name = RawExt.take_front(1);
-  RawExt.consume_front(Name);
-  if (auto E = getExtensionVersion(Name, RawExt, Major, Minor, ConsumeLength,
-                                   EnableExperimentalExtension,
-                                   ExperimentalExtensionVersionCheck)) {
-    if (IgnoreUnknown) {
-      consumeError(std::move(E));
-      RawExt = RawExt.substr(ConsumeLength);
-      return Error::success();
-    }
-    return E;
-  }
-
-  RawExt = RawExt.substr(ConsumeLength);
-
-  // Check if duplicated extension.
-  if (!IgnoreUnknown && SeenExtMap.contains(Name.str()))
-    return createStringError(errc::invalid_argument,
-                             "duplicated standard user-level extension '" +
-                                 Name + "'");
-
-  if (IgnoreUnknown && !RISCVISAInfo::isSupportedExtension(Name))
-    return Error::success();
-
-  SeenExtMap[Name.str()] = {Major, Minor};
-  return Error::success();
-}
-
 llvm::Expected<std::unique_ptr<RISCVISAInfo>>
 RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
                               bool ExperimentalExtensionVersionCheck,
@@ -738,11 +657,12 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
     Exts = Exts.slice(Idx, StringRef::npos);
 
     do {
+      StringRef Name, Vers, Desc;
       if (RISCVISAUtils::AllStdExts.contains(Ext.front())) {
-        if (auto E = processSingleLetterExtension(
-                Ext, SeenExtMap, IgnoreUnknown, EnableExperimentalExtension,
-                ExperimentalExtensionVersionCheck))
-          return std::move(E);
+        Name = Ext.take_front(1);
+        Ext = Ext.drop_front();
+        Vers = Ext;
+        Desc = "standard user-level extension";
       } else if (Ext.front() == 'z' || Ext.front() == 's' ||
                  Ext.front() == 'x') {
         // Handle other types of extensions other than the standard
@@ -753,19 +673,49 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
         // These extensions start with 'z', 's', 'x' prefixes, might have a
         // version number (major, minor) and are separated by a single
         // underscore '_'. We do not enforce a canonical order for them.
-        if (auto E = processMultiLetterExtension(
-                Ext, SeenExtMap, IgnoreUnknown, EnableExperimentalExtension,
-                ExperimentalExtensionVersionCheck))
-          return std::move(E);
-        // Multi-letter extension must be separate following extension with
-        // underscore
-        break;
+        StringRef Type = getExtensionType(Ext);
+        Desc = getExtensionTypeDesc(Ext);
+        auto Pos = findLastNonVersionCharacter(Ext) + 1;
+        Name = Ext.substr(0, Pos);
+        Vers = Ext.substr(Pos);
+        Ext = StringRef();
+
+        assert(!Type.empty() && "Empty type?");
+        if (!IgnoreUnknown && Name.size() == Type.size())
+          return createStringError(errc::invalid_argument,
+                                   Desc + " name missing after '" + Type + "'");
       } else {
         // FIXME: Could it be ignored by IgnoreUnknown?
         return createStringError(errc::invalid_argument,
                                  "invalid standard user-level extension '" +
                                      Twine(Ext.front()) + "'");
       }
+
+      unsigned Major, Minor, ConsumeLength;
+      if (auto E = getExtensionVersion(Name, Vers, Major, Minor, ConsumeLength,
+                                       EnableExperimentalExtension,
+                                       ExperimentalExtensionVersionCheck)) {
+        if (IgnoreUnknown) {
+          consumeError(std::move(E));
+          if (Name.size() == 1)
+            Ext = Ext.substr(ConsumeLength);
+          continue;
+        }
+        return E;
+      }
+
+      if (Name.size() == 1)
+        Ext = Ext.substr(ConsumeLength);
+
+      // Check if duplicated extension.
+      if (!IgnoreUnknown && SeenExtMap.contains(Name.str()))
+        return createStringError(errc::invalid_argument,
+                                 "duplicated " + Desc + " '" + Name + "'");
+
+      if (IgnoreUnknown && !RISCVISAInfo::isSupportedExtension(Name))
+        continue;
+
+      SeenExtMap[Name.str()] = {Major, Minor};
     } while (!Ext.empty());
   }
 
@@ -890,9 +840,9 @@ void RISCVISAInfo::updateImplication() {
 
   // This loop may execute over 1 iteration since implication can be layered
   // Exits loop if no more implication is applied
-  SmallSetVector<StringRef, 16> WorkList;
+  SmallVector<StringRef, 16> WorkList;
   for (auto const &Ext : Exts)
-    WorkList.insert(Ext.first);
+    WorkList.push_back(Ext.first);
 
   while (!WorkList.empty()) {
     StringRef ExtName = WorkList.pop_back_val();
@@ -901,13 +851,11 @@ void RISCVISAInfo::updateImplication() {
     std::for_each(Range.first, Range.second,
                   [&](const ImpliedExtsEntry &Implied) {
                     const char *ImpliedExt = Implied.ImpliedExt;
-                    if (WorkList.count(ImpliedExt))
-                      return;
                     if (Exts.count(ImpliedExt))
                       return;
                     auto Version = findDefaultVersion(ImpliedExt);
                     addExtension(ImpliedExt, *Version);
-                    WorkList.insert(ImpliedExt);
+                    WorkList.push_back(ImpliedExt);
                   });
   }
 
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index eda0c7f5da15ab..141ecb936b7084 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -631,6 +631,7 @@ constexpr FeatureBitset ImpliedFeaturesNDD = {};
 constexpr FeatureBitset ImpliedFeaturesCCMP = {};
 constexpr FeatureBitset ImpliedFeaturesNF = {};
 constexpr FeatureBitset ImpliedFeaturesCF = {};
+constexpr FeatureBitset ImpliedFeaturesZU = {};
 
 constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
 #define X86_FEATURE(ENUM, STR) {{"+" STR}, ImpliedFeatures##ENUM},
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index c7e84a009221f6..1e0b8d448b9d14 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -423,7 +423,8 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
   if (TTI.haveFastSqrt(Ty) &&
       (Call->hasNoNaNs() ||
        cannotBeOrderedLessThanZero(
-           Arg, 0, SimplifyQuery(M->getDataLayout(), &TLI, &DT, &AC, Call)))) {
+           Arg, 0,
+           SimplifyQuery(Call->getDataLayout(), &TLI, &DT, &AC, Call)))) {
     IRBuilder<> Builder(Call);
     IRBuilderBase::FastMathFlagGuard Guard(Builder);
     Builder.setFastMathFlags(Call->getFastMathFlags());
@@ -1152,7 +1153,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
     if (!DT.isReachableFromEntry(&BB))
       continue;
 
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
 
     // Walk the block backwards for efficiency. We're matching a chain of
     // use->defs, so we're more likely to succeed by starting from the bottom.
@@ -1188,7 +1189,7 @@ static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI,
                     TargetLibraryInfo &TLI, DominatorTree &DT,
                     AliasAnalysis &AA, bool &MadeCFGChange) {
   bool MadeChange = false;
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   TruncInstCombine TIC(AC, TLI, DL, DT);
   MadeChange |= TIC.run(F);
   MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, MadeCFGChange);
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 32326b5801031f..0e1a0a6ed94796 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index 3e3825fcd50e23..dd92b3593af92e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -8,10 +8,11 @@
 
 #include "llvm/Transforms/Coroutines/CoroCleanup.h"
 #include "CoroInternal.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Function.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 74b5ccb7b9b718..598ef7779d771c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -236,7 +236,7 @@ void CoroIdElider::elideHeapAllocations(uint64_t FrameSize, Align FrameAlign) {
   // is spilled into the coroutine frame and recreate the alignment information
   // here. Possibly we will need to do a mini SROA here and break the coroutine
   // frame into individual AllocaInst recreating the original alignment.
-  const DataLayout &DL = FEI.ContainingFunction->getParent()->getDataLayout();
+  const DataLayout &DL = FEI.ContainingFunction->getDataLayout();
   auto FrameTy = ArrayType::get(Type::getInt8Ty(C), FrameSize);
   auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
   Frame->setAlignment(FrameAlign);
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 30ef7689801dd1..73e30ea00a0e29 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1153,7 +1153,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
       llvm::DINodeArray());
   StructType *FrameTy = Shape.FrameTy;
   SmallVector<Metadata *, 16> Elements;
-  DataLayout Layout = F.getParent()->getDataLayout();
+  DataLayout Layout = F.getDataLayout();
 
   DenseMap<Value *, DILocalVariable *> DIVarCache;
   cacheDIVar(FrameData, DIVarCache);
@@ -1300,7 +1300,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
 static StructType *buildFrameType(Function &F, coro::Shape &Shape,
                                   FrameDataInfo &FrameData) {
   LLVMContext &C = F.getContext();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   StructType *FrameTy = [&] {
     SmallString<32> Name(F.getName());
     Name.append(".Frame");
@@ -2858,7 +2858,7 @@ static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape,
   bool ShouldUseLifetimeStartInfo =
       (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
        Shape.ABI != coro::ABI::RetconOnce);
-  AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT, Shape, Checker,
+  AllocaUseVisitor Visitor{AI->getDataLayout(), DT, Shape, Checker,
                            ShouldUseLifetimeStartInfo};
   Visitor.visitPtr(*AI);
   if (!Visitor.getShouldLiveOnFrame())
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 5a58a99d2879e8..299b514d34f1c8 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1790,7 +1790,7 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
     IRBuilder<> Builder(Id);
 
     // Determine the size of the frame.
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     auto Size = DL.getTypeAllocSize(Shape.FrameTy);
 
     // Allocate.  We don't need to update the call graph node because we're
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 3aa8ea3f514713..f7d86bcdd0de9f 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
@@ -203,7 +204,7 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
   // Loop over all the callers of the function, transforming the call sites to
   // pass in the loaded pointers.
   SmallVector<Value *, 16> Args;
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   SmallVector<WeakTrackingVH, 16> DeadArgs;
 
   while (!F->use_empty()) {
@@ -426,7 +427,7 @@ static bool allCallersPassValidPointerForArgument(Argument *Arg,
                                                   Align NeededAlign,
                                                   uint64_t NeededDerefBytes) {
   Function *Callee = Arg->getParent();
-  const DataLayout &DL = Callee->getParent()->getDataLayout();
+  const DataLayout &DL = Callee->getDataLayout();
   APInt Bytes(64, NeededDerefBytes);
 
   // Check if the argument itself is marked dereferenceable and aligned.
@@ -754,7 +755,7 @@ static Function *promoteArguments(Function *F, FunctionAnalysisManager &FAM,
     if (BB.getTerminatingMustTailCall())
       return nullptr;
 
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   auto &AAR = FAM.getResult<AAManager>(*F);
   const auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
 
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index c4b9375a53a271..a47f97667668ed 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -7470,7 +7470,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     assert(PrivType && "Expected privatizable type!");
 
     IRBuilder<NoFolder> IRB(IP->getParent(), IP);
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
 
     // Traverse the type, build GEPs and stores.
     if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
@@ -7502,7 +7502,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     Instruction *IP = ACS.getInstruction();
 
     IRBuilder<NoFolder> IRB(IP);
-    const DataLayout &DL = IP->getModule()->getDataLayout();
+    const DataLayout &DL = IP->getDataLayout();
 
     // Traverse the type, build GEPs and loads.
     if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
@@ -7566,7 +7566,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
             Function &ReplacementFn, Function::arg_iterator ArgIt) {
           BasicBlock &EntryBB = ReplacementFn.getEntryBlock();
           BasicBlock::iterator IP = EntryBB.getFirstInsertionPt();
-          const DataLayout &DL = IP->getModule()->getDataLayout();
+          const DataLayout &DL = IP->getDataLayout();
           unsigned AS = DL.getAllocaAddrSpace();
           Instruction *AI = new AllocaInst(*PrivatizableType, AS,
                                            Arg->getName() + ".priv", IP);
@@ -8866,7 +8866,7 @@ struct AADenormalFPMathImpl : public AADenormalFPMath {
     if (Known.ModeF32.isValid())
       OS << " denormal-fp-math-f32=" << Known.ModeF32;
     OS << ']';
-    return OS.str();
+    return Str;
   }
 };
 
@@ -8979,7 +8979,7 @@ struct AAValueConstantRangeImpl : AAValueConstantRange {
     OS << " / ";
     getAssumed().print(OS);
     OS << ">";
-    return OS.str();
+    return Str;
   }
 
   /// Helper function to get a SCEV expr for the associated value at program
@@ -9656,7 +9656,7 @@ struct AAPotentialConstantValuesImpl : AAPotentialConstantValues {
     std::string Str;
     llvm::raw_string_ostream OS(Str);
     OS << getState();
-    return OS.str();
+    return Str;
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -10778,7 +10778,7 @@ struct AAPotentialValuesImpl : AAPotentialValues {
     std::string Str;
     llvm::raw_string_ostream OS(Str);
     OS << getState();
-    return OS.str();
+    return Str;
   }
 
   template <typename AAType>
@@ -11276,7 +11276,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
     auto *AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*F);
 
-    const DataLayout &DL = I.getModule()->getDataLayout();
+    const DataLayout &DL = I.getDataLayout();
     SimplifyQuery Q(DL, TLI, DT, AC, &I);
     Value *NewV = simplifyInstructionWithOperands(&I, NewOps, Q);
     if (!NewV || NewV == &I)
diff --git a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
index 2c8756c07f8733..acc10f57c29acb 100644
--- a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 
diff --git a/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index ced04bd6f599cd..a1face0a6a9c3a 100644
--- a/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -85,7 +85,7 @@ static void copyDebugLocMetadata(const GlobalVariable *From,
 
 static Align getAlign(GlobalVariable *GV) {
   return GV->getAlign().value_or(
-      GV->getParent()->getDataLayout().getPreferredAlign(GV));
+      GV->getDataLayout().getPreferredAlign(GV));
 }
 
 static bool
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 26a4508aa1513a..7b419d0f098b5c 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1169,7 +1169,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
     if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
       FlowsToReturn.insert(Ret->getReturnValue());
 
-  auto &DL = F->getParent()->getDataLayout();
+  auto &DL = F->getDataLayout();
 
   for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
     Value *RetVal = FlowsToReturn[i];
@@ -1311,7 +1311,7 @@ static void addNoUndefAttrs(const SCCNodeSet &SCCNodes,
     if (F->getReturnType()->isVoidTy())
       continue;
 
-    const DataLayout &DL = F->getParent()->getDataLayout();
+    const DataLayout &DL = F->getDataLayout();
     if (all_of(*F, [&](BasicBlock &BB) {
           if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
             // TODO: perform context-sensitive analysis?
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 411a8454bf18b3..ab1e41ebf9a9d6 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1213,7 +1213,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
       for(auto *GVe : GVs){
         DIGlobalVariable *DGV = GVe->getVariable();
         DIExpression *E = GVe->getExpression();
-        const DataLayout &DL = GV->getParent()->getDataLayout();
+        const DataLayout &DL = GV->getDataLayout();
         unsigned SizeInOctets =
             DL.getTypeAllocSizeInBits(NewGV->getValueType()) / 8;
 
@@ -1356,7 +1356,7 @@ static bool isPointerValueDeadOnEntryToFunction(
   //
   // We don't do an exhaustive search for memory operations - simply look
   // through bitcasts as they're quite common and benign.
-  const DataLayout &DL = GV->getParent()->getDataLayout();
+  const DataLayout &DL = GV->getDataLayout();
   SmallVector<LoadInst *, 4> Loads;
   SmallVector<StoreInst *, 4> Stores;
   for (auto *U : GV->users()) {
@@ -1452,7 +1452,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
                       function_ref<TargetTransformInfo &(Function &)> GetTTI,
                       function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                       function_ref<DominatorTree &(Function &)> LookupDomTree) {
-  auto &DL = GV->getParent()->getDataLayout();
+  auto &DL = GV->getDataLayout();
   // If this is a first class global and has only one accessing function and
   // this function is non-recursive, we replace the global with a local alloca
   // in this function.
@@ -1469,7 +1469,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
       GS.AccessingFunction->doesNotRecurse() &&
       isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
                                           LookupDomTree)) {
-    const DataLayout &DL = GV->getParent()->getDataLayout();
+    const DataLayout &DL = GV->getDataLayout();
 
     LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     BasicBlock::iterator FirstI =
@@ -1540,7 +1540,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     ++NumMarked;
   }
   if (!GV->getInitializer()->getType()->isSingleValueType()) {
-    const DataLayout &DL = GV->getParent()->getDataLayout();
+    const DataLayout &DL = GV->getDataLayout();
     if (SRAGlobal(GV, DL))
       return true;
   }
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index dced3a5fdc7875..fd49b745fd750e 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -47,7 +47,7 @@ static bool splitGlobal(GlobalVariable &GV) {
   if (!Init)
     return false;
 
-  const DataLayout &DL = GV.getParent()->getDataLayout();
+  const DataLayout &DL = GV.getDataLayout();
   const StructLayout *SL = DL.getStructLayout(Init->getType());
   ArrayRef<TypeSize> MemberOffsets = SL->getMemberOffsets();
   unsigned IndexWidth = DL.getIndexTypeSizeInBits(GV.getType());
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 29f9264f2fc249..96c803c0186ef4 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -678,7 +678,7 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
     Mg.getNameWithPrefix(MangledNameStream, F, false);
 
     DISubprogram *OutlinedSP = DB.createFunction(
-        Unit /* Context */, F->getName(), MangledNameStream.str(),
+        Unit /* Context */, F->getName(), Dummy,
         Unit /* File */,
         0 /* Line 0 is reserved for compiler-generated code. */,
         DB.createSubroutineType(
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 1a7b9bc8e3e770..23ee23eb047f58 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -223,8 +223,6 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
   Advisor.onPassEntry(&InitialC);
 
-  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(&InitialC); });
-
   // We use a single common worklist for calls across the entire SCC. We
   // process these in-order and append new calls introduced during inlining to
   // the end. The PriorityInlineOrder is optional here, in which the smaller
@@ -279,12 +277,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           }
         }
   }
-  if (Calls.empty())
-    return PreservedAnalyses::all();
 
   // Capture updatable variable for the current SCC.
   auto *C = &InitialC;
 
+  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(C); });
+
+  if (Calls.empty())
+    return PreservedAnalyses::all();
+
   // When inlining a callee produces new call sites, we want to keep track of
   // the fact that they were inlined from the callee.  This allows us to avoid
   // infinite inlining in some obscure cases.  To represent this, we use an
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e3a4821b8226b4..9e658e91a71a47 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -880,7 +880,7 @@ struct OffloadArray {
     if (BB != Before.getParent())
       return false;
 
-    const DataLayout &DL = Array.getModule()->getDataLayout();
+    const DataLayout &DL = Array.getDataLayout();
     const unsigned int PointerSize = DL.getPointerSize();
 
     for (Instruction &I : *BB) {
@@ -1667,21 +1667,21 @@ struct OpenMPOpt {
       BP->print(Printer);
       Printer << Separator;
     }
-    LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
+    LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << ValuesStr << "\n");
     ValuesStr.clear();
 
     for (auto *P : OAs[1].StoredValues) {
       P->print(Printer);
       Printer << Separator;
     }
-    LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
+    LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << ValuesStr << "\n");
     ValuesStr.clear();
 
     for (auto *S : OAs[2].StoredValues) {
       S->print(Printer);
       Printer << Separator;
     }
-    LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
+    LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << ValuesStr << "\n");
   }
 
   /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index aa4f205ec5bdf1..3ca095e1520f3b 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -764,7 +764,7 @@ bool PartialInlinerImpl::shouldPartialInline(
     });
     return false;
   }
-  const DataLayout &DL = Caller->getParent()->getDataLayout();
+  const DataLayout &DL = Caller->getDataLayout();
 
   // The savings of eliminating the call:
   int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);
@@ -804,7 +804,7 @@ InstructionCost
 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
                                         TargetTransformInfo *TTI) {
   InstructionCost InlineCost = 0;
-  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+  const DataLayout &DL = BB->getDataLayout();
   int InstrCost = InlineConstants::getInstrCost();
   for (Instruction &I : BB->instructionsWithoutDebug()) {
     // Skip free instructions.
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index e591a8e73b1c6f..94ae511b2e4a1e 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -316,7 +316,7 @@ static bool runIPSCCP(
   SmallSetVector<Function *, 8> FuncZappedReturn;
   for (ReturnInst *RI : ReturnsToZap) {
     Function *F = RI->getParent()->getParent();
-    RI->setOperand(0, UndefValue::get(F->getReturnType()));
+    RI->setOperand(0, PoisonValue::get(F->getReturnType()));
     // Record all functions that are zapped.
     FuncZappedReturn.insert(F);
   }
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 61078c4194b811..d11b0b76b28c58 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -788,25 +788,23 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 /// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
 /// cannot promote for \p Inst anymore.
 static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
-  uint32_t NumVals = 0;
   uint64_t TotalCount = 0;
-  auto ValueData =
-      getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
-                               NumVals, TotalCount, true);
+  auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,
+                                            MaxNumPromotions, TotalCount, true);
   // No valid value profile so no promoted targets have been recorded
   // before. Ok to do ICP.
-  if (!ValueData)
+  if (ValueData.empty())
     return true;
 
   unsigned NumPromoted = 0;
-  for (uint32_t I = 0; I < NumVals; I++) {
-    if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
+  for (const auto &V : ValueData) {
+    if (V.Count != NOMORE_ICP_MAGICNUM)
       continue;
 
     // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
     // metadata, it means the candidate has been promoted for this
     // indirect call.
-    if (ValueData[I].Value == Function::getGUID(Candidate))
+    if (V.Value == Function::getGUID(Candidate))
       return false;
     NumPromoted++;
     // If already have MaxNumPromotions promotion, don't do it anymore.
@@ -832,11 +830,10 @@ updateIDTMetaData(Instruction &Inst,
   // `MaxNumPromotions` inside it.
   if (MaxNumPromotions == 0)
     return;
-  uint32_t NumVals = 0;
   // OldSum is the existing total count in the value profile data.
   uint64_t OldSum = 0;
-  auto ValueData = getValueProfDataFromInst(
-      Inst, IPVK_IndirectCallTarget, MaxNumPromotions, NumVals, OldSum, true);
+  auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,
+                                            MaxNumPromotions, OldSum, true);
 
   DenseMap<uint64_t, uint64_t> ValueCountMap;
   if (Sum == 0) {
@@ -845,10 +842,8 @@ updateIDTMetaData(Instruction &Inst,
            "If sum is 0, assume only one element in CallTargets "
            "with count being NOMORE_ICP_MAGICNUM");
     // Initialize ValueCountMap with existing value profile data.
-    if (ValueData) {
-      for (uint32_t I = 0; I < NumVals; I++)
-        ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
-    }
+    for (const auto &V : ValueData)
+      ValueCountMap[V.Value] = V.Count;
     auto Pair =
         ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
     // If the target already exists in value profile, decrease the total
@@ -861,11 +856,9 @@ updateIDTMetaData(Instruction &Inst,
   } else {
     // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
     // counts in the value profile.
-    if (ValueData) {
-      for (uint32_t I = 0; I < NumVals; I++) {
-        if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
-          ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
-      }
+    for (const auto &V : ValueData) {
+      if (V.Count == NOMORE_ICP_MAGICNUM)
+        ValueCountMap[V.Value] = V.Count;
     }
 
     for (const auto &Data : CallTargets) {
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 882b15c23ede58..b489d4fdaa2104 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/CRC.h"
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 9929ebb96dcafe..19bc841b10529b 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -311,7 +311,7 @@ void wholeprogramdevirt::setAfterReturnValues(
 
 VirtualCallTarget::VirtualCallTarget(GlobalValue *Fn, const TypeMemberInfo *TM)
     : Fn(Fn), TM(TM),
-      IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()),
+      IsBigEndian(Fn->getDataLayout().isBigEndian()),
       WasDevirt(false) {}
 
 namespace {
@@ -1616,7 +1616,7 @@ std::string DevirtModule::getGlobalName(VTableSlot Slot,
   for (uint64_t Arg : Args)
     OS << '_' << Arg;
   OS << '_' << Name;
-  return OS.str();
+  return FullName;
 }
 
 bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index d767fa3930e2fb..b8664089c36ced 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3421,6 +3421,25 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   return foldAndOrOfICmpsUsingRanges(LHS, RHS, IsAnd);
 }
 
+static Value *foldOrOfInversions(BinaryOperator &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::Or &&
+         "Simplification only supports or at the moment.");
+
+  Value *Cmp1, *Cmp2, *Cmp3, *Cmp4;
+  if (!match(I.getOperand(0), m_And(m_Value(Cmp1), m_Value(Cmp2))) ||
+      !match(I.getOperand(1), m_And(m_Value(Cmp3), m_Value(Cmp4))))
+    return nullptr;
+
+  // Check if any two pairs of the and operations are inversions of each other.
+  if (isKnownInversion(Cmp1, Cmp3) && isKnownInversion(Cmp2, Cmp4))
+    return Builder.CreateXor(Cmp1, Cmp4);
+  if (isKnownInversion(Cmp1, Cmp4) && isKnownInversion(Cmp2, Cmp3))
+    return Builder.CreateXor(Cmp1, Cmp3);
+
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -3450,6 +3469,11 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *X = foldComplexAndOrPatterns(I, Builder))
     return X;
 
+  // (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D
+  // (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C
+  if (Value *V = foldOrOfInversions(I, Builder))
+    return replaceInstUsesWith(I, V);
+
   // (A&B)|(A&C) -> A&(B|C) etc
   if (Value *V = foldUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
@@ -3631,8 +3655,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
 
     // (A & B) | (A ^ B) --> A | B
     // (B & A) | (A ^ B) --> A | B
-    if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
-        match(Op0, m_And(m_Specific(B), m_Specific(A))))
+    if (match(Op0, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
 
     // ~A | (A ^ B) --> ~(A & B)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 436cdbff756699..9291e6e67ef71d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -307,7 +307,7 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
   // If we can unconditionally load from this address, replace with a
   // load/select idiom. TODO: use DT for context sensitive query
   if (isDereferenceablePointer(LoadPtr, II.getType(),
-                               II.getModule()->getDataLayout(), &II, &AC)) {
+                               II.getDataLayout(), &II, &AC)) {
     LoadInst *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
                                              "unmaskedload");
     LI->copyMetadata(II);
@@ -2511,16 +2511,22 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::fabs: {
     Value *Cond, *TVal, *FVal;
-    if (match(II->getArgOperand(0),
-              m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))) {
+    Value *Arg = II->getArgOperand(0);
+    Value *X;
+    // fabs (-X) --> fabs (X)
+    if (match(Arg, m_FNeg(m_Value(X)))) {
+        CallInst *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, X, II);
+        return replaceInstUsesWith(CI, Fabs);
+    }
+
+    if (match(Arg, m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))) {
       // fabs (select Cond, TrueC, FalseC) --> select Cond, AbsT, AbsF
       if (isa<Constant>(TVal) || isa<Constant>(FVal)) {
         CallInst *AbsT = Builder.CreateCall(II->getCalledFunction(), {TVal});
         CallInst *AbsF = Builder.CreateCall(II->getCalledFunction(), {FVal});
         SelectInst *SI = SelectInst::Create(Cond, AbsT, AbsF);
         FastMathFlags FMF1 = II->getFastMathFlags();
-        FastMathFlags FMF2 =
-            cast<SelectInst>(II->getArgOperand(0))->getFastMathFlags();
+        FastMathFlags FMF2 = cast<SelectInst>(Arg)->getFastMathFlags();
         FMF2.setNoSignedZeros(false);
         SI->setFastMathFlags(FMF1 | FMF2);
         return SI;
@@ -2573,14 +2579,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::sin: {
+  case Intrinsic::sin:
+  case Intrinsic::amdgcn_sin: {
     Value *X;
     if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) {
       // sin(-x) --> -sin(x)
-      Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
-      Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin);
-      FNeg->copyFastMathFlags(II);
-      return FNeg;
+      Value *NewSin = Builder.CreateUnaryIntrinsic(IID, X, II);
+      return UnaryOperator::CreateFNegFMF(NewSin, II);
     }
     break;
   }
@@ -2643,13 +2648,14 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // (sign|resign) + (auth|resign) can be folded by omitting the middle
     // sign+auth component if the key and discriminator match.
     bool NeedSign = II->getIntrinsicID() == Intrinsic::ptrauth_resign;
+    Value *Ptr = II->getArgOperand(0);
     Value *Key = II->getArgOperand(1);
     Value *Disc = II->getArgOperand(2);
 
     // AuthKey will be the key we need to end up authenticating against in
     // whatever we replace this sequence with.
     Value *AuthKey = nullptr, *AuthDisc = nullptr, *BasePtr;
-    if (auto CI = dyn_cast<CallBase>(II->getArgOperand(0))) {
+    if (const auto *CI = dyn_cast<CallBase>(Ptr)) {
       BasePtr = CI->getArgOperand(0);
       if (CI->getIntrinsicID() == Intrinsic::ptrauth_sign) {
         if (CI->getArgOperand(1) != Key || CI->getArgOperand(2) != Disc)
@@ -2661,6 +2667,27 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         AuthDisc = CI->getArgOperand(2);
       } else
         break;
+    } else if (const auto *PtrToInt = dyn_cast<PtrToIntOperator>(Ptr)) {
+      // ptrauth constants are equivalent to a call to @llvm.ptrauth.sign for
+      // our purposes, so check for that too.
+      const auto *CPA = dyn_cast<ConstantPtrAuth>(PtrToInt->getOperand(0));
+      if (!CPA || !CPA->isKnownCompatibleWith(Key, Disc, DL))
+        break;
+
+      // resign(ptrauth(p,ks,ds),ks,ds,kr,dr) -> ptrauth(p,kr,dr)
+      if (NeedSign && isa<ConstantInt>(II->getArgOperand(4))) {
+        auto *SignKey = cast<ConstantInt>(II->getArgOperand(3));
+        auto *SignDisc = cast<ConstantInt>(II->getArgOperand(4));
+        auto *SignAddrDisc = ConstantPointerNull::get(Builder.getPtrTy());
+        auto *NewCPA = ConstantPtrAuth::get(CPA->getPointer(), SignKey,
+                                            SignDisc, SignAddrDisc);
+        replaceInstUsesWith(
+            *II, ConstantExpr::getPointerCast(NewCPA, II->getType()));
+        return eraseInstFromFunction(*II);
+      }
+
+      // auth(ptrauth(p,k,d),k,d) -> p
+      BasePtr = Builder.CreatePtrToInt(CPA->getPointer(), II->getType());
     } else
       break;
 
@@ -2677,8 +2704,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     } else {
       // sign(0) + auth(0) = nop
       replaceInstUsesWith(*II, BasePtr);
-      eraseInstFromFunction(*II);
-      return nullptr;
+      return eraseInstFromFunction(*II);
     }
 
     SmallVector<Value *, 4> CallArgs;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1ad637552eac69..93157a64bfe3f4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -8002,6 +8002,53 @@ static Instruction *foldFCmpFNegCommonOp(FCmpInst &I) {
   return new FCmpInst(Pred, Op0, Zero, "", &I);
 }
 
+static Instruction *foldFCmpFSubIntoFCmp(FCmpInst &I, Instruction *LHSI,
+                                         Constant *RHSC, InstCombinerImpl &CI) {
+  const CmpInst::Predicate Pred = I.getPredicate();
+  Value *X = LHSI->getOperand(0);
+  Value *Y = LHSI->getOperand(1);
+  switch (Pred) {
+  default:
+    break;
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_OEQ:
+  case FCmpInst::FCMP_OGE:
+  case FCmpInst::FCMP_OLE:
+    // The optimization is not valid if X and Y are infinities of the same
+    // sign, i.e. the inf - inf = nan case. If the fsub has the ninf or nnan
+    // flag then we can assume we do not have that case. Otherwise we might be
+    // able to prove that either X or Y is not infinity.
+    if (!LHSI->hasNoNaNs() && !LHSI->hasNoInfs() &&
+        !isKnownNeverInfinity(Y, /*Depth=*/0,
+                              CI.getSimplifyQuery().getWithInstruction(&I)) &&
+        !isKnownNeverInfinity(X, /*Depth=*/0,
+                              CI.getSimplifyQuery().getWithInstruction(&I)))
+      break;
+
+    [[fallthrough]];
+  case FCmpInst::FCMP_OGT:
+  case FCmpInst::FCMP_OLT:
+  case FCmpInst::FCMP_ONE:
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_ULE:
+    // fcmp pred (x - y), 0 --> fcmp pred x, y
+    if (match(RHSC, m_AnyZeroFP()) &&
+        I.getFunction()->getDenormalMode(
+            LHSI->getType()->getScalarType()->getFltSemantics()) ==
+            DenormalMode::getIEEE()) {
+      CI.replaceOperand(I, 0, X);
+      CI.replaceOperand(I, 1, Y);
+      return &I;
+    }
+    break;
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
   bool Changed = false;
 
@@ -8172,6 +8219,11 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
       if (Instruction *NV = FoldOpIntoSelect(I, cast<SelectInst>(LHSI)))
         return NV;
       break;
+    case Instruction::FSub:
+      if (LHSI->hasOneUse())
+        if (Instruction *NV = foldFCmpFSubIntoFCmp(I, LHSI, RHSC, *this))
+          return NV;
+      break;
     case Instruction::PHI:
       if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
         return NV;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index c3f1c12d2f5643..3a2620db1f042d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -365,6 +365,28 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
       return BinaryOperator::CreateMul(NegOp0, X);
   }
 
+  if (Op0->hasOneUse()) {
+    // (mul (div exact X, C0), C1)
+    //    -> (div exact X, C0 / C1)
+    // iff C0 % C1 == 0 and X / (C0 / C1) doesn't create UB.
+    const APInt *C1;
+    auto UDivCheck = [&C1](const APInt &C) { return C.urem(*C1).isZero(); };
+    auto SDivCheck = [&C1](const APInt &C) {
+      APInt Quot, Rem;
+      APInt::sdivrem(C, *C1, Quot, Rem);
+      return Rem.isZero() && !Quot.isAllOnes();
+    };
+    if (match(Op1, m_APInt(C1)) &&
+        (match(Op0, m_Exact(m_UDiv(m_Value(X), m_CheckedInt(UDivCheck)))) ||
+         match(Op0, m_Exact(m_SDiv(m_Value(X), m_CheckedInt(SDivCheck)))))) {
+      auto BOpc = cast<BinaryOperator>(Op0)->getOpcode();
+      return BinaryOperator::CreateExact(
+          BOpc, X,
+          Builder.CreateBinOp(BOpc, cast<BinaryOperator>(Op0)->getOperand(1),
+                              Op1));
+    }
+  }
+
   // (X / Y) *  Y = X - (X % Y)
   // (X / Y) * -Y = (X % Y) - X
   {
@@ -1697,7 +1719,7 @@ Instruction *InstCombinerImpl::foldFDivConstantDivisor(BinaryOperator &I) {
 
   // -X / C --> X / -C
   Value *X;
-  const DataLayout &DL = I.getModule()->getDataLayout();
+  const DataLayout &DL = I.getDataLayout();
   if (match(I.getOperand(0), m_FNeg(m_Value(X))))
     if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL))
       return BinaryOperator::CreateFDivFMF(X, NegC, &I);
@@ -1742,7 +1764,7 @@ static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
 
   // C / -X --> -C / X
   Value *X;
-  const DataLayout &DL = I.getModule()->getDataLayout();
+  const DataLayout &DL = I.getDataLayout();
   if (match(I.getOperand(1), m_FNeg(m_Value(X))))
     if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL))
       return BinaryOperator::CreateFDivFMF(NegC, X, &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index fbac209c3da6f6..aff691b000413e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1310,7 +1310,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
 
       // If NewOp is a constant and OldOp is not replace iff NewOp doesn't
       // contain and undef elements.
-      if (match(NewOp, m_ImmConstant())) {
+      if (match(NewOp, m_ImmConstant()) || NewOp == V) {
         if (isGuaranteedNotToBeUndef(NewOp, SQ.AC, &Sel, &DT))
           return replaceOperand(Sel, Swapped ? 2 : 1, V);
         return nullptr;
@@ -1529,7 +1529,7 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
     if (!match(ReplacementLow, m_ImmConstant(LowC)) ||
         !match(ReplacementHigh, m_ImmConstant(HighC)))
       return nullptr;
-    const DataLayout &DL = Sel0.getModule()->getDataLayout();
+    const DataLayout &DL = Sel0.getDataLayout();
     ReplacementLow =
         ConstantFoldCastOperand(Instruction::SExt, LowC, X->getType(), DL);
     ReplacementHigh =
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 06b434857c657a..38f8a41214b682 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -216,7 +216,7 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
   // ((1 << MaskShAmt) - 1)
   auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes());
   // (~(-1 << maskNbits))
-  auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes());
+  auto MaskB = m_Not(m_Shl(m_AllOnes(), m_Value(MaskShAmt)));
   // (-1 l>> MaskShAmt)
   auto MaskC = m_LShr(m_AllOnes(), m_Value(MaskShAmt));
   // ((-1 << MaskShAmt) l>> MaskShAmt)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 35425c5d0882ec..bfe5db3547cd59 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -499,10 +499,10 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       }
 
       // add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN
-      // TODO: Relax the one-use checks because we are removing an instruction?
-      if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))),
-                         m_OneUse(m_SExt(m_Value(Y))))) &&
-          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
+      if (match(I, m_Add(m_SExt(m_Value(X)), m_SExt(m_Value(Y)))) &&
+          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
+          (I->getOperand(0)->hasOneUse() || I->getOperand(1)->hasOneUse())) {
+
         // Truth table for inputs and output signbits:
         //       X:0 | X:1
         //      -----------
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index ebc2930d33d261..3de56a40380390 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2837,15 +2837,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     auto *XType = cast<FixedVectorType>(X->getType());
     unsigned XNumElts = XType->getNumElements();
     SmallVector<int, 16> ScaledMask;
-    if (XNumElts >= VWidth) {
-      assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast");
-      narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask);
-    } else {
-      assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast");
-      if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask))
-        ScaledMask.clear();
-    }
-    if (!ScaledMask.empty()) {
+    if (scaleShuffleMaskElts(XNumElts, Mask, ScaledMask)) {
       // If the shuffled source vector simplifies, cast that value to this
       // shuffle's type.
       if (auto *V = simplifyShuffleVectorInst(X, UndefValue::get(XType),
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index a9c994f8076b20..20e8e37af2e270 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -874,7 +874,7 @@ Instruction *InstCombinerImpl::tryFoldInstWithCtpopWithNot(Instruction *I) {
 //   -> (arithmetic_shift Binop1((not X), Y), Amt)
 
 Instruction *InstCombinerImpl::foldBinOpShiftWithShift(BinaryOperator &I) {
-  const DataLayout &DL = I.getModule()->getDataLayout();
+  const DataLayout &DL = I.getDataLayout();
   auto IsValidBinOpc = [](unsigned Opc) {
     switch (Opc) {
     default:
@@ -1670,7 +1670,7 @@ static Constant *constantFoldOperationIntoSelectOperand(Instruction &I,
     ConstOps.push_back(C);
   }
 
-  return ConstantFoldInstOperands(&I, ConstOps, I.getModule()->getDataLayout());
+  return ConstantFoldInstOperands(&I, ConstOps, I.getDataLayout());
 }
 
 static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
@@ -2787,9 +2787,16 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                                     GEP.getNoWrapFlags()));
   }
 
-  // Canonicalize scalable GEPs to an explicit offset using the llvm.vscale
-  // intrinsic. This has better support in BasicAA.
-  if (GEPEltType->isScalableTy()) {
+  // Canonicalize
+  //  - scalable GEPs to an explicit offset using the llvm.vscale intrinsic.
+  //    This has better support in BasicAA.
+  //  - gep i32 p, mul(O, C) -> gep i8, p, mul(O, C*4) to fold the two
+  //    multiplies together.
+  if (GEPEltType->isScalableTy() ||
+      (!GEPEltType->isIntegerTy(8) && GEP.getNumIndices() == 1 &&
+       match(GEP.getOperand(1),
+             m_OneUse(m_CombineOr(m_Mul(m_Value(), m_ConstantInt()),
+                                  m_Shl(m_Value(), m_ConstantInt())))))) {
     Value *Offset = EmitGEPOffset(cast<GEPOperator>(&GEP));
     return replaceInstUsesWith(
         GEP, Builder.CreatePtrAdd(PtrOp, Offset, "", GEP.isInBounds()));
@@ -5073,7 +5080,7 @@ bool InstCombinerImpl::run() {
 #ifndef NDEBUG
     std::string OrigI;
 #endif
-    LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+    LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS););
     LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
 
     if (Instruction *Result = visit(*I)) {
@@ -5335,7 +5342,7 @@ static bool combineInstructionsOverFunction(
     DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
     BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI, LoopInfo *LI,
     const InstCombineOptions &Opts) {
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
 
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 18b98e9b8a67e4..adf77f20cb1c76 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -747,7 +747,7 @@ struct AddressSanitizer {
   }
 
   TypeSize getAllocaSizeInBytes(const AllocaInst &AI) const {
-    return *AI.getAllocationSize(AI.getModule()->getDataLayout());
+    return *AI.getAllocationSize(AI.getDataLayout());
   }
 
   /// Check if we want (and can) handle this alloca.
@@ -3042,7 +3042,7 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   bool UseCalls = (InstrumentationWithCallsThreshold >= 0 &&
                    OperandsToInstrument.size() + IntrinToInstrument.size() >
                        (unsigned)InstrumentationWithCallsThreshold);
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   ObjectSizeOpts ObjSizeOpts;
   ObjSizeOpts.RoundToAlign = true;
   ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
@@ -3052,7 +3052,7 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   for (auto &Operand : OperandsToInstrument) {
     if (!suppressInstrumentationSiteForDebug(NumInstrumented))
       instrumentMop(ObjSizeVis, Operand, UseCalls,
-                    F.getParent()->getDataLayout(), RTCI);
+                    F.getDataLayout(), RTCI);
     FunctionModified = true;
   }
   for (auto *Inst : IntrinToInstrument) {
@@ -3148,7 +3148,7 @@ void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
   const size_t LargestStoreSizeInBytes =
       std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);
 
-  const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();
+  const bool IsLittleEndian = F.getDataLayout().isLittleEndian();
 
   // Poison given range in shadow using larges store size with out leading and
   // trailing zeros in ShadowMask. Zeros never change, so they need neither
@@ -3246,7 +3246,7 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
     assert(CopyInsertPoint);
   }
   IRBuilder<> IRB(CopyInsertPoint);
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   for (Argument &Arg : F.args()) {
     if (Arg.hasByValAttr()) {
       Type *Ty = Arg.getParamByValType();
@@ -3704,7 +3704,7 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   // ElementSize size, get allocated memory size in bytes by
   // OldSize * ElementSize.
   const unsigned ElementSize =
-      F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
+      F.getDataLayout().getTypeAllocSize(AI->getAllocatedType());
   Value *OldSize =
       IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
                     ConstantInt::get(IntptrTy, ElementSize));
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index cfa8ae26c62572..618b6fe1aea474 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -144,7 +144,7 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
   if (F.hasFnAttribute(Attribute::NoSanitizeBounds))
     return false;
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   ObjectSizeOpts EvalOpts;
   EvalOpts.RoundToAlign = true;
   EvalOpts.EvalMode = ObjectSizeOpts::Mode::ExactUnderlyingSizeAndOffset;
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index 651239bee91f9e..ebd7dae25ed3a1 100644
--- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Transforms/Instrumentation.h"
@@ -78,16 +79,11 @@ static bool runCGProfilePass(Module &M, FunctionAnalysisManager &FAM,
         if (!CB)
           continue;
         if (CB->isIndirectCall()) {
-          uint32_t ActualNumValueData;
           uint64_t TotalC;
-          auto ValueData = getValueProfDataFromInst(
-              *CB, IPVK_IndirectCallTarget, 8, ActualNumValueData, TotalC);
-          if (!ValueData)
-            continue;
-          for (const auto &VD : ArrayRef<InstrProfValueData>(
-                   ValueData.get(), ActualNumValueData)) {
+          auto ValueData =
+              getValueProfDataFromInst(*CB, IPVK_IndirectCallTarget, 8, TotalC);
+          for (const auto &VD : ValueData)
             UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count);
-          }
           continue;
         }
         UpdateCounts(TTI, &F, CB->getCalledFunction(), *BBCount);
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 8d345d394b51a2..4e3f9e27e0c344 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_component_library(LLVMInstrumentation
   BlockCoverageInference.cpp
   MemProfiler.cpp
   MemorySanitizer.cpp
+  NumericalStabilitySanitizer.cpp
   IndirectCallPromotion.cpp
   Instrumentation.cpp
   InstrOrderFile.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 731104d4fcef02..c2affafa49e519 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/BranchProbability.h"
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index f0b0917a25938c..113d39b4f2af7b 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -789,7 +789,7 @@ class DFSanVisitor : public InstVisitor<DFSanVisitor> {
   DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
 
   const DataLayout &getDataLayout() const {
-    return DFSF.F->getParent()->getDataLayout();
+    return DFSF.F->getDataLayout();
   }
 
   // Combines shadow values and origins for all of I's operands.
@@ -1843,7 +1843,7 @@ void DFSanFunction::setOrigin(Instruction *I, Value *Origin) {
 
 Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
   unsigned ArgOffset = 0;
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   for (auto &FArg : F->args()) {
     if (!FArg.getType()->isSized()) {
       if (A == &FArg)
@@ -2392,7 +2392,7 @@ Value *StripPointerGEPsAndCasts(Value *V) {
 }
 
 void DFSanVisitor::visitLoadInst(LoadInst &LI) {
-  auto &DL = LI.getModule()->getDataLayout();
+  auto &DL = LI.getDataLayout();
   uint64_t Size = DL.getTypeStoreSize(LI.getType());
   if (Size == 0) {
     DFSF.setShadow(&LI, DFSF.DFS.getZeroShadow(&LI));
@@ -2470,7 +2470,7 @@ Value *DFSanFunction::updateOrigin(Value *V, IRBuilder<> &IRB) {
 
 Value *DFSanFunction::originToIntptr(IRBuilder<> &IRB, Value *Origin) {
   const unsigned OriginSize = DataFlowSanitizer::OriginWidthBytes;
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   unsigned IntptrSize = DL.getTypeStoreSize(DFS.IntptrTy);
   if (IntptrSize == OriginSize)
     return Origin;
@@ -2483,7 +2483,7 @@ void DFSanFunction::paintOrigin(IRBuilder<> &IRB, Value *Origin,
                                 Value *StoreOriginAddr,
                                 uint64_t StoreOriginSize, Align Alignment) {
   const unsigned OriginSize = DataFlowSanitizer::OriginWidthBytes;
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   const Align IntptrAlignment = DL.getABITypeAlign(DFS.IntptrTy);
   unsigned IntptrSize = DL.getTypeStoreSize(DFS.IntptrTy);
   assert(IntptrAlignment >= MinOriginAlignment);
@@ -2659,7 +2659,7 @@ static AtomicOrdering addReleaseOrdering(AtomicOrdering AO) {
 }
 
 void DFSanVisitor::visitStoreInst(StoreInst &SI) {
-  auto &DL = SI.getModule()->getDataLayout();
+  auto &DL = SI.getDataLayout();
   Value *Val = SI.getValueOperand();
   uint64_t Size = DL.getTypeStoreSize(Val->getType());
   if (Size == 0)
@@ -2715,7 +2715,7 @@ void DFSanVisitor::visitCASOrRMW(Align InstAlignment, Instruction &I) {
   assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
 
   Value *Val = I.getOperand(1);
-  const auto &DL = I.getModule()->getDataLayout();
+  const auto &DL = I.getDataLayout();
   uint64_t Size = DL.getTypeStoreSize(Val->getType());
   if (Size == 0)
     return;
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index bb8019b5c31d84..d6c2d1f68751ed 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -13,13 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/IndirectCallVisitor.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -40,7 +43,9 @@
 #include <cassert>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -51,6 +56,12 @@ using namespace llvm;
 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
 
+extern cl::opt<unsigned> MaxNumVTableAnnotations;
+
+namespace llvm {
+extern cl::opt<bool> EnableVTableProfileUse;
+}
+
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
 static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
@@ -103,13 +114,196 @@ static cl::opt<bool>
     ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
                  cl::desc("Dump IR after transformation happens"));
 
+// Indirect call promotion pass will fall back to function-based comparison if
+// vtable-count / function-count is smaller than this threshold.
+static cl::opt<float> ICPVTablePercentageThreshold(
+    "icp-vtable-percentage-threshold", cl::init(0.99), cl::Hidden,
+    cl::desc("The percentage threshold of vtable-count / function-count for "
+             "cost-benefit analysis."));
+
+// Although comparing vtables can save a vtable load, we may need to compare
+// vtable pointer with multiple vtable address points due to class inheritance.
+// Comparing with multiple vtables inserts additional instructions on hot code
+// path, and doing so for an earlier candidate delays the comparisons for later
+// candidates. For the last candidate, only the fallback path is affected.
+// We allow multiple vtable comparison for the last function candidate and use
+// the option below to cap the number of vtables.
+static cl::opt<int> ICPMaxNumVTableLastCandidate(
+    "icp-max-num-vtable-last-candidate", cl::init(1), cl::Hidden,
+    cl::desc("The maximum number of vtable for the last candidate."));
+
 namespace {
 
+// The key is a vtable global variable, and the value is a map.
+// In the inner map, the key represents address point offsets and the value is a
+// constant for this address point.
+using VTableAddressPointOffsetValMap =
+    SmallDenseMap<const GlobalVariable *, std::unordered_map<int, Constant *>>;
+
+// A struct to collect type information for a virtual call site.
+struct VirtualCallSiteInfo {
+  // The offset from the address point to virtual function in the vtable.
+  uint64_t FunctionOffset;
+  // The instruction that computes the address point of vtable.
+  Instruction *VPtr;
+  // The compatible type used in LLVM type intrinsics.
+  StringRef CompatibleTypeStr;
+};
+
+// The key is a virtual call, and value is its type information.
+using VirtualCallSiteTypeInfoMap =
+    SmallDenseMap<const CallBase *, VirtualCallSiteInfo>;
+
+// The key is vtable GUID, and value is its value profile count.
+using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t, 16>;
+
+// Return the address point offset of the given compatible type.
+//
+// Type metadata of a vtable specifies the types that can contain a pointer to
+// this vtable, for example, `Base*` can be a pointer to an derived type
+// but not vice versa. See also https://llvm.org/docs/TypeMetadata.html
+static std::optional<uint64_t>
+getAddressPointOffset(const GlobalVariable &VTableVar,
+                      StringRef CompatibleType) {
+  SmallVector<MDNode *> Types;
+  VTableVar.getMetadata(LLVMContext::MD_type, Types);
+
+  for (MDNode *Type : Types)
+    if (auto *TypeId = dyn_cast<MDString>(Type->getOperand(1).get());
+        TypeId && TypeId->getString() == CompatibleType)
+      return cast<ConstantInt>(
+                 cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+          ->getZExtValue();
+
+  return std::nullopt;
+}
+
+// Return a constant representing the vtable's address point specified by the
+// offset.
+static Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
+                                             uint32_t AddressPointOffset) {
+  Module &M = *VTable->getParent();
+  LLVMContext &Context = M.getContext();
+  assert(AddressPointOffset <
+             M.getDataLayout().getTypeAllocSize(VTable->getValueType()) &&
+         "Out-of-bound access");
+
+  return ConstantExpr::getInBoundsGetElementPtr(
+      Type::getInt8Ty(Context), VTable,
+      llvm::ConstantInt::get(Type::getInt32Ty(Context), AddressPointOffset));
+}
+
+// Return the basic block in which Use `U` is used via its `UserInst`.
+static BasicBlock *getUserBasicBlock(Use &U, Instruction *UserInst) {
+  if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+    return PN->getIncomingBlock(U);
+
+  return UserInst->getParent();
+}
+
+// `DestBB` is a suitable basic block to sink `Inst` into when `Inst` have users
+// and all users are in `DestBB`. The caller guarantees that `Inst->getParent()`
+// is the sole predecessor of `DestBB` and `DestBB` is dominated by
+// `Inst->getParent()`.
+static bool isDestBBSuitableForSink(Instruction *Inst, BasicBlock *DestBB) {
+  // 'BB' is used only by assert.
+  [[maybe_unused]] BasicBlock *BB = Inst->getParent();
+
+  assert(BB != DestBB && BB->getTerminator()->getNumSuccessors() == 2 &&
+         DestBB->getUniquePredecessor() == BB &&
+         "Guaranteed by ICP transformation");
+
+  BasicBlock *UserBB = nullptr;
+  for (Use &Use : Inst->uses()) {
+    User *User = Use.getUser();
+    // Do checked cast since IR verifier guarantees that the user of an
+    // instruction must be an instruction. See `Verifier::visitInstruction`.
+    Instruction *UserInst = cast<Instruction>(User);
+    // We can sink debug or pseudo instructions together with Inst.
+    if (UserInst->isDebugOrPseudoInst())
+      continue;
+    UserBB = getUserBasicBlock(Use, UserInst);
+    // Do not sink if Inst is used in a basic block that is not DestBB.
+    // TODO: Sink to the common dominator of all user blocks.
+    if (UserBB != DestBB)
+      return false;
+  }
+  return UserBB != nullptr;
+}
+
+// For the virtual call dispatch sequence, try to sink vtable load instructions
+// to the cold indirect call fallback.
+// FIXME: Move the sink eligibility check below to a utility function in
+// Transforms/Utils/ directory.
+static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  if (!isDestBBSuitableForSink(I, DestBlock))
+    return false;
+
+  // Do not move control-flow-involving, volatile loads, vaarg, alloca
+  // instructions, etc.
+  if (isa<PHINode>(I) || I->isEHPad() || I->mayThrow() || !I->willReturn() ||
+      isa<AllocaInst>(I))
+    return false;
+
+  // Do not sink convergent call instructions.
+  if (const auto *C = dyn_cast<CallBase>(I))
+    if (C->isInlineAsm() || C->cannotMerge() || C->isConvergent())
+      return false;
+
+  // Do not move an instruction that may write to memory.
+  if (I->mayWriteToMemory())
+    return false;
+
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (I->mayReadFromMemory()) {
+    // We already know that SrcBlock is the unique predecessor of DestBlock.
+    for (BasicBlock::iterator Scan = std::next(I->getIterator()),
+                              E = I->getParent()->end();
+         Scan != E; ++Scan) {
+      // Note analysis analysis can tell whether two pointers can point to the
+      // same object in memory or not thereby find further opportunities to
+      // sink.
+      if (Scan->mayWriteToMemory())
+        return false;
+    }
+  }
+
+  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
+  I->moveBefore(*DestBlock, InsertPos);
+
+  // TODO: Sink debug intrinsic users of I to 'DestBlock'.
+  // 'InstCombinerImpl::tryToSinkInstructionDbgValues' and
+  // 'InstCombinerImpl::tryToSinkInstructionDbgVariableRecords' already have
+  // the core logic to do this.
+  return true;
+}
+
+// Try to sink instructions after VPtr to the indirect call fallback.
+// Return the number of sunk IR instructions.
+static int tryToSinkInstructions(BasicBlock *OriginalBB,
+                                 BasicBlock *IndirectCallBB) {
+  int SinkCount = 0;
+  // Do not sink across a critical edge for simplicity.
+  if (IndirectCallBB->getUniquePredecessor() != OriginalBB)
+    return SinkCount;
+  // Sink all eligible instructions in OriginalBB in reverse order.
+  for (Instruction &I :
+       llvm::make_early_inc_range(llvm::drop_begin(llvm::reverse(*OriginalBB))))
+    if (tryToSinkInstruction(&I, IndirectCallBB))
+      SinkCount++;
+
+  return SinkCount;
+}
+
 // Promote indirect calls to conditional direct calls, keeping track of
 // thresholds.
 class IndirectCallPromoter {
 private:
   Function &F;
+  Module &M;
+
+  ProfileSummaryInfo *PSI = nullptr;
 
   // Symtab that maps indirect call profile values to function names and
   // defines.
@@ -117,6 +311,11 @@ class IndirectCallPromoter {
 
   const bool SamplePGO;
 
+  // A map from a virtual call to its type information.
+  const VirtualCallSiteTypeInfoMap &VirtualCSInfo;
+
+  VTableAddressPointOffsetValMap &VTableAddressPointOffsetVal;
+
   OptimizationRemarkEmitter &ORE;
 
   // A struct that records the direct target and it's call count.
@@ -124,6 +323,16 @@ class IndirectCallPromoter {
     Function *const TargetFunction;
     const uint64_t Count;
 
+    // The following fields only exists for promotion candidates with vtable
+    // information.
+    //
+    // Due to class inheritance, one virtual call candidate can come from
+    // multiple vtables. `VTableGUIDAndCounts` tracks the vtable GUIDs and
+    // counts for 'TargetFunction'. `AddressPoints` stores the vtable address
+    // points for comparison.
+    VTableGUIDCountsMap VTableGUIDAndCounts;
+    SmallVector<Constant *> AddressPoints;
+
     PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
   };
 
@@ -133,21 +342,64 @@ class IndirectCallPromoter {
   // TotalCount is the total profiled count of call executions, and
   // NumCandidates is the number of candidate entries in ValueDataRef.
   std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
-      const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
+      const CallBase &CB, ArrayRef<InstrProfValueData> ValueDataRef,
       uint64_t TotalCount, uint32_t NumCandidates);
 
   // Promote a list of targets for one indirect-call callsite by comparing
-  // indirect callee with functions. Returns true if there are IR
+  // indirect callee with functions. Return true if there are IR
   // transformations and false otherwise.
-  bool tryToPromoteWithFuncCmp(
-      CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
-      uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
-      uint32_t NumCandidates);
+  bool tryToPromoteWithFuncCmp(CallBase &CB, Instruction *VPtr,
+                               ArrayRef<PromotionCandidate> Candidates,
+                               uint64_t TotalCount,
+                               ArrayRef<InstrProfValueData> ICallProfDataRef,
+                               uint32_t NumCandidates,
+                               VTableGUIDCountsMap &VTableGUIDCounts);
+
+  // Promote a list of targets for one indirect call by comparing vtables with
+  // functions. Return true if there are IR transformations and false
+  // otherwise.
+  bool tryToPromoteWithVTableCmp(
+      CallBase &CB, Instruction *VPtr,
+      const std::vector<PromotionCandidate> &Candidates,
+      uint64_t TotalFuncCount, uint32_t NumCandidates,
+      MutableArrayRef<InstrProfValueData> ICallProfDataRef,
+      VTableGUIDCountsMap &VTableGUIDCounts);
+
+  // Return true if it's profitable to compare vtables for the callsite.
+  bool isProfitableToCompareVTables(
+      const CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+      uint64_t TotalCount);
+
+  // Given an indirect callsite and the list of function candidates, compute
+  // the following vtable information in output parameters and return vtable
+  // pointer if type profiles exist.
+  // - Populate `VTableGUIDCounts` with <vtable-guid, count> using !prof
+  // metadata attached on the vtable pointer.
+  // - For each function candidate, finds out the vtables from which it gets
+  // called and stores the <vtable-guid, count> in promotion candidate.
+  Instruction *computeVTableInfos(const CallBase *CB,
+                                  VTableGUIDCountsMap &VTableGUIDCounts,
+                                  std::vector<PromotionCandidate> &Candidates);
+
+  Constant *getOrCreateVTableAddressPointVar(GlobalVariable *GV,
+                                             uint64_t AddressPointOffset);
+
+  void updateFuncValueProfiles(CallBase &CB, ArrayRef<InstrProfValueData> VDs,
+                               uint64_t Sum, uint32_t MaxMDCount);
+
+  void updateVPtrValueProfiles(Instruction *VPtr,
+                               VTableGUIDCountsMap &VTableGUIDCounts);
 
 public:
-  IndirectCallPromoter(Function &Func, InstrProfSymtab *Symtab, bool SamplePGO,
-                       OptimizationRemarkEmitter &ORE)
-      : F(Func), Symtab(Symtab), SamplePGO(SamplePGO), ORE(ORE) {}
+  IndirectCallPromoter(
+      Function &Func, Module &M, ProfileSummaryInfo *PSI,
+      InstrProfSymtab *Symtab, bool SamplePGO,
+      const VirtualCallSiteTypeInfoMap &VirtualCSInfo,
+      VTableAddressPointOffsetValMap &VTableAddressPointOffsetVal,
+      OptimizationRemarkEmitter &ORE)
+      : F(Func), M(M), PSI(PSI), Symtab(Symtab), SamplePGO(SamplePGO),
+        VirtualCSInfo(VirtualCSInfo),
+        VTableAddressPointOffsetVal(VTableAddressPointOffsetVal), ORE(ORE) {}
   IndirectCallPromoter(const IndirectCallPromoter &) = delete;
   IndirectCallPromoter &operator=(const IndirectCallPromoter &) = delete;
 
@@ -160,7 +412,7 @@ class IndirectCallPromoter {
 // the count. Stop at the first target that is not promoted.
 std::vector<IndirectCallPromoter::PromotionCandidate>
 IndirectCallPromoter::getPromotionCandidatesForCallSite(
-    const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
+    const CallBase &CB, ArrayRef<InstrProfValueData> ValueDataRef,
     uint64_t TotalCount, uint32_t NumCandidates) {
   std::vector<PromotionCandidate> Ret;
 
@@ -243,25 +495,127 @@ IndirectCallPromoter::getPromotionCandidatesForCallSite(
   return Ret;
 }
 
+Constant *IndirectCallPromoter::getOrCreateVTableAddressPointVar(
+    GlobalVariable *GV, uint64_t AddressPointOffset) {
+  auto [Iter, Inserted] =
+      VTableAddressPointOffsetVal[GV].try_emplace(AddressPointOffset, nullptr);
+  if (Inserted)
+    Iter->second = getVTableAddressPointOffset(GV, AddressPointOffset);
+  return Iter->second;
+}
+
+Instruction *IndirectCallPromoter::computeVTableInfos(
+    const CallBase *CB, VTableGUIDCountsMap &GUIDCountsMap,
+    std::vector<PromotionCandidate> &Candidates) {
+  if (!EnableVTableProfileUse)
+    return nullptr;
+
+  // Take the following code sequence as an example, here is how the code works
+  //   @vtable1 = {[n x ptr] [... ptr @func1]}
+  //   @vtable2 = {[m x ptr] [... ptr @func2]}
+  //
+  //   %vptr = load ptr, ptr %d, !prof !0
+  //   %0 = tail call i1 @llvm.type.test(ptr %vptr, metadata !"vtable1")
+  //   tail call void @llvm.assume(i1 %0)
+  //   %vfn = getelementptr inbounds ptr, ptr %vptr, i64 1
+  //   %1 = load ptr, ptr %vfn
+  //   call void %1(ptr %d), !prof !1
+  //
+  //   !0 = !{!"VP", i32 2, i64 100, i64 123, i64 50, i64 456, i64 50}
+  //   !1 = !{!"VP", i32 0, i64 100, i64 789, i64 50, i64 579, i64 50}
+  //
+  // Step 1. Find out the %vptr instruction for indirect call and use its !prof
+  // to populate `GUIDCountsMap`.
+  // Step 2. For each vtable-guid, look up its definition from symtab. LTO can
+  // make vtable definitions visible across modules.
+  // Step 3. Compute the byte offset of the virtual call, by adding vtable
+  // address point offset and function's offset relative to vtable address
+  // point. For each function candidate, this step tells us the vtable from
+  // which it comes from, and the vtable address point to compare %vptr with.
+
+  // Only virtual calls have virtual call site info.
+  auto Iter = VirtualCSInfo.find(CB);
+  if (Iter == VirtualCSInfo.end())
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "\nComputing vtable infos for callsite #"
+                    << NumOfPGOICallsites << "\n");
+
+  const auto &VirtualCallInfo = Iter->second;
+  Instruction *VPtr = VirtualCallInfo.VPtr;
+
+  SmallDenseMap<Function *, int, 4> CalleeIndexMap;
+  for (size_t I = 0; I < Candidates.size(); I++)
+    CalleeIndexMap[Candidates[I].TargetFunction] = I;
+
+  uint32_t ActualNumValueData = 0;
+  uint64_t TotalVTableCount = 0;
+  auto VTableValueDataArray = getValueProfDataFromInst(
+      *VirtualCallInfo.VPtr, IPVK_VTableTarget, MaxNumVTableAnnotations,
+      ActualNumValueData, TotalVTableCount);
+  if (VTableValueDataArray.get() == nullptr)
+    return VPtr;
+
+  // Compute the functions and counts from by each vtable.
+  for (size_t j = 0; j < ActualNumValueData; j++) {
+    uint64_t VTableVal = VTableValueDataArray[j].Value;
+    GUIDCountsMap[VTableVal] = VTableValueDataArray[j].Count;
+    GlobalVariable *VTableVar = Symtab->getGlobalVariable(VTableVal);
+    if (!VTableVar) {
+      LLVM_DEBUG(dbgs() << "  Cannot find vtable definition for " << VTableVal
+                        << "; maybe the vtable isn't imported\n");
+      continue;
+    }
+
+    std::optional<uint64_t> MaybeAddressPointOffset =
+        getAddressPointOffset(*VTableVar, VirtualCallInfo.CompatibleTypeStr);
+    if (!MaybeAddressPointOffset)
+      continue;
+
+    const uint64_t AddressPointOffset = *MaybeAddressPointOffset;
+
+    Function *Callee = nullptr;
+    std::tie(Callee, std::ignore) = getFunctionAtVTableOffset(
+        VTableVar, AddressPointOffset + VirtualCallInfo.FunctionOffset, M);
+    if (!Callee)
+      continue;
+    auto CalleeIndexIter = CalleeIndexMap.find(Callee);
+    if (CalleeIndexIter == CalleeIndexMap.end())
+      continue;
+
+    auto &Candidate = Candidates[CalleeIndexIter->second];
+    // There shouldn't be duplicate GUIDs in one !prof metadata (except
+    // duplicated zeros), so assign counters directly won't cause overwrite or
+    // counter loss.
+    Candidate.VTableGUIDAndCounts[VTableVal] = VTableValueDataArray[j].Count;
+    Candidate.AddressPoints.push_back(
+        getOrCreateVTableAddressPointVar(VTableVar, AddressPointOffset));
+  }
+
+  return VPtr;
+}
+
+// Creates 'branch_weights' prof metadata using TrueWeight and FalseWeight.
+// Scales uint64_t counters down to uint32_t if necessary to prevent overflow.
+static MDNode *createBranchWeights(LLVMContext &Context, uint64_t TrueWeight,
+                                   uint64_t FalseWeight) {
+  MDBuilder MDB(Context);
+  uint64_t Scale = calculateCountScale(std::max(TrueWeight, FalseWeight));
+  return MDB.createBranchWeights(scaleBranchCount(TrueWeight, Scale),
+                                 scaleBranchCount(FalseWeight, Scale));
+}
+
 CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
                                          uint64_t Count, uint64_t TotalCount,
                                          bool AttachProfToDirectCall,
                                          OptimizationRemarkEmitter *ORE) {
+  CallBase &NewInst = promoteCallWithIfThenElse(
+      CB, DirectCallee,
+      createBranchWeights(CB.getContext(), Count, TotalCount - Count));
 
-  uint64_t ElseCount = TotalCount - Count;
-  uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
-  uint64_t Scale = calculateCountScale(MaxCount);
-  MDBuilder MDB(CB.getContext());
-  MDNode *BranchWeights = MDB.createBranchWeights(
-      scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
-
-  CallBase &NewInst =
-      promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
-
-  if (AttachProfToDirectCall) {
+  if (AttachProfToDirectCall)
     setBranchWeights(NewInst, {static_cast<uint32_t>(Count)},
                      /*IsExpected=*/false);
-  }
 
   using namespace ore;
 
@@ -277,35 +631,175 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
 
 // Promote indirect-call to conditional direct-call for one callsite.
 bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
-    CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+    CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates,
     uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
-    uint32_t NumCandidates) {
+    uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts) {
   uint32_t NumPromoted = 0;
 
   for (const auto &C : Candidates) {
-    uint64_t Count = C.Count;
-    pgo::promoteIndirectCall(CB, C.TargetFunction, Count, TotalCount, SamplePGO,
-                             &ORE);
-    assert(TotalCount >= Count);
-    TotalCount -= Count;
+    uint64_t FuncCount = C.Count;
+    pgo::promoteIndirectCall(CB, C.TargetFunction, FuncCount, TotalCount,
+                             SamplePGO, &ORE);
+    assert(TotalCount >= FuncCount);
+    TotalCount -= FuncCount;
     NumOfPGOICallPromotion++;
     NumPromoted++;
-  }
 
+    if (!EnableVTableProfileUse || C.VTableGUIDAndCounts.empty())
+      continue;
+
+    // After a virtual call candidate gets promoted, update the vtable's counts
+    // proportionally. Each vtable-guid in `C.VTableGUIDAndCounts` represents
+    // a vtable from which the virtual call is loaded. Compute the sum and use
+    // 128-bit APInt to improve accuracy.
+    uint64_t SumVTableCount = 0;
+    for (const auto &[GUID, VTableCount] : C.VTableGUIDAndCounts)
+      SumVTableCount += VTableCount;
+
+    for (const auto &[GUID, VTableCount] : C.VTableGUIDAndCounts) {
+      APInt APFuncCount((unsigned)128, FuncCount, false /*signed*/);
+      APFuncCount *= VTableCount;
+      VTableGUIDCounts[GUID] -= APFuncCount.udiv(SumVTableCount).getZExtValue();
+    }
+  }
   if (NumPromoted == 0)
     return false;
 
-  // Adjust the MD.prof metadata. First delete the old one.
-  CB.setMetadata(LLVMContext::MD_prof, nullptr);
-
   assert(NumPromoted <= ICallProfDataRef.size() &&
          "Number of promoted functions should not be greater than the number "
          "of values in profile metadata");
+
+  // Update value profiles on the indirect call.
+  updateFuncValueProfiles(CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
+                          NumCandidates);
+  updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
+  return true;
+}
+
+void IndirectCallPromoter::updateFuncValueProfiles(
+    CallBase &CB, ArrayRef<InstrProfValueData> CallVDs, uint64_t TotalCount,
+    uint32_t MaxMDCount) {
+  // First clear the existing !prof.
+  CB.setMetadata(LLVMContext::MD_prof, nullptr);
   // Annotate the remaining value profiles if counter is not zero.
   if (TotalCount != 0)
-    annotateValueSite(*F.getParent(), CB, ICallProfDataRef.slice(NumPromoted),
-                      TotalCount, IPVK_IndirectCallTarget, NumCandidates);
+    annotateValueSite(M, CB, CallVDs, TotalCount, IPVK_IndirectCallTarget,
+                      MaxMDCount);
+}
+
+void IndirectCallPromoter::updateVPtrValueProfiles(
+    Instruction *VPtr, VTableGUIDCountsMap &VTableGUIDCounts) {
+  if (!EnableVTableProfileUse || VPtr == nullptr ||
+      !VPtr->getMetadata(LLVMContext::MD_prof))
+    return;
+  VPtr->setMetadata(LLVMContext::MD_prof, nullptr);
+  std::vector<InstrProfValueData> VTableValueProfiles;
+  uint64_t TotalVTableCount = 0;
+  for (auto [GUID, Count] : VTableGUIDCounts) {
+    if (Count == 0)
+      continue;
+
+    VTableValueProfiles.push_back({GUID, Count});
+    TotalVTableCount += Count;
+  }
+  llvm::sort(VTableValueProfiles,
+             [](const InstrProfValueData &LHS, const InstrProfValueData &RHS) {
+               return LHS.Count > RHS.Count;
+             });
+
+  annotateValueSite(M, *VPtr, VTableValueProfiles, TotalVTableCount,
+                    IPVK_VTableTarget, VTableValueProfiles.size());
+}
+
+bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
+    CallBase &CB, Instruction *VPtr,
+    const std::vector<PromotionCandidate> &Candidates, uint64_t TotalFuncCount,
+    uint32_t NumCandidates,
+    MutableArrayRef<InstrProfValueData> ICallProfDataRef,
+    VTableGUIDCountsMap &VTableGUIDCounts) {
+  SmallVector<uint64_t, 4> PromotedFuncCount;
+
+  for (const auto &Candidate : Candidates) {
+    for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
+      VTableGUIDCounts[GUID] -= Count;
+
+    // 'OriginalBB' is the basic block of indirect call. After each candidate
+    // is promoted, a new basic block is created for the indirect fallback basic
+    // block and indirect call `CB` is moved into this new BB.
+    BasicBlock *OriginalBB = CB.getParent();
+    promoteCallWithVTableCmp(
+        CB, VPtr, Candidate.TargetFunction, Candidate.AddressPoints,
+        createBranchWeights(CB.getContext(), Candidate.Count,
+                            TotalFuncCount - Candidate.Count));
+
+    int SinkCount = tryToSinkInstructions(OriginalBB, CB.getParent());
+
+    ORE.emit([&]() {
+      OptimizationRemark Remark(DEBUG_TYPE, "Promoted", &CB);
+
+      const auto &VTableGUIDAndCounts = Candidate.VTableGUIDAndCounts;
+      Remark << "Promote indirect call to "
+             << ore::NV("DirectCallee", Candidate.TargetFunction)
+             << " with count " << ore::NV("Count", Candidate.Count)
+             << " out of " << ore::NV("TotalCount", TotalFuncCount) << ", sink "
+             << ore::NV("SinkCount", SinkCount)
+             << " instruction(s) and compare "
+             << ore::NV("VTable", VTableGUIDAndCounts.size())
+             << " vtable(s): {";
+
+      // Sort GUIDs so remark message is deterministic.
+      std::set<uint64_t> GUIDSet;
+      for (auto [GUID, Count] : VTableGUIDAndCounts)
+        GUIDSet.insert(GUID);
+      for (auto Iter = GUIDSet.begin(); Iter != GUIDSet.end(); Iter++) {
+        if (Iter != GUIDSet.begin())
+          Remark << ", ";
+        Remark << ore::NV("VTable", Symtab->getGlobalVariable(*Iter));
+      }
+
+      Remark << "}";
+
+      return Remark;
+    });
+
+    PromotedFuncCount.push_back(Candidate.Count);
+
+    assert(TotalFuncCount >= Candidate.Count &&
+           "Within one prof metadata, total count is the sum of counts from "
+           "individual <target, count> pairs");
+    // Use std::min since 'TotalFuncCount' is the saturated sum of individual
+    // counts, see
+    // https://github.com/llvm/llvm-project/blob/abedb3b8356d5d56f1c575c4f7682fba2cb19787/llvm/lib/ProfileData/InstrProf.cpp#L1281-L1288
+    TotalFuncCount -= std::min(TotalFuncCount, Candidate.Count);
+    NumOfPGOICallPromotion++;
+  }
 
+  if (PromotedFuncCount.empty())
+    return false;
+
+  // Update value profiles for 'CB' and 'VPtr', assuming that each 'CB' has a
+  // a distinct 'VPtr'.
+  // FIXME: When Clang `-fstrict-vtable-pointers` is enabled, a vtable might be
+  // used to load multiple virtual functions. The vtable profiles needs to be
+  // updated properly in that case (e.g, for each indirect call annotate both
+  // type profiles and function profiles in one !prof).
+  for (size_t I = 0; I < PromotedFuncCount.size(); I++)
+    ICallProfDataRef[I].Count -=
+        std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count);
+  // Sort value profiles by count in descending order.
+  llvm::stable_sort(ICallProfDataRef, [](const InstrProfValueData &LHS,
+                                         const InstrProfValueData &RHS) {
+    return LHS.Count > RHS.Count;
+  });
+  // Drop the <target-value, count> pair if count is zero.
+  ArrayRef<InstrProfValueData> VDs(
+      ICallProfDataRef.begin(),
+      llvm::upper_bound(ICallProfDataRef, 0U,
+                        [](uint64_t Count, const InstrProfValueData &ProfData) {
+                          return ProfData.Count <= Count;
+                        }));
+  updateFuncValueProfiles(CB, VDs, TotalFuncCount, NumCandidates);
+  updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
   return true;
 }
 
@@ -322,14 +816,151 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
     if (!NumCandidates ||
         (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
       continue;
+
     auto PromotionCandidates = getPromotionCandidatesForCallSite(
         *CB, ICallProfDataRef, TotalCount, NumCandidates);
-    Changed |= tryToPromoteWithFuncCmp(*CB, PromotionCandidates, TotalCount,
-                                       ICallProfDataRef, NumCandidates);
+
+    VTableGUIDCountsMap VTableGUIDCounts;
+    Instruction *VPtr =
+        computeVTableInfos(CB, VTableGUIDCounts, PromotionCandidates);
+
+    if (isProfitableToCompareVTables(*CB, PromotionCandidates, TotalCount))
+      Changed |= tryToPromoteWithVTableCmp(*CB, VPtr, PromotionCandidates,
+                                           TotalCount, NumCandidates,
+                                           ICallProfDataRef, VTableGUIDCounts);
+    else
+      Changed |= tryToPromoteWithFuncCmp(*CB, VPtr, PromotionCandidates,
+                                         TotalCount, ICallProfDataRef,
+                                         NumCandidates, VTableGUIDCounts);
   }
   return Changed;
 }
 
+// TODO: Return false if the function addressing and vtable load instructions
+// cannot sink to indirect fallback.
+bool IndirectCallPromoter::isProfitableToCompareVTables(
+    const CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+    uint64_t TotalCount) {
+  if (!EnableVTableProfileUse || Candidates.empty())
+    return false;
+  LLVM_DEBUG(dbgs() << "\nEvaluating vtable profitability for callsite #"
+                    << NumOfPGOICallsites << CB << "\n");
+  uint64_t RemainingVTableCount = TotalCount;
+  const size_t CandidateSize = Candidates.size();
+  for (size_t I = 0; I < CandidateSize; I++) {
+    auto &Candidate = Candidates[I];
+    auto &VTableGUIDAndCounts = Candidate.VTableGUIDAndCounts;
+
+    LLVM_DEBUG(dbgs() << "  Candidate " << I << " FunctionCount: "
+                      << Candidate.Count << ", VTableCounts:");
+    // Add [[maybe_unused]] since <GUID, Count> are only used by LLVM_DEBUG.
+    for ([[maybe_unused]] auto &[GUID, Count] : VTableGUIDAndCounts)
+      LLVM_DEBUG(dbgs() << " {" << Symtab->getGlobalVariable(GUID)->getName()
+                        << ", " << Count << "}");
+    LLVM_DEBUG(dbgs() << "\n");
+
+    uint64_t CandidateVTableCount = 0;
+    for (auto &[GUID, Count] : VTableGUIDAndCounts)
+      CandidateVTableCount += Count;
+
+    if (CandidateVTableCount < Candidate.Count * ICPVTablePercentageThreshold) {
+      LLVM_DEBUG(
+          dbgs() << "    function count " << Candidate.Count
+                 << " and its vtable sum count " << CandidateVTableCount
+                 << " have discrepancies. Bail out vtable comparison.\n");
+      return false;
+    }
+
+    RemainingVTableCount -= Candidate.Count;
+
+    // 'MaxNumVTable' limits the number of vtables to make vtable comparison
+    // profitable. Comparing multiple vtables for one function candidate will
+    // insert additional instructions on the hot path, and allowing more than
+    // one vtable for non last candidates may or may not elongate the dependency
+    // chain for the subsequent candidates. Set its value to 1 for non-last
+    // candidate and allow option to override it for the last candidate.
+    int MaxNumVTable = 1;
+    if (I == CandidateSize - 1)
+      MaxNumVTable = ICPMaxNumVTableLastCandidate;
+
+    if ((int)Candidate.AddressPoints.size() > MaxNumVTable) {
+      LLVM_DEBUG(dbgs() << "    allow at most " << MaxNumVTable << " and got "
+                        << Candidate.AddressPoints.size()
+                        << " vtables. Bail out for vtable comparison.\n");
+      return false;
+    }
+  }
+
+  // If the indirect fallback is not cold, don't compare vtables.
+  if (PSI && PSI->hasProfileSummary() &&
+      !PSI->isColdCount(RemainingVTableCount)) {
+    LLVM_DEBUG(dbgs() << "    Indirect fallback basic block is not cold. Bail "
+                         "out for vtable comparison.\n");
+    return false;
+  }
+
+  return true;
+}
+
+// For virtual calls in the module, collect per-callsite information which will
+// be used to associate an ICP candidate with a vtable and a specific function
+// in the vtable. With type intrinsics (llvm.type.test), we can find virtual
+// calls in a compile-time efficient manner (by iterating its users) and more
+// importantly use the compatible type later to figure out the function byte
+// offset relative to the start of vtables.
+static void
+computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
+                                  VirtualCallSiteTypeInfoMap &VirtualCSInfo) {
+  // Right now only llvm.type.test is used to find out virtual call sites.
+  // With ThinLTO and whole-program-devirtualization, llvm.type.test and
+  // llvm.public.type.test are emitted, and llvm.public.type.test is either
+  // refined to llvm.type.test or dropped before indirect-call-promotion pass.
+  //
+  // FIXME: For fullLTO with VFE, `llvm.type.checked.load intrinsic` is emitted.
+  // Find out virtual calls by looking at users of llvm.type.checked.load in
+  // that case.
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  if (!TypeTestFunc || TypeTestFunc->use_empty())
+    return;
+
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
+    return FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  // Iterate all type.test calls to find all indirect calls.
+  for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
+    if (!CI)
+      continue;
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
+    if (!TypeMDVal)
+      continue;
+    auto *CompatibleTypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!CompatibleTypeId)
+      continue;
+
+    // Find out all devirtualizable call sites given a llvm.type.test
+    // intrinsic call.
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<CallInst *, 1> Assumes;
+    auto &DT = LookupDomTree(*CI->getFunction());
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
+
+    for (auto &DevirtCall : DevirtCalls) {
+      CallBase &CB = DevirtCall.CB;
+      // Given an indirect call, try find the instruction which loads a
+      // pointer to virtual table.
+      Instruction *VTablePtr =
+          PGOIndirectCallVisitor::tryGetVTableInstruction(&CB);
+      if (!VTablePtr)
+        continue;
+      VirtualCSInfo[&CB] = {DevirtCall.Offset, VTablePtr,
+                            CompatibleTypeId->getString()};
+    }
+  }
+}
+
 // A wrapper function that does the actual work.
 static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
                                  bool SamplePGO, ModuleAnalysisManager &MAM) {
@@ -342,6 +973,20 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
     return false;
   }
   bool Changed = false;
+  VirtualCallSiteTypeInfoMap VirtualCSInfo;
+
+  if (EnableVTableProfileUse)
+    computeVirtualCallSiteTypeInfoMap(M, MAM, VirtualCSInfo);
+
+  // VTableAddressPointOffsetVal stores the vtable address points. The vtable
+  // address point of a given <vtable, address point offset> is static (doesn't
+  // change after being computed once).
+  // IndirectCallPromoter::getOrCreateVTableAddressPointVar creates the map
+  // entry the first time a <vtable, offset> pair is seen, as
+  // promoteIndirectCalls processes an IR module and calls IndirectCallPromoter
+  // repeatedly on each function.
+  VTableAddressPointOffsetValMap VTableAddressPointOffsetVal;
+
   for (auto &F : M) {
     if (F.isDeclaration() || F.hasOptNone())
       continue;
@@ -350,7 +995,9 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
         MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-    IndirectCallPromoter CallPromoter(F, &Symtab, SamplePGO, ORE);
+    IndirectCallPromoter CallPromoter(F, M, PSI, &Symtab, SamplePGO,
+                                      VirtualCSInfo,
+                                      VTableAddressPointOffsetVal, ORE);
     bool FuncChanged = CallPromoter.processFunction(PSI);
     if (ICPDUMPAFTER && FuncChanged) {
       LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 505ca9f7256f0f..f994f8a62c3204 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
@@ -967,6 +968,29 @@ Function *InstrLowerer::createRMWOrFunc() {
   //  %mcdc.bits = load i8, ptr %4, align 1
   auto *Bitmap = Builder.CreateLoad(Int8Ty, ArgAddr, "mcdc.bits");
 
+  if (Options.Atomic || AtomicCounterUpdateAll) {
+    // If ((Bitmap & Val) != Val), then execute atomic (Bitmap |= Val).
+    // Note, just-loaded Bitmap might not be up-to-date. Use it just for
+    // early testing.
+    auto *Masked = Builder.CreateAnd(Bitmap, ArgVal);
+    auto *ShouldStore = Builder.CreateICmpNE(Masked, ArgVal);
+    auto *ThenTerm = BasicBlock::Create(Ctx, "", Fn);
+    auto *ElseTerm = BasicBlock::Create(Ctx, "", Fn);
+    // Assume updating will be rare.
+    auto *Unlikely = MDBuilder(Ctx).createUnlikelyBranchWeights();
+    Builder.CreateCondBr(ShouldStore, ThenTerm, ElseTerm, Unlikely);
+
+    IRBuilder<> ThenBuilder(ThenTerm);
+    ThenBuilder.CreateAtomicRMW(AtomicRMWInst::Or, ArgAddr, ArgVal,
+                                MaybeAlign(), AtomicOrdering::Monotonic);
+    ThenBuilder.CreateRetVoid();
+
+    IRBuilder<> ElseBuilder(ElseTerm);
+    ElseBuilder.CreateRetVoid();
+
+    return Fn;
+  }
+
   // Perform logical OR of profile bitmap byte and shifted bit offset.
   //  %8 = or i8 %mcdc.bits, %7
   auto *Result = Builder.CreateOr(Bitmap, ArgVal);
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index 465fa41b6c6630..0115809e939e8c 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include <memory>
 #include <random>
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 8a12fa19a3dedf..5d5e205d454626 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -77,6 +77,8 @@ constexpr char MemProfShadowMemoryDynamicAddress[] =
 
 constexpr char MemProfFilenameVar[] = "__memprof_profile_filename";
 
+constexpr char MemProfHistogramFlagVar[] = "__memprof_histogram";
+
 // Command-line flags.
 
 static cl::opt<bool> ClInsertVersionCheck(
@@ -145,10 +147,14 @@ static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"),
 // override these hints anyway.
 static cl::opt<bool> ClMemProfMatchHotColdNew(
     "memprof-match-hot-cold-new",
-    cl::desc(
+ cl::desc(
         "Match allocation profiles onto existing hot/cold operator new calls"),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClHistogram("memprof-histogram",
+                                 cl::desc("Collect access count histograms"),
+                                 cl::Hidden, cl::init(false));
+
 static cl::opt<bool>
     ClPrintMemProfMatchInfo("memprof-print-match-info",
                             cl::desc("Print matching stats for each allocation "
@@ -279,6 +285,11 @@ ModuleMemProfilerPass::ModuleMemProfilerPass() = default;
 
 PreservedAnalyses ModuleMemProfilerPass::run(Module &M,
                                              AnalysisManager<Module> &AM) {
+
+  assert((!ClHistogram || (ClHistogram && ClUseCalls)) &&
+         "Cannot use -memprof-histogram without Callbacks. Set "
+         "memprof-use-callbacks");
+
   ModuleMemProfiler Profiler(M);
   if (Profiler.instrumentModule(M))
     return PreservedAnalyses::none();
@@ -508,7 +519,24 @@ void createProfileFileNameVar(Module &M) {
   }
 }
 
+// Set MemprofHistogramFlag as a Global veriable in IR. This makes it accessible
+// to the runtime, changing shadow count behavior.
+void createMemprofHistogramFlagVar(Module &M) {
+  const StringRef VarName(MemProfHistogramFlagVar);
+  Type *IntTy1 = Type::getInt1Ty(M.getContext());
+  auto MemprofHistogramFlag = new GlobalVariable(
+      M, IntTy1, true, GlobalValue::WeakAnyLinkage,
+      Constant::getIntegerValue(IntTy1, APInt(1, ClHistogram)), VarName);
+  Triple TT(M.getTargetTriple());
+  if (TT.supportsCOMDAT()) {
+    MemprofHistogramFlag->setLinkage(GlobalValue::ExternalLinkage);
+    MemprofHistogramFlag->setComdat(M.getOrInsertComdat(VarName));
+  }
+  appendToCompilerUsed(M, MemprofHistogramFlag);
+}
+
 bool ModuleMemProfiler::instrumentModule(Module &M) {
+
   // Create a module constructor.
   std::string MemProfVersion = std::to_string(LLVM_MEM_PROFILER_VERSION);
   std::string VersionCheckName =
@@ -524,6 +552,8 @@ bool ModuleMemProfiler::instrumentModule(Module &M) {
 
   createProfileFileNameVar(M);
 
+  createMemprofHistogramFlagVar(M);
+
   return true;
 }
 
@@ -532,11 +562,12 @@ void MemProfiler::initializeCallbacks(Module &M) {
 
   for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
     const std::string TypeStr = AccessIsWrite ? "store" : "load";
+    const std::string HistPrefix = ClHistogram ? "hist_" : "";
 
     SmallVector<Type *, 2> Args1{1, IntptrTy};
-    MemProfMemoryAccessCallback[AccessIsWrite] =
-        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr,
-                              FunctionType::get(IRB.getVoidTy(), Args1, false));
+    MemProfMemoryAccessCallback[AccessIsWrite] = M.getOrInsertFunction(
+        ClMemoryAccessCallbackPrefix + HistPrefix + TypeStr,
+        FunctionType::get(IRB.getVoidTy(), Args1, false));
   }
   MemProfMemmove = M.getOrInsertFunction(
       ClMemoryAccessCallbackPrefix + "memmove", PtrTy, PtrTy, PtrTy, IntptrTy);
@@ -621,7 +652,7 @@ bool MemProfiler::instrumentFunction(Function &F) {
       std::optional<InterestingMemoryAccess> Access =
           isInterestingMemoryAccess(Inst);
       if (Access)
-        instrumentMop(Inst, F.getParent()->getDataLayout(), *Access);
+        instrumentMop(Inst, F.getDataLayout(), *Access);
       else
         instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
     }
@@ -1024,4 +1055,4 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
   }
 
   return PreservedAnalyses::none();
-}
+}
\ No newline at end of file
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index f11c1c5933327b..905d2671aa43ed 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1218,7 +1218,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
     if (IntptrSize == kOriginSize)
       return Origin;
@@ -1230,7 +1230,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Fill memory range with the given origin value.
   void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
                    TypeSize TS, Align Alignment) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     const Align IntptrAlignment = DL.getABITypeAlign(MS.IntptrTy);
     unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
     assert(IntptrAlignment >= kMinOriginAlignment);
@@ -1280,7 +1280,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
                    Value *OriginPtr, Align Alignment) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
     TypeSize StoreSize = DL.getTypeStoreSize(Shadow->getType());
     Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB);
@@ -1392,7 +1392,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void materializeOneCheck(IRBuilder<> &IRB, Value *ConvertedShadow,
                            Value *Origin) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     TypeSize TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
     unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
     if (instrumentWithCalls(ConvertedShadow) &&
@@ -1419,7 +1419,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void materializeInstructionChecks(
       ArrayRef<ShadowOriginAndInsertPoint> InstructionChecks) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     // Disable combining in some cases. TrackOrigins checks each shadow to pick
     // correct origin.
     bool Combine = !MS.TrackOrigins;
@@ -1579,7 +1579,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // This may return weird-sized types like i1.
     if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
       return IT;
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
       uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
       return VectorType::get(IntegerType::get(*MS.C, EltSize),
@@ -1775,7 +1775,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                                             Type *ShadowTy,
                                                             bool isStore) {
     Value *ShadowOriginPtrs;
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     TypeSize Size = DL.getTypeStoreSize(ShadowTy);
 
     FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size);
@@ -1963,7 +1963,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Function *F = A->getParent();
       IRBuilder<> EntryIRB(FnPrologueEnd);
       unsigned ArgOffset = 0;
-      const DataLayout &DL = F->getParent()->getDataLayout();
+      const DataLayout &DL = F->getDataLayout();
       for (auto &FArg : F->args()) {
         if (!FArg.getType()->isSized() || FArg.getType()->isScalableTy()) {
           LLVM_DEBUG(dbgs() << (FArg.getType()->isScalableTy()
@@ -3676,7 +3676,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (!MS.TrackOrigins)
       return;
 
-    auto &DL = F.getParent()->getDataLayout();
+    auto &DL = F.getDataLayout();
     paintOrigin(IRB, getOrigin(V), OriginPtr,
                 DL.getTypeStoreSize(Shadow->getType()),
                 std::max(Alignment, kMinOriginAlignment));
@@ -4340,7 +4340,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
 
       unsigned Size = 0;
-      const DataLayout &DL = F.getParent()->getDataLayout();
+      const DataLayout &DL = F.getDataLayout();
 
       bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
       bool NoUndef = CB.paramHasAttr(i, Attribute::NoUndef);
@@ -4576,7 +4576,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (!InsPoint)
       InsPoint = &I;
     NextNodeIRBuilder IRB(InsPoint);
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     TypeSize TS = DL.getTypeAllocSize(I.getAllocatedType());
     Value *Len = IRB.CreateTypeSize(MS.IntptrTy, TS);
     if (I.isArrayAllocation())
@@ -4798,7 +4798,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // The total number of asm() arguments in the source is nR+nO+nI, and the
     // corresponding CallInst has nO+nI+1 operands (the last operand is the
     // function to be called).
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     CallBase *CB = cast<CallBase>(&I);
     IRBuilder<> IRB(&I);
     InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
@@ -4986,7 +4986,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
     unsigned GpOffset = 0;
     unsigned FpOffset = AMD64GpEndOffset;
     unsigned OverflowOffset = AMD64FpEndOffset;
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
 
     for (const auto &[ArgNo, A] : llvm::enumerate(CB.args())) {
       bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
@@ -5173,7 +5173,7 @@ struct VarArgMIPS64Helper : public VarArgHelperBase {
 
   void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
     unsigned VAArgOffset = 0;
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     for (Value *A :
          llvm::drop_begin(CB.args(), CB.getFunctionType()->getNumParams())) {
       Triple TargetTriple(F.getParent()->getTargetTriple());
@@ -5304,7 +5304,7 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
     unsigned VrOffset = AArch64VrBegOffset;
     unsigned OverflowOffset = AArch64VAEndOffset;
 
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     for (const auto &[ArgNo, A] : llvm::enumerate(CB.args())) {
       bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
       auto [AK, RegNum] = classifyArgument(A->getType());
@@ -5515,7 +5515,7 @@ struct VarArgPowerPC64Helper : public VarArgHelperBase {
     else
       VAArgBase = 32;
     unsigned VAArgOffset = VAArgBase;
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     for (const auto &[ArgNo, A] : llvm::enumerate(CB.args())) {
       bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
       bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
@@ -5706,7 +5706,7 @@ struct VarArgSystemZHelper : public VarArgHelperBase {
     unsigned FpOffset = SystemZFpOffset;
     unsigned VrIndex = 0;
     unsigned OverflowOffset = SystemZOverflowOffset;
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     for (const auto &[ArgNo, A] : llvm::enumerate(CB.args())) {
       bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
       // SystemZABIInfo does not produce ByVal parameters.
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
new file mode 100644
index 00000000000000..6207d4246925cf
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -0,0 +1,2165 @@
+//===-- NumericalStabilitySanitizer.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the instrumentation pass for the numerical sanitizer.
+// Conceptually the pass injects shadow computations using higher precision
+// types and inserts consistency checks. For details see the paper
+// https://arxiv.org/abs/2102.12782.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nsan"
+
+STATISTIC(NumInstrumentedFTLoads,
+          "Number of instrumented floating-point loads");
+
+STATISTIC(NumInstrumentedFTCalls,
+          "Number of instrumented floating-point calls");
+STATISTIC(NumInstrumentedFTRets,
+          "Number of instrumented floating-point returns");
+STATISTIC(NumInstrumentedFTStores,
+          "Number of instrumented floating-point stores");
+STATISTIC(NumInstrumentedNonFTStores,
+          "Number of instrumented non floating-point stores");
+STATISTIC(
+    NumInstrumentedNonFTMemcpyStores,
+    "Number of instrumented non floating-point stores with memcpy semantics");
+STATISTIC(NumInstrumentedFCmp, "Number of instrumented fcmps");
+
+// Using smaller shadow types types can help improve speed. For example, `dlq`
+// is 3x slower to 5x faster in opt mode and 2-6x faster in dbg mode compared to
+// `dqq`.
+static cl::opt<std::string> ClShadowMapping(
+    "nsan-shadow-type-mapping", cl::init("dqq"),
+    cl::desc("One shadow type id for each of `float`, `double`, `long double`. "
+             "`d`,`l`,`q`,`e` mean double, x86_fp80, fp128 (quad) and "
+             "ppc_fp128 (extended double) respectively. The default is to "
+             "shadow `float` as `double`, and `double` and `x86_fp80` as "
+             "`fp128`"),
+    cl::Hidden);
+
+static cl::opt<bool>
+    ClInstrumentFCmp("nsan-instrument-fcmp", cl::init(true),
+                     cl::desc("Instrument floating-point comparisons"),
+                     cl::Hidden);
+
+static cl::opt<std::string> ClCheckFunctionsFilter(
+    "check-functions-filter",
+    cl::desc("Only emit checks for arguments of functions "
+             "whose names match the given regular expression"),
+    cl::value_desc("regex"));
+
+static cl::opt<bool> ClTruncateFCmpEq(
+    "nsan-truncate-fcmp-eq", cl::init(true),
+    cl::desc(
+        "This flag controls the behaviour of fcmp equality comparisons."
+        "For equality comparisons such as `x == 0.0f`, we can perform the "
+        "shadow check in the shadow (`x_shadow == 0.0) == (x == 0.0f)`) or app "
+        " domain (`(trunc(x_shadow) == 0.0f) == (x == 0.0f)`). This helps "
+        "catch the case when `x_shadow` is accurate enough (and therefore "
+        "close enough to zero) so that `trunc(x_shadow)` is zero even though "
+        "both `x` and `x_shadow` are not"),
+    cl::Hidden);
+
+// When there is external, uninstrumented code writing to memory, the shadow
+// memory can get out of sync with the application memory. Enabling this flag
+// emits consistency checks for loads to catch this situation.
+// When everything is instrumented, this is not strictly necessary because any
+// load should have a corresponding store, but can help debug cases when the
+// framework did a bad job at tracking shadow memory modifications by failing on
+// load rather than store.
+// TODO: provide a way to resume computations from the FT value when the load
+// is inconsistent. This ensures that further computations are not polluted.
+static cl::opt<bool> ClCheckLoads("nsan-check-loads",
+                                  cl::desc("Check floating-point load"),
+                                  cl::Hidden);
+
+static cl::opt<bool> ClCheckStores("nsan-check-stores", cl::init(true),
+                                   cl::desc("Check floating-point stores"),
+                                   cl::Hidden);
+
+static cl::opt<bool> ClCheckRet("nsan-check-ret", cl::init(true),
+                                cl::desc("Check floating-point return values"),
+                                cl::Hidden);
+
+// LLVM may store constant floats as bitcasted ints.
+// It's not really necessary to shadow such stores,
+// if the shadow value is unknown the framework will re-extend it on load
+// anyway. Moreover, because of size collisions (e.g. bf16 vs f16) it is
+// impossible to determine the floating-point type based on the size.
+// However, for debugging purposes it can be useful to model such stores.
+static cl::opt<bool> ClPropagateNonFTConstStoresAsFT(
+    "nsan-propagate-non-ft-const-stores-as-ft",
+    cl::desc(
+        "Propagate non floating-point const stores as floating point values."
+        "For debugging purposes only"),
+    cl::Hidden);
+
+constexpr StringLiteral kNsanModuleCtorName("nsan.module_ctor");
+constexpr StringLiteral kNsanInitName("__nsan_init");
+
+// The following values must be kept in sync with the runtime.
+constexpr int kShadowScale = 2;
+constexpr int kMaxVectorWidth = 8;
+constexpr int kMaxNumArgs = 128;
+constexpr int kMaxShadowTypeSizeBytes = 16; // fp128
+
+namespace {
+
+// Defines the characteristics (type id, type, and floating-point semantics)
+// attached for all possible shadow types.
+class ShadowTypeConfig {
+public:
+  static std::unique_ptr<ShadowTypeConfig> fromNsanTypeId(char TypeId);
+
+  // The LLVM Type corresponding to the shadow type.
+  virtual Type *getType(LLVMContext &Context) const = 0;
+
+  // The nsan type id of the shadow type (`d`, `l`, `q`, ...).
+  virtual char getNsanTypeId() const = 0;
+
+  virtual ~ShadowTypeConfig() = default;
+};
+
+template <char NsanTypeId>
+class ShadowTypeConfigImpl : public ShadowTypeConfig {
+public:
+  char getNsanTypeId() const override { return NsanTypeId; }
+  static constexpr const char kNsanTypeId = NsanTypeId;
+};
+
+// `double` (`d`) shadow type.
+class F64ShadowConfig : public ShadowTypeConfigImpl<'d'> {
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getDoubleTy(Context);
+  }
+};
+
+// `x86_fp80` (`l`) shadow type: X86 long double.
+class F80ShadowConfig : public ShadowTypeConfigImpl<'l'> {
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getX86_FP80Ty(Context);
+  }
+};
+
+// `fp128` (`q`) shadow type.
+class F128ShadowConfig : public ShadowTypeConfigImpl<'q'> {
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getFP128Ty(Context);
+  }
+};
+
+// `ppc_fp128` (`e`) shadow type: IBM extended double with 106 bits of mantissa.
+class PPC128ShadowConfig : public ShadowTypeConfigImpl<'e'> {
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getPPC_FP128Ty(Context);
+  }
+};
+
+// Creates a ShadowTypeConfig given its type id.
+std::unique_ptr<ShadowTypeConfig>
+ShadowTypeConfig::fromNsanTypeId(const char TypeId) {
+  switch (TypeId) {
+  case F64ShadowConfig::kNsanTypeId:
+    return std::make_unique<F64ShadowConfig>();
+  case F80ShadowConfig::kNsanTypeId:
+    return std::make_unique<F80ShadowConfig>();
+  case F128ShadowConfig::kNsanTypeId:
+    return std::make_unique<F128ShadowConfig>();
+  case PPC128ShadowConfig::kNsanTypeId:
+    return std::make_unique<PPC128ShadowConfig>();
+  }
+  report_fatal_error("nsan: invalid shadow type id '" + Twine(TypeId) + "'");
+}
+
+// An enum corresponding to shadow value types. Used as indices in arrays, so
+// not an `enum class`.
+enum FTValueType { kFloat, kDouble, kLongDouble, kNumValueTypes };
+
+// If `FT` corresponds to a primitive FTValueType, return it.
+static std::optional<FTValueType> ftValueTypeFromType(Type *FT) {
+  if (FT->isFloatTy())
+    return kFloat;
+  if (FT->isDoubleTy())
+    return kDouble;
+  if (FT->isX86_FP80Ty())
+    return kLongDouble;
+  return {};
+}
+
+// Returns the LLVM type for an FTValueType.
+static Type *typeFromFTValueType(FTValueType VT, LLVMContext &Context) {
+  switch (VT) {
+  case kFloat:
+    return Type::getFloatTy(Context);
+  case kDouble:
+    return Type::getDoubleTy(Context);
+  case kLongDouble:
+    return Type::getX86_FP80Ty(Context);
+  case kNumValueTypes:
+    return nullptr;
+  }
+}
+
+// Returns the type name for an FTValueType.
+static const char *typeNameFromFTValueType(FTValueType VT) {
+  switch (VT) {
+  case kFloat:
+    return "float";
+  case kDouble:
+    return "double";
+  case kLongDouble:
+    return "longdouble";
+  case kNumValueTypes:
+    return nullptr;
+  }
+}
+
+// A specific mapping configuration of application type to shadow type for nsan
+// (see -nsan-shadow-mapping flag).
+class MappingConfig {
+public:
+  explicit MappingConfig(LLVMContext &C) : Context(C) {
+    if (ClShadowMapping.size() != 3)
+      report_fatal_error("Invalid nsan mapping: " + Twine(ClShadowMapping));
+    unsigned ShadowTypeSizeBits[kNumValueTypes];
+    for (int VT = 0; VT < kNumValueTypes; ++VT) {
+      auto Config = ShadowTypeConfig::fromNsanTypeId(ClShadowMapping[VT]);
+      if (!Config)
+        report_fatal_error("Failed to get ShadowTypeConfig for " +
+                           Twine(ClShadowMapping[VT]));
+      const unsigned AppTypeSize =
+          typeFromFTValueType(static_cast<FTValueType>(VT), Context)
+              ->getScalarSizeInBits();
+      const unsigned ShadowTypeSize =
+          Config->getType(Context)->getScalarSizeInBits();
+      // Check that the shadow type size is at most kShadowScale times the
+      // application type size, so that shadow memory compoutations are valid.
+      if (ShadowTypeSize > kShadowScale * AppTypeSize)
+        report_fatal_error("Invalid nsan mapping f" + Twine(AppTypeSize) +
+                           "->f" + Twine(ShadowTypeSize) +
+                           ": The shadow type size should be at most " +
+                           Twine(kShadowScale) +
+                           " times the application type size");
+      ShadowTypeSizeBits[VT] = ShadowTypeSize;
+      Configs[VT] = std::move(Config);
+    }
+
+    // Check that the mapping is monotonous. This is required because if one
+    // does an fpextend of `float->long double` in application code, nsan is
+    // going to do an fpextend of `shadow(float) -> shadow(long double)` in
+    // shadow code. This will fail in `qql` mode, since nsan would be
+    // fpextending `f128->long`, which is invalid.
+    // TODO: Relax this.
+    if (ShadowTypeSizeBits[kFloat] > ShadowTypeSizeBits[kDouble] ||
+        ShadowTypeSizeBits[kDouble] > ShadowTypeSizeBits[kLongDouble])
+      report_fatal_error("Invalid nsan mapping: { float->f" +
+                         Twine(ShadowTypeSizeBits[kFloat]) + "; double->f" +
+                         Twine(ShadowTypeSizeBits[kDouble]) +
+                         "; long double->f" +
+                         Twine(ShadowTypeSizeBits[kLongDouble]) + " }");
+  }
+
+  const ShadowTypeConfig &byValueType(FTValueType VT) const {
+    assert(VT < FTValueType::kNumValueTypes && "invalid value type");
+    return *Configs[VT];
+  }
+
+  // Returns the extended shadow type for a given application type.
+  Type *getExtendedFPType(Type *FT) const {
+    if (const auto VT = ftValueTypeFromType(FT))
+      return Configs[*VT]->getType(Context);
+    if (FT->isVectorTy()) {
+      auto *VecTy = cast<VectorType>(FT);
+      // TODO: add support for scalable vector types.
+      if (VecTy->isScalableTy())
+        return nullptr;
+      Type *ExtendedScalar = getExtendedFPType(VecTy->getElementType());
+      return ExtendedScalar
+                 ? VectorType::get(ExtendedScalar, VecTy->getElementCount())
+                 : nullptr;
+    }
+    return nullptr;
+  }
+
+private:
+  LLVMContext &Context;
+  std::unique_ptr<ShadowTypeConfig> Configs[FTValueType::kNumValueTypes];
+};
+
+// The memory extents of a type specifies how many elements of a given
+// FTValueType needs to be stored when storing this type.
+struct MemoryExtents {
+  FTValueType ValueType;
+  uint64_t NumElts;
+};
+
+static MemoryExtents getMemoryExtentsOrDie(Type *FT) {
+  if (const auto VT = ftValueTypeFromType(FT))
+    return {*VT, 1};
+  if (auto *VecTy = dyn_cast<VectorType>(FT)) {
+    const auto ScalarExtents = getMemoryExtentsOrDie(VecTy->getElementType());
+    return {ScalarExtents.ValueType,
+            ScalarExtents.NumElts * VecTy->getElementCount().getFixedValue()};
+  }
+  llvm_unreachable("invalid value type");
+}
+
+// The location of a check. Passed as parameters to runtime checking functions.
+class CheckLoc {
+public:
+  // Creates a location that references an application memory location.
+  static CheckLoc makeStore(Value *Address) {
+    CheckLoc Result(kStore);
+    Result.Address = Address;
+    return Result;
+  }
+  static CheckLoc makeLoad(Value *Address) {
+    CheckLoc Result(kLoad);
+    Result.Address = Address;
+    return Result;
+  }
+
+  // Creates a location that references an argument, given by id.
+  static CheckLoc makeArg(int ArgId) {
+    CheckLoc Result(kArg);
+    Result.ArgId = ArgId;
+    return Result;
+  }
+
+  // Creates a location that references the return value of a function.
+  static CheckLoc makeRet() { return CheckLoc(kRet); }
+
+  // Creates a location that references a vector insert.
+  static CheckLoc makeInsert() { return CheckLoc(kInsert); }
+
+  // Returns the CheckType of location this refers to, as an integer-typed LLVM
+  // IR value.
+  Value *getType(LLVMContext &C) const {
+    return ConstantInt::get(Type::getInt32Ty(C), static_cast<int>(CheckTy));
+  }
+
+  // Returns a CheckType-specific value representing details of the location
+  // (e.g. application address for loads or stores), as an `IntptrTy`-typed LLVM
+  // IR value.
+  Value *getValue(Type *IntptrTy, IRBuilder<> &Builder) const {
+    switch (CheckTy) {
+    case kUnknown:
+      llvm_unreachable("unknown type");
+    case kRet:
+    case kInsert:
+      return ConstantInt::get(IntptrTy, 0);
+    case kArg:
+      return ConstantInt::get(IntptrTy, ArgId);
+    case kLoad:
+    case kStore:
+      return Builder.CreatePtrToInt(Address, IntptrTy);
+    }
+  }
+
+private:
+  // Must be kept in sync with the runtime,
+  // see compiler-rt/lib/nsan/nsan_stats.h
+  enum CheckType {
+    kUnknown = 0,
+    kRet,
+    kArg,
+    kLoad,
+    kStore,
+    kInsert,
+  };
+  explicit CheckLoc(CheckType CheckTy) : CheckTy(CheckTy) {}
+
+  Value *Address = nullptr;
+  const CheckType CheckTy;
+  int ArgId = -1;
+};
+
+// A map of LLVM IR values to shadow LLVM IR values.
+class ValueToShadowMap {
+public:
+  explicit ValueToShadowMap(const MappingConfig &Config) : Config(Config) {}
+
+  ValueToShadowMap(const ValueToShadowMap &) = delete;
+  ValueToShadowMap &operator=(const ValueToShadowMap &) = delete;
+
+  // Sets the shadow value for a value. Asserts that the value does not already
+  // have a value.
+  void setShadow(Value &V, Value &Shadow) {
+    [[maybe_unused]] const bool Inserted = Map.try_emplace(&V, &Shadow).second;
+    LLVM_DEBUG({
+      if (!Inserted) {
+        if (auto *I = dyn_cast<Instruction>(&V))
+          errs() << I->getFunction()->getName() << ": ";
+        errs() << "duplicate shadow (" << &V << "): ";
+        V.dump();
+      }
+    });
+    assert(Inserted && "duplicate shadow");
+  }
+
+  // Returns true if the value already has a shadow (including if the value is a
+  // constant). If true, calling getShadow() is valid.
+  bool hasShadow(Value *V) const {
+    return isa<Constant>(V) || (Map.find(V) != Map.end());
+  }
+
+  // Returns the shadow value for a given value. Asserts that the value has
+  // a shadow value. Lazily creates shadows for constant values.
+  Value *getShadow(Value *V) const {
+    if (Constant *C = dyn_cast<Constant>(V))
+      return getShadowConstant(C);
+    return Map.find(V)->second;
+  }
+
+  bool empty() const { return Map.empty(); }
+
+private:
+  // Extends a constant application value to its shadow counterpart.
+  APFloat extendConstantFP(APFloat CV, const fltSemantics &To) const {
+    bool LosesInfo = false;
+    CV.convert(To, APFloatBase::rmTowardZero, &LosesInfo);
+    return CV;
+  }
+
+  // Returns the shadow constant for the given application constant.
+  Constant *getShadowConstant(Constant *C) const {
+    if (UndefValue *U = dyn_cast<UndefValue>(C)) {
+      return UndefValue::get(Config.getExtendedFPType(U->getType()));
+    }
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+      // Floating-point constants.
+      Type *Ty = Config.getExtendedFPType(CFP->getType());
+      return ConstantFP::get(
+          Ty, extendConstantFP(CFP->getValueAPF(), Ty->getFltSemantics()));
+    }
+    // Vector, array, or aggregate constants.
+    if (C->getType()->isVectorTy()) {
+      SmallVector<Constant *, 8> Elements;
+      for (int I = 0, E = cast<VectorType>(C->getType())
+                              ->getElementCount()
+                              .getFixedValue();
+           I < E; ++I)
+        Elements.push_back(getShadowConstant(C->getAggregateElement(I)));
+      return ConstantVector::get(Elements);
+    }
+    llvm_unreachable("unimplemented");
+  }
+
+  const MappingConfig &Config;
+  DenseMap<Value *, Value *> Map;
+};
+
+/// Instantiating NumericalStabilitySanitizer inserts the nsan runtime library
+/// API function declarations into the module if they don't exist already.
+/// Instantiating ensures the __nsan_init function is in the list of global
+/// constructors for the module.
+class NumericalStabilitySanitizer {
+public:
+  NumericalStabilitySanitizer(Module &M);
+  bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
+
+private:
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  void maybeAddSuffixForNsanInterface(CallBase *CI);
+  bool addrPointsToConstantData(Value *Addr);
+  void maybeCreateShadowValue(Instruction &Root, const TargetLibraryInfo &TLI,
+                              ValueToShadowMap &Map);
+  Value *createShadowValueWithOperandsAvailable(Instruction &Inst,
+                                                const TargetLibraryInfo &TLI,
+                                                const ValueToShadowMap &Map);
+  PHINode *maybeCreateShadowPhi(PHINode &Phi, const TargetLibraryInfo &TLI);
+  void createShadowArguments(Function &F, const TargetLibraryInfo &TLI,
+                             ValueToShadowMap &Map);
+
+  void populateShadowStack(CallBase &CI, const TargetLibraryInfo &TLI,
+                           const ValueToShadowMap &Map);
+
+  void propagateShadowValues(Instruction &Inst, const TargetLibraryInfo &TLI,
+                             const ValueToShadowMap &Map);
+  Value *emitCheck(Value *V, Value *ShadowV, IRBuilder<> &Builder,
+                   CheckLoc Loc);
+  Value *emitCheckInternal(Value *V, Value *ShadowV, IRBuilder<> &Builder,
+                           CheckLoc Loc);
+  void emitFCmpCheck(FCmpInst &FCmp, const ValueToShadowMap &Map);
+
+  // Value creation handlers.
+  Value *handleLoad(LoadInst &Load, Type *VT, Type *ExtendedVT);
+  Value *handleCallBase(CallBase &Call, Type *VT, Type *ExtendedVT,
+                        const TargetLibraryInfo &TLI,
+                        const ValueToShadowMap &Map, IRBuilder<> &Builder);
+  Value *maybeHandleKnownCallBase(CallBase &Call, Type *VT, Type *ExtendedVT,
+                                  const TargetLibraryInfo &TLI,
+                                  const ValueToShadowMap &Map,
+                                  IRBuilder<> &Builder);
+  Value *handleTrunc(const FPTruncInst &Trunc, Type *VT, Type *ExtendedVT,
+                     const ValueToShadowMap &Map, IRBuilder<> &Builder);
+  Value *handleExt(const FPExtInst &Ext, Type *VT, Type *ExtendedVT,
+                   const ValueToShadowMap &Map, IRBuilder<> &Builder);
+
+  // Value propagation handlers.
+  void propagateFTStore(StoreInst &Store, Type *VT, Type *ExtendedVT,
+                        const ValueToShadowMap &Map);
+  void propagateNonFTStore(StoreInst &Store, Type *VT,
+                           const ValueToShadowMap &Map);
+
+  const DataLayout &DL;
+  LLVMContext &Context;
+  MappingConfig Config;
+  IntegerType *IntptrTy = nullptr;
+  FunctionCallee NsanGetShadowPtrForStore[FTValueType::kNumValueTypes] = {};
+  FunctionCallee NsanGetShadowPtrForLoad[FTValueType::kNumValueTypes] = {};
+  FunctionCallee NsanCheckValue[FTValueType::kNumValueTypes] = {};
+  FunctionCallee NsanFCmpFail[FTValueType::kNumValueTypes] = {};
+  FunctionCallee NsanCopyValues;
+  FunctionCallee NsanSetValueUnknown;
+  FunctionCallee NsanGetRawShadowTypePtr;
+  FunctionCallee NsanGetRawShadowPtr;
+  GlobalValue *NsanShadowRetTag = nullptr;
+
+  Type *NsanShadowRetType = nullptr;
+  GlobalValue *NsanShadowRetPtr = nullptr;
+
+  GlobalValue *NsanShadowArgsTag = nullptr;
+
+  Type *NsanShadowArgsType = nullptr;
+  GlobalValue *NsanShadowArgsPtr = nullptr;
+
+  std::optional<Regex> CheckFunctionsFilter;
+};
+} // end anonymous namespace
+
+PreservedAnalyses
+NumericalStabilitySanitizerPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kNsanModuleCtorName, kNsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
+
+  NumericalStabilitySanitizer Nsan(M);
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M)
+    Nsan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F));
+
+  return PreservedAnalyses::none();
+}
+
+static GlobalValue *createThreadLocalGV(const char *Name, Module &M, Type *Ty) {
+  return dyn_cast<GlobalValue>(M.getOrInsertGlobal(Name, Ty, [&M, Ty, Name] {
+    return new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                              nullptr, Name, nullptr,
+                              GlobalVariable::InitialExecTLSModel);
+  }));
+}
+
+NumericalStabilitySanitizer::NumericalStabilitySanitizer(Module &M)
+    : DL(M.getDataLayout()), Context(M.getContext()), Config(Context) {
+  IntptrTy = DL.getIntPtrType(Context);
+  Type *PtrTy = PointerType::getUnqual(Context);
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int1Ty = Type::getInt1Ty(Context);
+  Type *VoidTy = Type::getVoidTy(Context);
+
+  AttributeList Attr;
+  Attr = Attr.addFnAttribute(Context, Attribute::NoUnwind);
+  // Initialize the runtime values (functions and global variables).
+  for (int I = 0; I < kNumValueTypes; ++I) {
+    const FTValueType VT = static_cast<FTValueType>(I);
+    const char *VTName = typeNameFromFTValueType(VT);
+    Type *VTTy = typeFromFTValueType(VT, Context);
+
+    // Load/store.
+    const std::string GetterPrefix =
+        std::string("__nsan_get_shadow_ptr_for_") + VTName;
+    NsanGetShadowPtrForStore[VT] = M.getOrInsertFunction(
+        GetterPrefix + "_store", Attr, PtrTy, PtrTy, IntptrTy);
+    NsanGetShadowPtrForLoad[VT] = M.getOrInsertFunction(
+        GetterPrefix + "_load", Attr, PtrTy, PtrTy, IntptrTy);
+
+    // Check.
+    const auto &ShadowConfig = Config.byValueType(VT);
+    Type *ShadowTy = ShadowConfig.getType(Context);
+    NsanCheckValue[VT] =
+        M.getOrInsertFunction(std::string("__nsan_internal_check_") + VTName +
+                                  "_" + ShadowConfig.getNsanTypeId(),
+                              Attr, Int32Ty, VTTy, ShadowTy, Int32Ty, IntptrTy);
+    NsanFCmpFail[VT] = M.getOrInsertFunction(
+        std::string("__nsan_fcmp_fail_") + VTName + "_" +
+            ShadowConfig.getNsanTypeId(),
+        Attr, VoidTy, VTTy, VTTy, ShadowTy, ShadowTy, Int32Ty, Int1Ty, Int1Ty);
+  }
+
+  NsanCopyValues = M.getOrInsertFunction("__nsan_copy_values", Attr, VoidTy,
+                                         PtrTy, PtrTy, IntptrTy);
+  NsanSetValueUnknown = M.getOrInsertFunction("__nsan_set_value_unknown", Attr,
+                                              VoidTy, PtrTy, IntptrTy);
+
+  // TODO: Add attributes nofree, nosync, readnone, readonly,
+  NsanGetRawShadowTypePtr = M.getOrInsertFunction(
+      "__nsan_internal_get_raw_shadow_type_ptr", Attr, PtrTy, PtrTy);
+  NsanGetRawShadowPtr = M.getOrInsertFunction(
+      "__nsan_internal_get_raw_shadow_ptr", Attr, PtrTy, PtrTy);
+
+  NsanShadowRetTag = createThreadLocalGV("__nsan_shadow_ret_tag", M, IntptrTy);
+
+  NsanShadowRetType = ArrayType::get(Type::getInt8Ty(Context),
+                                     kMaxVectorWidth * kMaxShadowTypeSizeBytes);
+  NsanShadowRetPtr =
+      createThreadLocalGV("__nsan_shadow_ret_ptr", M, NsanShadowRetType);
+
+  NsanShadowArgsTag =
+      createThreadLocalGV("__nsan_shadow_args_tag", M, IntptrTy);
+
+  NsanShadowArgsType =
+      ArrayType::get(Type::getInt8Ty(Context),
+                     kMaxVectorWidth * kMaxNumArgs * kMaxShadowTypeSizeBytes);
+
+  NsanShadowArgsPtr =
+      createThreadLocalGV("__nsan_shadow_args_ptr", M, NsanShadowArgsType);
+
+  if (!ClCheckFunctionsFilter.empty()) {
+    Regex R = Regex(ClCheckFunctionsFilter);
+    std::string RegexError;
+    assert(R.isValid(RegexError));
+    CheckFunctionsFilter = std::move(R);
+  }
+}
+
+// Returns true if the given LLVM Value points to constant data (typically, a
+// global variable reference).
+bool NumericalStabilitySanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr))
+    return GV->isConstant();
+  return false;
+}
+
+// This instruments the function entry to create shadow arguments.
+// Pseudocode:
+//   if (this_fn_ptr == __nsan_shadow_args_tag) {
+//     s(arg0) = LOAD<sizeof(arg0)>(__nsan_shadow_args);
+//     s(arg1) = LOAD<sizeof(arg1)>(__nsan_shadow_args + sizeof(arg0));
+//     ...
+//     __nsan_shadow_args_tag = 0;
+//   } else {
+//     s(arg0) = fext(arg0);
+//     s(arg1) = fext(arg1);
+//     ...
+//   }
+void NumericalStabilitySanitizer::createShadowArguments(
+    Function &F, const TargetLibraryInfo &TLI, ValueToShadowMap &Map) {
+  assert(!F.getIntrinsicID() && "found a definition of an intrinsic");
+
+  // Do not bother if there are no FP args.
+  if (all_of(F.args(), [this](const Argument &Arg) {
+        return Config.getExtendedFPType(Arg.getType()) == nullptr;
+      }))
+    return;
+
+  IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHI());
+  // The function has shadow args if the shadow args tag matches the function
+  // address.
+  Value *HasShadowArgs = Builder.CreateICmpEQ(
+      Builder.CreateLoad(IntptrTy, NsanShadowArgsTag, /*isVolatile=*/false),
+      Builder.CreatePtrToInt(&F, IntptrTy));
+
+  unsigned ShadowArgsOffsetBytes = 0;
+  for (Argument &Arg : F.args()) {
+    Type *VT = Arg.getType();
+    Type *ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      continue; // Not an FT value.
+    Value *L = Builder.CreateAlignedLoad(
+        ExtendedVT,
+        Builder.CreateConstGEP2_64(NsanShadowArgsType, NsanShadowArgsPtr, 0,
+                                   ShadowArgsOffsetBytes),
+        Align(1), /*isVolatile=*/false);
+    Value *Shadow = Builder.CreateSelect(HasShadowArgs, L,
+                                         Builder.CreateFPExt(&Arg, ExtendedVT));
+    Map.setShadow(Arg, *Shadow);
+    TypeSize SlotSize = DL.getTypeStoreSize(ExtendedVT);
+    assert(!SlotSize.isScalable() && "unsupported");
+    ShadowArgsOffsetBytes += SlotSize;
+  }
+  Builder.CreateStore(ConstantInt::get(IntptrTy, 0), NsanShadowArgsTag);
+}
+
+// Returns true if the instrumentation should emit code to check arguments
+// before a function call.
+static bool shouldCheckArgs(CallBase &CI, const TargetLibraryInfo &TLI,
+                            const std::optional<Regex> &CheckFunctionsFilter) {
+
+  Function *Fn = CI.getCalledFunction();
+
+  if (CheckFunctionsFilter) {
+    // Skip checking args of indirect calls.
+    if (Fn == nullptr)
+      return false;
+    if (CheckFunctionsFilter->match(Fn->getName()))
+      return true;
+    return false;
+  }
+
+  if (Fn == nullptr)
+    return true; // Always check args of indirect calls.
+
+  // Never check nsan functions, the user called them for a reason.
+  if (Fn->getName().starts_with("__nsan_"))
+    return false;
+
+  const auto ID = Fn->getIntrinsicID();
+  LibFunc LFunc = LibFunc::NumLibFuncs;
+  // Always check args of unknown functions.
+  if (ID == Intrinsic::ID() && !TLI.getLibFunc(*Fn, LFunc))
+    return true;
+
+  // Do not check args of an `fabs` call that is used for a comparison.
+  // This is typically used for `fabs(a-b) < tolerance`, where what matters is
+  // the result of the comparison, which is already caught be the fcmp checks.
+  if (ID == Intrinsic::fabs || LFunc == LibFunc_fabsf ||
+      LFunc == LibFunc_fabs || LFunc == LibFunc_fabsl)
+    for (const auto &U : CI.users())
+      if (isa<CmpInst>(U))
+        return false;
+
+  return true; // Default is check.
+}
+
+// Populates the shadow call stack (which contains shadow values for every
+// floating-point parameter to the function).
+void NumericalStabilitySanitizer::populateShadowStack(
+    CallBase &CI, const TargetLibraryInfo &TLI, const ValueToShadowMap &Map) {
+  // Do not create a shadow stack for inline asm.
+  if (CI.isInlineAsm())
+    return;
+
+  // Do not bother if there are no FP args.
+  if (all_of(CI.operands(), [this](const Value *Arg) {
+        return Config.getExtendedFPType(Arg->getType()) == nullptr;
+      }))
+    return;
+
+  IRBuilder<> Builder(&CI);
+  SmallVector<Value *, 8> ArgShadows;
+  const bool ShouldCheckArgs = shouldCheckArgs(CI, TLI, CheckFunctionsFilter);
+  for (auto [ArgIdx, Arg] : enumerate(CI.operands())) {
+    if (Config.getExtendedFPType(Arg->getType()) == nullptr)
+      continue; // Not an FT value.
+    Value *ArgShadow = Map.getShadow(Arg);
+    ArgShadows.push_back(ShouldCheckArgs ? emitCheck(Arg, ArgShadow, Builder,
+                                                     CheckLoc::makeArg(ArgIdx))
+                                         : ArgShadow);
+  }
+
+  // Do not create shadow stacks for intrinsics/known lib funcs.
+  if (Function *Fn = CI.getCalledFunction()) {
+    LibFunc LFunc;
+    if (Fn->isIntrinsic() || TLI.getLibFunc(*Fn, LFunc))
+      return;
+  }
+
+  // Set the shadow stack tag.
+  Builder.CreateStore(CI.getCalledOperand(), NsanShadowArgsTag);
+  TypeSize ShadowArgsOffsetBytes = TypeSize::getFixed(0);
+
+  unsigned ShadowArgId = 0;
+  for (const Value *Arg : CI.operands()) {
+    Type *VT = Arg->getType();
+    Type *ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      continue; // Not an FT value.
+    Builder.CreateAlignedStore(
+        ArgShadows[ShadowArgId++],
+        Builder.CreateConstGEP2_64(NsanShadowArgsType, NsanShadowArgsPtr, 0,
+                                   ShadowArgsOffsetBytes),
+        Align(1), /*isVolatile=*/false);
+    TypeSize SlotSize = DL.getTypeStoreSize(ExtendedVT);
+    assert(!SlotSize.isScalable() && "unsupported");
+    ShadowArgsOffsetBytes += SlotSize;
+  }
+}
+
+// Internal part of emitCheck(). Returns a value that indicates whether
+// computation should continue with the shadow or resume by re-fextending the
+// value.
+enum class ContinuationType { // Keep in sync with runtime.
+  ContinueWithShadow = 0,
+  ResumeFromValue = 1,
+};
+
+Value *NumericalStabilitySanitizer::emitCheckInternal(Value *V, Value *ShadowV,
+                                                      IRBuilder<> &Builder,
+                                                      CheckLoc Loc) {
+  // Do not emit checks for constant values, this is redundant.
+  if (isa<Constant>(V))
+    return ConstantInt::get(
+        Builder.getInt32Ty(),
+        static_cast<int>(ContinuationType::ContinueWithShadow));
+
+  Type *Ty = V->getType();
+  if (const auto VT = ftValueTypeFromType(Ty))
+    return Builder.CreateCall(
+        NsanCheckValue[*VT],
+        {V, ShadowV, Loc.getType(Context), Loc.getValue(IntptrTy, Builder)});
+
+  if (Ty->isVectorTy()) {
+    auto *VecTy = cast<VectorType>(Ty);
+    // We currently skip scalable vector types in MappingConfig,
+    // thus we should not encounter any such types here.
+    assert(!VecTy->isScalableTy() &&
+           "Scalable vector types are not supported yet");
+    Value *CheckResult = nullptr;
+    for (int I = 0, E = VecTy->getElementCount().getFixedValue(); I < E; ++I) {
+      // We resume if any element resumes. Another option would be to create a
+      // vector shuffle with the array of ContinueWithShadow, but that is too
+      // complex.
+      Value *ExtractV = Builder.CreateExtractElement(V, I);
+      Value *ExtractShadowV = Builder.CreateExtractElement(ShadowV, I);
+      Value *ComponentCheckResult =
+          emitCheckInternal(ExtractV, ExtractShadowV, Builder, Loc);
+      CheckResult = CheckResult
+                        ? Builder.CreateOr(CheckResult, ComponentCheckResult)
+                        : ComponentCheckResult;
+    }
+    return CheckResult;
+  }
+  if (Ty->isArrayTy()) {
+    Value *CheckResult = nullptr;
+    for (auto I : seq(Ty->getArrayNumElements())) {
+      Value *ExtractV = Builder.CreateExtractElement(V, I);
+      Value *ExtractShadowV = Builder.CreateExtractElement(ShadowV, I);
+      Value *ComponentCheckResult =
+          emitCheckInternal(ExtractV, ExtractShadowV, Builder, Loc);
+      CheckResult = CheckResult
+                        ? Builder.CreateOr(CheckResult, ComponentCheckResult)
+                        : ComponentCheckResult;
+    }
+    return CheckResult;
+  }
+  if (Ty->isStructTy()) {
+    Value *CheckResult = nullptr;
+    for (auto I : seq(Ty->getStructNumElements())) {
+      if (Config.getExtendedFPType(Ty->getStructElementType(I)) == nullptr)
+        continue; // Only check FT values.
+      Value *ExtractV = Builder.CreateExtractValue(V, I);
+      Value *ExtractShadowV = Builder.CreateExtractElement(ShadowV, I);
+      Value *ComponentCheckResult =
+          emitCheckInternal(ExtractV, ExtractShadowV, Builder, Loc);
+      CheckResult = CheckResult
+                        ? Builder.CreateOr(CheckResult, ComponentCheckResult)
+                        : ComponentCheckResult;
+    }
+    if (!CheckResult)
+      return ConstantInt::get(
+          Builder.getInt32Ty(),
+          static_cast<int>(ContinuationType::ContinueWithShadow));
+    return CheckResult;
+  }
+
+  llvm_unreachable("not implemented");
+}
+
+// Inserts a runtime check of V against its shadow value ShadowV.
+// We check values whenever they escape: on return, call, stores, and
+// insertvalue.
+// Returns the shadow value that should be used to continue the computations,
+// depending on the answer from the runtime.
+// TODO: Should we check on select ? phi ?
+Value *NumericalStabilitySanitizer::emitCheck(Value *V, Value *ShadowV,
+                                              IRBuilder<> &Builder,
+                                              CheckLoc Loc) {
+  // Do not emit checks for constant values, this is redundant.
+  if (isa<Constant>(V))
+    return ShadowV;
+
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    Function *F = Inst->getFunction();
+    if (CheckFunctionsFilter && !CheckFunctionsFilter->match(F->getName())) {
+      return ShadowV;
+    }
+  }
+
+  Value *CheckResult = emitCheckInternal(V, ShadowV, Builder, Loc);
+  Value *ICmpEQ = Builder.CreateICmpEQ(
+      CheckResult,
+      ConstantInt::get(Builder.getInt32Ty(),
+                       static_cast<int>(ContinuationType::ResumeFromValue)));
+  return Builder.CreateSelect(
+      ICmpEQ, Builder.CreateFPExt(V, Config.getExtendedFPType(V->getType())),
+      ShadowV);
+}
+
+// Inserts a check that fcmp on shadow values are consistent with that on base
+// values.
+void NumericalStabilitySanitizer::emitFCmpCheck(FCmpInst &FCmp,
+                                                const ValueToShadowMap &Map) {
+  if (!ClInstrumentFCmp)
+    return;
+
+  Function *F = FCmp.getFunction();
+  if (CheckFunctionsFilter && !CheckFunctionsFilter->match(F->getName()))
+    return;
+
+  Value *LHS = FCmp.getOperand(0);
+  if (Config.getExtendedFPType(LHS->getType()) == nullptr)
+    return;
+  Value *RHS = FCmp.getOperand(1);
+
+  // Split the basic block. On mismatch, we'll jump to the new basic block with
+  // a call to the runtime for error reporting.
+  BasicBlock *FCmpBB = FCmp.getParent();
+  BasicBlock *NextBB = FCmpBB->splitBasicBlock(FCmp.getNextNode());
+  // Remove the newly created terminator unconditional branch.
+  FCmpBB->back().eraseFromParent();
+  BasicBlock *FailBB =
+      BasicBlock::Create(Context, "", FCmpBB->getParent(), NextBB);
+
+  // Create the shadow fcmp and comparison between the fcmps.
+  IRBuilder<> FCmpBuilder(FCmpBB);
+  FCmpBuilder.SetCurrentDebugLocation(FCmp.getDebugLoc());
+  Value *ShadowLHS = Map.getShadow(LHS);
+  Value *ShadowRHS = Map.getShadow(RHS);
+  // See comment on ClTruncateFCmpEq.
+  if (FCmp.isEquality() && ClTruncateFCmpEq) {
+    Type *Ty = ShadowLHS->getType();
+    ShadowLHS = FCmpBuilder.CreateFPExt(
+        FCmpBuilder.CreateFPTrunc(ShadowLHS, LHS->getType()), Ty);
+    ShadowRHS = FCmpBuilder.CreateFPExt(
+        FCmpBuilder.CreateFPTrunc(ShadowRHS, RHS->getType()), Ty);
+  }
+  Value *ShadowFCmp =
+      FCmpBuilder.CreateFCmp(FCmp.getPredicate(), ShadowLHS, ShadowRHS);
+  Value *OriginalAndShadowFcmpMatch =
+      FCmpBuilder.CreateICmpEQ(&FCmp, ShadowFCmp);
+
+  if (OriginalAndShadowFcmpMatch->getType()->isVectorTy()) {
+    // If we have a vector type, `OriginalAndShadowFcmpMatch` is a vector of i1,
+    // where an element is true if the corresponding elements in original and
+    // shadow are the same. We want all elements to be 1.
+    OriginalAndShadowFcmpMatch =
+        FCmpBuilder.CreateAndReduce(OriginalAndShadowFcmpMatch);
+  }
+
+  // Use MDBuilder(*C).createLikelyBranchWeights() because "match" is the common
+  // case.
+  FCmpBuilder.CreateCondBr(OriginalAndShadowFcmpMatch, NextBB, FailBB,
+                           MDBuilder(Context).createLikelyBranchWeights());
+
+  // Fill in FailBB.
+  IRBuilder<> FailBuilder(FailBB);
+  FailBuilder.SetCurrentDebugLocation(FCmp.getDebugLoc());
+
+  const auto EmitFailCall = [this, &FCmp, &FCmpBuilder,
+                             &FailBuilder](Value *L, Value *R, Value *ShadowL,
+                                           Value *ShadowR, Value *Result,
+                                           Value *ShadowResult) {
+    Type *FT = L->getType();
+    FunctionCallee *Callee = nullptr;
+    if (FT->isFloatTy()) {
+      Callee = &(NsanFCmpFail[kFloat]);
+    } else if (FT->isDoubleTy()) {
+      Callee = &(NsanFCmpFail[kDouble]);
+    } else if (FT->isX86_FP80Ty()) {
+      // TODO: make NsanFCmpFailLongDouble work.
+      Callee = &(NsanFCmpFail[kDouble]);
+      L = FailBuilder.CreateFPTrunc(L, Type::getDoubleTy(Context));
+      R = FailBuilder.CreateFPTrunc(L, Type::getDoubleTy(Context));
+    } else {
+      llvm_unreachable("not implemented");
+    }
+    FailBuilder.CreateCall(*Callee, {L, R, ShadowL, ShadowR,
+                                     ConstantInt::get(FCmpBuilder.getInt32Ty(),
+                                                      FCmp.getPredicate()),
+                                     Result, ShadowResult});
+  };
+  if (LHS->getType()->isVectorTy()) {
+    for (int I = 0, E = cast<VectorType>(LHS->getType())
+                            ->getElementCount()
+                            .getFixedValue();
+         I < E; ++I) {
+      Value *ExtractLHS = FailBuilder.CreateExtractElement(LHS, I);
+      Value *ExtractRHS = FailBuilder.CreateExtractElement(RHS, I);
+      Value *ExtractShaodwLHS = FailBuilder.CreateExtractElement(ShadowLHS, I);
+      Value *ExtractShaodwRHS = FailBuilder.CreateExtractElement(ShadowRHS, I);
+      Value *ExtractFCmp = FailBuilder.CreateExtractElement(&FCmp, I);
+      Value *ExtractShadowFCmp =
+          FailBuilder.CreateExtractElement(ShadowFCmp, I);
+      EmitFailCall(ExtractLHS, ExtractRHS, ExtractShaodwLHS, ExtractShaodwRHS,
+                   ExtractFCmp, ExtractShadowFCmp);
+    }
+  } else {
+    EmitFailCall(LHS, RHS, ShadowLHS, ShadowRHS, &FCmp, ShadowFCmp);
+  }
+  FailBuilder.CreateBr(NextBB);
+
+  ++NumInstrumentedFCmp;
+}
+
+// Creates a shadow phi value for any phi that defines a value of FT type.
+PHINode *NumericalStabilitySanitizer::maybeCreateShadowPhi(
+    PHINode &Phi, const TargetLibraryInfo &TLI) {
+  Type *VT = Phi.getType();
+  Type *ExtendedVT = Config.getExtendedFPType(VT);
+  if (ExtendedVT == nullptr)
+    return nullptr; // Not an FT value.
+  // The phi operands are shadow values and are not available when the phi is
+  // created. They will be populated in a final phase, once all shadow values
+  // have been created.
+  PHINode *Shadow = PHINode::Create(ExtendedVT, Phi.getNumIncomingValues());
+  Shadow->insertAfter(&Phi);
+  return Shadow;
+}
+
+Value *NumericalStabilitySanitizer::handleLoad(LoadInst &Load, Type *VT,
+                                               Type *ExtendedVT) {
+  IRBuilder<> Builder(Load.getNextNode());
+  Builder.SetCurrentDebugLocation(Load.getDebugLoc());
+  if (addrPointsToConstantData(Load.getPointerOperand())) {
+    // No need to look into the shadow memory, the value is a constant. Just
+    // convert from FT to 2FT.
+    return Builder.CreateFPExt(&Load, ExtendedVT);
+  }
+
+  // if (%shadowptr == &)
+  //    %shadow = fpext %v
+  // else
+  //    %shadow = load (ptrcast %shadow_ptr))
+  // Considered options here:
+  //  - Have `NsanGetShadowPtrForLoad` return a fixed address
+  //    &__nsan_unknown_value_shadow_address that is valid to load from, and
+  //    use a select. This has the advantage that the generated IR is simpler.
+  //  - Have `NsanGetShadowPtrForLoad` return nullptr.  Because `select` does
+  //    not short-circuit, dereferencing the returned pointer is no longer an
+  //    option, have to split and create a separate basic block. This has the
+  //    advantage of being easier to debug because it crashes if we ever mess
+  //    up.
+
+  const auto Extents = getMemoryExtentsOrDie(VT);
+  Value *ShadowPtr = Builder.CreateCall(
+      NsanGetShadowPtrForLoad[Extents.ValueType],
+      {Load.getPointerOperand(), ConstantInt::get(IntptrTy, Extents.NumElts)});
+  ++NumInstrumentedFTLoads;
+
+  // Split the basic block.
+  BasicBlock *LoadBB = Load.getParent();
+  BasicBlock *NextBB = LoadBB->splitBasicBlock(Builder.GetInsertPoint());
+  // Create the two options for creating the shadow value.
+  BasicBlock *ShadowLoadBB =
+      BasicBlock::Create(Context, "", LoadBB->getParent(), NextBB);
+  BasicBlock *FExtBB =
+      BasicBlock::Create(Context, "", LoadBB->getParent(), NextBB);
+
+  // Replace the newly created terminator unconditional branch by a conditional
+  // branch to one of the options.
+  {
+    LoadBB->back().eraseFromParent();
+    IRBuilder<> LoadBBBuilder(LoadBB); // The old builder has been invalidated.
+    LoadBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+    LoadBBBuilder.CreateCondBr(LoadBBBuilder.CreateIsNull(ShadowPtr), FExtBB,
+                               ShadowLoadBB);
+  }
+
+  // Fill in ShadowLoadBB.
+  IRBuilder<> ShadowLoadBBBuilder(ShadowLoadBB);
+  ShadowLoadBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+  Value *ShadowLoad = ShadowLoadBBBuilder.CreateAlignedLoad(
+      ExtendedVT, ShadowPtr, Align(1), Load.isVolatile());
+  if (ClCheckLoads) {
+    ShadowLoad = emitCheck(&Load, ShadowLoad, ShadowLoadBBBuilder,
+                           CheckLoc::makeLoad(Load.getPointerOperand()));
+  }
+  ShadowLoadBBBuilder.CreateBr(NextBB);
+
+  // Fill in FExtBB.
+  IRBuilder<> FExtBBBuilder(FExtBB);
+  FExtBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+  Value *FExt = FExtBBBuilder.CreateFPExt(&Load, ExtendedVT);
+  FExtBBBuilder.CreateBr(NextBB);
+
+  // The shadow value come from any of the options.
+  IRBuilder<> NextBBBuilder(&*NextBB->begin());
+  NextBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+  PHINode *ShadowPhi = NextBBBuilder.CreatePHI(ExtendedVT, 2);
+  ShadowPhi->addIncoming(ShadowLoad, ShadowLoadBB);
+  ShadowPhi->addIncoming(FExt, FExtBB);
+  return ShadowPhi;
+}
+
+Value *NumericalStabilitySanitizer::handleTrunc(const FPTruncInst &Trunc,
+                                                Type *VT, Type *ExtendedVT,
+                                                const ValueToShadowMap &Map,
+                                                IRBuilder<> &Builder) {
+  Value *OrigSource = Trunc.getOperand(0);
+  Type *OrigSourceTy = OrigSource->getType();
+  Type *ExtendedSourceTy = Config.getExtendedFPType(OrigSourceTy);
+
+  // When truncating:
+  //  - (A) If the source has a shadow, we truncate from the shadow, else we
+  //    truncate from the original source.
+  //  - (B) If the shadow of the source is larger than the shadow of the dest,
+  //    we still need a truncate. Else, the shadow of the source is the same
+  //    type as the shadow of the dest (because mappings are non-decreasing), so
+  //   we don't need to emit a truncate.
+  // Examples,
+  //   with a mapping of {f32->f64;f64->f80;f80->f128}
+  //     fptrunc double   %1 to float     ->  fptrunc x86_fp80 s(%1) to double
+  //     fptrunc x86_fp80 %1 to float     ->  fptrunc fp128    s(%1) to double
+  //     fptrunc fp128    %1 to float     ->  fptrunc fp128    %1    to double
+  //     fptrunc x86_fp80 %1 to double    ->  x86_fp80 s(%1)
+  //     fptrunc fp128    %1 to double    ->  fptrunc fp128 %1 to x86_fp80
+  //     fptrunc fp128    %1 to x86_fp80  ->  fp128 %1
+  //   with a mapping of {f32->f64;f64->f128;f80->f128}
+  //     fptrunc double   %1 to float     ->  fptrunc fp128    s(%1) to double
+  //     fptrunc x86_fp80 %1 to float     ->  fptrunc fp128    s(%1) to double
+  //     fptrunc fp128    %1 to float     ->  fptrunc fp128    %1    to double
+  //     fptrunc x86_fp80 %1 to double    ->  fp128 %1
+  //     fptrunc fp128    %1 to double    ->  fp128 %1
+  //     fptrunc fp128    %1 to x86_fp80  ->  fp128 %1
+  //   with a mapping of {f32->f32;f64->f32;f80->f64}
+  //     fptrunc double   %1 to float     ->  float s(%1)
+  //     fptrunc x86_fp80 %1 to float     ->  fptrunc double    s(%1) to float
+  //     fptrunc fp128    %1 to float     ->  fptrunc fp128     %1    to float
+  //     fptrunc x86_fp80 %1 to double    ->  fptrunc double    s(%1) to float
+  //     fptrunc fp128    %1 to double    ->  fptrunc fp128     %1    to float
+  //     fptrunc fp128    %1 to x86_fp80  ->  fptrunc fp128     %1    to double
+
+  // See (A) above.
+  Value *Source = ExtendedSourceTy ? Map.getShadow(OrigSource) : OrigSource;
+  Type *SourceTy = ExtendedSourceTy ? ExtendedSourceTy : OrigSourceTy;
+  // See (B) above.
+  if (SourceTy == ExtendedVT)
+    return Source;
+
+  return Builder.CreateFPTrunc(Source, ExtendedVT);
+}
+
+Value *NumericalStabilitySanitizer::handleExt(const FPExtInst &Ext, Type *VT,
+                                              Type *ExtendedVT,
+                                              const ValueToShadowMap &Map,
+                                              IRBuilder<> &Builder) {
+  Value *OrigSource = Ext.getOperand(0);
+  Type *OrigSourceTy = OrigSource->getType();
+  Type *ExtendedSourceTy = Config.getExtendedFPType(OrigSourceTy);
+  // When extending:
+  //  - (A) If the source has a shadow, we extend from the shadow, else we
+  //    extend from the original source.
+  //  - (B) If the shadow of the dest is larger than the shadow of the source,
+  //    we still need an extend. Else, the shadow of the source is the same
+  //    type as the shadow of the dest (because mappings are non-decreasing), so
+  //    we don't need to emit an extend.
+  // Examples,
+  //   with a mapping of {f32->f64;f64->f80;f80->f128}
+  //     fpext half    %1 to float     ->  fpext half     %1    to double
+  //     fpext half    %1 to double    ->  fpext half     %1    to x86_fp80
+  //     fpext half    %1 to x86_fp80  ->  fpext half     %1    to fp128
+  //     fpext float   %1 to double    ->  double s(%1)
+  //     fpext float   %1 to x86_fp80  ->  fpext double   s(%1) to fp128
+  //     fpext double  %1 to x86_fp80  ->  fpext x86_fp80 s(%1) to fp128
+  //   with a mapping of {f32->f64;f64->f128;f80->f128}
+  //     fpext half    %1 to float     ->  fpext half     %1    to double
+  //     fpext half    %1 to double    ->  fpext half     %1    to fp128
+  //     fpext half    %1 to x86_fp80  ->  fpext half     %1    to fp128
+  //     fpext float   %1 to double    ->  fpext double   s(%1) to fp128
+  //     fpext float   %1 to x86_fp80  ->  fpext double   s(%1) to fp128
+  //     fpext double  %1 to x86_fp80  ->  fp128 s(%1)
+  //   with a mapping of {f32->f32;f64->f32;f80->f64}
+  //     fpext half    %1 to float     ->  fpext half     %1    to float
+  //     fpext half    %1 to double    ->  fpext half     %1    to float
+  //     fpext half    %1 to x86_fp80  ->  fpext half     %1    to double
+  //     fpext float   %1 to double    ->  s(%1)
+  //     fpext float   %1 to x86_fp80  ->  fpext float    s(%1) to double
+  //     fpext double  %1 to x86_fp80  ->  fpext float    s(%1) to double
+
+  // See (A) above.
+  Value *Source = ExtendedSourceTy ? Map.getShadow(OrigSource) : OrigSource;
+  Type *SourceTy = ExtendedSourceTy ? ExtendedSourceTy : OrigSourceTy;
+  // See (B) above.
+  if (SourceTy == ExtendedVT)
+    return Source;
+
+  return Builder.CreateFPExt(Source, ExtendedVT);
+}
+
+namespace {
+// TODO: This should be tablegen-ed.
+struct KnownIntrinsic {
+  struct WidenedIntrinsic {
+    const char *NarrowName;
+    Intrinsic::ID ID; // wide id.
+    using FnTypeFactory = FunctionType *(*)(LLVMContext &);
+    FnTypeFactory MakeFnTy;
+  };
+
+  static const char *get(LibFunc LFunc);
+
+  // Given an intrinsic with an `FT` argument, try to find a wider intrinsic
+  // that applies the same operation on the shadow argument.
+  // Options are:
+  //  - pass in the ID and full function type,
+  //  - pass in the name, which includes the function type through mangling.
+  static const WidenedIntrinsic *widen(StringRef Name);
+
+private:
+  struct LFEntry {
+    LibFunc LFunc;
+    const char *IntrinsicName;
+  };
+  static const LFEntry kLibfuncIntrinsics[];
+
+  static const WidenedIntrinsic kWidenedIntrinsics[];
+};
+} // namespace
+
+static FunctionType *makeDoubleDouble(LLVMContext &C) {
+  return FunctionType::get(Type::getDoubleTy(C), {Type::getDoubleTy(C)}, false);
+}
+
+static FunctionType *makeX86FP80X86FP80(LLVMContext &C) {
+  return FunctionType::get(Type::getX86_FP80Ty(C), {Type::getX86_FP80Ty(C)},
+                           false);
+}
+
+static FunctionType *makeDoubleDoubleI32(LLVMContext &C) {
+  return FunctionType::get(Type::getDoubleTy(C),
+                           {Type::getDoubleTy(C), Type::getInt32Ty(C)}, false);
+}
+
+static FunctionType *makeX86FP80X86FP80I32(LLVMContext &C) {
+  return FunctionType::get(Type::getX86_FP80Ty(C),
+                           {Type::getX86_FP80Ty(C), Type::getInt32Ty(C)},
+                           false);
+}
+
+static FunctionType *makeDoubleDoubleDouble(LLVMContext &C) {
+  return FunctionType::get(Type::getDoubleTy(C),
+                           {Type::getDoubleTy(C), Type::getDoubleTy(C)}, false);
+}
+
+static FunctionType *makeX86FP80X86FP80X86FP80(LLVMContext &C) {
+  return FunctionType::get(Type::getX86_FP80Ty(C),
+                           {Type::getX86_FP80Ty(C), Type::getX86_FP80Ty(C)},
+                           false);
+}
+
+static FunctionType *makeDoubleDoubleDoubleDouble(LLVMContext &C) {
+  return FunctionType::get(
+      Type::getDoubleTy(C),
+      {Type::getDoubleTy(C), Type::getDoubleTy(C), Type::getDoubleTy(C)},
+      false);
+}
+
+static FunctionType *makeX86FP80X86FP80X86FP80X86FP80(LLVMContext &C) {
+  return FunctionType::get(
+      Type::getX86_FP80Ty(C),
+      {Type::getX86_FP80Ty(C), Type::getX86_FP80Ty(C), Type::getX86_FP80Ty(C)},
+      false);
+}
+
+const KnownIntrinsic::WidenedIntrinsic KnownIntrinsic::kWidenedIntrinsics[] = {
+    // TODO: Right now we ignore vector intrinsics.
+    // This is hard because we have to model the semantics of the intrinsics,
+    // e.g. llvm.x86.sse2.min.sd means extract first element, min, insert back.
+    // Intrinsics that take any non-vector FT types:
+    // NOTE: Right now because of
+    // https://github.com/llvm/llvm-project/issues/44744
+    // for f128 we need to use makeX86FP80X86FP80 (go to a lower precision and
+    // come back).
+    {"llvm.sqrt.f32", Intrinsic::sqrt, makeDoubleDouble},
+    {"llvm.sqrt.f64", Intrinsic::sqrt, makeX86FP80X86FP80},
+    {"llvm.sqrt.f80", Intrinsic::sqrt, makeX86FP80X86FP80},
+    {"llvm.powi.f32", Intrinsic::powi, makeDoubleDoubleI32},
+    {"llvm.powi.f64", Intrinsic::powi, makeX86FP80X86FP80I32},
+    {"llvm.powi.f80", Intrinsic::powi, makeX86FP80X86FP80I32},
+    {"llvm.sin.f32", Intrinsic::sin, makeDoubleDouble},
+    {"llvm.sin.f64", Intrinsic::sin, makeX86FP80X86FP80},
+    {"llvm.sin.f80", Intrinsic::sin, makeX86FP80X86FP80},
+    {"llvm.cos.f32", Intrinsic::cos, makeDoubleDouble},
+    {"llvm.cos.f64", Intrinsic::cos, makeX86FP80X86FP80},
+    {"llvm.cos.f80", Intrinsic::cos, makeX86FP80X86FP80},
+    {"llvm.pow.f32", Intrinsic::pow, makeDoubleDoubleDouble},
+    {"llvm.pow.f64", Intrinsic::pow, makeX86FP80X86FP80X86FP80},
+    {"llvm.pow.f80", Intrinsic::pow, makeX86FP80X86FP80X86FP80},
+    {"llvm.exp.f32", Intrinsic::exp, makeDoubleDouble},
+    {"llvm.exp.f64", Intrinsic::exp, makeX86FP80X86FP80},
+    {"llvm.exp.f80", Intrinsic::exp, makeX86FP80X86FP80},
+    {"llvm.exp2.f32", Intrinsic::exp2, makeDoubleDouble},
+    {"llvm.exp2.f64", Intrinsic::exp2, makeX86FP80X86FP80},
+    {"llvm.exp2.f80", Intrinsic::exp2, makeX86FP80X86FP80},
+    {"llvm.log.f32", Intrinsic::log, makeDoubleDouble},
+    {"llvm.log.f64", Intrinsic::log, makeX86FP80X86FP80},
+    {"llvm.log.f80", Intrinsic::log, makeX86FP80X86FP80},
+    {"llvm.log10.f32", Intrinsic::log10, makeDoubleDouble},
+    {"llvm.log10.f64", Intrinsic::log10, makeX86FP80X86FP80},
+    {"llvm.log10.f80", Intrinsic::log10, makeX86FP80X86FP80},
+    {"llvm.log2.f32", Intrinsic::log2, makeDoubleDouble},
+    {"llvm.log2.f64", Intrinsic::log2, makeX86FP80X86FP80},
+    {"llvm.log2.f80", Intrinsic::log2, makeX86FP80X86FP80},
+    {"llvm.fma.f32", Intrinsic::fma, makeDoubleDoubleDoubleDouble},
+
+    {"llvm.fmuladd.f32", Intrinsic::fmuladd, makeDoubleDoubleDoubleDouble},
+
+    {"llvm.fma.f64", Intrinsic::fma, makeX86FP80X86FP80X86FP80X86FP80},
+
+    {"llvm.fmuladd.f64", Intrinsic::fma, makeX86FP80X86FP80X86FP80X86FP80},
+
+    {"llvm.fma.f80", Intrinsic::fma, makeX86FP80X86FP80X86FP80X86FP80},
+    {"llvm.fabs.f32", Intrinsic::fabs, makeDoubleDouble},
+    {"llvm.fabs.f64", Intrinsic::fabs, makeX86FP80X86FP80},
+    {"llvm.fabs.f80", Intrinsic::fabs, makeX86FP80X86FP80},
+    {"llvm.minnum.f32", Intrinsic::minnum, makeDoubleDoubleDouble},
+    {"llvm.minnum.f64", Intrinsic::minnum, makeX86FP80X86FP80X86FP80},
+    {"llvm.minnum.f80", Intrinsic::minnum, makeX86FP80X86FP80X86FP80},
+    {"llvm.maxnum.f32", Intrinsic::maxnum, makeDoubleDoubleDouble},
+    {"llvm.maxnum.f64", Intrinsic::maxnum, makeX86FP80X86FP80X86FP80},
+    {"llvm.maxnum.f80", Intrinsic::maxnum, makeX86FP80X86FP80X86FP80},
+    {"llvm.minimum.f32", Intrinsic::minimum, makeDoubleDoubleDouble},
+    {"llvm.minimum.f64", Intrinsic::minimum, makeX86FP80X86FP80X86FP80},
+    {"llvm.minimum.f80", Intrinsic::minimum, makeX86FP80X86FP80X86FP80},
+    {"llvm.maximum.f32", Intrinsic::maximum, makeDoubleDoubleDouble},
+    {"llvm.maximum.f64", Intrinsic::maximum, makeX86FP80X86FP80X86FP80},
+    {"llvm.maximum.f80", Intrinsic::maximum, makeX86FP80X86FP80X86FP80},
+    {"llvm.copysign.f32", Intrinsic::copysign, makeDoubleDoubleDouble},
+    {"llvm.copysign.f64", Intrinsic::copysign, makeX86FP80X86FP80X86FP80},
+    {"llvm.copysign.f80", Intrinsic::copysign, makeX86FP80X86FP80X86FP80},
+    {"llvm.floor.f32", Intrinsic::floor, makeDoubleDouble},
+    {"llvm.floor.f64", Intrinsic::floor, makeX86FP80X86FP80},
+    {"llvm.floor.f80", Intrinsic::floor, makeX86FP80X86FP80},
+    {"llvm.ceil.f32", Intrinsic::ceil, makeDoubleDouble},
+    {"llvm.ceil.f64", Intrinsic::ceil, makeX86FP80X86FP80},
+    {"llvm.ceil.f80", Intrinsic::ceil, makeX86FP80X86FP80},
+    {"llvm.trunc.f32", Intrinsic::trunc, makeDoubleDouble},
+    {"llvm.trunc.f64", Intrinsic::trunc, makeX86FP80X86FP80},
+    {"llvm.trunc.f80", Intrinsic::trunc, makeX86FP80X86FP80},
+    {"llvm.rint.f32", Intrinsic::rint, makeDoubleDouble},
+    {"llvm.rint.f64", Intrinsic::rint, makeX86FP80X86FP80},
+    {"llvm.rint.f80", Intrinsic::rint, makeX86FP80X86FP80},
+    {"llvm.nearbyint.f32", Intrinsic::nearbyint, makeDoubleDouble},
+    {"llvm.nearbyint.f64", Intrinsic::nearbyint, makeX86FP80X86FP80},
+    {"llvm.nearbyin80f64", Intrinsic::nearbyint, makeX86FP80X86FP80},
+    {"llvm.round.f32", Intrinsic::round, makeDoubleDouble},
+    {"llvm.round.f64", Intrinsic::round, makeX86FP80X86FP80},
+    {"llvm.round.f80", Intrinsic::round, makeX86FP80X86FP80},
+    {"llvm.lround.f32", Intrinsic::lround, makeDoubleDouble},
+    {"llvm.lround.f64", Intrinsic::lround, makeX86FP80X86FP80},
+    {"llvm.lround.f80", Intrinsic::lround, makeX86FP80X86FP80},
+    {"llvm.llround.f32", Intrinsic::llround, makeDoubleDouble},
+    {"llvm.llround.f64", Intrinsic::llround, makeX86FP80X86FP80},
+    {"llvm.llround.f80", Intrinsic::llround, makeX86FP80X86FP80},
+    {"llvm.lrint.f32", Intrinsic::lrint, makeDoubleDouble},
+    {"llvm.lrint.f64", Intrinsic::lrint, makeX86FP80X86FP80},
+    {"llvm.lrint.f80", Intrinsic::lrint, makeX86FP80X86FP80},
+    {"llvm.llrint.f32", Intrinsic::llrint, makeDoubleDouble},
+    {"llvm.llrint.f64", Intrinsic::llrint, makeX86FP80X86FP80},
+    {"llvm.llrint.f80", Intrinsic::llrint, makeX86FP80X86FP80},
+};
+
+const KnownIntrinsic::LFEntry KnownIntrinsic::kLibfuncIntrinsics[] = {
+    {LibFunc_sqrtf, "llvm.sqrt.f32"},
+    {LibFunc_sqrt, "llvm.sqrt.f64"},
+    {LibFunc_sqrtl, "llvm.sqrt.f80"},
+    {LibFunc_sinf, "llvm.sin.f32"},
+    {LibFunc_sin, "llvm.sin.f64"},
+    {LibFunc_sinl, "llvm.sin.f80"},
+    {LibFunc_cosf, "llvm.cos.f32"},
+    {LibFunc_cos, "llvm.cos.f64"},
+    {LibFunc_cosl, "llvm.cos.f80"},
+    {LibFunc_powf, "llvm.pow.f32"},
+    {LibFunc_pow, "llvm.pow.f64"},
+    {LibFunc_powl, "llvm.pow.f80"},
+    {LibFunc_expf, "llvm.exp.f32"},
+    {LibFunc_exp, "llvm.exp.f64"},
+    {LibFunc_expl, "llvm.exp.f80"},
+    {LibFunc_exp2f, "llvm.exp2.f32"},
+    {LibFunc_exp2, "llvm.exp2.f64"},
+    {LibFunc_exp2l, "llvm.exp2.f80"},
+    {LibFunc_logf, "llvm.log.f32"},
+    {LibFunc_log, "llvm.log.f64"},
+    {LibFunc_logl, "llvm.log.f80"},
+    {LibFunc_log10f, "llvm.log10.f32"},
+    {LibFunc_log10, "llvm.log10.f64"},
+    {LibFunc_log10l, "llvm.log10.f80"},
+    {LibFunc_log2f, "llvm.log2.f32"},
+    {LibFunc_log2, "llvm.log2.f64"},
+    {LibFunc_log2l, "llvm.log2.f80"},
+    {LibFunc_fabsf, "llvm.fabs.f32"},
+    {LibFunc_fabs, "llvm.fabs.f64"},
+    {LibFunc_fabsl, "llvm.fabs.f80"},
+    {LibFunc_copysignf, "llvm.copysign.f32"},
+    {LibFunc_copysign, "llvm.copysign.f64"},
+    {LibFunc_copysignl, "llvm.copysign.f80"},
+    {LibFunc_floorf, "llvm.floor.f32"},
+    {LibFunc_floor, "llvm.floor.f64"},
+    {LibFunc_floorl, "llvm.floor.f80"},
+    {LibFunc_fmaxf, "llvm.maxnum.f32"},
+    {LibFunc_fmax, "llvm.maxnum.f64"},
+    {LibFunc_fmaxl, "llvm.maxnum.f80"},
+    {LibFunc_fminf, "llvm.minnum.f32"},
+    {LibFunc_fmin, "llvm.minnum.f64"},
+    {LibFunc_fminl, "llvm.minnum.f80"},
+    {LibFunc_ceilf, "llvm.ceil.f32"},
+    {LibFunc_ceil, "llvm.ceil.f64"},
+    {LibFunc_ceill, "llvm.ceil.f80"},
+    {LibFunc_truncf, "llvm.trunc.f32"},
+    {LibFunc_trunc, "llvm.trunc.f64"},
+    {LibFunc_truncl, "llvm.trunc.f80"},
+    {LibFunc_rintf, "llvm.rint.f32"},
+    {LibFunc_rint, "llvm.rint.f64"},
+    {LibFunc_rintl, "llvm.rint.f80"},
+    {LibFunc_nearbyintf, "llvm.nearbyint.f32"},
+    {LibFunc_nearbyint, "llvm.nearbyint.f64"},
+    {LibFunc_nearbyintl, "llvm.nearbyint.f80"},
+    {LibFunc_roundf, "llvm.round.f32"},
+    {LibFunc_round, "llvm.round.f64"},
+    {LibFunc_roundl, "llvm.round.f80"},
+};
+
+const char *KnownIntrinsic::get(LibFunc LFunc) {
+  for (const auto &E : kLibfuncIntrinsics) {
+    if (E.LFunc == LFunc)
+      return E.IntrinsicName;
+  }
+  return nullptr;
+}
+
+const KnownIntrinsic::WidenedIntrinsic *KnownIntrinsic::widen(StringRef Name) {
+  for (const auto &E : kWidenedIntrinsics) {
+    if (E.NarrowName == Name)
+      return &E;
+  }
+  return nullptr;
+}
+
+// Returns the name of the LLVM intrinsic corresponding to the given function.
+static const char *getIntrinsicFromLibfunc(Function &Fn, Type *VT,
+                                           const TargetLibraryInfo &TLI) {
+  LibFunc LFunc;
+  if (!TLI.getLibFunc(Fn, LFunc))
+    return nullptr;
+
+  if (const char *Name = KnownIntrinsic::get(LFunc))
+    return Name;
+
+  LLVM_DEBUG(errs() << "TODO: LibFunc: " << TLI.getName(LFunc) << "\n");
+  return nullptr;
+}
+
+// Try to handle a known function call.
+Value *NumericalStabilitySanitizer::maybeHandleKnownCallBase(
+    CallBase &Call, Type *VT, Type *ExtendedVT, const TargetLibraryInfo &TLI,
+    const ValueToShadowMap &Map, IRBuilder<> &Builder) {
+  Function *Fn = Call.getCalledFunction();
+  if (Fn == nullptr)
+    return nullptr;
+
+  Intrinsic::ID WidenedId = Intrinsic::ID();
+  FunctionType *WidenedFnTy = nullptr;
+  if (const auto ID = Fn->getIntrinsicID()) {
+    const auto *Widened = KnownIntrinsic::widen(Fn->getName());
+    if (Widened) {
+      WidenedId = Widened->ID;
+      WidenedFnTy = Widened->MakeFnTy(Context);
+    } else {
+      // If we don't know how to widen the intrinsic, we have no choice but to
+      // call the non-wide version on a truncated shadow and extend again
+      // afterwards.
+      WidenedId = ID;
+      WidenedFnTy = Fn->getFunctionType();
+    }
+  } else if (const char *Name = getIntrinsicFromLibfunc(*Fn, VT, TLI)) {
+    // We might have a call to a library function that we can replace with a
+    // wider Intrinsic.
+    const auto *Widened = KnownIntrinsic::widen(Name);
+    assert(Widened && "make sure KnownIntrinsic entries are consistent");
+    WidenedId = Widened->ID;
+    WidenedFnTy = Widened->MakeFnTy(Context);
+  } else {
+    // This is not a known library function or intrinsic.
+    return nullptr;
+  }
+
+  // Check that the widened intrinsic is valid.
+  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(WidenedId, Table);
+  SmallVector<Type *, 4> ArgTys;
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+  [[maybe_unused]] Intrinsic::MatchIntrinsicTypesResult MatchResult =
+      Intrinsic::matchIntrinsicSignature(WidenedFnTy, TableRef, ArgTys);
+  assert(MatchResult == Intrinsic::MatchIntrinsicTypes_Match &&
+         "invalid widened intrinsic");
+  // For known intrinsic functions, we create a second call to the same
+  // intrinsic with a different type.
+  SmallVector<Value *, 4> Args;
+  // The last operand is the intrinsic itself, skip it.
+  for (unsigned I = 0, E = Call.getNumOperands() - 1; I < E; ++I) {
+    Value *Arg = Call.getOperand(I);
+    Type *OrigArgTy = Arg->getType();
+    Type *IntrinsicArgTy = WidenedFnTy->getParamType(I);
+    if (OrigArgTy == IntrinsicArgTy) {
+      Args.push_back(Arg); // The arg is passed as is.
+      continue;
+    }
+    Type *ShadowArgTy = Config.getExtendedFPType(Arg->getType());
+    assert(ShadowArgTy &&
+           "don't know how to get the shadow value for a non-FT");
+    Value *Shadow = Map.getShadow(Arg);
+    if (ShadowArgTy == IntrinsicArgTy) {
+      // The shadow is the right type for the intrinsic.
+      assert(Shadow->getType() == ShadowArgTy);
+      Args.push_back(Shadow);
+      continue;
+    }
+    // There is no intrinsic with his level of precision, truncate the shadow.
+    Args.push_back(Builder.CreateFPTrunc(Shadow, IntrinsicArgTy));
+  }
+  Value *IntrinsicCall = Builder.CreateIntrinsic(WidenedId, ArgTys, Args);
+  return WidenedFnTy->getReturnType() == ExtendedVT
+             ? IntrinsicCall
+             : Builder.CreateFPExt(IntrinsicCall, ExtendedVT);
+}
+
+// Handle a CallBase, i.e. a function call, an inline asm sequence, or an
+// invoke.
+Value *NumericalStabilitySanitizer::handleCallBase(CallBase &Call, Type *VT,
+                                                   Type *ExtendedVT,
+                                                   const TargetLibraryInfo &TLI,
+                                                   const ValueToShadowMap &Map,
+                                                   IRBuilder<> &Builder) {
+  // We cannot look inside inline asm, just expand the result again.
+  if (Call.isInlineAsm())
+    return Builder.CreateFPExt(&Call, ExtendedVT);
+
+  // Intrinsics and library functions (e.g. sin, exp) are handled
+  // specifically, because we know their semantics and can do better than
+  // blindly calling them (e.g. compute the sinus in the actual shadow domain).
+  if (Value *V =
+          maybeHandleKnownCallBase(Call, VT, ExtendedVT, TLI, Map, Builder))
+    return V;
+
+  // If the return tag matches that of the called function, read the extended
+  // return value from the shadow ret ptr. Else, just extend the return value.
+  Value *L =
+      Builder.CreateLoad(IntptrTy, NsanShadowRetTag, /*isVolatile=*/false);
+  Value *HasShadowRet = Builder.CreateICmpEQ(
+      L, Builder.CreatePtrToInt(Call.getCalledOperand(), IntptrTy));
+
+  Value *ShadowRetVal = Builder.CreateLoad(
+      ExtendedVT,
+      Builder.CreateConstGEP2_64(NsanShadowRetType, NsanShadowRetPtr, 0, 0),
+      /*isVolatile=*/false);
+  Value *Shadow = Builder.CreateSelect(HasShadowRet, ShadowRetVal,
+                                       Builder.CreateFPExt(&Call, ExtendedVT));
+  ++NumInstrumentedFTCalls;
+  return Shadow;
+}
+
+// Creates a shadow value for the given FT value. At that point all operands are
+// guaranteed to be available.
+Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
+    Instruction &Inst, const TargetLibraryInfo &TLI,
+    const ValueToShadowMap &Map) {
+  Type *VT = Inst.getType();
+  Type *ExtendedVT = Config.getExtendedFPType(VT);
+  assert(ExtendedVT != nullptr && "trying to create a shadow for a non-FT");
+
+  if (auto *Load = dyn_cast<LoadInst>(&Inst))
+    return handleLoad(*Load, VT, ExtendedVT);
+
+  if (auto *Call = dyn_cast<CallInst>(&Inst)) {
+    // Insert after the call.
+    BasicBlock::iterator It(Inst);
+    IRBuilder<> Builder(Call->getParent(), ++It);
+    Builder.SetCurrentDebugLocation(Call->getDebugLoc());
+    return handleCallBase(*Call, VT, ExtendedVT, TLI, Map, Builder);
+  }
+
+  if (auto *Invoke = dyn_cast<InvokeInst>(&Inst)) {
+    // The Invoke terminates the basic block, create a new basic block in
+    // between the successful invoke and the next block.
+    BasicBlock *InvokeBB = Invoke->getParent();
+    BasicBlock *NextBB = Invoke->getNormalDest();
+    BasicBlock *NewBB =
+        BasicBlock::Create(Context, "", NextBB->getParent(), NextBB);
+    Inst.replaceSuccessorWith(NextBB, NewBB);
+
+    IRBuilder<> Builder(NewBB);
+    Builder.SetCurrentDebugLocation(Invoke->getDebugLoc());
+    Value *Shadow = handleCallBase(*Invoke, VT, ExtendedVT, TLI, Map, Builder);
+    Builder.CreateBr(NextBB);
+    NewBB->replaceSuccessorsPhiUsesWith(InvokeBB, NewBB);
+    return Shadow;
+  }
+
+  IRBuilder<> Builder(Inst.getNextNode());
+  Builder.SetCurrentDebugLocation(Inst.getDebugLoc());
+
+  if (auto *Trunc = dyn_cast<FPTruncInst>(&Inst))
+    return handleTrunc(*Trunc, VT, ExtendedVT, Map, Builder);
+  if (auto *Ext = dyn_cast<FPExtInst>(&Inst))
+    return handleExt(*Ext, VT, ExtendedVT, Map, Builder);
+
+  if (auto *UnaryOp = dyn_cast<UnaryOperator>(&Inst))
+    return Builder.CreateUnOp(UnaryOp->getOpcode(),
+                              Map.getShadow(UnaryOp->getOperand(0)));
+
+  if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst))
+    return Builder.CreateBinOp(BinOp->getOpcode(),
+                               Map.getShadow(BinOp->getOperand(0)),
+                               Map.getShadow(BinOp->getOperand(1)));
+
+  if (isa<UIToFPInst>(&Inst) || isa<SIToFPInst>(&Inst)) {
+    auto *Cast = dyn_cast<CastInst>(&Inst);
+    return Builder.CreateCast(Cast->getOpcode(), Cast->getOperand(0),
+                              ExtendedVT);
+  }
+
+  if (auto *S = dyn_cast<SelectInst>(&Inst))
+    return Builder.CreateSelect(S->getCondition(),
+                                Map.getShadow(S->getTrueValue()),
+                                Map.getShadow(S->getFalseValue()));
+
+  if (auto *Extract = dyn_cast<ExtractElementInst>(&Inst))
+    return Builder.CreateExtractElement(
+        Map.getShadow(Extract->getVectorOperand()), Extract->getIndexOperand());
+
+  if (auto *Insert = dyn_cast<InsertElementInst>(&Inst))
+    return Builder.CreateInsertElement(Map.getShadow(Insert->getOperand(0)),
+                                       Map.getShadow(Insert->getOperand(1)),
+                                       Insert->getOperand(2));
+
+  if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(&Inst))
+    return Builder.CreateShuffleVector(Map.getShadow(Shuffle->getOperand(0)),
+                                       Map.getShadow(Shuffle->getOperand(1)),
+                                       Shuffle->getShuffleMask());
+  // TODO: We could make aggregate object first class citizens. For now we
+  // just extend the extracted value.
+  if (auto *Extract = dyn_cast<ExtractValueInst>(&Inst))
+    return Builder.CreateFPExt(Extract, ExtendedVT);
+
+  if (auto *BC = dyn_cast<BitCastInst>(&Inst))
+    return Builder.CreateFPExt(BC, ExtendedVT);
+
+  report_fatal_error("Unimplemented support for " +
+                     Twine(Inst.getOpcodeName()));
+}
+
+// Creates a shadow value for an instruction that defines a value of FT type.
+// FT operands that do not already have shadow values are created recursively.
+// The DFS is guaranteed to not loop as phis and arguments already have
+// shadows.
+void NumericalStabilitySanitizer::maybeCreateShadowValue(
+    Instruction &Root, const TargetLibraryInfo &TLI, ValueToShadowMap &Map) {
+  Type *VT = Root.getType();
+  Type *ExtendedVT = Config.getExtendedFPType(VT);
+  if (ExtendedVT == nullptr)
+    return; // Not an FT value.
+
+  if (Map.hasShadow(&Root))
+    return; // Shadow already exists.
+
+  assert(!isa<PHINode>(Root) && "phi nodes should already have shadows");
+
+  std::vector<Instruction *> DfsStack(1, &Root);
+  while (!DfsStack.empty()) {
+    // Ensure that all operands to the instruction have shadows before
+    // proceeding.
+    Instruction *I = DfsStack.back();
+    // The shadow for the instruction might have been created deeper in the DFS,
+    // see `forward_use_with_two_uses` test.
+    if (Map.hasShadow(I)) {
+      DfsStack.pop_back();
+      continue;
+    }
+
+    bool MissingShadow = false;
+    for (Value *Op : I->operands()) {
+      Type *VT = Op->getType();
+      if (!Config.getExtendedFPType(VT))
+        continue; // Not an FT value.
+      if (Map.hasShadow(Op))
+        continue; // Shadow is already available.
+      MissingShadow = true;
+      DfsStack.push_back(cast<Instruction>(Op));
+    }
+    if (MissingShadow)
+      continue; // Process operands and come back to this instruction later.
+
+    // All operands have shadows. Create a shadow for the current value.
+    Value *Shadow = createShadowValueWithOperandsAvailable(*I, TLI, Map);
+    Map.setShadow(*I, *Shadow);
+    DfsStack.pop_back();
+  }
+}
+
+// A floating-point store needs its value and type written to shadow memory.
+void NumericalStabilitySanitizer::propagateFTStore(
+    StoreInst &Store, Type *VT, Type *ExtendedVT, const ValueToShadowMap &Map) {
+  Value *StoredValue = Store.getValueOperand();
+  IRBuilder<> Builder(&Store);
+  Builder.SetCurrentDebugLocation(Store.getDebugLoc());
+  const auto Extents = getMemoryExtentsOrDie(VT);
+  Value *ShadowPtr = Builder.CreateCall(
+      NsanGetShadowPtrForStore[Extents.ValueType],
+      {Store.getPointerOperand(), ConstantInt::get(IntptrTy, Extents.NumElts)});
+
+  Value *StoredShadow = Map.getShadow(StoredValue);
+  if (!Store.getParent()->getParent()->hasOptNone()) {
+    // Only check stores when optimizing, because non-optimized code generates
+    // too many stores to the stack, creating false positives.
+    if (ClCheckStores) {
+      StoredShadow = emitCheck(StoredValue, StoredShadow, Builder,
+                               CheckLoc::makeStore(Store.getPointerOperand()));
+      ++NumInstrumentedFTStores;
+    }
+  }
+
+  Builder.CreateAlignedStore(StoredShadow, ShadowPtr, Align(1),
+                             Store.isVolatile());
+}
+
+// A non-ft store needs to invalidate shadow memory. Exceptions are:
+//   - memory transfers of floating-point data through other pointer types (llvm
+//     optimization passes transform `*(float*)a = *(float*)b` into
+//     `*(i32*)a = *(i32*)b` ). These have the same semantics as memcpy.
+//   - Writes of FT-sized constants. LLVM likes to do float stores as bitcasted
+//     ints. Note that this is not really necessary because if the value is
+//     unknown the framework will re-extend it on load anyway. It just felt
+//     easier to debug tests with vectors of FTs.
+void NumericalStabilitySanitizer::propagateNonFTStore(
+    StoreInst &Store, Type *VT, const ValueToShadowMap &Map) {
+  Value *PtrOp = Store.getPointerOperand();
+  IRBuilder<> Builder(Store.getNextNode());
+  Builder.SetCurrentDebugLocation(Store.getDebugLoc());
+  Value *Dst = PtrOp;
+  TypeSize SlotSize = DL.getTypeStoreSize(VT);
+  assert(!SlotSize.isScalable() && "unsupported");
+  const auto LoadSizeBytes = SlotSize.getFixedValue();
+  Value *ValueSize = Constant::getIntegerValue(
+      IntptrTy, APInt(IntptrTy->getPrimitiveSizeInBits(), LoadSizeBytes));
+
+  ++NumInstrumentedNonFTStores;
+  Value *StoredValue = Store.getValueOperand();
+  if (LoadInst *Load = dyn_cast<LoadInst>(StoredValue)) {
+    // TODO: Handle the case when the value is from a phi.
+    // This is a memory transfer with memcpy semantics. Copy the type and
+    // value from the source. Note that we cannot use __nsan_copy_values()
+    // here, because that will not work when there is a write to memory in
+    // between the load and the store, e.g. in the case of a swap.
+    Type *ShadowTypeIntTy = Type::getIntNTy(Context, 8 * LoadSizeBytes);
+    Type *ShadowValueIntTy =
+        Type::getIntNTy(Context, 8 * kShadowScale * LoadSizeBytes);
+    IRBuilder<> LoadBuilder(Load->getNextNode());
+    Builder.SetCurrentDebugLocation(Store.getDebugLoc());
+    Value *LoadSrc = Load->getPointerOperand();
+    // Read the shadow type and value at load time. The type has the same size
+    // as the FT value, the value has twice its size.
+    // TODO: cache them to avoid re-creating them when a load is used by
+    // several stores. Maybe create them like the FT shadows when a load is
+    // encountered.
+    Value *RawShadowType = LoadBuilder.CreateAlignedLoad(
+        ShadowTypeIntTy,
+        LoadBuilder.CreateCall(NsanGetRawShadowTypePtr, {LoadSrc}), Align(1),
+        /*isVolatile=*/false);
+    Value *RawShadowValue = LoadBuilder.CreateAlignedLoad(
+        ShadowValueIntTy,
+        LoadBuilder.CreateCall(NsanGetRawShadowPtr, {LoadSrc}), Align(1),
+        /*isVolatile=*/false);
+
+    // Write back the shadow type and value at store time.
+    Builder.CreateAlignedStore(
+        RawShadowType, Builder.CreateCall(NsanGetRawShadowTypePtr, {Dst}),
+        Align(1),
+        /*isVolatile=*/false);
+    Builder.CreateAlignedStore(RawShadowValue,
+                               Builder.CreateCall(NsanGetRawShadowPtr, {Dst}),
+                               Align(1),
+                               /*isVolatile=*/false);
+
+    ++NumInstrumentedNonFTMemcpyStores;
+    return;
+  }
+  // ClPropagateNonFTConstStoresAsFT is by default false.
+  if (Constant *C; ClPropagateNonFTConstStoresAsFT &&
+                   (C = dyn_cast<Constant>(StoredValue))) {
+    // This might be a fp constant stored as an int. Bitcast and store if it has
+    // appropriate size.
+    Type *BitcastTy = nullptr; // The FT type to bitcast to.
+    if (auto *CInt = dyn_cast<ConstantInt>(C)) {
+      switch (CInt->getType()->getScalarSizeInBits()) {
+      case 32:
+        BitcastTy = Type::getFloatTy(Context);
+        break;
+      case 64:
+        BitcastTy = Type::getDoubleTy(Context);
+        break;
+      case 80:
+        BitcastTy = Type::getX86_FP80Ty(Context);
+        break;
+      default:
+        break;
+      }
+    } else if (auto *CDV = dyn_cast<ConstantDataVector>(C)) {
+      const int NumElements =
+          cast<VectorType>(CDV->getType())->getElementCount().getFixedValue();
+      switch (CDV->getType()->getScalarSizeInBits()) {
+      case 32:
+        BitcastTy =
+            VectorType::get(Type::getFloatTy(Context), NumElements, false);
+        break;
+      case 64:
+        BitcastTy =
+            VectorType::get(Type::getDoubleTy(Context), NumElements, false);
+        break;
+      case 80:
+        BitcastTy =
+            VectorType::get(Type::getX86_FP80Ty(Context), NumElements, false);
+        break;
+      default:
+        break;
+      }
+    }
+    if (BitcastTy) {
+      const MemoryExtents Extents = getMemoryExtentsOrDie(BitcastTy);
+      Value *ShadowPtr = Builder.CreateCall(
+          NsanGetShadowPtrForStore[Extents.ValueType],
+          {PtrOp, ConstantInt::get(IntptrTy, Extents.NumElts)});
+      // Bitcast the integer value to the appropriate FT type and extend to 2FT.
+      Type *ExtVT = Config.getExtendedFPType(BitcastTy);
+      Value *Shadow =
+          Builder.CreateFPExt(Builder.CreateBitCast(C, BitcastTy), ExtVT);
+      Builder.CreateAlignedStore(Shadow, ShadowPtr, Align(1),
+                                 Store.isVolatile());
+      return;
+    }
+  }
+  // All other stores just reset the shadow value to unknown.
+  Builder.CreateCall(NsanSetValueUnknown, {Dst, ValueSize});
+}
+
+void NumericalStabilitySanitizer::propagateShadowValues(
+    Instruction &Inst, const TargetLibraryInfo &TLI,
+    const ValueToShadowMap &Map) {
+  if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+    Value *StoredValue = Store->getValueOperand();
+    Type *VT = StoredValue->getType();
+    Type *ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      return propagateNonFTStore(*Store, VT, Map);
+    return propagateFTStore(*Store, VT, ExtendedVT, Map);
+  }
+
+  if (auto *FCmp = dyn_cast<FCmpInst>(&Inst)) {
+    emitFCmpCheck(*FCmp, Map);
+    return;
+  }
+
+  if (auto *CB = dyn_cast<CallBase>(&Inst)) {
+    maybeAddSuffixForNsanInterface(CB);
+    if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+      maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) {
+      instrumentMemIntrinsic(MI);
+      return;
+    }
+    populateShadowStack(*CB, TLI, Map);
+    return;
+  }
+
+  if (auto *RetInst = dyn_cast<ReturnInst>(&Inst)) {
+    if (!ClCheckRet)
+      return;
+
+    Value *RV = RetInst->getReturnValue();
+    if (RV == nullptr)
+      return; // This is a `ret void`.
+    Type *VT = RV->getType();
+    Type *ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      return; // Not an FT ret.
+    Value *RVShadow = Map.getShadow(RV);
+    IRBuilder<> Builder(RetInst);
+
+    RVShadow = emitCheck(RV, RVShadow, Builder, CheckLoc::makeRet());
+    ++NumInstrumentedFTRets;
+    // Store tag.
+    Value *FnAddr =
+        Builder.CreatePtrToInt(Inst.getParent()->getParent(), IntptrTy);
+    Builder.CreateStore(FnAddr, NsanShadowRetTag);
+    // Store value.
+    Value *ShadowRetValPtr =
+        Builder.CreateConstGEP2_64(NsanShadowRetType, NsanShadowRetPtr, 0, 0);
+    Builder.CreateStore(RVShadow, ShadowRetValPtr);
+    return;
+  }
+
+  if (InsertValueInst *Insert = dyn_cast<InsertValueInst>(&Inst)) {
+    Value *V = Insert->getOperand(1);
+    Type *VT = V->getType();
+    Type *ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      return;
+    IRBuilder<> Builder(Insert);
+    emitCheck(V, Map.getShadow(V), Builder, CheckLoc::makeInsert());
+    return;
+  }
+}
+
+// Moves fast math flags from the function to individual instructions, and
+// removes the attribute from the function.
+// TODO: Make this controllable with a flag.
+static void moveFastMathFlags(Function &F,
+                              std::vector<Instruction *> &Instructions) {
+  FastMathFlags FMF;
+#define MOVE_FLAG(attr, setter)                                                \
+  if (F.getFnAttribute(attr).getValueAsString() == "true") {                   \
+    F.removeFnAttr(attr);                                                      \
+    FMF.set##setter();                                                         \
+  }
+  MOVE_FLAG("unsafe-fp-math", Fast)
+  MOVE_FLAG("no-infs-fp-math", NoInfs)
+  MOVE_FLAG("no-nans-fp-math", NoNaNs)
+  MOVE_FLAG("no-signed-zeros-fp-math", NoSignedZeros)
+#undef MOVE_FLAG
+
+  for (Instruction *I : Instructions)
+    if (isa<FPMathOperator>(I))
+      I->setFastMathFlags(FMF);
+}
+
+bool NumericalStabilitySanitizer::sanitizeFunction(
+    Function &F, const TargetLibraryInfo &TLI) {
+  if (!F.hasFnAttribute(Attribute::SanitizeNumericalStability))
+    return false;
+
+  // This is required to prevent instrumenting call to __nsan_init from within
+  // the module constructor.
+  if (F.getName() == kNsanModuleCtorName)
+    return false;
+  SmallVector<Instruction *, 8> AllLoadsAndStores;
+  SmallVector<Instruction *, 8> LocalLoadsAndStores;
+
+  // The instrumentation maintains:
+  //  - for each IR value `v` of floating-point (or vector floating-point) type
+  //    FT, a shadow IR value `s(v)` with twice the precision 2FT (e.g.
+  //    double for float and f128 for double).
+  //  - A shadow memory, which stores `s(v)` for any `v` that has been stored,
+  //    along with a shadow memory tag, which stores whether the value in the
+  //    corresponding shadow memory is valid. Note that this might be
+  //    incorrect if a non-instrumented function stores to memory, or if
+  //    memory is stored to through a char pointer.
+  //  - A shadow stack, which holds `s(v)` for any floating-point argument `v`
+  //    of a call to an instrumented function. This allows
+  //    instrumented functions to retrieve the shadow values for their
+  //    arguments.
+  //    Because instrumented functions can be called from non-instrumented
+  //    functions, the stack needs to include a tag so that the instrumented
+  //    function knows whether shadow values are available for their
+  //    parameters (i.e. whether is was called by an instrumented function).
+  //    When shadow arguments are not available, they have to be recreated by
+  //    extending the precision of the non-shadow arguments to the non-shadow
+  //    value. Non-instrumented functions do not modify (or even know about) the
+  //    shadow stack. The shadow stack pointer is __nsan_shadow_args. The shadow
+  //    stack tag is __nsan_shadow_args_tag. The tag is any unique identifier
+  //    for the function (we use the address of the function). Both variables
+  //    are thread local.
+  //    Example:
+  //     calls                             shadow stack tag      shadow stack
+  //     =======================================================================
+  //     non_instrumented_1()              0                     0
+  //             |
+  //             v
+  //     instrumented_2(float a)           0                     0
+  //             |
+  //             v
+  //     instrumented_3(float b, double c) &instrumented_3       s(b),s(c)
+  //             |
+  //             v
+  //     instrumented_4(float d)           &instrumented_4       s(d)
+  //             |
+  //             v
+  //     non_instrumented_5(float e)       &non_instrumented_5   s(e)
+  //             |
+  //             v
+  //     instrumented_6(float f)           &non_instrumented_5   s(e)
+  //
+  //   On entry, instrumented_2 checks whether the tag corresponds to its
+  //   function ptr.
+  //   Note that functions reset the tag to 0 after reading shadow parameters.
+  //   This ensures that the function does not erroneously read invalid data if
+  //   called twice in the same stack, once from an instrumented function and
+  //   once from an uninstrumented one. For example, in the following example,
+  //   resetting the tag in (A) ensures that (B) does not reuse the same the
+  //   shadow arguments (which would be incorrect).
+  //      instrumented_1(float a)
+  //             |
+  //             v
+  //      instrumented_2(float b)  (A)
+  //             |
+  //             v
+  //      non_instrumented_3()
+  //             |
+  //             v
+  //      instrumented_2(float b)  (B)
+  //
+  //  - A shadow return slot. Any function that returns a floating-point value
+  //    places a shadow return value in __nsan_shadow_ret_val. Again, because
+  //    we might be calling non-instrumented functions, this value is guarded
+  //    by __nsan_shadow_ret_tag marker indicating which instrumented function
+  //    placed the value in __nsan_shadow_ret_val, so that the caller can check
+  //    that this corresponds to the callee. Both variables are thread local.
+  //
+  //    For example, in the following example, the instrumentation in
+  //    `instrumented_1` rejects the shadow return value from `instrumented_3`
+  //    because is is not tagged as expected (`&instrumented_3` instead of
+  //    `non_instrumented_2`):
+  //
+  //        instrumented_1()
+  //            |
+  //            v
+  //        float non_instrumented_2()
+  //            |
+  //            v
+  //        float instrumented_3()
+  //
+  // Calls of known math functions (sin, cos, exp, ...) are duplicated to call
+  // their overload on the shadow type.
+
+  // Collect all instructions before processing, as creating shadow values
+  // creates new instructions inside the function.
+  std::vector<Instruction *> OriginalInstructions;
+  for (BasicBlock &BB : F)
+    for (Instruction &Inst : BB)
+      OriginalInstructions.emplace_back(&Inst);
+
+  moveFastMathFlags(F, OriginalInstructions);
+  ValueToShadowMap ValueToShadow(Config);
+
+  // In the first pass, we create shadow values for all FT function arguments
+  // and all phis. This ensures that the DFS of the next pass does not have
+  // any loops.
+  std::vector<PHINode *> OriginalPhis;
+  createShadowArguments(F, TLI, ValueToShadow);
+  for (Instruction *I : OriginalInstructions) {
+    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+      if (PHINode *Shadow = maybeCreateShadowPhi(*Phi, TLI)) {
+        OriginalPhis.push_back(Phi);
+        ValueToShadow.setShadow(*Phi, *Shadow);
+      }
+    }
+  }
+
+  // Create shadow values for all instructions creating FT values.
+  for (Instruction *I : OriginalInstructions)
+    maybeCreateShadowValue(*I, TLI, ValueToShadow);
+
+  // Propagate shadow values across stores, calls and rets.
+  for (Instruction *I : OriginalInstructions)
+    propagateShadowValues(*I, TLI, ValueToShadow);
+
+  // The last pass populates shadow phis with shadow values.
+  for (PHINode *Phi : OriginalPhis) {
+    PHINode *ShadowPhi = dyn_cast<PHINode>(ValueToShadow.getShadow(Phi));
+    for (unsigned I : seq(Phi->getNumOperands())) {
+      Value *V = Phi->getOperand(I);
+      Value *Shadow = ValueToShadow.getShadow(V);
+      BasicBlock *IncomingBB = Phi->getIncomingBlock(I);
+      // For some instructions (e.g. invoke), we create the shadow in a separate
+      // block, different from the block where the original value is created.
+      // In that case, the shadow phi might need to refer to this block instead
+      // of the original block.
+      // Note that this can only happen for instructions as constant shadows are
+      // always created in the same block.
+      ShadowPhi->addIncoming(Shadow, IncomingBB);
+    }
+  }
+
+  return !ValueToShadow.empty();
+}
+
+// Instrument the memory intrinsics so that they properly modify the shadow
+// memory.
+bool NumericalStabilitySanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> Builder(MI);
+  if (auto *M = dyn_cast<MemSetInst>(MI)) {
+    Builder.CreateCall(
+        NsanSetValueUnknown,
+        {/*Address=*/M->getArgOperand(0),
+         /*Size=*/Builder.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+  } else if (auto *M = dyn_cast<MemTransferInst>(MI)) {
+    Builder.CreateCall(
+        NsanCopyValues,
+        {/*Destination=*/M->getArgOperand(0),
+         /*Source=*/M->getArgOperand(1),
+         /*Size=*/Builder.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+  }
+  return false;
+}
+
+void NumericalStabilitySanitizer::maybeAddSuffixForNsanInterface(CallBase *CI) {
+  Function *Fn = CI->getCalledFunction();
+  if (Fn == nullptr)
+    return;
+
+  if (!Fn->getName().starts_with("__nsan_"))
+    return;
+
+  if (Fn->getName() == "__nsan_dump_shadow_mem") {
+    assert(CI->arg_size() == 4 &&
+           "invalid prototype for __nsan_dump_shadow_mem");
+    // __nsan_dump_shadow_mem requires an extra parameter with the dynamic
+    // configuration:
+    // (shadow_type_id_for_long_double << 16) | (shadow_type_id_for_double << 8)
+    // | shadow_type_id_for_double
+    const uint64_t shadow_value_type_ids =
+        (static_cast<size_t>(Config.byValueType(kLongDouble).getNsanTypeId())
+         << 16) |
+        (static_cast<size_t>(Config.byValueType(kDouble).getNsanTypeId())
+         << 8) |
+        static_cast<size_t>(Config.byValueType(kFloat).getNsanTypeId());
+    CI->setArgOperand(3, ConstantInt::get(IntptrTy, shadow_value_type_ids));
+  }
+}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index 76afa2f22461f7..de1d4d2381c06e 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -14,6 +14,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include <utility>
diff --git a/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
index d93b5e5ac90356..450c191a896da8 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/ErrorHandling.h"
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 572d37a2b3e555..35b1bbf21be976 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -320,6 +320,8 @@ static cl::opt<unsigned> PGOFunctionCriticalEdgeThreshold(
     cl::desc("Do not instrument functions with the number of critical edges "
              " greater than this threshold."));
 
+extern cl::opt<unsigned> MaxNumVTableAnnotations;
+
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
@@ -332,6 +334,7 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
 // Command line option to enable vtable value profiling. Defined in
 // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=
 extern cl::opt<bool> EnableVTableValueProfiling;
+extern cl::opt<bool> EnableVTableProfileUse;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
@@ -1728,6 +1731,14 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   llvm_unreachable("Unknown visiting mode");
 }
 
+static uint32_t getMaxNumAnnotations(InstrProfValueKind ValueProfKind) {
+  if (ValueProfKind == IPVK_MemOPSize)
+    return MaxNumMemOPAnnotations;
+  if (ValueProfKind == llvm::IPVK_VTableTarget)
+    return MaxNumVTableAnnotations;
+  return MaxNumAnnotations;
+}
+
 // Traverse all valuesites and annotate the instructions for all value kind.
 void PGOUseFunc::annotateValueSites() {
   if (isValueProfilingDisabled())
@@ -1762,10 +1773,10 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) {
     LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
                       << "): Index = " << ValueSiteIndex << " out of "
                       << NumValueSites << "\n");
-    annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
-                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
-                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
-                                             : MaxNumAnnotations);
+    annotateValueSite(
+        *M, *I.AnnotatedInst, ProfileRecord,
+        static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+        getMaxNumAnnotations(static_cast<InstrProfValueKind>(Kind)));
     ValueSiteIndex++;
   }
 }
@@ -2054,6 +2065,16 @@ static bool annotateAllFunctions(
     return false;
   }
 
+  if (EnableVTableProfileUse) {
+    for (GlobalVariable &G : M.globals()) {
+      if (!G.hasName() || !G.hasMetadata(LLVMContext::MD_type))
+        continue;
+
+      // Create the PGOFuncName meta data.
+      createPGONameMetadata(G, getPGOName(G, false /* InLTO*/));
+    }
+  }
+
   // Add the profile summary (read from the header of the indexed summary) here
   // so that we can use it below when reading counters (which checks if the
   // function should be marked with a cold or inlinehint attribute).
@@ -2229,7 +2250,6 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
   };
 
   auto *PSI = &MAM.getResult<ProfileSummaryAnalysis>(M);
-
   if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, *FS,
                             LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
     return PreservedAnalyses::all();
@@ -2244,7 +2264,7 @@ static std::string getSimpleNodeName(const BasicBlock *Node) {
   std::string SimpleNodeName;
   raw_string_ostream OS(SimpleNodeName);
   Node->printAsOperand(OS, false);
-  return OS.str();
+  return SimpleNodeName;
 }
 
 void llvm::setProfMetadata(Module *M, Instruction *TI,
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index ec19942c1d5f4a..dc51c564fbe0d4 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -247,12 +247,11 @@ bool MemOPSizeOpt::perform(MemOp MO) {
   if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
     return false;
 
-  uint32_t NumVals = INSTR_PROF_NUM_BUCKETS;
   uint32_t MaxNumVals = INSTR_PROF_NUM_BUCKETS;
   uint64_t TotalCount;
-  auto ValueDataArray = getValueProfDataFromInst(
-      *MO.I, IPVK_MemOPSize, MaxNumVals, NumVals, TotalCount);
-  if (!ValueDataArray)
+  auto VDs =
+      getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, TotalCount);
+  if (VDs.empty())
     return false;
 
   uint64_t ActualCount = TotalCount;
@@ -264,7 +263,6 @@ bool MemOPSizeOpt::perform(MemOp MO) {
     ActualCount = *BBEdgeCount;
   }
 
-  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
   LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
                     << ActualCount << "\n");
   LLVM_DEBUG(
@@ -397,10 +395,10 @@ bool MemOPSizeOpt::perform(MemOp MO) {
   // Clear the value profile data.
   MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
   // If all promoted, we don't need the MD.prof metadata.
-  if (SavedRemainCount > 0 || Version != NumVals) {
+  if (SavedRemainCount > 0 || Version != VDs.size()) {
     // Otherwise we need update with the un-promoted records back.
     annotateValueSite(*Func.getParent(), *MO.I, RemainingVDs, SavedRemainCount,
-                      IPVK_MemOPSize, NumVals);
+                      IPVK_MemOPSize, VDs.size());
   }
 
   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
index 42e7cd80374db0..e094acdc317813 100644
--- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -62,6 +62,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 2f3de199f51d44..92e533d2281af8 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -511,7 +511,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   bool Res = false;
   bool HasCalls = false;
   bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   // Traverse all instructions, collect loads/stores/returns, check for calls.
   for (auto &BB : F) {
diff --git a/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
index b182f46cc5150a..5d9a7bca7efeca 100644
--- a/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
+++ b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
@@ -33,7 +33,7 @@ static void tryEmitAutoInitRemark(ArrayRef<Instruction *> Instructions,
       continue;
 
     Function &F = *I->getParent()->getParent();
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     AutoInitRemark Remark(ORE, REMARK_PASS, DL, TLI);
     Remark.visit(I);
   }
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 68774cf31ce9b3..4a6dedc93d3065 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -948,7 +949,7 @@ bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
   this->TTI = &TTI;
   this->DT = &DT;
   this->BFI = BFI;
-  this->DL = &Fn.getParent()->getDataLayout();
+  this->DL = &Fn.getDataLayout();
   this->Ctx = &Fn.getContext();
   this->Entry = &Entry;
   this->PSI = PSI;
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 70bfa469193bfe..bf0c67d9dbc4ff 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
@@ -1640,7 +1641,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
   SmallVector<Value *> FunctionArgs;
   for (Value &Arg : F.args())
     FunctionArgs.push_back(&Arg);
-  ConstraintInfo Info(F.getParent()->getDataLayout(), FunctionArgs);
+  ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
   State S(DT, LI, SE);
   std::unique_ptr<Module> ReproducerModule(
       DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 172cce2cfa381f..5457f6c13174d5 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -484,7 +484,7 @@ memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI,
 static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
                               uint64_t OldOffsetInBits, uint64_t OldSizeInBits,
                               uint64_t NewSizeInBits, bool IsOverwriteEnd) {
-  const DataLayout &DL = Inst->getModule()->getDataLayout();
+  const DataLayout &DL = Inst->getDataLayout();
   uint64_t DeadSliceSizeInBits = OldSizeInBits - NewSizeInBits;
   uint64_t DeadSliceOffsetInBits =
       OldOffsetInBits + (IsOverwriteEnd ? NewSizeInBits : 0);
@@ -870,7 +870,7 @@ struct DSEState {
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
            const LoopInfo &LI)
       : F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
-        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
+        PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) {
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
diff --git a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
index f7ada9fb8eb8a8..d8aea1e810e9b6 100644
--- a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -215,6 +215,7 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       RemInst = RealRem;
       // And replace the original instruction with the new one.
       OrigRemInst->replaceAllUsesWith(RealRem);
+      RealRem->setDebugLoc(OrigRemInst->getDebugLoc());
       OrigRemInst->eraseFromParent();
       NumRecomposed++;
       // Note that we have left ((X / Y) * Y) around.
@@ -366,7 +367,9 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       if (!DivDominates)
         DivInst->moveBefore(RemInst);
       Mul->insertAfter(RemInst);
+      Mul->setDebugLoc(RemInst->getDebugLoc());
       Sub->insertAfter(Mul);
+      Sub->setDebugLoc(RemInst->getDebugLoc());
 
       // If DivInst has the exact flag, remove it. Otherwise this optimization
       // may replace a well-defined value 'X % Y' with poison.
@@ -384,6 +387,7 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       if (!isGuaranteedNotToBeUndef(X, nullptr, DivInst, &DT)) {
         auto *FrX =
             new FreezeInst(X, X->getName() + ".frozen", DivInst->getIterator());
+        FrX->setDebugLoc(DivInst->getDebugLoc());
         DivInst->setOperand(0, FrX);
         Sub->setOperand(0, FrX);
       }
@@ -392,6 +396,7 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       if (!isGuaranteedNotToBeUndef(Y, nullptr, DivInst, &DT)) {
         auto *FrY =
             new FreezeInst(Y, Y->getName() + ".frozen", DivInst->getIterator());
+        FrY->setDebugLoc(DivInst->getDebugLoc());
         DivInst->setOperand(1, FrY);
         Mul->setOperand(1, FrY);
       }
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index f736d429cb6381..cf11f5bc885a75 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1833,7 +1833,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
   auto *MSSA =
       UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
 
-  EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+  EarlyCSE CSE(F.getDataLayout(), TLI, TTI, DT, AC, MSSA);
 
   if (!CSE.run())
     return PreservedAnalyses::all();
@@ -1887,7 +1887,7 @@ class EarlyCSELegacyCommonPass : public FunctionPass {
     auto *MSSA =
         UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
 
-    EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+    EarlyCSE CSE(F.getDataLayout(), TLI, TTI, DT, AC, MSSA);
 
     return CSE.run();
   }
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index da4d39b4e3ed44..a4a1438dbe41a8 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -497,7 +497,7 @@ bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
   walkBackwards();
   walkForwards();
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   bool Modified = validateAndTransform(DL);
   if (Modified)
     cleanup();
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index b5be8ac2494171..db39d8621d0771 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -420,7 +420,7 @@ GVNPass::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
 GVNPass::Expression GVNPass::ValueTable::createGEPExpr(GetElementPtrInst *GEP) {
   Expression E;
   Type *PtrTy = GEP->getType()->getScalarType();
-  const DataLayout &DL = GEP->getModule()->getDataLayout();
+  const DataLayout &DL = GEP->getDataLayout();
   unsigned BitWidth = DL.getIndexTypeSizeInBits(PtrTy);
   MapVector<Value *, APInt> VariableOffsets;
   APInt ConstantOffset(BitWidth, 0);
@@ -1072,7 +1072,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
                                                 GVNPass &gvn) const {
   Value *Res;
   Type *LoadTy = Load->getType();
-  const DataLayout &DL = Load->getModule()->getDataLayout();
+  const DataLayout &DL = Load->getDataLayout();
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
@@ -1238,7 +1238,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
 
   Instruction *DepInst = DepInfo.getInst();
 
-  const DataLayout &DL = Load->getModule()->getDataLayout();
+  const DataLayout &DL = Load->getDataLayout();
   if (DepInfo.isClobber()) {
     // If the dependence is to a store that writes to a superset of the bits
     // read by the load, we can extract the bits we need for the load from the
@@ -1723,7 +1723,7 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
 
   // Check if the load can safely be moved to all the unavailable predecessors.
   bool CanDoPRE = true;
-  const DataLayout &DL = Load->getModule()->getDataLayout();
+  const DataLayout &DL = Load->getDataLayout();
   SmallVector<Instruction*, 8> NewInsts;
   for (auto &PredLoad : PredLoads) {
     BasicBlock *UnavailablePred = PredLoad.first;
@@ -2477,8 +2477,8 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
     assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
     const DataLayout &DL =
         isa<Argument>(LHS)
-            ? cast<Argument>(LHS)->getParent()->getParent()->getDataLayout()
-            : cast<Instruction>(LHS)->getModule()->getDataLayout();
+            ? cast<Argument>(LHS)->getParent()->getDataLayout()
+            : cast<Instruction>(LHS)->getDataLayout();
 
     // If there is no obvious reason to prefer the left-hand side over the
     // right-hand side, ensure the longest lived term is on the right-hand side,
@@ -2621,7 +2621,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
-  const DataLayout &DL = I->getModule()->getDataLayout();
+  const DataLayout &DL = I->getDataLayout();
   if (Value *V = simplifyInstruction(I, {DL, TLI, DT, AC})) {
     bool Changed = false;
     if (!I->use_empty()) {
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index b6498c4e36963a..e7ff2a14469c5e 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -52,6 +52,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -816,7 +817,7 @@ bool GuardWideningImpl::parseRangeChecks(
   if (IC->getPredicate() == ICmpInst::ICMP_UGT)
     std::swap(CmpLHS, CmpRHS);
 
-  auto &DL = IC->getModule()->getDataLayout();
+  auto &DL = IC->getDataLayout();
 
   GuardWideningImpl::RangeCheck Check(
       CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())),
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index dd7c89034ca09b..5e2131b0b18075 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -518,7 +518,7 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI,
 
   Type *Ty = Cast->getType();
   uint64_t Width = SE->getTypeSizeInBits(Ty);
-  if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
+  if (!Cast->getDataLayout().isLegalInteger(Width))
     return;
 
   // Check that `Cast` actually extends the induction variable (we rely on this
@@ -845,7 +845,7 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
   const SCEV *BestInit = nullptr;
   BasicBlock *LatchBlock = L->getLoopLatch();
   assert(LatchBlock && "Must be in simplified form");
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L->getHeader()->getDataLayout();
 
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
     PHINode *Phi = cast<PHINode>(I);
@@ -1232,7 +1232,7 @@ static void replaceLoopPHINodesWithPreheaderValues(
     if (!L->contains(I))
       continue;
 
-    Value *Res = simplifyInstruction(I, I->getModule()->getDataLayout());
+    Value *Res = simplifyInstruction(I, I->getDataLayout());
     if (Res && LI->replacementPreservesLCSSAForm(I, Res)) {
       for (User *U : I->users())
         Worklist.push_back(cast<Instruction>(U));
@@ -1463,7 +1463,7 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
     if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned())
       continue;
 
-    const DataLayout &DL = ExitingBB->getModule()->getDataLayout();
+    const DataLayout &DL = ExitingBB->getDataLayout();
     const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType());
     const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType());
     auto FullCR = ConstantRange::getFull(InnerBitWidth);
@@ -1538,7 +1538,7 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
     };
 
 
-    const DataLayout &DL = ExitingBB->getModule()->getDataLayout();
+    const DataLayout &DL = ExitingBB->getDataLayout();
     const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType());
     const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType());
     auto FullCR = ConstantRange::getFull(InnerBitWidth);
@@ -2062,7 +2062,7 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
                                           LoopStandardAnalysisResults &AR,
                                           LPMUpdater &) {
   Function *F = L.getHeader()->getParent();
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
 
   IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA,
                      WidenIndVars && AllowIVWidening);
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index fbefd0e9368b2c..5074d049d2521b 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -821,7 +821,7 @@ unsigned InferAddressSpacesImpl::joinAddressSpaces(unsigned AS1,
 }
 
 bool InferAddressSpacesImpl::run(Function &F) {
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
 
   if (AssumeDefaultIsFlatAddressSpace)
     FlatAddrSpace = 0;
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index b75b8d486fbbe8..6e0c206bd19805 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -48,7 +48,7 @@ static bool tryToImproveAlign(
 }
 
 bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   bool Changed = false;
 
   // Enforce preferred type alignment if possible. We do this as a separate
diff --git a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
index ee9452ce1c7d4e..326849a4eb3953 100644
--- a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -99,7 +99,7 @@ struct InstSimplifyLegacyPass : public FunctionPass {
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     AssumptionCache *AC =
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-    const DataLayout &DL = F.getParent()->getDataLayout();
+    const DataLayout &DL = F.getDataLayout();
     const SimplifyQuery SQ(DL, TLI, DT, AC);
     return runImpl(F, SQ);
   }
@@ -125,7 +125,7 @@ PreservedAnalyses InstSimplifyPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
   bool Changed = runImpl(F, SQ);
   if (!Changed)
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index f9712dbbf68a6a..2a4f68e1252523 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -53,7 +53,7 @@ static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP,
     return std::nullopt;
 
   Function &F = *GEP->getParent()->getParent();
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   const unsigned BitWidth =
       DL.getIndexSizeInBits(GEP->getPointerAddressSpace());
   MapVector<Value *, APInt> VariableOffsets;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 1aef2800e98462..9e9f376eaa9ce1 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -560,7 +560,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
     Value *V, BasicBlock *BB, PredValueInfo &Result,
     ConstantPreference Preference, SmallPtrSet<Value *, 4> &RecursionSet,
     Instruction *CxtI) {
-  const DataLayout &DL = BB->getModule()->getDataLayout();
+  const DataLayout &DL = BB->getDataLayout();
 
   // This method walks up use-def chains recursively.  Because of this, we could
   // get into an infinite loop going around loops in the use-def chain.  To
@@ -757,7 +757,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
     // may result in comparison of values from two different loop iterations.
     // FIXME: This check is broken if LoopHeaders is not populated.
     if (PN && PN->getParent() == BB && !LoopHeaders.contains(BB)) {
-      const DataLayout &DL = PN->getModule()->getDataLayout();
+      const DataLayout &DL = PN->getDataLayout();
       // We can do this simplification if any comparisons fold to true or false.
       // See if any do.
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -1008,7 +1008,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
   // constant.
   if (Instruction *I = dyn_cast<Instruction>(Condition)) {
     Value *SimpleVal =
-        ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
+        ConstantFoldInstruction(I, BB->getDataLayout(), TLI);
     if (SimpleVal) {
       I->replaceAllUsesWith(SimpleVal);
       if (isInstructionTriviallyDead(I, TLI))
@@ -1178,7 +1178,7 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
   BasicBlock *CurrentPred = BB->getSinglePredecessor();
   unsigned Iter = 0;
 
-  auto &DL = BB->getModule()->getDataLayout();
+  auto &DL = BB->getDataLayout();
 
   while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
     auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
@@ -1324,7 +1324,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     // If this is a load on a phi pointer, phi-translate it and search
     // for available load/store to the pointer in predecessors.
     Type *AccessTy = LoadI->getType();
-    const auto &DL = LoadI->getModule()->getDataLayout();
+    const auto &DL = LoadI->getDataLayout();
     MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB),
                        LocationSize::precise(DL.getTypeStoreSize(AccessTy)),
                        AATags);
@@ -2196,7 +2196,7 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
   unsigned OneCount = 0;
   BasicBlock *ZeroPred = nullptr;
   BasicBlock *OnePred = nullptr;
-  const DataLayout &DL = BB->getModule()->getDataLayout();
+  const DataLayout &DL = BB->getDataLayout();
   for (BasicBlock *P : predecessors(PredBB)) {
     // If PredPred ends with IndirectBrInst, we can't handle it.
     if (isa<IndirectBrInst>(P->getTerminator()))
@@ -2710,7 +2710,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
     // phi translation.
     if (Value *IV = simplifyInstruction(
             New,
-            {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
+            {BB->getDataLayout(), TLI, nullptr, nullptr, New})) {
       ValueMapping[&*BI] = IV;
       if (!New->mayHaveSideEffects()) {
         New->eraseFromParent();
@@ -3071,7 +3071,7 @@ bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
   BasicBlock *TrueDest = BI->getSuccessor(0);
   BasicBlock *FalseDest = BI->getSuccessor(1);
 
-  auto &DL = BB->getModule()->getDataLayout();
+  auto &DL = BB->getDataLayout();
   bool TrueDestIsSafe = false;
   bool FalseDestIsSafe = false;
 
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 75883e0da214d3..91ef2b4b7c1839 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1052,7 +1052,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
 static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
                                   Loop *CurLoop) {
   Value *Addr = LI->getPointerOperand();
-  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const DataLayout &DL = LI->getDataLayout();
   const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
 
   // It is not currently possible for clang to generate an invariant.start
@@ -2043,7 +2043,7 @@ bool llvm::promoteLoopAccessesToScalars(
   bool SawNotAtomic = false;
   AAMDNodes AATags;
 
-  const DataLayout &MDL = Preheader->getModule()->getDataLayout();
+  const DataLayout &MDL = Preheader->getDataLayout();
 
   // If there are reads outside the promoted set, then promoting stores is
   // definitely not safe.
@@ -2506,7 +2506,7 @@ static bool hoistGEP(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   // The swapped GEPs are inbounds if both original GEPs are inbounds
   // and the sign of the offsets is the same. For simplicity, only
   // handle both offsets being non-negative.
-  const DataLayout &DL = GEP->getModule()->getDataLayout();
+  const DataLayout &DL = GEP->getDataLayout();
   auto NonNegative = [&](Value *V) {
     return isKnownNonNegative(V, SimplifyQuery(DL, DT, AC, GEP));
   };
@@ -2556,7 +2556,7 @@ static bool hoistAdd(ICmpInst::Predicate Pred, Value *VariantLHS,
   // freely move values from left side of inequality to right side (just as in
   // normal linear arithmetics). Overflows make things much more complicated, so
   // we want to avoid this.
-  auto &DL = L.getHeader()->getModule()->getDataLayout();
+  auto &DL = L.getHeader()->getDataLayout();
   bool ProvedNoOverflowAfterReassociate =
       computeOverflowForSignedSub(InvariantRHS, InvariantOp,
                                   SimplifyQuery(DL, DT, AC, &ICmp)) ==
@@ -2609,7 +2609,7 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS,
   // normal linear arithmetics). Overflows make things much more complicated, so
   // we want to avoid this. Likewise, for "C1 - LV < C2" we need to prove that
   // "C1 - C2" does not overflow.
-  auto &DL = L.getHeader()->getModule()->getDataLayout();
+  auto &DL = L.getHeader()->getDataLayout();
   SimplifyQuery SQ(DL, DT, AC, &ICmp);
   if (VariantSubtracted) {
     // C1 - LV < C2 --> LV > C1 - C2
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 9a27a08c86eb4f..6092cd1bc08beb 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -405,7 +405,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
                      : SE.getUMinExpr(NewBoundSCEV, SplitBoundSCEV);
 
   SCEVExpander Expander(
-      SE, L.getHeader()->getParent()->getParent()->getDataLayout(), "split");
+      SE, L.getHeader()->getDataLayout(), "split");
   Instruction *InsertPt = SplitLoopPH->getTerminator();
   Value *NewBoundValue =
       Expander.expandCodeFor(NewBoundSCEV, NewBoundSCEV->getType(), InsertPt);
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index cc1f56014eee98..d85166e518f1db 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -391,7 +391,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
       continue;
 
     BasicBlock *BB = P.InsertPt->getParent();
-    SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
+    SCEVExpander SCEVE(*SE, BB->getDataLayout(), "prefaddr");
     const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
       SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
       P.LSCEVAddRec->getStepRecurrence(*SE)));
diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index bfe9374cf2f8c8..b0b7ae60da9884 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -273,9 +273,9 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT,
       if (LiveEdges.count({ Pred, BB })) {
         HasLivePreds = true;
         Value *Incoming = PN.getIncomingValueForBlock(Pred);
-        // Skip undefs. If they are present, we can assume they are equal to
-        // the non-undef input.
-        if (isa<UndefValue>(Incoming))
+        // Skip poison. If they are present, we can assume they are equal to
+        // the non-poison input.
+        if (isa<PoisonValue>(Incoming))
           continue;
         // Two inputs.
         if (OnlyInput && OnlyInput != Incoming)
@@ -284,8 +284,8 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT,
       }
 
     assert(HasLivePreds && "No live predecessors?");
-    // If all incoming live value were undefs, return undef.
-    return OnlyInput ? OnlyInput : UndefValue::get(PN.getType());
+    // If all incoming live value were poison, return poison.
+    return OnlyInput ? OnlyInput : PoisonValue::get(PN.getType());
   };
   DenseMap<Value *, Value *> FirstIterValue;
 
@@ -299,7 +299,7 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT,
   //     iteration, mark this successor live.
   // 3b. If we cannot prove it, conservatively assume that all successors are
   //     live.
-  auto &DL = Header->getModule()->getDataLayout();
+  auto &DL = Header->getDataLayout();
   const SimplifyQuery SQ(DL);
   for (auto *BB : RPOT) {
     Visited.insert(BB);
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 626888c74bad8f..7a34ec2c008ccc 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -659,9 +659,9 @@ class LoopDistributeForLoop {
   bool processLoop() {
     assert(L->isInnermost() && "Only process inner loops.");
 
-    LLVM_DEBUG(dbgs() << "\nLDist: In \""
-                      << L->getHeader()->getParent()->getName()
-                      << "\" checking " << *L << "\n");
+    LLVM_DEBUG(dbgs() << "\nLDist: Checking a loop in '"
+                      << L->getHeader()->getParent()->getName() << "' from "
+                      << L->getLocStr() << "\n");
 
     // Having a single exit block implies there's also one exiting block.
     if (!L->getExitBlock())
@@ -686,6 +686,9 @@ class LoopDistributeForLoop {
     if (!Dependences || Dependences->empty())
       return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
 
+    LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: "
+                      << L->getHeader()->getName() << "\n");
+
     InstPartitionContainer Partitions(L, LI, DT);
 
     // First, go through each memory operation and assign them to consecutive
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index bb9632ff73a422..715f63b836c68f 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -644,7 +644,7 @@ static bool checkIVUsers(FlattenInfo &FI) {
 static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT,
                                     AssumptionCache *AC) {
   Function *F = FI.OuterLoop->getHeader()->getParent();
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
 
   // For debugging/testing.
   if (AssumeNoOverflow)
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index e0b224d5ef7359..8512b2accbe7c2 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1684,7 +1684,7 @@ struct LoopFuser {
           PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0");
       L1HeaderPHI->insertBefore(L1HeaderIP);
       L1HeaderPHI->addIncoming(LCV, FC0.Latch);
-      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+      L1HeaderPHI->addIncoming(PoisonValue::get(LCV->getType()),
                                FC0.ExitingBlock);
 
       LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
@@ -2072,7 +2072,7 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   // Ensure loops are in simplifed form which is a pre-requisite for loop fusion
   // pass. Added only for new PM since the legacy PM has already added
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 3fe5478408d457..635bd1236196e5 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -251,7 +251,7 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
   if (DisableLIRP::All)
     return PreservedAnalyses::all();
 
-  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+  const auto *DL = &L.getHeader()->getDataLayout();
 
   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
   // pass.  Function analyses need to be preserved across loop transformations
diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index cfe069d00bce76..270c2120365c05 100644
--- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -45,7 +45,7 @@ STATISTIC(NumSimplified, "Number of redundant instructions simplified");
 static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
                              AssumptionCache &AC, const TargetLibraryInfo &TLI,
                              MemorySSAUpdater *MSSAU) {
-  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L.getHeader()->getDataLayout();
   SimplifyQuery SQ(DL, &TLI, &DT, &AC);
 
   // On the first pass over the loop body we try to simplify every instruction.
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 38aea1371e1e1a..489f12e689d319 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -96,7 +96,7 @@ struct StoreToLoadForwardingCandidate {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadType = getLoadStoreType(Load);
-    auto &DL = Load->getParent()->getModule()->getDataLayout();
+    auto &DL = Load->getDataLayout();
 
     assert(LoadPtr->getType()->getPointerAddressSpace() ==
                StorePtr->getType()->getPointerAddressSpace() &&
@@ -225,7 +225,7 @@ class LoadEliminationForLoop {
       // Only propagate if the stored values are bit/pointer castable.
       if (!CastInst::isBitOrNoopPointerCastable(
               getLoadStoreType(Store), getLoadStoreType(Load),
-              Store->getParent()->getModule()->getDataLayout()))
+              Store->getDataLayout()))
         continue;
 
       Candidates.emplace_front(Load, Store);
@@ -459,7 +459,7 @@ class LoadEliminationForLoop {
 
     Type *LoadType = Initial->getType();
     Type *StoreType = Cand.Store->getValueOperand()->getType();
-    auto &DL = Cand.Load->getParent()->getModule()->getDataLayout();
+    auto &DL = Cand.Load->getDataLayout();
     (void)DL;
 
     assert(DL.getTypeSizeInBits(LoadType) == DL.getTypeSizeInBits(StoreType) &&
@@ -617,7 +617,7 @@ class LoadEliminationForLoop {
 
     // Next, propagate the value stored by the store to the users of the load.
     // Also for the first iteration, generate the initial value of the load.
-    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getDataLayout(),
                      "storeforward");
     for (const auto &Cand : Candidates)
       propagateStoredValueToLoadUsers(Cand, SEE);
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index 7036759a4eed57..072859af4c5f98 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -68,7 +68,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
                           hasVectorizeTransformation(&L) == TM_ForcedByUser
                       ? DefaultRotationThreshold
                       : 0;
-  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L.getHeader()->getDataLayout();
   const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
 
   std::optional<MemorySSAUpdater> MSSAU;
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3a98e257367b25..a37a1c5188f783 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5606,7 +5606,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
       if (C->getType() != OpTy) {
         C = ConstantFoldCastOperand(
             CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
-            CI->getModule()->getDataLayout());
+            CI->getDataLayout());
         assert(C && "Cast of ConstantInt should have folded");
       }
 
@@ -5911,7 +5911,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
       MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
                             ? PreferredAddresingMode
                             : TTI.getPreferredAddressingMode(L, &SE)),
-      Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr", false),
+      Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
       BaselineCost(L, SE, TTI, AMK) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
@@ -6887,7 +6887,7 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   }();
 
   const SCEV *BECount = SE.getBackedgeTakenCount(L);
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L->getHeader()->getDataLayout();
   SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
 
   PHINode *ToHelpFold = nullptr;
@@ -7023,7 +7023,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
   if (EnablePhiElim && L->isLoopSimplifyForm()) {
     SmallVector<WeakTrackingVH, 16> DeadInsts;
-    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    const DataLayout &DL = L->getHeader()->getDataLayout();
     SCEVExpander Rewriter(SE, DL, "lsr", false);
 #ifndef NDEBUG
     Rewriter.setDebugType(DEBUG_TYPE);
@@ -7044,7 +7044,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   // skip the updates in each loop iteration.
   if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
     SmallVector<WeakTrackingVH, 16> DeadInsts;
-    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    const DataLayout &DL = L->getHeader()->getDataLayout();
     SCEVExpander Rewriter(SE, DL, "lsr", true);
     int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
                                          UnusedIndVarInLoop, DeadInsts);
@@ -7094,7 +7094,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
         cast<Instruction>(LoopValue)->dropPoisonGeneratingFlags();
 
       // SCEVExpander for both use in preheader and latch
-      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+      const DataLayout &DL = L->getHeader()->getDataLayout();
       SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
 
       assert(Expander.isSafeToExpand(TermValueS) &&
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index b167120a906dfe..939c36164f7815 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -103,7 +103,7 @@ static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo &TLI,
     DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
 
   bool HasDeadBlocks = false;
-  const auto &DL = F.getParent()->getDataLayout();
+  const auto &DL = F.getDataLayout();
   SmallVector<WeakTrackingVH, 8> Worklist;
 
   ReversePostOrderTraversal<Function *> RPOT(&F);
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index e991296bd2fb07..6a681fd9339717 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -521,7 +521,7 @@ class LowerMatrixIntrinsics {
   LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
                         AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
                         OptimizationRemarkEmitter *ORE)
-      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
+      : Func(F), DL(F.getDataLayout()), TTI(TTI), AA(AA), DT(DT),
         LI(LI), ORE(ORE) {}
 
   unsigned getNumOps(Type *VT) {
@@ -1641,7 +1641,7 @@ class LowerMatrixIntrinsics {
     IRBuilder<> Builder(MatMul);
     Check0->getTerminator()->eraseFromParent();
     Builder.SetInsertPoint(Check0);
-    Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout());
+    Type *IntPtrTy = Builder.getIntPtrTy(Load->getDataLayout());
     Value *StoreBegin = Builder.CreatePtrToInt(
         const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");
     Value *StoreEnd = Builder.CreateAdd(
@@ -2462,7 +2462,7 @@ class LowerMatrixIntrinsics {
     RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,
                     OptimizationRemarkEmitter &ORE, Function &Func)
         : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),
-          DL(Func.getParent()->getDataLayout()) {}
+          DL(Func.getDataLayout()) {}
 
     /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
     /// instructions in Inst2Matrix returning void or without any users in
diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index 78e474f925b541..aea17aa82a88a4 100644
--- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/GuardUtils.h"
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 8fe3780bcf1b34..b9efd9aaa28c5d 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -357,7 +357,7 @@ static void combineAAMetadata(Instruction *ReplInst, Instruction *I) {
 Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
                                                  Value *StartPtr,
                                                  Value *ByteVal) {
-  const DataLayout &DL = StartInst->getModule()->getDataLayout();
+  const DataLayout &DL = StartInst->getDataLayout();
 
   // We can't track scalable types
   if (auto *SI = dyn_cast<StoreInst>(StartInst))
@@ -769,7 +769,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (SI->getMetadata(LLVMContext::MD_nontemporal))
     return false;
 
-  const DataLayout &DL = SI->getModule()->getDataLayout();
+  const DataLayout &DL = SI->getDataLayout();
 
   Value *StoredVal = SI->getValueOperand();
 
@@ -882,7 +882,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   if (!srcArraySize)
     return false;
 
-  const DataLayout &DL = cpyLoad->getModule()->getDataLayout();
+  const DataLayout &DL = cpyLoad->getDataLayout();
   TypeSize SrcAllocaSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType());
   // We can't optimize scalable types.
   if (SrcAllocaSize.isScalable())
@@ -1342,7 +1342,7 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
       // The size also doesn't matter, as an out-of-bounds access would be UB.
       if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
         if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
-          const DataLayout &DL = Alloca->getModule()->getDataLayout();
+          const DataLayout &DL = Alloca->getDataLayout();
           if (std::optional<TypeSize> AllocaSize =
                   Alloca->getAllocationSize(DL))
             if (*AllocaSize == LTSize->getValue())
@@ -1449,7 +1449,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   }
 
   // Check that copy is full with static size.
-  const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
+  const DataLayout &DL = DestAlloca->getDataLayout();
   std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
   if (!SrcSize || Size != *SrcSize) {
     LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
@@ -1644,7 +1644,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
 
 static bool isZeroSize(Value *Size) {
   if (auto *I = dyn_cast<Instruction>(Size))
-    if (auto *Res = simplifyInstruction(I, I->getModule()->getDataLayout()))
+    if (auto *Res = simplifyInstruction(I, I->getDataLayout()))
       Size = Res;
   // Treat undef/poison size like zero.
   if (auto *C = dyn_cast<Constant>(Size))
@@ -1686,7 +1686,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
-                                           M->getModule()->getDataLayout())) {
+                                           M->getDataLayout())) {
         IRBuilder<> Builder(M);
         Instruction *NewM = Builder.CreateMemSet(
             M->getRawDest(), ByteVal, M->getLength(), M->getDestAlign(), false);
@@ -1815,7 +1815,7 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
 
 /// This is called on every byval argument in call sites.
 bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
-  const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
+  const DataLayout &DL = CB.getDataLayout();
   // Find out what feeds this byval argument.
   Value *ByValArg = CB.getArgOperand(ArgNo);
   Type *ByValTy = CB.getParamByValType(ArgNo);
@@ -1902,7 +1902,7 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
   if (!(CB.paramHasAttr(ArgNo, Attribute::NoAlias) &&
         CB.paramHasAttr(ArgNo, Attribute::NoCapture)))
     return false;
-  const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
+  const DataLayout &DL = CB.getDataLayout();
   Value *ImmutArg = CB.getArgOperand(ArgNo);
 
   // 2. Check that arg is alloca
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index 2bd13556c69663..4291f3aee0cd1b 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -151,7 +151,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
     LLVM_DEBUG(dbgs() << "from non-zero AddressSpace\n");
     return {};
   }
-  const auto &DL = LoadI->getModule()->getDataLayout();
+  const auto &DL = LoadI->getDataLayout();
   if (!isDereferenceablePointer(Addr, LoadI->getType(), DL)) {
     LLVM_DEBUG(dbgs() << "not dereferenceable\n");
     // We need to make sure that we can do comparison in any order, so we
@@ -325,7 +325,7 @@ std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI,
   auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId);
   if (!Rhs.BaseId)
     return std::nullopt;
-  const auto &DL = CmpI->getModule()->getDataLayout();
+  const auto &DL = CmpI->getDataLayout();
   return BCECmp(std::move(Lhs), std::move(Rhs),
                 DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()), CmpI);
 }
@@ -658,7 +658,7 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
     unsigned IntBits = TLI.getIntSize();
 
     // Create memcmp() == 0.
-    const auto &DL = Phi.getModule()->getDataLayout();
+    const auto &DL = Phi.getDataLayout();
     Value *const MemCmpCall = emitMemCmp(
         Lhs, Rhs,
         ConstantInt::get(Builder.getIntNTy(SizeTBits), TotalSizeBits / 8),
diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index d65054a6ff9d5f..299239fb70200b 100644
--- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -199,7 +199,7 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
         CastInst::isBitOrNoopPointerCastable(
             Store0->getValueOperand()->getType(),
             Store1->getValueOperand()->getType(),
-            Store0->getModule()->getDataLayout()))
+            Store0->getDataLayout()))
       return Store1;
   }
   return nullptr;
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 224cd24915fac2..94e0b026eeef8a 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -205,7 +205,7 @@ bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
   SE = SE_;
   TLI = TLI_;
   TTI = TTI_;
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
 
   bool Changed = false, ChangedInThisIteration;
   do {
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 3e2f99acdc9a67..4cba196ed688a7 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -4237,7 +4237,7 @@ PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
   auto &AA = AM.getResult<AAManager>(F);
   auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
   bool Changed =
-      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getDataLayout())
           .runGVN();
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index ce54806c560dfd..77d67a2ce0f380 100644
--- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -60,6 +60,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index c84107d7ef2cb4..ce7b95af242912 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -420,7 +420,7 @@ static bool LinearizeExprTree(Instruction *I,
   using LeafMap = DenseMap<Value *, uint64_t>;
   LeafMap Leaves; // Leaf -> Total weight so far.
   SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
-  const DataLayout DL = I->getModule()->getDataLayout();
+  const DataLayout DL = I->getDataLayout();
 
 #ifndef NDEBUG
   SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme.
@@ -687,9 +687,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
     // stupid, create a new node if there are none left.
     BinaryOperator *NewOp;
     if (NodesToRewrite.empty()) {
-      Constant *Undef = UndefValue::get(I->getType());
-      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Undef,
-                                     Undef, "", I->getIterator());
+      Constant *Poison = PoisonValue::get(I->getType());
+      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Poison,
+                                     Poison, "", I->getIterator());
       if (isa<FPMathOperator>(NewOp))
         NewOp->setFastMathFlags(I->getFastMathFlags());
     } else {
@@ -767,7 +767,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
 static Value *NegateValue(Value *V, Instruction *BI,
                           ReassociatePass::OrderedSet &ToRedo) {
   if (auto *C = dyn_cast<Constant>(V)) {
-    const DataLayout &DL = BI->getModule()->getDataLayout();
+    const DataLayout &DL = BI->getDataLayout();
     Constant *Res = C->getType()->isFPOrFPVectorTy()
                         ? ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL)
                         : ConstantExpr::getNeg(C);
@@ -1882,7 +1882,7 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
                                            SmallVectorImpl<ValueEntry> &Ops) {
   // Now that we have the linearized expression tree, try to optimize it.
   // Start by folding any constants that we found.
-  const DataLayout &DL = I->getModule()->getDataLayout();
+  const DataLayout &DL = I->getDataLayout();
   Constant *Cst = nullptr;
   unsigned Opcode = I->getOpcode();
   while (!Ops.empty()) {
@@ -2178,7 +2178,7 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
       shouldConvertOrWithNoCommonBitsToAdd(I) && !isLoadCombineCandidate(I) &&
       (cast<PossiblyDisjointInst>(I)->isDisjoint() ||
        haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1),
-                           SimplifyQuery(I->getModule()->getDataLayout(),
+                           SimplifyQuery(I->getDataLayout(),
                                          /*DT=*/nullptr, /*AC=*/nullptr, I)))) {
     Instruction *NI = convertOrWithNoCommonBitsToAdd(I);
     RedoInsts.insert(I);
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index e0a9cff620189c..2b99e28acb4e95 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1252,7 +1252,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
 
   // get the data layout to compare the sizes of base/derived pointer values
   [[maybe_unused]] auto &DL =
-      cast<llvm::Instruction>(Def)->getModule()->getDataLayout();
+      cast<llvm::Instruction>(Def)->getDataLayout();
   // Cache all of our results so we can cheaply reuse them
   // NOTE: This is actually two caches: one of the base defining value
   // relation and one of the base pointer relation!  FIXME
@@ -1734,7 +1734,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
       //   memcpy(dest_derived, source_derived, ...) =>
       //   memcpy(dest_base, dest_offset, source_base, source_offset, ...)
       auto &Context = Call->getContext();
-      auto &DL = Call->getModule()->getDataLayout();
+      auto &DL = Call->getDataLayout();
       auto GetBaseAndOffset = [&](Value *Derived) {
         Value *Base = nullptr;
         // Optimizations in unreachable code might substitute the real pointer
@@ -2030,7 +2030,7 @@ static void relocationViaAlloca(
 
   // Emit alloca for "LiveValue" and record it in "allocaMap" and
   // "PromotableAllocas"
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   auto emitAllocaFor = [&](Value *LiveValue) {
     AllocaInst *Alloca =
         new AllocaInst(LiveValue->getType(), DL.getAllocaAddrSpace(), "",
@@ -2271,7 +2271,7 @@ static Value* findRematerializableChainToBasePointer(
   }
 
   if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
-    if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
+    if (!CI->isNoopCast(CI->getDataLayout()))
       return CI;
 
     ChainToBase.push_back(CI);
@@ -2293,7 +2293,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
 
   for (Instruction *Instr : Chain) {
     if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
-      assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
+      assert(CI->isNoopCast(CI->getDataLayout()) &&
              "non noop cast is found during rematerialization");
 
       Type *SrcTy = CI->getOperand(0)->getType();
@@ -2601,7 +2601,7 @@ static bool inlineGetBaseAndOffset(Function &F,
                                    DefiningValueMapTy &DVCache,
                                    IsKnownBaseMapTy &KnownBases) {
   auto &Context = F.getContext();
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   bool Changed = false;
 
   for (auto *Callsite : Intrinsics)
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 8a491e74b91c62..ce45c58e624e48 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -119,7 +119,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 }
 
 PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 2adbdca4b52860..4d8fd5d3b9f5c9 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1280,7 +1280,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
     Visited.insert(Root);
     Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
-    const DataLayout &DL = Root->getModule()->getDataLayout();
+    const DataLayout &DL = Root->getDataLayout();
     // If there are no loads or stores, the access is dead. We mark that as
     // a size zero access.
     Size = 0;
@@ -1527,7 +1527,7 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
 /// FIXME: This should be hoisted into a generic utility, likely in
 /// Transforms/Util/Local.h
 static bool isSafePHIToSpeculate(PHINode &PN) {
-  const DataLayout &DL = PN.getModule()->getDataLayout();
+  const DataLayout &DL = PN.getDataLayout();
 
   // For now, we can only do this promotion if the load is in the same block
   // as the PHI, and if there are no stores between the phi and load.
@@ -1685,7 +1685,7 @@ isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
   assert(LI.isSimple() && "Only for simple loads");
   SelectHandSpeculativity Spec;
 
-  const DataLayout &DL = SI.getModule()->getDataLayout();
+  const DataLayout &DL = SI.getDataLayout();
   for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
     if (isSafeToLoadUnconditionally(Value, LI.getType(), LI.getAlign(), DL,
                                     &LI))
@@ -4499,7 +4499,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   // them to the alloca slices.
   SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
   std::vector<LoadInst *> SplitLoads;
-  const DataLayout &DL = AI.getModule()->getDataLayout();
+  const DataLayout &DL = AI.getDataLayout();
   for (LoadInst *LI : Loads) {
     SplitLoads.clear();
 
@@ -4793,7 +4793,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // or an i8 array of an appropriate size.
   Type *SliceTy = nullptr;
   VectorType *SliceVecTy = nullptr;
-  const DataLayout &DL = AI.getModule()->getDataLayout();
+  const DataLayout &DL = AI.getDataLayout();
   std::pair<Type *, IntegerType *> CommonUseTy =
       findCommonType(P.begin(), P.end(), P.endOffset());
   // Do all uses operate on the same type?
@@ -5019,7 +5019,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   unsigned NumPartitions = 0;
   bool Changed = false;
-  const DataLayout &DL = AI.getModule()->getDataLayout();
+  const DataLayout &DL = AI.getDataLayout();
 
   // First try to pre-split loads and stores.
   Changed |= presplitLoadsAndStores(AI, AS);
@@ -5225,7 +5225,7 @@ SROA::runOnAlloca(AllocaInst &AI) {
     Changed = true;
     return {Changed, CFGChanged};
   }
-  const DataLayout &DL = AI.getModule()->getDataLayout();
+  const DataLayout &DL = AI.getDataLayout();
 
   // Skip alloca forms that this analysis can't handle.
   auto *AT = AI.getAllocatedType();
@@ -5357,7 +5357,7 @@ bool SROA::promoteAllocas(Function &F) {
 std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
   LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
        I != E; ++I) {
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 8f820a3bba2b32..8eadf8900020d9 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -933,7 +933,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
 
   bool EverMadeChange = false;
   bool MadeChange = true;
-  auto &DL = F.getParent()->getDataLayout();
+  auto &DL = F.getDataLayout();
   while (MadeChange) {
     MadeChange = false;
     for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 3eca9ac7c2673b..9930c1a294440a 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -1107,7 +1107,7 @@ bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) {
     return false;
 
   std::optional<VectorLayout> Layout = getVectorLayout(
-      LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout());
+      LI.getType(), LI.getAlign(), LI.getDataLayout());
   if (!Layout)
     return false;
 
@@ -1133,7 +1133,7 @@ bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) {
 
   Value *FullValue = SI.getValueOperand();
   std::optional<VectorLayout> Layout = getVectorLayout(
-      FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout());
+      FullValue->getType(), SI.getAlign(), SI.getDataLayout());
   if (!Layout)
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 4e515e05c842a3..111157b5b326fe 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -245,7 +245,7 @@ class ConstantOffsetExtractor {
 
 private:
   ConstantOffsetExtractor(BasicBlock::iterator InsertionPt)
-      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()) {}
+      : IP(InsertionPt), DL(InsertionPt->getDataLayout()) {}
 
   /// Searches the expression that computes V for a non-zero constant C s.t.
   /// V can be reassociated into the form V' + C. If the searching is
@@ -1179,7 +1179,7 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
   if (DisableSeparateConstOffsetFromGEP)
     return false;
 
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
   bool Changed = false;
   for (BasicBlock &B : F) {
     if (!DT->isReachableFromEntry(&B))
@@ -1369,7 +1369,7 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
   Second->setOperand(1, Offset1);
 
   // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
-  const DataLayout &DAL = First->getModule()->getDataLayout();
+  const DataLayout &DAL = First->getDataLayout();
   APInt Offset(DAL.getIndexSizeInBits(
                    cast<PointerType>(First->getType())->getAddressSpace()),
                0);
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 002ed381a4fd23..fdb3211b4a438e 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Use.h"
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 75910d7b698aa2..9be60721bebb54 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -715,7 +715,7 @@ namespace llvm {
 
 PreservedAnalyses
 StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
-  const DataLayout *DL = &F.getParent()->getDataLayout();
+  const DataLayout *DL = &F.getDataLayout();
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 7d96a3478858bd..9c711ec183821f 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -772,7 +772,7 @@ void StructurizeCFG::simplifyAffectedPhis() {
   bool Changed;
   do {
     Changed = false;
-    SimplifyQuery Q(Func->getParent()->getDataLayout());
+    SimplifyQuery Q(Func->getDataLayout());
     Q.DT = DT;
     // Setting CanUseUndef to true might extend value liveness, set it to false
     // to achieve better register pressure.
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index f413e4e1c15ac0..1b3e6d9549b825 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -349,7 +349,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
       // does not write to memory and the load provably won't trap.
       // Writes to memory only matter if they may alias the pointer
       // being loaded from.
-      const DataLayout &DL = L->getModule()->getDataLayout();
+      const DataLayout &DL = L->getDataLayout();
       if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
                                        L->getAlign(), DL, L))
@@ -594,7 +594,7 @@ void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI,
                                                               int OpndIdx) {
   Type *AggTy = CI->getParamByValType(OpndIdx);
   assert(AggTy);
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   // Get alignment of byVal operand.
   Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne());
@@ -621,7 +621,7 @@ void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments(
     CallInst *CI, int OpndIdx) {
   Type *AggTy = CI->getParamByValType(OpndIdx);
   assert(AggTy);
-  const DataLayout &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getDataLayout();
 
   // Get alignment of byVal operand.
   Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne());
@@ -749,7 +749,7 @@ void TailRecursionEliminator::cleanupAndFinalize() {
   // call.
   for (PHINode *PN : ArgumentPHIs) {
     // If the PHI Node is a dynamic constant, replace it with the value it is.
-    if (Value *PNV = simplifyInstruction(PN, F.getParent()->getDataLayout())) {
+    if (Value *PNV = simplifyInstruction(PN, F.getDataLayout())) {
       PN->replaceAllUsesWith(PNV);
       PN->eraseFromParent();
     }
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 6ca737df49b95f..c4b24bc3d190ea 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
@@ -351,7 +352,7 @@ static void processConstantStringArg(StringData *SD, IRBuilder<> &Builder,
 }
 
 static Value *processNonStringArg(Value *Arg, IRBuilder<> &Builder) {
-  const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
+  const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
   auto Ty = Arg->getType();
 
   if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index efa8e874b955e0..3cf68e07da5be2 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -320,7 +320,7 @@ RetainedKnowledge llvm::simplifyRetainedKnowledge(AssumeInst *Assume,
                                                   AssumptionCache *AC,
                                                   DominatorTree *DT) {
   AssumeBuilderState Builder(Assume->getModule(), Assume, AC, DT);
-  RK = canonicalizedKnowledge(RK, Assume->getModule()->getDataLayout());
+  RK = canonicalizedKnowledge(RK, Assume->getDataLayout());
 
   if (!Builder.isKnowledgeWorthPreserving(RK))
     return RetainedKnowledge::none();
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 462283c0bfe00a..79911bf563ea42 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1734,7 +1734,7 @@ llvm::SplitBlockAndInsertSimpleForLoop(Value *End, Instruction *SplitBefore) {
   BasicBlock *LoopExit = SplitBlock(SplitBefore->getParent(), SplitBefore);
 
   auto *Ty = End->getType();
-  auto &DL = SplitBefore->getModule()->getDataLayout();
+  auto &DL = SplitBefore->getDataLayout();
   const unsigned Bitwidth = DL.getTypeSizeInBits(Ty);
 
   IRBuilder<> Builder(LoopBody->getTerminator());
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 73a50b793e6d2e..41031ae69c4033 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -233,7 +233,7 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
   unsigned HiBits = LongLen - ShortLen;
 
-  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  const DataLayout &DL = SlowDivOrRem->getDataLayout();
   KnownBits Known(LongLen);
 
   computeKnownBits(V, Known, DL);
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index dda80d419999df..90dc727cde16d7 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -395,7 +396,7 @@ bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
                             const char **FailureReason) {
   assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
 
-  auto &DL = Callee->getParent()->getDataLayout();
+  auto &DL = Callee->getDataLayout();
 
   // Check the return type. The callee's return value type must be bitcast
   // compatible with the call site's type.
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp b/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
index c24b6ed70405ac..7330e59d25ee71 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
@@ -31,6 +31,7 @@
 
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 1fef8bc4612110..47e3c03288d979 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -535,7 +535,7 @@ void PruningFunctionCloner::CloneBlock(
       // this stage, thus instruction simplification is performed after
       // processing phi-nodes.
       if (Value *V = ConstantFoldInstruction(
-              NewInst, BB->getModule()->getDataLayout())) {
+              NewInst, BB->getDataLayout())) {
         if (isInstructionTriviallyDead(NewInst)) {
           VMap[&*II] = V;
           NewInst->eraseFromParent();
@@ -818,7 +818,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 
   // As phi-nodes have been now remapped, allow incremental simplification of
   // newly-cloned instructions.
-  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+  const DataLayout &DL = NewFunc->getDataLayout();
   for (const auto &BB : *OldFunc) {
     for (const auto &I : BB) {
       auto *NewI = dyn_cast_or_null<Instruction>(VMap.lookup(&I));
diff --git a/llvm/lib/Transforms/Utils/CountVisits.cpp b/llvm/lib/Transforms/Utils/CountVisits.cpp
index 4faded8fc6563e..f22880bc9a6619 100644
--- a/llvm/lib/Transforms/Utils/CountVisits.cpp
+++ b/llvm/lib/Transforms/Utils/CountVisits.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index 7995b754080313..3a33b591d35582 100644
--- a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -27,7 +28,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
   }
 
   Function *F = I.getParent()->getParent();
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
 
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
@@ -134,7 +135,7 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterato
     return nullptr;
   }
 
-  const DataLayout &DL = P->getModule()->getDataLayout();
+  const DataLayout &DL = P->getDataLayout();
 
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 012cdd627866fa..47d4e167b1c8fb 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -524,7 +524,7 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
   PointerType *PTyL = dyn_cast<PointerType>(TyL);
   PointerType *PTyR = dyn_cast<PointerType>(TyR);
 
-  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  const DataLayout &DL = FnL->getDataLayout();
   if (PTyL && PTyL->getAddressSpace() == 0)
     TyL = DL.getIntPtrType(TyL);
   if (PTyR && PTyR->getAddressSpace() == 0)
@@ -805,7 +805,7 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
 
   // When we have target data, we can reduce the GEP down to the value in bytes
   // added to the address.
-  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  const DataLayout &DL = FnL->getDataLayout();
   unsigned OffsetBitWidth = DL.getIndexSizeInBits(ASL);
   APInt OffsetL(OffsetBitWidth, 0), OffsetR(OffsetBitWidth, 0);
   if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
diff --git a/llvm/lib/Transforms/Utils/HelloWorld.cpp b/llvm/lib/Transforms/Utils/HelloWorld.cpp
index 7019e9e4451b2f..1098281fa0e136 100644
--- a/llvm/lib/Transforms/Utils/HelloWorld.cpp
+++ b/llvm/lib/Transforms/Utils/HelloWorld.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/HelloWorld.h"
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 39e3a2c5809c87..0725addfbb90af 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
@@ -56,6 +57,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -1575,7 +1577,7 @@ static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) {
     return;
 
   AssumptionCache *AC = &IFI.GetAssumptionCache(*CB.getCaller());
-  auto &DL = CB.getCaller()->getParent()->getDataLayout();
+  auto &DL = CB.getDataLayout();
 
   // To avoid inserting redundant assumptions, we should check for assumptions
   // already in the caller. To do this, we might need a DT of the caller.
@@ -1638,7 +1640,7 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg,
                                   InlineFunctionInfo &IFI,
                                   MaybeAlign ByValAlignment) {
   Function *Caller = TheCall->getFunction();
-  const DataLayout &DL = Caller->getParent()->getDataLayout();
+  const DataLayout &DL = Caller->getDataLayout();
 
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
@@ -1976,16 +1978,28 @@ void llvm::updateProfileCallee(
           ? 0
           : PriorEntryCount + EntryDelta;
 
+  auto updateVTableProfWeight = [](CallBase *CB, const uint64_t NewEntryCount,
+                                   const uint64_t PriorEntryCount) {
+    Instruction *VPtr = PGOIndirectCallVisitor::tryGetVTableInstruction(CB);
+    if (VPtr)
+      scaleProfData(*VPtr, NewEntryCount, PriorEntryCount);
+  };
+
   // During inlining ?
   if (VMap) {
     uint64_t CloneEntryCount = PriorEntryCount - NewEntryCount;
     for (auto Entry : *VMap) {
       if (isa<CallInst>(Entry.first))
-        if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+        if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second)) {
           CI->updateProfWeight(CloneEntryCount, PriorEntryCount);
+          updateVTableProfWeight(CI, CloneEntryCount, PriorEntryCount);
+        }
+
       if (isa<InvokeInst>(Entry.first))
-        if (auto *II = dyn_cast_or_null<InvokeInst>(Entry.second))
+        if (auto *II = dyn_cast_or_null<InvokeInst>(Entry.second)) {
           II->updateProfWeight(CloneEntryCount, PriorEntryCount);
+          updateVTableProfWeight(II, CloneEntryCount, PriorEntryCount);
+        }
     }
   }
 
@@ -1996,10 +2010,14 @@ void llvm::updateProfileCallee(
       // No need to update the callsite if it is pruned during inlining.
       if (!VMap || VMap->count(&BB))
         for (Instruction &I : BB) {
-          if (CallInst *CI = dyn_cast<CallInst>(&I))
+          if (CallInst *CI = dyn_cast<CallInst>(&I)) {
             CI->updateProfWeight(NewEntryCount, PriorEntryCount);
-          if (InvokeInst *II = dyn_cast<InvokeInst>(&I))
+            updateVTableProfWeight(CI, NewEntryCount, PriorEntryCount);
+          }
+          if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
             II->updateProfWeight(NewEntryCount, PriorEntryCount);
+            updateVTableProfWeight(II, NewEntryCount, PriorEntryCount);
+          }
         }
   }
 }
@@ -2293,7 +2311,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // callee.
     ScopedAliasMetadataDeepCloner SAMetadataCloner(CB.getCalledFunction());
 
-    auto &DL = Caller->getParent()->getDataLayout();
+    auto &DL = Caller->getDataLayout();
 
     // Calculate the vector of arguments to pass into the function cloner, which
     // matches up the formal to the actual argument values.
@@ -2627,7 +2645,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
       ConstantInt *AllocaSize = nullptr;
       if (ConstantInt *AIArraySize =
           dyn_cast<ConstantInt>(AI->getArraySize())) {
-        auto &DL = Caller->getParent()->getDataLayout();
+        auto &DL = Caller->getDataLayout();
         Type *AllocaType = AI->getAllocatedType();
         TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
         uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
@@ -3033,7 +3051,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   if (PHI) {
     AssumptionCache *AC =
         IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
-    auto &DL = Caller->getParent()->getDataLayout();
+    auto &DL = Caller->getDataLayout();
     if (Value *V = simplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index e6e5c3909feb09..6ebf9e104ac14f 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -731,7 +731,7 @@ simplifyAndDCEInstruction(Instruction *I,
 bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
                                        const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
-  const DataLayout &DL = BB->getModule()->getDataLayout();
+  const DataLayout &DL = BB->getDataLayout();
 
 #ifndef NDEBUG
   // In debug builds, ensure that the terminator of the block is never replaced
@@ -1601,7 +1601,7 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
 /// value when doing the comparison. E.g. an i1 value will be identified as
 /// covering an n-bit fragment, if the store size of i1 is at least n bits.
 static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
-  const DataLayout &DL = DII->getModule()->getDataLayout();
+  const DataLayout &DL = DII->getDataLayout();
   TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
   if (std::optional<uint64_t> FragmentSize =
           DII->getExpression()->getActiveBits(DII->getVariable()))
@@ -3371,7 +3371,7 @@ void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
   Source.getAllMetadata(MD);
   MDBuilder MDB(Dest.getContext());
   Type *NewType = Dest.getType();
-  const DataLayout &DL = Source.getModule()->getDataLayout();
+  const DataLayout &DL = Source.getDataLayout();
   for (const auto &MDPair : MD) {
     unsigned ID = MDPair.first;
     MDNode *N = MDPair.second;
diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
index d9832eeb0697e6..4ae2baca327c7a 100644
--- a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -397,7 +397,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
   BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
 
   assert(!L.contains(LatchExit) && "expected an exit block!");
-  const DataLayout &DL = Preheader->getModule()->getDataLayout();
+  const DataLayout &DL = Preheader->getDataLayout();
   SCEVExpander Expander(SE, DL, "loop-constrainer");
   Instruction *Ins = Preheader->getTerminator();
 
@@ -733,7 +733,7 @@ bool LoopConstrainer::run() {
   bool Increasing = MainLoopStructure.IndVarIncreasing;
   IntegerType *IVTy = cast<IntegerType>(RangeTy);
 
-  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "loop-constrainer");
+  SCEVExpander Expander(SE, F.getDataLayout(), "loop-constrainer");
   Instruction *InsertPt = OriginalPreheader->getTerminator();
 
   // It would have been better to make `PreLoop' and `PostLoop'
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index d517ec3d6e9f21..5d7c0d947facc4 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -298,7 +298,7 @@ static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
   BasicBlock *Header = L.getHeader();
   BasicBlock *Latch = L.getLoopLatch();
   SmallPtrSet<Value *, 8> LoadUsers;
-  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L.getHeader()->getDataLayout();
   for (BasicBlock *BB : L.blocks()) {
     for (Instruction &I : *BB) {
       if (I.mayWriteToMemory())
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index b38fecd6d8b416..66b59cdc784d3a 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -172,7 +172,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
 /// us how to partition the loops.
 static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
                                         AssumptionCache *AC) {
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L->getHeader()->getDataLayout();
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
@@ -588,7 +588,7 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L->getHeader()->getDataLayout();
 
   // Scan over the PHI nodes in the loop header.  Since they now have only two
   // incoming values (the loop is canonicalized), we may have simplified the PHI
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 90d7b99e9d817b..0afa1ef7805046 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -374,7 +374,7 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
 
   // At this point, the code is well formed.  Perform constprop, instsimplify,
   // and dce.
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const DataLayout &DL = L->getHeader()->getDataLayout();
   SmallVector<WeakTrackingVH, 16> DeadInsts;
   for (BasicBlock *BB : L->getBlocks()) {
     // Remove repeated debug instructions after loop unrolling.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 15303a17467b42..56aa96e550d9c3 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -126,7 +126,7 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
                            PreHeader);
       } else {
         // Succ is LatchExit.
-        NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
+        NewPN->addIncoming(PoisonValue::get(PN.getType()), PreHeader);
       }
 
       Value *V = PN.getIncomingValueForBlock(Latch);
@@ -253,7 +253,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
 
     // Add incoming PreHeader from branch around the Loop
-    PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
+    PN.addIncoming(PoisonValue::get(PN.getType()), PreHeader);
     SE.forgetValue(&PN);
 
     Value *V = PN.getIncomingValueForBlock(Latch);
@@ -272,7 +272,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                                NewExit);
     // Now PHIs should look like:
     // NewExit:
-    //   PN = PHI [I, Latch], [undef, PreHeader]
+    //   PN = PHI [I, Latch], [poison, PreHeader]
     // ...
     // Exit:
     //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
@@ -670,7 +670,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
 
   BasicBlock *PreHeader = L->getLoopPreheader();
   BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
-  const DataLayout &DL = Header->getModule()->getDataLayout();
+  const DataLayout &DL = Header->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
       Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
@@ -977,7 +977,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     remainderLoop = nullptr;
 
     // Simplify loop values after breaking the backedge
-    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    const DataLayout &DL = L->getHeader()->getDataLayout();
     SmallVector<WeakTrackingVH, 16> DeadInsts;
     for (BasicBlock *BB : RemainderBlocks) {
       for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 922d4b6437c7e0..a127a3265758d1 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1817,7 +1817,7 @@ Value *llvm::addRuntimeChecks(
 
   LLVMContext &Ctx = Loc->getContext();
   IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,
-                                           Loc->getModule()->getDataLayout());
+                                           Loc->getDataLayout());
   ChkBuilder.SetInsertPoint(Loc);
   // Our instructions might fold to a constant.
   Value *MemoryRuntimeCheck = nullptr;
@@ -1871,7 +1871,7 @@ Value *llvm::addDiffRuntimeChecks(
 
   LLVMContext &Ctx = Loc->getContext();
   IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,
-                                           Loc->getModule()->getDataLayout());
+                                           Loc->getDataLayout());
   ChkBuilder.SetInsertPoint(Loc);
   // Our instructions might fold to a constant.
   Value *MemoryRuntimeCheck = nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 548b0f3c55f043..c43c92a6b4d5d3 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -62,19 +62,19 @@ void LoopVersioning::versionLoop(
   const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
 
   SCEVExpander Exp2(*RtPtrChecking.getSE(),
-                    VersionedLoop->getHeader()->getModule()->getDataLayout(),
+                    VersionedLoop->getHeader()->getDataLayout(),
                     "induction");
   MemRuntimeCheck = addRuntimeChecks(RuntimeCheckBB->getTerminator(),
                                      VersionedLoop, AliasChecks, Exp2);
 
-  SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
+  SCEVExpander Exp(*SE, RuntimeCheckBB->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
       Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator());
 
   IRBuilder<InstSimplifyFolder> Builder(
       RuntimeCheckBB->getContext(),
-      InstSimplifyFolder(RuntimeCheckBB->getModule()->getDataLayout()));
+      InstSimplifyFolder(RuntimeCheckBB->getDataLayout()));
   if (MemRuntimeCheck && SCEVRuntimeCheck) {
     Builder.SetInsertPoint(RuntimeCheckBB->getTerminator());
     RuntimeCheck =
diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
index 791eeed9cb62cd..55f9400d93d79b 100644
--- a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
+++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index acd3f2802031ea..d84e9f094e03ad 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -34,7 +34,7 @@ void llvm::createMemCpyLoopKnownSize(
   BasicBlock *PostLoopBB = nullptr;
   Function *ParentFunc = PreLoopBB->getParent();
   LLVMContext &Ctx = PreLoopBB->getContext();
-  const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+  const DataLayout &DL = ParentFunc->getDataLayout();
   MDBuilder MDB(Ctx);
   MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
   StringRef Name = "MemCopyAliasScope";
@@ -186,7 +186,7 @@ void llvm::createMemCpyLoopUnknownSize(
       PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
 
   Function *ParentFunc = PreLoopBB->getParent();
-  const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+  const DataLayout &DL = ParentFunc->getDataLayout();
   LLVMContext &Ctx = PreLoopBB->getContext();
   MDBuilder MDB(Ctx);
   MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
@@ -374,7 +374,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   // TODO: Use different element type if possible?
   Type *EltTy = Type::getInt8Ty(F->getContext());
 
@@ -461,7 +461,7 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   BasicBlock *NewBB =
       OrigBB->splitBasicBlock(InsertBefore, "split");
   BasicBlock *LoopBB
diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index f4ef6d02abf0c0..55f35448af6376 100644
--- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -407,7 +407,7 @@ void ProcessSwitchInst(SwitchInst *SI,
     // 2. even if limited to icmp instructions only, it will have to process
     //    roughly C icmp's per switch, where C is the number of cases in the
     //    switch, while LowerSwitch only needs to call LVI once per switch.
-    const DataLayout &DL = F->getParent()->getDataLayout();
+    const DataLayout &DL = F->getDataLayout();
     KnownBits Known = computeKnownBits(Val, DL, /*Depth=*/0, AC, SI);
     // TODO Shouldn't this create a signed range?
     ConstantRange KnownBitsRange =
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 77b2c50b4413a4..1472302b6ca373 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -199,7 +199,7 @@ bool StackInfoBuilder::isInterestingAlloca(const AllocaInst &AI) {
 }
 
 uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
-  auto DL = AI.getModule()->getDataLayout();
+  auto DL = AI.getDataLayout();
   return *AI.getAllocationSize(DL);
 }
 
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 40d0f6b75d69b0..6f62545241e9e1 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -397,7 +397,7 @@ struct PromoteMem2Reg {
                  AssumptionCache *AC)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
         DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
-        AC(AC), SQ(DT.getRoot()->getParent()->getParent()->getDataLayout(),
+        AC(AC), SQ(DT.getRoot()->getDataLayout(),
                    nullptr, &DT, AC) {}
 
   void run();
@@ -453,6 +453,15 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
 static void convertMetadataToAssumes(LoadInst *LI, Value *Val,
                                      const DataLayout &DL, AssumptionCache *AC,
                                      const DominatorTree *DT) {
+  if (isa<UndefValue>(Val) && LI->hasMetadata(LLVMContext::MD_noundef)) {
+    // Insert non-terminator unreachable.
+    LLVMContext &Ctx = LI->getContext();
+    new StoreInst(ConstantInt::getTrue(Ctx),
+                  PoisonValue::get(PointerType::getUnqual(Ctx)),
+                  /*isVolatile=*/false, Align(1), LI);
+    return;
+  }
+
   // If the load was marked as nonnull we don't want to lose that information
   // when we erase this Load. So we preserve it with an assume. As !nonnull
   // returns poison while assume violations are immediate undefined behavior,
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 38df20c949c2ec..7fd3e51e141f30 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -136,9 +136,9 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     }
   }
 
-  // If there are no predecessors, just return undef.
+  // If there are no predecessors, just return poison.
   if (PredValues.empty())
-    return UndefValue::get(ProtoType);
+    return PoisonValue::get(ProtoType);
 
   // Otherwise, if all the merged values are the same, just use it.
   if (SingularValue)
@@ -167,7 +167,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // See if the PHI node can be merged to a single value.  This can happen in
   // loop cases when we get a PHI of itself and one other value.
   if (Value *V =
-          simplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
+          simplifyInstruction(InsertedPHI, BB->getDataLayout())) {
     InsertedPHI->eraseFromParent();
     return V;
   }
@@ -307,10 +307,10 @@ class SSAUpdaterTraits<SSAUpdater> {
       append_range(*Preds, predecessors(BB));
   }
 
-  /// GetUndefVal - Get an undefined value of the same type as the value
+  /// GetPoisonVal - Get a poison value of the same type as the value
   /// being handled.
-  static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) {
-    return UndefValue::get(Updater->ProtoType);
+  static Value *GetPoisonVal(BasicBlock *BB, SSAUpdater *Updater) {
+    return PoisonValue::get(Updater->ProtoType);
   }
 
   /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4e2dc7f2b2f4e7..69daa6352f95e9 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5818,7 +5818,8 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   for (auto &ForwardingNode : ForwardingNodes) {
     PHINode *Phi = ForwardingNode.first;
     SmallVectorImpl<int> &Indexes = ForwardingNode.second;
-    if (Indexes.size() < 2)
+    // Check if it helps to fold PHI.
+    if (Indexes.size() < 2 && !llvm::is_contained(Phi->incoming_values(), SI->getCondition()))
       continue;
 
     for (int Index : Indexes)
@@ -6592,7 +6593,7 @@ static void reuseTableCompare(
   Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType());
 
   // Check if the compare with the default value is constant true or false.
-  const DataLayout &DL = PhiBlock->getModule()->getDataLayout();
+  const DataLayout &DL = PhiBlock->getDataLayout();
   Constant *DefaultConst = ConstantFoldCompareInstOperands(
       CmpInst->getPredicate(), DefaultValue, CmpOp1, DL);
   if (DefaultConst != TrueConst && DefaultConst != FalseConst)
@@ -7360,6 +7361,95 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
   return PredPred;
 }
 
+/// Fold the following pattern:
+/// bb0:
+///   br i1 %cond1, label %bb1, label %bb2
+/// bb1:
+///   br i1 %cond2, label %bb3, label %bb4
+/// bb2:
+///   br i1 %cond2, label %bb4, label %bb3
+/// bb3:
+///   ...
+/// bb4:
+///   ...
+/// into
+/// bb0:
+///   %cond = xor i1 %cond1, %cond2
+///   br i1 %cond, label %bb4, label %bb3
+/// bb3:
+///   ...
+/// bb4:
+///   ...
+/// NOTE: %cond2 always dominates the terminator of bb0.
+static bool mergeNestedCondBranch(BranchInst *BI, DomTreeUpdater *DTU) {
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *BB1 = BI->getSuccessor(0);
+  BasicBlock *BB2 = BI->getSuccessor(1);
+  auto IsSimpleSuccessor = [BB](BasicBlock *Succ, BranchInst *&SuccBI) {
+    if (Succ == BB)
+      return false;
+    if (&Succ->front() != Succ->getTerminator())
+      return false;
+    SuccBI = dyn_cast<BranchInst>(Succ->getTerminator());
+    if (!SuccBI || !SuccBI->isConditional())
+      return false;
+    BasicBlock *Succ1 = SuccBI->getSuccessor(0);
+    BasicBlock *Succ2 = SuccBI->getSuccessor(1);
+    return Succ1 != Succ && Succ2 != Succ && Succ1 != BB && Succ2 != BB &&
+           !isa<PHINode>(Succ1->front()) && !isa<PHINode>(Succ2->front());
+  };
+  BranchInst *BB1BI, *BB2BI;
+  if (!IsSimpleSuccessor(BB1, BB1BI) || !IsSimpleSuccessor(BB2, BB2BI))
+    return false;
+
+  if (BB1BI->getCondition() != BB2BI->getCondition() ||
+      BB1BI->getSuccessor(0) != BB2BI->getSuccessor(1) ||
+      BB1BI->getSuccessor(1) != BB2BI->getSuccessor(0))
+    return false;
+
+  BasicBlock *BB3 = BB1BI->getSuccessor(0);
+  BasicBlock *BB4 = BB1BI->getSuccessor(1);
+  IRBuilder<> Builder(BI);
+  BI->setCondition(
+      Builder.CreateXor(BI->getCondition(), BB1BI->getCondition()));
+  BB1->removePredecessor(BB);
+  BI->setSuccessor(0, BB4);
+  BB2->removePredecessor(BB);
+  BI->setSuccessor(1, BB3);
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 4> Updates;
+    Updates.push_back({DominatorTree::Delete, BB, BB1});
+    Updates.push_back({DominatorTree::Insert, BB, BB4});
+    Updates.push_back({DominatorTree::Delete, BB, BB2});
+    Updates.push_back({DominatorTree::Insert, BB, BB3});
+
+    DTU->applyUpdates(Updates);
+  }
+  bool HasWeight = false;
+  uint64_t BBTWeight, BBFWeight;
+  if (extractBranchWeights(*BI, BBTWeight, BBFWeight))
+    HasWeight = true;
+  else
+    BBTWeight = BBFWeight = 1;
+  uint64_t BB1TWeight, BB1FWeight;
+  if (extractBranchWeights(*BB1BI, BB1TWeight, BB1FWeight))
+    HasWeight = true;
+  else
+    BB1TWeight = BB1FWeight = 1;
+  uint64_t BB2TWeight, BB2FWeight;
+  if (extractBranchWeights(*BB2BI, BB2TWeight, BB2FWeight))
+    HasWeight = true;
+  else
+    BB2TWeight = BB2FWeight = 1;
+  if (HasWeight) {
+    uint64_t Weights[2] = {BBTWeight * BB1FWeight + BBFWeight * BB2TWeight,
+                           BBTWeight * BB1TWeight + BBFWeight * BB2FWeight};
+    FitWeights(Weights);
+    setBranchWeights(BI, Weights[0], Weights[1], /*IsExpected=*/false);
+  }
+  return true;
+}
+
 bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   assert(
       !isa<ConstantInt>(BI->getCondition()) &&
@@ -7467,6 +7557,10 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
           if (mergeConditionalStores(PBI, BI, DTU, DL, TTI))
             return requestResimplify();
 
+  // Look for nested conditional branches.
+  if (mergeNestedCondBranch(BI, DTU))
+    return requestResimplify();
+
   return false;
 }
 
@@ -7749,7 +7843,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
                        DomTreeUpdater *DTU, const SimplifyCFGOptions &Options,
                        ArrayRef<WeakVH> LoopHeaders) {
-  return SimplifyCFGOpt(TTI, DTU, BB->getModule()->getDataLayout(), LoopHeaders,
+  return SimplifyCFGOpt(TTI, DTU, BB->getDataLayout(), LoopHeaders,
                         Options)
       .run(BB);
 }
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 912c02c2ed3ae5..74af0ef40fe7f6 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -80,6 +80,11 @@ namespace {
     /// all simplifications to users of an IV.
     void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);
 
+    void pushIVUsers(Instruction *Def,
+                     SmallPtrSet<Instruction *, 16> &Simplified,
+                     SmallVectorImpl<std::pair<Instruction *, Instruction *>>
+                         &SimpleIVUsers);
+
     Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
 
     bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
@@ -839,11 +844,9 @@ bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
 }
 
 /// Add all uses of Def to the current IV's worklist.
-static void pushIVUsers(
-  Instruction *Def, Loop *L,
-  SmallPtrSet<Instruction*,16> &Simplified,
-  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
-
+void SimplifyIndvar::pushIVUsers(
+    Instruction *Def, SmallPtrSet<Instruction *, 16> &Simplified,
+    SmallVectorImpl<std::pair<Instruction *, Instruction *>> &SimpleIVUsers) {
   for (User *U : Def->users()) {
     Instruction *UI = cast<Instruction>(U);
 
@@ -913,7 +916,7 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
   // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
   // called multiple times for the same LoopPhi. This is the proper thing to
   // do for loop header phis that use each other.
-  pushIVUsers(CurrIV, L, Simplified, SimpleIVUsers);
+  pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
 
   while (!SimpleIVUsers.empty()) {
     std::pair<Instruction*, Instruction*> UseOper =
@@ -960,7 +963,7 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       continue;
 
     if (eliminateIVUser(UseInst, IVOperand)) {
-      pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
+      pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
       continue;
     }
 
@@ -968,14 +971,14 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       if (strengthenBinaryOp(BO, IVOperand)) {
         // re-queue uses of the now modified binary operator and fall
         // through to the checks that remain.
-        pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
+        pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
       }
     }
 
     // Try to use integer induction for FPToSI of float induction directly.
     if (replaceFloatIVWithIntegerIV(UseInst)) {
       // Re-queue the potentially new direct uses of IVOperand.
-      pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
+      pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
       continue;
     }
 
@@ -985,7 +988,7 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       continue;
     }
     if (isSimpleIVUser(UseInst, L, SE)) {
-      pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
+      pushIVUsers(UseInst, Simplified, SimpleIVUsers);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 60ea200ad9ff9a..e61440e6e2daf1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -4159,7 +4159,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
                                                       IRBuilderBase &B) {
-  const DataLayout &DL = CI->getModule()->getDataLayout();
+  const DataLayout &DL = CI->getDataLayout();
   if (isFortifiedCallFoldable(CI, 3, 2))
     if (Value *Call = emitMemPCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                                   CI->getArgOperand(2), B, DL, TLI)) {
@@ -4171,7 +4171,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
                                                       IRBuilderBase &B,
                                                       LibFunc Func) {
-  const DataLayout &DL = CI->getModule()->getDataLayout();
+  const DataLayout &DL = CI->getDataLayout();
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
         *ObjSize = CI->getArgOperand(2);
 
@@ -4219,7 +4219,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI,
                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 1, std::nullopt, 0))
     return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B,
-                                     CI->getModule()->getDataLayout(), TLI));
+                                     CI->getDataLayout(), TLI));
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 6ba278794b7fb4..c91911ecad7456 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -252,7 +252,7 @@ class Vectorizer {
   Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
              DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
       : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
-        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
+        DL(F.getDataLayout()), Builder(SE.getContext()) {}
 
   bool run();
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 38095b1433ebe3..63f14208bf556b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -112,7 +112,7 @@ PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM,
   if (DisableAll)
     return PreservedAnalyses::all();
 
-  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+  const auto *DL = &L.getHeader()->getDataLayout();
 
   LoopIdiomVectorize LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
   if (!LIT.run(&L))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0c18c4e146de16..d306524ae51135 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -692,7 +692,7 @@ void LoopVectorizationLegality::addInductionPhi(
     InductionCastsToIgnore.insert(*Casts.begin());
 
   Type *PhiTy = Phi->getType();
-  const DataLayout &DL = Phi->getModule()->getDataLayout();
+  const DataLayout &DL = Phi->getDataLayout();
 
   // Get the widest type.
   if (!PhiTy->isFloatingPointTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c03c278fcebe78..93f6d1d82e244c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -262,16 +262,6 @@ struct VectorizationFactor {
   }
 };
 
-/// ElementCountComparator creates a total ordering for ElementCount
-/// for the purposes of using it in a set structure.
-struct ElementCountComparator {
-  bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
-    return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
-           std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
-  }
-};
-using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
-
 /// A class that represents two vectorization factors (initialized with 0 by
 /// default). One for fixed-width vectorization and one for scalable
 /// vectorization. This can be used by the vectorizer to choose from a range of
@@ -442,10 +432,9 @@ class LoopVectorizationPlanner {
                                   VPRecipeBuilder &RecipeBuilder,
                                   ElementCount MinVF);
 
-  /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every VF in \p CandidateVFs.
-  VectorizationFactor
-  selectVectorizationFactor(const ElementCountSet &CandidateVFs);
+  /// \return The most profitable vectorization factor for the available VPlans
+  /// and the cost of that VF.
+  VectorizationFactor selectVectorizationFactor();
 
   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
   /// that of B.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9571cfe358bf34..f9c0c66e6e0af8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1027,23 +1027,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
 
 } // end namespace llvm
 
-#ifndef NDEBUG
-/// \return string containing a file name and a line # for the given loop.
-static std::string getDebugLocString(const Loop *L) {
-  std::string Result;
-  if (L) {
-    raw_string_ostream OS(Result);
-    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
-      LoopDbgLoc.print(OS);
-    else
-      // Just print the module name.
-      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
-    OS.flush();
-  }
-  return Result;
-}
-#endif
-
 namespace llvm {
 
 // Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -1448,29 +1431,40 @@ class LoopVectorizationCostModel {
   /// Returns true if \p I is a memory instruction in an interleaved-group
   /// of memory accesses that can be vectorized with wide vector loads/stores
   /// and shuffles.
-  bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
+  bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
 
   /// Check if \p Instr belongs to any interleaved access group.
-  bool isAccessInterleaved(Instruction *Instr) {
+  bool isAccessInterleaved(Instruction *Instr) const {
     return InterleaveInfo.isInterleaved(Instr);
   }
 
   /// Get the interleaved access group that \p Instr belongs to.
   const InterleaveGroup<Instruction> *
-  getInterleavedAccessGroup(Instruction *Instr) {
+  getInterleavedAccessGroup(Instruction *Instr) const {
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
   bool requiresScalarEpilogue(bool IsVectorizing) const {
-    if (!isScalarEpilogueAllowed())
+    if (!isScalarEpilogueAllowed()) {
+      LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
       return false;
+    }
     // If we might exit from anywhere but the latch, must run the exiting
     // iteration in scalar form.
-    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+      return true;
+    }
+    if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
+      LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
+                           "interleaved group requires scalar epilogue\n");
       return true;
-    return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
+    }
+    LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
+    return false;
   }
 
   /// Returns true if we're required to use a scalar epilogue for at least
@@ -2443,7 +2437,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
     VPValue *BlockInMask, bool NeedsMaskForGaps) {
   Instruction *Instr = Group->getInsertPos();
-  const DataLayout &DL = Instr->getModule()->getDataLayout();
+  const DataLayout &DL = Instr->getDataLayout();
 
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getLoadStoreType(Instr);
@@ -3936,7 +3930,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
 }
 
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
-    Instruction *I, ElementCount VF) {
+    Instruction *I, ElementCount VF) const {
   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
   assert(getWideningDecision(I, VF) == CM_Unknown &&
          "Decision should not be set yet.");
@@ -3945,7 +3939,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
-  auto &DL = I->getModule()->getDataLayout();
+  auto &DL = I->getDataLayout();
   auto *ScalarTy = getLoadStoreType(I);
   if (hasIrregularType(ScalarTy, DL))
     return false;
@@ -4023,7 +4017,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
-  auto &DL = I->getModule()->getDataLayout();
+  auto &DL = I->getDataLayout();
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
@@ -4833,8 +4827,10 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
     if (Numbering[A.first] != Numbering[B.first])
       return Numbering[A.first] < Numbering[B.first];
-    ElementCountComparator ECC;
-    return ECC(A.second, B.second);
+    const auto &LHS = A.second;
+    const auto &RHS = B.second;
+    return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
+           std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
   });
 
   // For a list of ordered instruction-vf pairs:
@@ -4877,13 +4873,15 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   } while (!Tail.empty());
 }
 
-VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
-    const ElementCountSet &VFCandidates) {
+VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
   InstructionCost ExpectedCost =
       CM.expectedCost(ElementCount::getFixed(1)).first;
   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
-  assert(VFCandidates.count(ElementCount::getFixed(1)) &&
+  assert(any_of(VPlans,
+                [](std::unique_ptr<VPlan> &P) {
+                  return P->hasVF(ElementCount::getFixed(1));
+                }) &&
          "Expected Scalar VF to be a candidate");
 
   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
@@ -4891,7 +4889,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
   VectorizationFactor ChosenFactor = ScalarCost;
 
   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
-  if (ForceVectorization && VFCandidates.size() > 1) {
+  if (ForceVectorization &&
+      (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
@@ -4899,43 +4898,46 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
   }
 
   SmallVector<InstructionVFPair> InvalidCosts;
-  for (const auto &i : VFCandidates) {
-    // The cost for scalar VF=1 is already calculated, so ignore it.
-    if (i.isScalar())
-      continue;
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      // The cost for scalar VF=1 is already calculated, so ignore it.
+      if (VF.isScalar())
+        continue;
 
-    LoopVectorizationCostModel::VectorizationCostTy C =
-        CM.expectedCost(i, &InvalidCosts);
-    VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
+      LoopVectorizationCostModel::VectorizationCostTy C =
+          CM.expectedCost(VF, &InvalidCosts);
+      VectorizationFactor Candidate(VF, C.first, ScalarCost.ScalarCost);
 
 #ifndef NDEBUG
-    unsigned AssumedMinimumVscale =
-        getVScaleForTuning(OrigLoop, TTI).value_or(1);
-    unsigned Width =
-        Candidate.Width.isScalable()
-            ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
-            : Candidate.Width.getFixedValue();
-    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
-                      << " costs: " << (Candidate.Cost / Width));
-    if (i.isScalable())
-      LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
-                        << AssumedMinimumVscale << ")");
-    LLVM_DEBUG(dbgs() << ".\n");
+      unsigned AssumedMinimumVscale =
+          getVScaleForTuning(OrigLoop, TTI).value_or(1);
+      unsigned Width =
+          Candidate.Width.isScalable()
+              ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
+              : Candidate.Width.getFixedValue();
+      LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
+                        << " costs: " << (Candidate.Cost / Width));
+      if (VF.isScalable())
+        LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+                          << AssumedMinimumVscale << ")");
+      LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-    if (!C.second && !ForceVectorization) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Not considering vector loop of width " << i
-                 << " because it will not generate any vector instructions.\n");
-      continue;
-    }
+      if (!C.second && !ForceVectorization) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Not considering vector loop of width " << VF
+            << " because it will not generate any vector instructions.\n");
+        continue;
+      }
 
-    // If profitable add it to ProfitableVF list.
-    if (isMoreProfitable(Candidate, ScalarCost))
-      ProfitableVFs.push_back(Candidate);
+      // If profitable add it to ProfitableVF list.
+      if (isMoreProfitable(Candidate, ScalarCost))
+        ProfitableVFs.push_back(Candidate);
 
-    if (isMoreProfitable(Candidate, ChosenFactor))
-      ChosenFactor = Candidate;
+      if (isMoreProfitable(Candidate, ChosenFactor))
+        ChosenFactor = Candidate;
+    }
   }
 
   emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
@@ -5114,7 +5116,7 @@ std::pair<unsigned, unsigned>
 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
   unsigned MaxWidth = 8;
-  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+  const DataLayout &DL = TheFunction->getDataLayout();
   // For in-loop reductions, no element types are added to ElementTypesInLoop
   // if there are no loads/stores in the loop. In this case, check through the
   // reduction variables to determine the maximum width.
@@ -7270,14 +7272,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
                               "InvalidCost", ORE, OrigLoop);
   }
 
-  // Populate the set of Vectorization Factor Candidates.
-  ElementCountSet VFCandidates;
+  // Collect the Vectorization Factor Candidates.
+  SmallVector<ElementCount> VFCandidates;
   for (auto VF = ElementCount::getFixed(1);
        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
-    VFCandidates.insert(VF);
+    VFCandidates.push_back(VF);
   for (auto VF = ElementCount::getScalable(1);
        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
-    VFCandidates.insert(VF);
+    VFCandidates.push_back(VF);
 
   CM.collectInLoopReductions();
   for (const auto &VF : VFCandidates) {
@@ -7294,11 +7296,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
 
   LLVM_DEBUG(printPlans(dbgs()));
-  if (!MaxFactors.hasVector())
+  if (VPlans.empty())
+    return std::nullopt;
+  if (all_of(VPlans,
+             [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
     return VectorizationFactor::Disabled();
 
   // Select the optimal vectorization factor.
-  VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
+  VectorizationFactor VF = selectVectorizationFactor();
   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
   if (!hasPlanWithVF(VF.Width)) {
     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
@@ -9576,7 +9581,7 @@ static bool processLoopInVPlanNativePath(
     bool AddBranchWeights =
         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                             F->getParent()->getDataLayout(), AddBranchWeights);
+                             F->getDataLayout(), AddBranchWeights);
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9749,13 +9754,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   assert((EnableVPlanNativePath || L->isInnermost()) &&
          "VPlan-native path is not enabled. Only process inner loops.");
 
-#ifndef NDEBUG
-  const std::string DebugLocStr = getDebugLocString(L);
-#endif /* NDEBUG */
-
   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
                     << L->getHeader()->getParent()->getName() << "' from "
-                    << DebugLocStr << "\n");
+                    << L->getLocStr() << "\n");
 
   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
 
@@ -9924,7 +9925,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool AddBranchWeights =
       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                           F->getParent()->getDataLayout(), AddBranchWeights);
+                           F->getDataLayout(), AddBranchWeights);
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
@@ -10025,7 +10026,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     });
   } else if (VectorizeLoop && !InterleaveLoop) {
     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
-                      << ") in " << DebugLocStr << '\n');
+                      << ") in " << L->getLocStr() << '\n');
     ORE->emit([&]() {
       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
                                         L->getStartLoc(), L->getHeader())
@@ -10033,7 +10034,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     });
   } else if (VectorizeLoop && InterleaveLoop) {
     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
-                      << ") in " << DebugLocStr << '\n');
+                      << ") in " << L->getLocStr() << '\n');
     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 501f6afd60607d..c4876bb92a1ce5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -983,6 +983,17 @@ static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
   }
 }
 
+/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
+/// Opcode1.
+SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0,
+                               unsigned Opcode1) {
+  SmallBitVector OpcodeMask(VL.size(), false);
+  for (unsigned Lane : seq<unsigned>(VL.size()))
+    if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
+      OpcodeMask.set(Lane);
+  return OpcodeMask;
+}
+
 namespace llvm {
 
 static void inversePermutation(ArrayRef<unsigned> Indices,
@@ -999,7 +1010,7 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
                            ArrayRef<int> Mask) {
   assert(!Mask.empty() && "Expected non-empty mask.");
   SmallVector<Value *> Prev(Scalars.size(),
-                            UndefValue::get(Scalars.front()->getType()));
+                            PoisonValue::get(Scalars.front()->getType()));
   Prev.swap(Scalars);
   for (unsigned I = 0, E = Prev.size(); I < E; ++I)
     if (Mask[I] != PoisonMaskElem)
@@ -5093,11 +5104,7 @@ void BoUpSLP::reorderTopToBottom() {
           FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
       unsigned Opcode0 = TE->getOpcode();
       unsigned Opcode1 = TE->getAltOpcode();
-      // The opcode mask selects between the two opcodes.
-      SmallBitVector OpcodeMask(TE->Scalars.size(), false);
-      for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
-        if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
-          OpcodeMask.set(Lane);
+      SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
       // If this pattern is supported by the target then we consider the order.
       if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
         VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
@@ -6009,11 +6016,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
                                        ArrayRef<Value *> VL) const {
   unsigned Opcode0 = S.getOpcode();
   unsigned Opcode1 = S.getAltOpcode();
-  // The opcode mask selects between the two opcodes.
-  SmallBitVector OpcodeMask(VL.size(), false);
-  for (unsigned Lane : seq<unsigned>(0, VL.size()))
-    if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
-      OpcodeMask.set(Lane);
+  SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
   // If this pattern is supported by the target then consider it profitable.
   if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
                            Opcode0, Opcode1, OpcodeMask))
@@ -8301,20 +8304,31 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         });
     // FIXME: this must be moved to TTI for better estimation.
     unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
-    auto CheckPerRegistersShuffle =
-        [&](MutableArrayRef<int> Mask,
-            SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
+    auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
+                                        SmallVectorImpl<unsigned> &Indices)
+        -> std::optional<TTI::ShuffleKind> {
       if (NumElts <= EltsPerVector)
         return std::nullopt;
+      int OffsetReg0 =
+          alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
+                                    [](int S, int I) {
+                                      if (I == PoisonMaskElem)
+                                        return S;
+                                      return std::min(S, I);
+                                    }),
+                    EltsPerVector);
+      int OffsetReg1 = OffsetReg0;
       DenseSet<int> RegIndices;
       // Check that if trying to permute same single/2 input vectors.
       TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
       int FirstRegId = -1;
-      Indices.assign(1, -1);
-      for (int &I : Mask) {
+      Indices.assign(1, OffsetReg0);
+      for (auto [Pos, I] : enumerate(Mask)) {
         if (I == PoisonMaskElem)
           continue;
-        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+        int Idx = I - OffsetReg0;
+        int RegId =
+            (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
         if (FirstRegId < 0)
           FirstRegId = RegId;
         RegIndices.insert(RegId);
@@ -8322,14 +8336,25 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           return std::nullopt;
         if (RegIndices.size() == 2) {
           ShuffleKind = TTI::SK_PermuteTwoSrc;
-          if (Indices.size() == 1)
-            Indices.push_back(-1);
+          if (Indices.size() == 1) {
+            OffsetReg1 = alignDown(
+                std::accumulate(
+                    std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
+                    [&](int S, int I) {
+                      if (I == PoisonMaskElem)
+                        return S;
+                      int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
+                                  ((I - OffsetReg0) % NumElts) / EltsPerVector;
+                      if (RegId == FirstRegId)
+                        return S;
+                      return std::min(S, I);
+                    }),
+                EltsPerVector);
+            Indices.push_back(OffsetReg1 % NumElts);
+          }
+          Idx = I - OffsetReg1;
         }
-        if (RegId == FirstRegId)
-          Indices.front() = I % NumElts;
-        else
-          Indices.back() = I % NumElts;
-        I = (I % NumElts) % EltsPerVector +
+        I = (Idx % NumElts) % EltsPerVector +
             (RegId == FirstRegId ? 0 : EltsPerVector);
       }
       return ShuffleKind;
@@ -8346,7 +8371,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
-      SmallVector<int> Indices;
+      SmallVector<unsigned, 2> Indices;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
           CheckPerRegistersShuffle(SubMask, Indices);
       if (!RegShuffleKind) {
@@ -8364,12 +8389,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                  FixedVectorType::get(ScalarTy, EltsPerVector),
                                  SubMask);
       }
-      for (int Idx : Indices) {
-        Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
-                                 FixedVectorType::get(ScalarTy, NumElts),
-                                 std::nullopt, CostKind, Idx,
-                                 FixedVectorType::get(ScalarTy, EltsPerVector));
-      }
+      for (unsigned Idx : Indices) {
+        assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
+               "SK_ExtractSubvector index out of range");
+        Cost += ::getShuffleCost(
+            TTI, TTI::SK_ExtractSubvector,
+            FixedVectorType::get(ScalarTy, alignTo(NumElts, EltsPerVector)),
+            std::nullopt, CostKind, Idx,
+            FixedVectorType::get(ScalarTy, EltsPerVector));
+      }
+      // Second attempt to check, if just a permute is better estimated than
+      // subvector extract.
+      SubMask.assign(NumElts, PoisonMaskElem);
+      copy(MaskSlice, SubMask.begin());
+      InstructionCost OriginalCost =
+          ::getShuffleCost(TTI, *ShuffleKinds[Part],
+                           FixedVectorType::get(ScalarTy, NumElts), SubMask);
+      if (OriginalCost < Cost)
+        Cost = OriginalCost;
     }
     return Cost;
   }
@@ -9744,11 +9781,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       // order.
       unsigned Opcode0 = E->getOpcode();
       unsigned Opcode1 = E->getAltOpcode();
-      // The opcode mask selects between the two opcodes.
-      SmallBitVector OpcodeMask(E->Scalars.size(), false);
-      for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
-        if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
-          OpcodeMask.set(Lane);
+      SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
       // If this pattern is supported by the target then we consider the
       // order.
       if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
@@ -10270,7 +10303,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
     // If found user is an insertelement, do not calculate extract cost but try
     // to detect it as a final shuffled/identity match.
-    if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
+    if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
+        VU && VU->getOperand(1) == EU.Scalar) {
       if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
         if (!UsedInserts.insert(VU).second)
           continue;
@@ -11804,13 +11838,13 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
     Value *V1 = E1.VectorizedValue;
     if (V1->getType()->isIntOrIntVectorTy())
-      V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
+      V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
                                 return !isKnownNonNegative(
                                     V, SimplifyQuery(*R.DL));
                               }));
     Value *V2 = E2.VectorizedValue;
     if (V2->getType()->isIntOrIntVectorTy())
-      V2 = castToScalarTyElem(V2, all_of(E2.Scalars, [&](Value *V) {
+      V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
                                 return !isKnownNonNegative(
                                     V, SimplifyQuery(*R.DL));
                               }));
@@ -11821,7 +11855,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   void add(const TreeEntry &E1, ArrayRef<int> Mask) {
     Value *V1 = E1.VectorizedValue;
     if (V1->getType()->isIntOrIntVectorTy())
-      V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
+      V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
                                 return !isKnownNonNegative(
                                     V, SimplifyQuery(*R.DL));
                               }));
@@ -13769,7 +13803,8 @@ Value *BoUpSLP::vectorizeTree(
       continue;
     }
 
-    if (auto *VU = dyn_cast<InsertElementInst>(User)) {
+    if (auto *VU = dyn_cast<InsertElementInst>(User);
+        VU && VU->getOperand(1) == Scalar) {
       // Skip if the scalar is another vector op or Vec is not an instruction.
       if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
         if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
@@ -14901,24 +14936,30 @@ bool BoUpSLP::collectValuesToDemote(
   // If the value is not a vectorized instruction in the expression and not used
   // by the insertelement instruction and not used in multiple vector nodes, it
   // cannot be demoted.
+  bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
+    return !isKnownNonNegative(R, SimplifyQuery(*DL));
+  });
   auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
     if (MultiNodeScalars.contains(V))
       return false;
-    if (OrigBitWidth > BitWidth) {
+    // For lat shuffle of sext/zext with many uses need to check the extra bit
+    // for unsigned values, otherwise may have incorrect casting for reused
+    // scalars.
+    bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
+    if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
       APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
       if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
         return true;
     }
-    auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
+    unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
     unsigned BitWidth1 = OrigBitWidth - NumSignBits;
-    bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*DL));
-    if (IsSigned)
+    if (IsSignedNode)
       ++BitWidth1;
     if (auto *I = dyn_cast<Instruction>(V)) {
       APInt Mask = DB->getDemandedBits(I);
       unsigned BitWidth2 =
           std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
-      while (!IsSigned && BitWidth2 < OrigBitWidth) {
+      while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
         APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
         if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
           break;
@@ -15544,7 +15585,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   DT = DT_;
   AC = AC_;
   DB = DB_;
-  DL = &F.getParent()->getDataLayout();
+  DL = &F.getDataLayout();
 
   Stores.clear();
   GEPs.clear();
@@ -16170,7 +16211,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         Ty->print(rso);
         return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
                << "Cannot SLP vectorize list: type "
-               << rso.str() + " is unsupported by vectorizer";
+               << TypeStr + " is unsupported by vectorizer";
       });
       return false;
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5070950f87c275..d6a5d841e69d7f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -655,22 +655,38 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry);
 // cloned region.
 static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry) {
   DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
-      Entry);
-  for (VPBlockBase *BB : RPOT) {
+  VPBlockBase *Exiting = nullptr;
+  // First, clone blocks reachable from Entry.
+  for (VPBlockBase *BB : vp_depth_first_shallow(Entry)) {
     VPBlockBase *NewBB = BB->clone();
-    for (VPBlockBase *Pred : BB->getPredecessors())
-      VPBlockUtils::connectBlocks(Old2NewVPBlocks[Pred], NewBB);
-
     Old2NewVPBlocks[BB] = NewBB;
+    if (BB->getNumSuccessors() == 0) {
+      assert(!Exiting && "Multiple exiting blocks?");
+      Exiting = BB;
+    }
+  }
+
+  // Second, update the predecessors & successors of the cloned blocks.
+  for (VPBlockBase *BB : vp_depth_first_shallow(Entry)) {
+    VPBlockBase *NewBB = Old2NewVPBlocks[BB];
+    SmallVector<VPBlockBase *> NewPreds;
+    for (VPBlockBase *Pred : BB->getPredecessors()) {
+      NewPreds.push_back(Old2NewVPBlocks[Pred]);
+    }
+    NewBB->setPredecessors(NewPreds);
+    SmallVector<VPBlockBase *> NewSuccs;
+    for (VPBlockBase *Succ : BB->successors()) {
+      NewSuccs.push_back(Old2NewVPBlocks[Succ]);
+    }
+    NewBB->setSuccessors(NewSuccs);
   }
 
 #if !defined(NDEBUG)
   // Verify that the order of predecessors and successors matches in the cloned
   // version.
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
-      NewRPOT(Old2NewVPBlocks[Entry]);
-  for (const auto &[OldBB, NewBB] : zip(RPOT, NewRPOT)) {
+  for (const auto &[OldBB, NewBB] :
+       zip(vp_depth_first_shallow(Entry),
+           vp_depth_first_shallow(Old2NewVPBlocks[Entry]))) {
     for (const auto &[OldPred, NewPred] :
          zip(OldBB->getPredecessors(), NewBB->getPredecessors()))
       assert(NewPred == Old2NewVPBlocks[OldPred] && "Different predecessors");
@@ -681,8 +697,7 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry) {
   }
 #endif
 
-  return std::make_pair(Old2NewVPBlocks[Entry],
-                        Old2NewVPBlocks[*reverse(RPOT).begin()]);
+  return std::make_pair(Old2NewVPBlocks[Entry], Old2NewVPBlocks[Exiting]);
 }
 
 VPRegionBlock *VPRegionBlock::clone() {
@@ -790,9 +805,9 @@ VPlan::~VPlan() {
 
 VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
                                    BasicBlock *PH) {
-  VPIRBasicBlock *Preheader = new VPIRBasicBlock(PH);
+  VPIRBasicBlock *Entry = new VPIRBasicBlock(PH);
   VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
-  auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
+  auto Plan = std::make_unique<VPlan>(Entry, VecPreheader);
   Plan->TripCount =
       vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
   // Create empty VPRegionBlock, to be filled during processing later.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fc25ed907b1530..db10c7a240c7ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -614,6 +614,15 @@ class VPBlockBase {
       appendPredecessor(Pred);
   }
 
+  /// Set each VPBasicBlock in \p NewSuccss as successor of this VPBlockBase.
+  /// This VPBlockBase must have no successors. This VPBlockBase is not added
+  /// as predecessor of any VPBasicBlock in \p NewSuccs.
+  void setSuccessors(ArrayRef<VPBlockBase *> NewSuccs) {
+    assert(Successors.empty() && "Block successors already set.");
+    for (auto *Succ : NewSuccs)
+      appendSuccessor(Succ);
+  }
+
   /// Remove all the predecessor of this block.
   void clearPredecessors() { Predecessors.clear(); }
 
@@ -3246,6 +3255,12 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
+  /// Returns an iterator range over all VFs of the plan.
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator>
+  vectorFactors() const {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a3ff6395bb39ed..788e6c96d32aa1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -442,20 +442,20 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
       return nullptr;
 
     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
-    VPRegionBlock *ParentRegion = getParent()->getParent();
-    VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
-
     // Replace the temporary unreachable terminator with a new conditional
     // branch, hooking it up to backward destination for exiting blocks now and
     // to forward destination(s) later when they are created.
     BranchInst *CondBr =
         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
-
-    if (getParent()->isExiting())
-      CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
-
     CondBr->setSuccessor(0, nullptr);
     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+
+    if (!getParent()->isExiting())
+      return CondBr;
+
+    VPRegionBlock *ParentRegion = getParent()->getParent();
+    VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
+    CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
     return CondBr;
   }
   case VPInstruction::BranchOnCount: {
@@ -1502,7 +1502,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
     // Use i32 for the gep index type when the value is constant,
     // or query DataLayout for a more suitable index type otherwise.
     const DataLayout &DL =
-        Builder.GetInsertBlock()->getModule()->getDataLayout();
+        Builder.GetInsertBlock()->getDataLayout();
     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
                         ? DL.getIndexType(IndexedTy->getPointerTo())
                         : Builder.getInt32Ty();
@@ -1941,7 +1941,7 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "cannot be used in per-lane");
-  const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
+  const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
   SCEVExpander Exp(SE, DL, "induction");
 
   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 5b9fe1c9c9854d..8c1337cabb42f9 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1703,9 +1703,44 @@ generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
   return NItem;
 }
 
+/// Detect concat of multiple values into a vector
+static bool isFreeConcat(ArrayRef<InstLane> Item,
+                         const TargetTransformInfo &TTI) {
+  auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
+  unsigned NumElts = Ty->getNumElements();
+  if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
+    return false;
+
+  // Check that the concat is free, usually meaning that the type will be split
+  // during legalization.
+  SmallVector<int, 16> ConcatMask(NumElts * 2);
+  std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+  if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask,
+                         TTI::TCK_RecipThroughput) != 0)
+    return false;
+
+  unsigned NumSlices = Item.size() / NumElts;
+  // Currently we generate a tree of shuffles for the concats, which limits us
+  // to a power2.
+  if (!isPowerOf2_32(NumSlices))
+    return false;
+  for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
+    Use *SliceV = Item[Slice * NumElts].first;
+    if (!SliceV || SliceV->get()->getType() != Ty)
+      return false;
+    for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
+      auto [V, Lane] = Item[Slice * NumElts + Elt];
+      if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
+        return false;
+    }
+  }
+  return true;
+}
+
 static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
                                   const SmallPtrSet<Use *, 4> &IdentityLeafs,
                                   const SmallPtrSet<Use *, 4> &SplatLeafs,
+                                  const SmallPtrSet<Use *, 4> &ConcatLeafs,
                                   IRBuilder<> &Builder) {
   auto [FrontU, FrontLane] = Item.front();
 
@@ -1713,13 +1748,28 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
     return FrontU->get();
   }
   if (SplatLeafs.contains(FrontU)) {
-    if (auto *ILI = dyn_cast<Instruction>(FrontU))
-      Builder.SetInsertPoint(*ILI->getInsertionPointAfterDef());
-    else if (auto *Arg = dyn_cast<Argument>(FrontU))
-      Builder.SetInsertPointPastAllocas(Arg->getParent());
     SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
     return Builder.CreateShuffleVector(FrontU->get(), Mask);
   }
+  if (ConcatLeafs.contains(FrontU)) {
+    unsigned NumElts =
+        cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
+    SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
+    for (unsigned S = 0; S < Values.size(); ++S)
+      Values[S] = Item[S * NumElts].first->get();
+
+    while (Values.size() > 1) {
+      NumElts *= 2;
+      SmallVector<int, 16> Mask(NumElts, 0);
+      std::iota(Mask.begin(), Mask.end(), 0);
+      SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
+      for (unsigned S = 0; S < NewValues.size(); ++S)
+        NewValues[S] =
+            Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
+      Values = NewValues;
+    }
+    return Values[0];
+  }
 
   auto *I = cast<Instruction>(FrontU->get());
   auto *II = dyn_cast<IntrinsicInst>(I);
@@ -1730,8 +1780,9 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
       Ops[Idx] = II->getOperand(Idx);
       continue;
     }
-    Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
-                                   Ty, IdentityLeafs, SplatLeafs, Builder);
+    Ops[Idx] =
+        generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty,
+                            IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
   }
 
   SmallVector<Value *, 8> ValueList;
@@ -1739,7 +1790,6 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
     if (Lane.first)
       ValueList.push_back(Lane.first->get());
 
-  Builder.SetInsertPoint(I);
   Type *DstTy =
       FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
   if (auto *BI = dyn_cast<BinaryOperator>(I)) {
@@ -1790,7 +1840,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
 
   SmallVector<SmallVector<InstLane>> Worklist;
   Worklist.push_back(Start);
-  SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs;
+  SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
   unsigned NumVisited = 0;
 
   while (!Worklist.empty()) {
@@ -1839,7 +1889,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
 
     // We need each element to be the same type of value, and check that each
     // element has a single use.
-    if (!all_of(drop_begin(Item), [Item](InstLane IL) {
+    if (all_of(drop_begin(Item), [Item](InstLane IL) {
           Value *FrontV = Item.front().first->get();
           if (!IL.first)
             return true;
@@ -1852,7 +1902,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
             if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
               return false;
           if (auto *SI = dyn_cast<SelectInst>(V))
-            if (!isa<VectorType>(SI->getOperand(0)->getType()))
+            if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
+                SI->getOperand(0)->getType() !=
+                    cast<SelectInst>(FrontV)->getOperand(0)->getType())
               return false;
           if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
             return false;
@@ -1860,40 +1912,49 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
           return !II || (isa<IntrinsicInst>(FrontV) &&
                          II->getIntrinsicID() ==
                              cast<IntrinsicInst>(FrontV)->getIntrinsicID());
-        }))
-      return false;
-
-    // Check the operator is one that we support. We exclude div/rem in case
-    // they hit UB from poison lanes.
-    if ((isa<BinaryOperator>(FrontU) &&
-         !cast<BinaryOperator>(FrontU)->isIntDivRem()) ||
-        isa<CmpInst>(FrontU)) {
-      Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
-      Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
-    } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) {
-      Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
-    } else if (isa<SelectInst>(FrontU)) {
-      Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
-      Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
-      Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
-    } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
-               II && isTriviallyVectorizable(II->getIntrinsicID())) {
-      for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
-        if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
-          if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
-                Value *FrontV = Item.front().first->get();
-                Use *U = IL.first;
-                return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
-                              cast<Instruction>(FrontV)->getOperand(Op));
-              }))
-            return false;
-          continue;
+        })) {
+      // Check the operator is one that we support.
+      if (isa<BinaryOperator, CmpInst>(FrontU)) {
+        //  We exclude div/rem in case they hit UB from poison lanes.
+        if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
+            BO && BO->isIntDivRem())
+          return false;
+        Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
+        Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
+        continue;
+      } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) {
+        Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
+        continue;
+      } else if (isa<SelectInst>(FrontU)) {
+        Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
+        Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
+        Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
+        continue;
+      } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
+                 II && isTriviallyVectorizable(II->getIntrinsicID())) {
+        for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
+          if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
+            if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
+                  Value *FrontV = Item.front().first->get();
+                  Use *U = IL.first;
+                  return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
+                                cast<Instruction>(FrontV)->getOperand(Op));
+                }))
+              return false;
+            continue;
+          }
+          Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
         }
-        Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
+        continue;
       }
-    } else {
-      return false;
     }
+
+    if (isFreeConcat(Item, TTI)) {
+      ConcatLeafs.insert(FrontU);
+      continue;
+    }
+
+    return false;
   }
 
   if (NumVisited <= 1)
@@ -1901,7 +1962,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
 
   // If we got this far, we know the shuffles are superfluous and can be
   // removed. Scan through again and generate the new tree of instructions.
-  Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, Builder);
+  Builder.SetInsertPoint(&I);
+  Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
+                                 ConcatLeafs, Builder);
   replaceValue(I, *V);
   return true;
 }
@@ -2523,7 +2586,7 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
   TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   AAResults &AA = FAM.getResult<AAManager>(F);
-  const DataLayout *DL = &F.getParent()->getDataLayout();
+  const DataLayout *DL = &F.getDataLayout();
   VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
   if (!Combiner.run())
     return PreservedAnalyses::all();
diff --git a/llvm/lib/XRay/Profile.cpp b/llvm/lib/XRay/Profile.cpp
index c1a43632b600f1..4d1ab10e1d1b8c 100644
--- a/llvm/lib/XRay/Profile.cpp
+++ b/llvm/lib/XRay/Profile.cpp
@@ -202,7 +202,7 @@ Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
     for (const auto &Block : P.get()) {
       ThreadProfileIndexMap::iterator It;
       std::tie(It, std::ignore) = ThreadProfileIndex.insert(
-          {Block.Thread, PathDataMapPtr{new PathDataMap()}});
+          {Block.Thread, std::make_unique<PathDataMap>()});
       for (const auto &PathAndData : Block.PathData) {
         auto &PathID = PathAndData.first;
         auto &Data = PathAndData.second;
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 029b6a73674a15..52e95193947426 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -496,7 +496,7 @@ if(build_runtimes)
       if(TARGET amdhsa-loader)
         list(APPEND extra_cmake_args
              "-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:amdhsa-loader>")
-        list(APPEND extra_deps amdhsa-loader amdgpu-arch)
+        list(APPEND extra_deps amdhsa-loader)
       endif()
       list(APPEND extra_cmake_args "-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_LIBC_FULL_BUILD=ON")
     endif()
@@ -504,7 +504,7 @@ if(build_runtimes)
       if(TARGET nvptx-loader)
         list(APPEND extra_cmake_args
              "-DRUNTIMES_nvptx64-nvidia-cuda_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:nvptx-loader>")
-        list(APPEND extra_deps nvptx-loader nvptx-arch)
+        list(APPEND extra_deps nvptx-loader)
       endif()
       list(APPEND extra_cmake_args "-DRUNTIMES_nvptx64-nvidia-cuda_LLVM_LIBC_FULL_BUILD=ON")
     endif()
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index af41ed92319cff..219fbbaf072bed 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -75,6 +75,7 @@ define void @scalable() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
@@ -103,6 +104,9 @@ entry:
   %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
   %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
 
+  ; Types that are legal, but for which we have no masked load/store lowering
+  %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
+
   ret void
 }
 
@@ -265,6 +269,7 @@ declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>
 declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
 declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
 
 
 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index c05339d89d35c0..3d835779aa8828 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -12,23 +12,27 @@ define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nx
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
   %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
   %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+  %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
   ret void
 }
 
@@ -130,6 +134,7 @@ attributes #3 = { "target-features"="+sve" vscale_range(2, 2) }
 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
 declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+declare <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
 declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
 declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
 declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
index 3996eb40a2b218..225c1ebe60b640 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
@@ -9,12 +9,18 @@ define void @scalable_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i8 = load <vscale x 16 x i8>, ptr undef, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i8 = load <vscale x 32 x i8>, ptr undef, align 32
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = load <vscale x 1 x i64>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i1 = load <vscale x 32 x i1>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i1 = load <vscale x 16 x i1>, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = load <vscale x 4 x i1>, ptr undef, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv8i8 = load <vscale x 8 x i8>, ptr undef
   %res.nxv16i8 = load <vscale x 16 x i8>, ptr undef
   %res.nxv32i8 = load <vscale x 32 x i8>, ptr undef
   %res.nxv1i64 = load <vscale x 1 x i64>, ptr undef
+  %res.nxv32i1 = load <vscale x 32 x i1>, ptr undef
+  %res.nxv16i1 = load <vscale x 16 x i1>, ptr undef
+  %res.nxv4i1 = load <vscale x 4 x i1>, ptr undef
   ret void
 }
 
@@ -24,12 +30,18 @@ define void @scalable_stores() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 32 x i8> undef, ptr undef, align 32
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: store <vscale x 1 x i64> undef, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 32 x i1> undef, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i1> undef, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: store <vscale x 4 x i1> undef, ptr undef, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   store <vscale x 8 x i8> undef, ptr undef
   store <vscale x 16 x i8> undef, ptr undef
   store <vscale x 32 x i8> undef, ptr undef
   store <vscale x 1 x i64> undef, ptr undef
+  store <vscale x 32 x i1> undef, ptr undef
+  store <vscale x 16 x i1> undef, ptr undef
+  store <vscale x 4 x i1> undef, ptr undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
index c2309ebbc274a6..2a94d63278888e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
@@ -12,23 +12,27 @@ define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %n
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
   call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
   call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+  call void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
   ret void
 }
 
@@ -112,6 +116,7 @@ attributes #2 = { "target-features"="+sve" }
 declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
 declare void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
+declare void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x ptr>, i32, <vscale x 2 x i1>)
 declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
index 050beb7fc25a3a..f5ad65817950e8 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
@@ -791,9 +791,9 @@ define i32 @mul(i32 %arg) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'mul'
@@ -835,9 +835,9 @@ define i32 @mul(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'mul'
@@ -858,8 +858,8 @@ define i32 @mul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'mul'
@@ -880,7 +880,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -902,8 +902,8 @@ define i32 @mul(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'mul'
@@ -924,7 +924,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
index 6cf278e98bd85a..ed58f0f554e23b 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
@@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'mul'
@@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'mul'
@@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'mul'
@@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
index b5ca132d8c51d8..c9ee064822636a 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
@@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'mul'
@@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'mul'
@@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'mul'
@@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll
index e0f7afd3eb42d0..28e36ad88a88c2 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll
@@ -905,9 +905,9 @@ define i32 @mul(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'mul'
@@ -927,9 +927,9 @@ define i32 @mul(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'mul'
@@ -950,8 +950,8 @@ define i32 @mul(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'mul'
@@ -972,8 +972,8 @@ define i32 @mul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'mul'
@@ -994,7 +994,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
@@ -1016,8 +1016,8 @@ define i32 @mul(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'mul'
@@ -1038,7 +1038,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = mul <32 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
@@ -1059,9 +1059,9 @@ define i32 @mul(i32 %arg) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = mul <2 x i8> undef, undef
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'mul'
@@ -1081,9 +1081,9 @@ define i32 @mul(i32 %arg) {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = mul i64 undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
index 86e67eda2b1f27..b47580ba0abfd1 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
@@ -289,6 +289,25 @@ define i32 @srem_constpow2() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 590 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSSE3-LABEL: 'srem_constpow2'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = srem i16 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 378 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 292 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 582 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'srem_constpow2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
@@ -322,9 +341,9 @@ define i32 @srem_constpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_constpow2'
@@ -342,8 +361,8 @@ define i32 @srem_constpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_constpow2'
@@ -361,7 +380,7 @@ define i32 @srem_constpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -380,8 +399,8 @@ define i32 @srem_constpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_constpow2'
@@ -527,6 +546,25 @@ define i32 @srem_uniformconstpow2() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSSE3-LABEL: 'srem_uniformconstpow2'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = srem i16 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'srem_uniformconstpow2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
@@ -560,9 +598,9 @@ define i32 @srem_uniformconstpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_uniformconstpow2'
@@ -580,8 +618,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_uniformconstpow2'
@@ -599,7 +637,7 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -618,8 +656,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_uniformconstpow2'
diff --git a/llvm/test/Analysis/CostModel/X86/rem-latency.ll b/llvm/test/Analysis/CostModel/X86/rem-latency.ll
index 1428c6cdaca638..5264a75659d150 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-latency.ll
@@ -323,8 +323,8 @@ define i32 @srem_constpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_constpow2'
@@ -342,8 +342,8 @@ define i32 @srem_constpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_constpow2'
@@ -361,7 +361,7 @@ define i32 @srem_constpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -380,8 +380,8 @@ define i32 @srem_constpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_constpow2'
@@ -561,8 +561,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_uniformconstpow2'
@@ -580,8 +580,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_uniformconstpow2'
@@ -599,7 +599,7 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -618,8 +618,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_uniformconstpow2'
diff --git a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
index 2dc4c823cb5397..817a0c7e070ac6 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
@@ -323,8 +323,8 @@ define i32 @srem_constpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 230 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 460 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 226 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 452 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_constpow2'
@@ -342,8 +342,8 @@ define i32 @srem_constpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 268 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_constpow2'
@@ -361,7 +361,7 @@ define i32 @srem_constpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -380,8 +380,8 @@ define i32 @srem_constpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_constpow2'
@@ -561,8 +561,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_uniformconstpow2'
@@ -580,8 +580,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_uniformconstpow2'
@@ -599,7 +599,7 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
@@ -618,8 +618,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_uniformconstpow2'
diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll
index 7ca17b2af033a2..0e799d63741aa8 100644
--- a/llvm/test/Analysis/CostModel/X86/rem.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
@@ -193,6 +193,25 @@ define i32 @srem_const() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SSSE3-LABEL: 'srem_const'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = srem i16 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = srem i8 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'srem_const'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
@@ -726,6 +745,25 @@ define i32 @srem_constpow2() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SSSE3-LABEL: 'srem_constpow2'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 300 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'srem_constpow2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
@@ -740,9 +778,9 @@ define i32 @srem_constpow2() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'srem_constpow2'
@@ -760,8 +798,8 @@ define i32 @srem_constpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_constpow2'
@@ -779,8 +817,8 @@ define i32 @srem_constpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_constpow2'
@@ -798,7 +836,7 @@ define i32 @srem_constpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
@@ -817,8 +855,8 @@ define i32 @srem_constpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_constpow2'
@@ -835,9 +873,9 @@ define i32 @srem_constpow2() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'srem_constpow2'
@@ -854,9 +892,9 @@ define i32 @srem_constpow2() {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 16
@@ -983,6 +1021,25 @@ define i32 @srem_uniformconstpow2() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SSSE3-LABEL: 'srem_uniformconstpow2'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'srem_uniformconstpow2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
@@ -997,9 +1054,9 @@ define i32 @srem_uniformconstpow2() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'srem_uniformconstpow2'
@@ -1017,8 +1074,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_uniformconstpow2'
@@ -1036,8 +1093,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_uniformconstpow2'
@@ -1055,7 +1112,7 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
@@ -1074,8 +1131,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_uniformconstpow2'
@@ -1092,9 +1149,9 @@ define i32 @srem_uniformconstpow2() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'srem_uniformconstpow2'
@@ -1111,9 +1168,9 @@ define i32 @srem_uniformconstpow2() {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16
-; GLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 16
@@ -1240,6 +1297,25 @@ define i32 @srem_constnegpow2() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SSSE3-LABEL: 'srem_constnegpow2'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, -16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = srem i16 undef, -16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = srem i8 undef, -16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'srem_constnegpow2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, -16
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
diff --git a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
index 251af66ef7be0a..204290bd4e40f2 100644
--- a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
+++ b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -192,7 +192,7 @@ entry:
 
 define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b)  {
 ; CHECK-LABEL: 'slm-costs_8_v16_mul'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = mul nsw <16 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = mul nsw <16 x i8> %a, %b
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %res
 ;
 entry:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/debug-loc.ll b/llvm/test/Analysis/LoopAccessAnalysis/debug-loc.ll
new file mode 100644
index 00000000000000..320a543d9b3cf8
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/debug-loc.ll
@@ -0,0 +1,34 @@
+; REQUIRES: asserts
+; RUN: opt -passes='print<access-info>' -debug-only=loop-accesses \
+; RUN:   -disable-output -S %s 2>&1 | FileCheck %s
+
+define void @negative_step(ptr nocapture %A) {
+; CHECK-LABEL: LAA: Checking a loop in 'negative_step' from negative_step.c:5:2
+entry:
+  %A.plus.1 = getelementptr i32, ptr %A, i64 1
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1022, %entry ], [ %iv.next, %loop ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  %add = add nsw i32 %l, 1
+  %gep.A.plus.1 = getelementptr i32, ptr %A.plus.1, i64 %iv
+  store i32 %add, ptr %gep.A.plus.1, align 4
+  %iv.next = add nsw i64 %iv, -1
+  %cmp.not = icmp eq i64 %iv, 0
+  br i1 %cmp.not, label %exit, label %loop, !dbg !2
+
+exit:
+  ret void
+}
+
+!llvm.module.flags = !{!5, !6, !7}
+
+!0 = !DIFile(filename: "negative_step.c", directory: "/")
+!1 = distinct !DISubprogram(name: "negative_step", scope: !0, file: !0, unit: !4)
+!2 = !DILocation(line: 5, column: 2, scope: !1)
+!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !0, producer: "clang")
+!5 = !{i32 1, !"Debug Info Version", i32 3}
+!6 = !{i32 2, !"Dwarf Version", i32 2}
+!7 = !{i32 1, !"PIC Level", i32 2}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll
new file mode 100644
index 00000000000000..5b9833553fa02c
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+define void @false.equal.predicate(ptr %arg, ptr %arg1, i1 %arg2) {
+; CHECK-LABEL: 'false.equal.predicate'
+; CHECK-NEXT:    loop.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %load == 1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep10 = getelementptr double, ptr %gep8, i64 %mul:
+; CHECK-NEXT:        {(8 + %arg1),+,(8 * (sext i32 %load to i64))<nsw>}<%loop.body>
+; CHECK-NEXT:        --> {(8 + %arg1),+,8}<%loop.body>
+;
+entry:
+  %load = load i32, ptr %arg, align 4
+  br i1 %arg2, label %noloop.exit, label %loop.ph
+
+loop.ph:                                          ; preds = %entry
+  %sext7 = sext i32 %load to i64
+  %gep8 = getelementptr i8, ptr %arg1, i64 8
+  br label %loop.body
+
+loop.body:                                        ; preds = %loop.body, %loop.ph
+  %phi = phi i64 [ 0, %loop.ph ], [ %add, %loop.body ]
+  %mul = mul i64 %phi, %sext7
+  %gep10 = getelementptr double, ptr %gep8, i64 %mul
+  %load11 = load double, ptr %gep10, align 8
+  store double %load11, ptr %arg1, align 8
+  %add = add i64 %phi, 1
+  %icmp = icmp eq i64 %phi, 0
+  br i1 %icmp, label %loop.exit, label %loop.body
+
+noloop.exit:                                      ; preds = %entry
+  %sext = sext i32 %load to i64
+  %gep = getelementptr double, ptr %arg1, i64 %sext
+  %load5 = load double, ptr %gep, align 8
+  store double %load5, ptr %arg, align 8
+  ret void
+
+loop.exit:                                        ; preds = %loop.body
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
index 65f94a7d8fdb4a..18e45f469b4a38 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
@@ -6,7 +6,8 @@
 ;    A[i+1] = A[i] + 1;
 ; }
 
-; CHECK: LAA: Found a loop in negative_step: loop
+; CHECK-LABEL: 'negative_step'
+; CHECK: LAA: Found an analyzable loop: loop
 ; CHECK: LAA: Checking memory dependencies
 ; CHECK-NEXT: LAA: Src Scev: {(4092 + %A),+,-4}<nw><%loop>Sink Scev: {(4088 + %A)<nuw>,+,-4}<nw><%loop>(Induction step: -1)
 ; CHECK-NEXT: LAA: Distance for   store i32 %add, ptr %gep.A.plus.1, align 4 to   %l = load i32, ptr %gep.A, align 4: -4
@@ -37,7 +38,8 @@ exit:
 ;    A[i-1] = A[i] + 1;
 ; }
 
-; CHECK: LAA: Found a loop in positive_step: loop
+; CHECK-LABEL: 'positive_step'
+; CHECK: LAA: Found an analyzable loop: loop
 ; CHECK: LAA: Checking memory dependencies
 ; CHECK-NEXT: LAA: Src Scev: {(4 + %A)<nuw>,+,4}<nuw><%loop>Sink Scev: {%A,+,4}<nw><%loop>(Induction step: 1)
 ; CHECK-NEXT: LAA: Distance for   %l = load i32, ptr %gep.A, align 4 to   store i32 %add, ptr %gep.A.minus.1, align 4: -4
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/scalable-vector-regression-tests.ll b/llvm/test/Analysis/LoopAccessAnalysis/scalable-vector-regression-tests.ll
index 82a884a6372593..ffa5b3c868ab02 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/scalable-vector-regression-tests.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/scalable-vector-regression-tests.ll
@@ -24,7 +24,8 @@ loop.end:
   ret void
 }
 
-; CHECK-LABEL: LAA: Found a loop in regression_test_loop_access_scalable_typesize
+; CHECK-LABEL: 'regression_test_loop_access_scalable_typesize'
+; CHECK: LAA: Found an analyzable loop: vector.body
 ; CHECK: LAA: Bad stride - Scalable object:
 define void @regression_test_loop_access_scalable_typesize(ptr %input_ptr) {
 entry:
@@ -42,7 +43,8 @@ end:
   ret void
 }
 
-; CHECK-LABEL: LAA: Found a loop in regression_test_loop_access_scalable_typesize_nonscalable_object
+; CHECK-LABEL: 'regression_test_loop_access_scalable_typesize_nonscalable_object'
+; CHECK: LAA: Found an analyzable loop: vector.body
 ; CHECK: LAA: Bad stride - Scalable object:
 define void @regression_test_loop_access_scalable_typesize_nonscalable_object(ptr %input_ptr) {
 entry:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index e9aeac7ac2bc5d..7c1b11e22aef24 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -170,16 +170,23 @@ define void @single_stride_castexpr_multiuse(i32 %offset, ptr %src, ptr %dst, i1
 ; CHECK-NEXT:          %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv.3
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group [[GRP3]]:
-; CHECK-NEXT:          (Low: (((4 * %iv.1) + %dst) umin ((4 * %iv.1) + (4 * (sext i32 %offset to i64) * (200 + (-1 * (zext i32 %offset to i64))<nsw>)<nsw>) + %dst)) High: (4 + (((4 * %iv.1) + %dst) umax ((4 * %iv.1) + (4 * (sext i32 %offset to i64) * (200 + (-1 * (zext i32 %offset to i64))<nsw>)<nsw>) + %dst))))
-; CHECK-NEXT:            Member: {((4 * %iv.1) + %dst),+,(4 * (sext i32 %offset to i64))<nsw>}<%inner.loop>
+; CHECK-NEXT:          (Low: ((4 * %iv.1) + %dst) High: (804 + (4 * %iv.1) + (-4 * (zext i32 %offset to i64))<nsw> + %dst))
+; CHECK-NEXT:            Member: {((4 * %iv.1) + %dst),+,4}<%inner.loop>
 ; CHECK-NEXT:        Group [[GRP4]]:
-; CHECK-NEXT:          (Low: ((4 * (zext i32 %offset to i64))<nuw><nsw> + %src) High: (804 + %src))
-; CHECK-NEXT:            Member: {((4 * (zext i32 %offset to i64))<nuw><nsw> + %src),+,4}<%inner.loop>
+; CHECK-NEXT:          (Low: (4 + %src) High: (808 + (-4 * (zext i32 %offset to i64))<nsw> + %src))
+; CHECK-NEXT:            Member: {(4 + %src),+,4}<%inner.loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %offset == 1
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv.3:
+; CHECK-NEXT:        {((4 * (zext i32 %offset to i64))<nuw><nsw> + %src),+,4}<%inner.loop>
+; CHECK-NEXT:        --> {(4 + %src),+,4}<%inner.loop>
+; CHECK-NEXT:      [PSE] %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2:
+; CHECK-NEXT:        {((4 * %iv.1) + %dst),+,(4 * (sext i32 %offset to i64))<nsw>}<%inner.loop>
+; CHECK-NEXT:        --> {((4 * %iv.1) + %dst),+,4}<%inner.loop>
 ; CHECK-NEXT:    outer.header:
 ; CHECK-NEXT:      Report: loop is not the innermost loop
 ; CHECK-NEXT:      Dependences:
diff --git a/llvm/test/Analysis/ScalarEvolution/pr92560.ll b/llvm/test/Analysis/ScalarEvolution/pr92560.ll
new file mode 100644
index 00000000000000..bf1feec555af99
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/pr92560.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -disable-output "-passes=print<scalar-evolution>" < %s 2>&1 | FileCheck %s
+
+define dso_local void @simple(i32 noundef %n) local_unnamed_addr {
+; CHECK-LABEL: 'simple'
+; CHECK-NEXT:  Classifying expressions for: @simple
+; CHECK-NEXT:    %right.06 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+; CHECK-NEXT:    --> {%n,+,-4}<nsw><%while.body> U: full-set S: full-set Exits: ((-4 * (((-4 + (-1 * (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))<nuw><nsw> + (4 smax (-4 + %n))) /u 8) + (1 umin (-4 + (4 smax (-4 + %n)))<nsw>)))<nsw> + %n) LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %left.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%while.body> U: [0,2147483641) S: [0,2147483641) Exits: (4 * (((-4 + (-1 * (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))<nuw><nsw> + (4 smax (-4 + %n))) /u 8) + (1 umin (-4 + (4 smax (-4 + %n)))<nsw>)))<nuw> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %inc = add nuw nsw i32 %left.05, 4
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%while.body> U: [4,2147483645) S: [4,2147483645) Exits: (4 + (4 * (((-4 + (-1 * (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))<nuw><nsw> + (4 smax (-4 + %n))) /u 8) + (1 umin (-4 + (4 smax (-4 + %n)))<nsw>)))<nuw>)<nuw> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %dec = add nsw i32 %right.06, -4
+; CHECK-NEXT:    --> {(-4 + %n),+,-4}<nsw><%while.body> U: full-set S: full-set Exits: (-4 + (-4 * (((-4 + (-1 * (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))<nuw><nsw> + (4 smax (-4 + %n))) /u 8) + (1 umin (-4 + (4 smax (-4 + %n)))<nsw>)))<nsw> + %n) LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @simple
+; CHECK-NEXT:  Loop %while.body: backedge-taken count is (((-4 + (-1 * (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))<nuw><nsw> + (4 smax (-4 + %n))) /u 8) + (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))
+; CHECK-NEXT:  Loop %while.body: constant max backedge-taken count is i32 536870910
+; CHECK-NEXT:  Loop %while.body: symbolic max backedge-taken count is (((-4 + (-1 * (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))<nuw><nsw> + (4 smax (-4 + %n))) /u 8) + (1 umin (-4 + (4 smax (-4 + %n)))<nsw>))
+; CHECK-NEXT:  Loop %while.body: Trip multiple is 1
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %while.body, label %while.end
+
+while.body:
+  %right.06 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %left.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %inc = add nuw nsw i32 %left.05, 4
+  %dec = add nsw i32 %right.06, -4
+  %cmp = icmp slt i32 %inc, %dec
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+; Cannot find backedge-count because subtraction of strides is wrapping.
+define dso_local void @stride_overflow(i32 noundef %n) local_unnamed_addr {
+; CHECK-LABEL: 'stride_overflow'
+; CHECK-NEXT:  Classifying expressions for: @stride_overflow
+; CHECK-NEXT:    %right.06 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+; CHECK-NEXT:    --> {%n,+,-1}<nsw><%while.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %left.05 = phi i32 [ %inc, %while.body ], [ 2147483647, %entry ]
+; CHECK-NEXT:    --> {2147483647,+,2147483647}<nuw><nsw><%while.body> U: [2147483647,-2147483648) S: [2147483647,-2147483648) Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %inc = add nuw nsw i32 %left.05, 2147483647
+; CHECK-NEXT:    --> {-2,+,2147483647}<nuw><nsw><%while.body> U: [-2,-1) S: [-2,0) Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %dec = add nsw i32 %right.06, -1
+; CHECK-NEXT:    --> {(-1 + %n),+,-1}<nsw><%while.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @stride_overflow
+; CHECK-NEXT:  Loop %while.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %while.body: constant max backedge-taken count is i32 1
+; CHECK-NEXT:  Loop %while.body: symbolic max backedge-taken count is i32 1
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %while.body, label %while.end
+
+while.body:
+  %right.06 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %left.05 = phi i32 [ %inc, %while.body ], [ 2147483647, %entry ]
+  %inc = add nuw nsw i32 %left.05, 2147483647
+  %dec = add nsw i32 %right.06, -1
+  %cmp = icmp slt i32 %inc, %dec
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+; Cannot find backedge-count because %conv110 is wrapping
+define dso_local void @rhs_wrapping() local_unnamed_addr {
+; CHECK-LABEL: 'rhs_wrapping'
+; CHECK-NEXT:  Classifying expressions for: @rhs_wrapping
+; CHECK-NEXT:    %a = alloca i8, align 1
+; CHECK-NEXT:    --> %a U: full-set S: full-set
+; CHECK-NEXT:    %conv110 = phi i32 [ 0, %entry ], [ %sext8, %while.body ]
+; CHECK-NEXT:    --> {0,+,-1090519040}<%while.body> U: [0,-16777215) S: [-2147483648,2130706433) Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %conv9 = phi i32 [ -2147483648, %entry ], [ %sext, %while.body ]
+; CHECK-NEXT:    --> {-2147483648,+,16777216}<nsw><%while.body> U: [0,-16777215) S: [-2147483648,2113929217) Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %sext = add nsw i32 %conv9, 16777216
+; CHECK-NEXT:    --> {-2130706432,+,16777216}<nsw><%while.body> U: [0,-16777215) S: [-2130706432,2130706433) Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %sext8 = add i32 %conv110, -1090519040
+; CHECK-NEXT:    --> {-1090519040,+,-1090519040}<%while.body> U: [0,-16777215) S: [-2147483648,2130706433) Exits: <<Unknown>> LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @rhs_wrapping
+; CHECK-NEXT:  Loop %while.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %while.body: constant max backedge-taken count is i32 254
+; CHECK-NEXT:  Loop %while.body: symbolic max backedge-taken count is i32 254
+;
+entry:
+  %a = alloca i8, align 1
+  br label %while.body
+
+while.body:
+  %conv110 = phi i32 [ 0, %entry ], [ %sext8, %while.body ]
+  %conv9 = phi i32 [ -2147483648, %entry ], [ %sext, %while.body ]
+  %sext = add nsw i32 %conv9, 16777216
+  %sext8 = add i32 %conv110, -1090519040
+  %cmp = icmp slt i32 %sext, %sext8
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+; abs(left_stride) != abs(right_stride)
+define dso_local void @simple2() local_unnamed_addr {
+; CHECK-LABEL: 'simple2'
+; CHECK-NEXT:  Classifying expressions for: @simple2
+; CHECK-NEXT:    %right.08 = phi i32 [ 50, %entry ], [ %add2, %while.body ]
+; CHECK-NEXT:    --> {50,+,-5}<nsw><%while.body> U: [25,51) S: [25,51) Exits: 25 LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %left.07 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%while.body> U: [0,21) S: [0,21) Exits: 20 LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %add = add nuw nsw i32 %left.07, 4
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%while.body> U: [4,25) S: [4,25) Exits: 24 LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:    %add2 = add nsw i32 %right.08, -5
+; CHECK-NEXT:    --> {45,+,-5}<nsw><%while.body> U: [20,46) S: [20,46) Exits: 20 LoopDispositions: { %while.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @simple2
+; CHECK-NEXT:  Loop %while.body: backedge-taken count is i32 5
+; CHECK-NEXT:  Loop %while.body: constant max backedge-taken count is i32 5
+; CHECK-NEXT:  Loop %while.body: symbolic max backedge-taken count is i32 5
+; CHECK-NEXT:  Loop %while.body: Trip multiple is 6
+;
+entry:
+  br label %while.body
+
+while.body:
+  %right.08 = phi i32 [ 50, %entry ], [ %add2, %while.body ]
+  %left.07 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add nuw nsw i32 %left.07, 4
+  %add2 = add nsw i32 %right.08, -5
+  %cmp = icmp slt i32 %add, %add2
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26c85e83b53adc..680c998a4b39f9 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,16 +7,16 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
   ret void
 }
 
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
@@ -56,9 +56,9 @@ define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 {
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
 define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
-  %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+  %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
   store i32 %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -230,14 +230,14 @@ bb:
 }
 
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
-declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
-declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
+declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
+declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
-declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
+declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index bb5fae0dcd12ee..dc6f2a9e7d2064 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -66,6 +66,23 @@ define void @types() {
   ret void
 }
 
+; Target extension types:
+define target("target.ext.1") @target_ext_01(target("target.ext.1") %0) {
+  ret target("target.ext.1") %0
+}
+
+define target("target.ext.2", i8, i1) @target_ext_02(target("target.ext.2", i8, i1) %0) {
+  ret target("target.ext.2", i8, i1) %0
+}
+
+define target("target.ext.3", 7) @target_ext_03(target("target.ext.3", 7) %0) {
+  ret target("target.ext.3", 7) %0
+}
+
+define target("target.ext.4", i1, i32, 7) @target_ext_04(target("target.ext.4", i1, i32, 7) %0) {
+  ret target("target.ext.4", i1, i32, 7) %0
+}
+
 define i32 @iops(i32 %a, i32 %b) {
   %1 = add i32 %a, %b
   %2 = mul i32 %a, %1
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 2e6286a7df8dfd..a114c27bafd4a2 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -2,13 +2,13 @@
 
 
 define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
-  ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
 
-  ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
 
-  ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4{{$}}
   %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
 
   ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i64 48 syncscope("agent") seq_cst, align 8
@@ -26,13 +26,13 @@ define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr
 }
 
 define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
-  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
 
-  ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
 
-  ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4{{$}}
   %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
 
   ; CHECK: atomicrmw udec_wrap ptr %ptr0, i64 48 syncscope("agent") seq_cst, align 8
@@ -51,49 +51,49 @@ define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr
 
 ; Test some invalid ordering handling
 define void @ordering(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
-  ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
 
-  ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
 
-  ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
 
-  ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4
+  ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0
   %result3 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
 
-  ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result4 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
 
-  ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result5 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
 
-  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result6 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
 
-  ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result7 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
 
-  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result8 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
 
-  ; CHECK:= atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK:= atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result9 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
 
-  ; CHECK:= atomicrmw volatile udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+  ; CHECK:= atomicrmw volatile udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result10 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
   ret void
 }
 
 define void @immarg_violations(ptr %ptr0, i32 %val32, i1 %val1) {
-  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+  ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
 
-; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0
   %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
 
-  ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+  ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0
   %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
   ret void
 }
@@ -112,4 +112,192 @@ declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32
 declare i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
 declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
 
+; ptr, rmw_value, ordering, scope, isVolatile)
+declare float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32 immarg, i32 immarg, i1 immarg)
+declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) nocapture, <2 x half>, i32 immarg, i32 immarg, i1 immarg)
+declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) nocapture, <2 x i16>)
+
+define float @upgrade_amdgcn_ds_fadd_f32(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result0 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result1 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result2 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") acquire, align 4
+  %result3 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 4, i32 2, i1 false)
+
+  ret float %result3
+}
+
+; Handle missing type suffix
+declare float @llvm.amdgcn.ds.fadd(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fadd_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result0 = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+  ret float %result0
+}
+
+define void @immarg_violations_ds_fadd_f32(ptr addrspace(3) %ptr, float %fval, i32 %val32, i1 %val1) {
+  ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, float %fval syncscope("agent") seq_cst, align 4
+  %result0 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %fval, i32 %val32, i32 %val32, i1 %val1)
+  ret void
+}
+
+declare float @llvm.amdgcn.ds.fadd.f32broken0(i32, float, i32 immarg, i32 immarg, i1 immarg)
+
+; This will just delete the invalid call, which isn't ideal, but these
+; cases were never emitted.
+; CHECK-LABEL: define void @ds_fadd_f32_invalid_not_ptr(
+; CHECK-NEXT: ret void
+define void @ds_fadd_f32_invalid_not_ptr(i32 %ptr, float %fval) {
+  %result0 = call float @llvm.amdgcn.ds.fadd.f32broken0(i32 %ptr, float %fval, i32 0, i32 0, i1 false)
+  ret void
+}
+
+declare float @llvm.amdgcn.ds.fadd.f32broken1(ptr addrspace(3), double, i32 immarg, i32 immarg, i1 immarg)
+
+; CHECK-LABEL: define void @ds_fadd_f32_invalid_misatch(
+; CHECK-NEXT: ret void
+define void @ds_fadd_f32_invalid_misatch(ptr addrspace(3) %ptr, double %fval) {
+  %result0 = call float @llvm.amdgcn.ds.fadd.f32broken1(ptr addrspace(3) %ptr, double %fval, i32 0, i32 0, i1 false)
+  ret void
+}
+
+define double @upgrade_amdgcn_ds_fadd_f64(ptr addrspace(3) %ptr, double %val) {
+  ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result0 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result1 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result2 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, double %val syncscope("agent") acquire, align 8
+  %result3 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 4, i32 2, i1 false)
+
+  ret double %result3
+}
+
+; CHECK-LABEL: @immarg_violations_ds_fadd_f64(
+define void @immarg_violations_ds_fadd_f64(ptr addrspace(3) %ptr, double %fval, i32 %val32, i1 %val1) {
+  ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, double %fval syncscope("agent") seq_cst, align 8
+  %result0 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %fval, i32 %val32, i32 %val32, i1 %val1)
+  ret void
+}
+
+define <2 x half> @upgrade_amdgcn_ds_fadd_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) {
+  ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") seq_cst, align 4
+  %result0 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") seq_cst, align 4
+  %result1 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") seq_cst, align 4
+  %result2 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") acquire, align 4
+  %result3 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 4, i32 2, i1 false)
+
+  ret <2 x half> %result3
+}
+
+define void @immarg_violations_ds_fadd_v2f16(ptr addrspace(3) %ptr, <2 x half> %fval, i32 %val32, i1 %val1) {
+  ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, <2 x half> %fval syncscope("agent") seq_cst, align 4
+  %result0 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %fval, i32 %val32, i32 %val32, i1 %val1)
+  ret void
+}
+
+define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__as_i16(ptr addrspace(3) %ptr, <2 x i16> %val) {
+  ; CHECK: [[BC0:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+  ; CHECK-NEXT: [[RMW0:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+  ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW0]] to <2 x i16>
+  %result0 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: [[BC1:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+  ; CHECK-NEXT: [[RMW1:%[0-9]+]] = atomicrmw volatile fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC1]] syncscope("agent") seq_cst, align 4
+  ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW1]] to <2 x i16>
+  %result1 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: [[BC2:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+  ; CHECK-NEXT: [[RMW2:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC2]] syncscope("agent") seq_cst, align 4
+  ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW2]] to <2 x i16>
+  %result2 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: [[BC3:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+  ; CHECK-NEXT: [[RMW3:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC3]] syncscope("agent") acquire, align 4
+  ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW3]] to <2 x i16>
+  %result3 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 4, i32 2, i1 false)
+
+  ret <2 x i16> %result3
+}
+
+; Somehow the bf16 version was defined as a separate intrinsic with missing arguments.
+define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__missing_args_as_i16(ptr addrspace(3) %ptr, <2 x i16> %val) {
+  ; CHECK: [[BC0:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+  ; CHECK-NEXT: [[RMW0:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+  ; CHECK-NEXT: [[BC1:%[0-9]+]] = bitcast <2 x bfloat> [[RMW0]] to <2 x i16>
+  ; CHECK-NEXT: ret <2 x i16> [[BC1]]
+  %result0 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val)
+  ret <2 x i16> %result0
+}
+
+declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fmin_f32(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result0 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") acquire, align 4
+  %result3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 4, i32 2, i1 false)
+
+  ret float %result3
+}
+
+define double @upgrade_amdgcn_ds_fmin_f64(ptr addrspace(3) %ptr, double %val) {
+  ; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result0 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") acquire, align 8
+  %result3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 4, i32 2, i1 false)
+
+  ret double %result3
+}
+
+declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fmin_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+
+  %result0 = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+  ret float %result0
+}
+
+declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: = atomicrmw fmax ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result0 = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+  ret float %result0
+}
+
 attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll
new file mode 100644
index 00000000000000..1add4a86aee277
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s
+
+define double @pow_optsize(double %x) nounwind optsize {
+; CHECK-LABEL: pow_optsize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #15 // =0xf
+; CHECK-NEXT:    b __powidf2
+entry:
+  %0 = call double @llvm.powi.f64.i32(double %x, i32 15)
+  ret double %0
+}
+
+define double @pow_optsize_expand(double %x) nounwind optsize {
+; CHECK-LABEL: pow_optsize_expand:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul d0, d0, d0
+; CHECK-NEXT:    fmul d0, d0, d0
+; CHECK-NEXT:    fmul d0, d0, d0
+; CHECK-NEXT:    fmul d0, d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %0 = call double @llvm.powi.f64.i32(double %x, i32 16)
+  ret double %0
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir
new file mode 100644
index 00000000000000..8b8158348e399a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir
@@ -0,0 +1,124 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s
+
+---
+name: fpowi_s64_zero
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s64_zero
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; CHECK-NEXT: $d0 = COPY [[C]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 0
+      %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+      $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_zero
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s32_zero
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; CHECK-NEXT: $s0 = COPY [[C]](s32)
+      %0:_(s32) = COPY $s0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 0
+      %3:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+      $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_positive
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_positive
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[FMUL]]
+    ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMUL1]]
+    ; CHECK-NEXT: $d0 = COPY [[FMUL2]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 5
+      %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+      $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s64_negative
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s64_negative
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[FMUL]]
+    ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMUL1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FDIV [[C]], [[FMUL2]]
+    ; CHECK-NEXT: $d0 = COPY [[FDIV]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 -5
+      %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+      $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_negative
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s32_negative
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[FMUL]]
+    ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[FMUL1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FDIV [[C]], [[FMUL2]]
+    ; CHECK-NEXT: $s0 = COPY [[FDIV]](s32)
+      %0:_(s32) = COPY $s0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 -5
+      %3:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+      $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_libcall
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_libcall
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[FPOWI:%[0-9]+]]:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI [[COPY]], [[COPY1]](s32)
+    ; CHECK-NEXT: $d0 = COPY [[FPOWI]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %1(s32)
+      $d0 = COPY %2(s64)
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir
index dd794b7af94668..cfdf0900f2f065 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir
@@ -22,8 +22,8 @@ body:             |
     ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR3]]
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[OR]](<2 x s32>)
+    ; CHECK-NEXT: %6:_(<2 x s32>) = disjoint G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
     ; CHECK-NEXT: %fcopysign:_(s32) = COPY [[UV]](s32)
     ; CHECK-NEXT: $s0 = COPY %fcopysign(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $s0
@@ -54,8 +54,8 @@ body:             |
     ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C1]](s64)
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s64>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR3]]
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s64>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[AND]], [[AND1]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[OR]](<2 x s64>)
+    ; CHECK-NEXT: %6:_(<2 x s64>) = disjoint G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %6(<2 x s64>)
     ; CHECK-NEXT: %fcopysign:_(s64) = COPY [[UV]](s64)
     ; CHECK-NEXT: $d0 = COPY %fcopysign(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index f03491924f7f4b..1f048528ea1538 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -86,6 +86,10 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
+# DEBUG-NEXT: G_PTRAUTH_GLOBAL_VALUE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+#
 # DEBUG-NEXT: G_CONSTANT_POOL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/ptrauth-constant-in-code.ll b/llvm/test/CodeGen/AArch64/GlobalISel/ptrauth-constant-in-code.ll
new file mode 100644
index 00000000000000..7b85b12bb89520
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/ptrauth-constant-in-code.ll
@@ -0,0 +1,235 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- err1.ll
+
+; RUN: not --crash llc < err1.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=1 -verify-machineinstrs -global-isel-abort=1 2>&1 | \
+; RUN:   FileCheck --check-prefix=ERR1 %s
+
+@g = external global i32
+
+define ptr @foo() {
+; ERR1: LLVM ERROR: key in ptrauth global out of range [0, 3]
+  ret ptr ptrauth (ptr @g, i32 4)
+}
+
+;--- err2.ll
+
+; RUN: not --crash llc < err2.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=1 -verify-machineinstrs -global-isel-abort=1 2>&1 | \
+; RUN:   FileCheck --check-prefix=ERR2 %s
+
+@g = external global i32
+
+define ptr @foo() {
+; ERR2: LLVM ERROR: constant discriminator in ptrauth global out of range [0, 0xffff]
+  ret ptr ptrauth (ptr @g, i32 2, i64 65536)
+}
+
+;--- err3.ll
+
+; RUN: not --crash llc < err3.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=1 -verify-machineinstrs -global-isel-abort=1 2>&1 | \
+; RUN:   FileCheck --check-prefix=ERR3 %s
+
+@g_weak = extern_weak global i32
+
+define ptr @foo() {
+; ERR3: LLVM ERROR: unsupported non-zero offset in weak ptrauth global reference
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g_weak, i64 16), i32 2, i64 42)
+}
+
+;--- err4.ll
+
+; RUN: not --crash llc < err4.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=1 -verify-machineinstrs -global-isel-abort=1 2>&1 | \
+; RUN:   FileCheck --check-prefix=ERR4 %s
+
+@g_weak = extern_weak global i32
+@g_weak.ref.da.42.addr = dso_local constant ptr ptrauth (ptr @g_weak, i32 2, i64 42, ptr @g_weak.ref.da.42.addr)
+
+define ptr @foo() {
+; ERR4: LLVM ERROR: unsupported weak addr-div ptrauth global
+  ret ptr ptrauth (ptr @g_weak, i32 0, i64 42, ptr @g_weak.ref.da.42.addr)
+}
+
+;--- err5.ll
+
+; RUN: not --crash llc < err5.ll -mtriple arm64-apple-darwin -mattr=+pauth \
+; RUN:   -global-isel=1 -verify-machineinstrs -global-isel-abort=1 2>&1 | \
+; RUN:   FileCheck --check-prefix=ERR5 %s
+
+@g = external global i32
+
+define ptr @foo() {
+; ERR5: LLVM ERROR: ptrauth global lowering is only implemented for ELF
+  ret ptr ptrauth (ptr @g, i32 0)
+}
+
+;--- ok.ll
+
+; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth -global-isel=1 \
+; RUN:   -verify-machineinstrs -global-isel-abort=1 | FileCheck %s
+; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth -global-isel=1 \
+; RUN:   -verify-machineinstrs -global-isel-abort=1 -filetype=obj
+
+@g = external global i32
+@g_weak = extern_weak global i32
+@g_strong_def = dso_local constant i32 42
+
+define ptr @test_global_zero_disc() {
+; CHECK-LABEL: test_global_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    paciza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr @g, i32 0)
+}
+
+define ptr @test_global_offset_zero_disc() {
+; CHECK-LABEL: test_global_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    add     x16, x16, #16
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2)
+}
+
+define ptr @test_global_neg_offset_zero_disc() {
+; CHECK-LABEL: test_global_neg_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    sub     x16, x16, #576
+; CHECK-NEXT:    sub     x16, x16, #30, lsl #12
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 -123456), i32 2)
+}
+
+define ptr @test_global_big_offset_zero_disc() {
+; CHECK-LABEL: test_global_big_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #1
+; CHECK-NEXT:    movk    x17, #32769, lsl #16
+; CHECK-NEXT:    add     x16, x16, x17
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2)
+}
+
+define ptr @test_global_big_neg_offset_zero_disc() {
+; CHECK-LABEL: test_global_big_neg_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #-52501
+; CHECK-NEXT:    movk    x17, #63652, lsl #16
+; CHECK-NEXT:    add     x16, x16, x17
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 -123456789), i32 2)
+}
+
+define ptr @test_global_huge_neg_offset_zero_disc() {
+; CHECK-LABEL: test_global_huge_neg_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #-65536
+; CHECK-NEXT:    movk    x17, #0, lsl #16
+; CHECK-NEXT:    movk    x17, #0, lsl #32
+; CHECK-NEXT:    movk    x17, #32768, lsl #48
+; CHECK-NEXT:    add     x16, x16, x17
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 -9223372036854775808), i32 2)
+}
+
+define ptr @test_global_disc() {
+; CHECK-LABEL: test_global_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #42 // =0x2a
+; CHECK-NEXT:    pacia   x16, x17
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr @g, i32 0, i64 42)
+}
+
+@g.ref.da.42.addr = dso_local constant ptr ptrauth (ptr @g, i32 2, i64 42, ptr @g.ref.da.42.addr)
+
+define ptr @test_global_addr_disc() {
+; CHECK-LABEL: test_global_addr_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, g.ref.da.42.addr
+; CHECK-NEXT:    add x8, x8, :lo12:g.ref.da.42.addr
+; CHECK-NEXT:    adrp x16, :got:g
+; CHECK-NEXT:    ldr x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov x17, x8
+; CHECK-NEXT:    movk x17, #42, lsl #48
+; CHECK-NEXT:    pacda x16, x17
+; CHECK-NEXT:    mov x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr @g, i32 2, i64 42, ptr @g.ref.da.42.addr)
+}
+
+define ptr @test_global_process_specific() {
+; CHECK-LABEL: test_global_process_specific:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    pacizb  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+  ret ptr ptrauth (ptr @g, i32 1)
+}
+
+; weak symbols can't be assumed to be non-nil. Use $auth_ptr$ stub slot always.
+; The alternative is to emit a null-check here, but that'd be redundant with
+; whatever null-check follows in user code.
+
+define ptr @test_global_weak() {
+; CHECK-LABEL: test_global_weak:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x0, g_weak$auth_ptr$ia$42
+; CHECK-NEXT:    ldr     x0, [x0, :lo12:g_weak$auth_ptr$ia$42]
+; CHECK-NEXT:    ret
+  ret ptr ptrauth (ptr @g_weak, i32 0, i64 42)
+}
+
+; Non-external symbols don't need to be accessed through the GOT.
+
+define ptr @test_global_strong_def() {
+; CHECK-LABEL: test_global_strong_def:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, g_strong_def
+; CHECK-NEXT:    add     x16, x16, :lo12:g_strong_def
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+  ret ptr ptrauth (ptr @g_strong_def, i32 2)
+}
+
+; CHECK-LABEL: g_weak$auth_ptr$ia$42:
+; CHECK-NEXT:    .xword  g_weak@AUTH(ia,42)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
index d4d803a91cfa14..63cc061cb61882 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sme2 -stop-after=finalize-isel | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sme2 -force-streaming -stop-after=finalize-isel | FileCheck %s
 
 define void @UphPNR(target("aarch64.svcount") %predcnt) {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64-jumptable.ll b/llvm/test/CodeGen/AArch64/arm64-jumptable.ll
index 7d9adf92a6a95c..40b8f721c0505b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-jumptable.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-jumptable.ll
@@ -18,7 +18,7 @@ bb3:
 bb4:
   br label %exit.sink.split
 exit.sink.split:
-  %.sink = phi i32 [ 5, %bb4 ], [ %b, %bb1 ], [ 3, %bb3 ], [ %a, %entry ]
+  %.sink = phi i32 [ 5, %bb4 ], [ %b, %bb1 ], [ 7, %bb3 ], [ %a, %entry ]
   store i32 %.sink, ptr %to
   br label %exit
 exit:
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 9d0672d1c95ea1..aec5d7959226c3 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -32,6 +32,208 @@ entry:
   ret half %c
 }
 
+
+define half @fptrunc_f128_f16(fp128 %a) {
+; CHECK-SD-LABEL: fptrunc_f128_f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __trunctfhf2
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptrunc_f128_f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __trunctfhf2
+entry:
+  %c = fptrunc fp128 %a to half
+  ret half %c
+}
+
+define float @fptrunc_f128_f32(fp128 %a) {
+; CHECK-SD-LABEL: fptrunc_f128_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __trunctfsf2
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptrunc_f128_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __trunctfsf2
+entry:
+  %c = fptrunc fp128 %a to float
+  ret float %c
+}
+
+define double @fptrunc_f128_f64(fp128 %a) {
+; CHECK-SD-LABEL: fptrunc_f128_f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __trunctfdf2
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptrunc_f128_f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __trunctfdf2
+entry:
+  %c = fptrunc fp128 %a to double
+  ret double %c
+}
+
+define <2 x half> @fptrunc_v2f128_v2f16(<2 x fp128> %a) {
+; CHECK-SD-LABEL: fptrunc_v2f128_v2f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    bl __trunctfhf2
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __trunctfhf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptrunc_v2f128_v2f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __trunctfhf2
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    bl __trunctfhf2
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    bl __trunctfhf2
+; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    bl __trunctfhf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fptrunc <2 x fp128> %a to <2 x half>
+  ret <2 x half> %c
+}
+
+
+define <2 x float> @fptrunc_v2f128_v2f32(<2 x fp128> %a) {
+; CHECK-SD-LABEL: fptrunc_v2f128_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    bl __trunctfsf2
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __trunctfsf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptrunc_v2f128_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __trunctfsf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    bl __trunctfsf2
+; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fptrunc <2 x fp128> %a to <2 x float>
+  ret <2 x float> %c
+}
+
+
+define <2 x double> @fptrunc_v2f128_v2f64(<2 x fp128> %a) {
+; CHECK-SD-LABEL: fptrunc_v2f128_v2f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    bl __trunctfdf2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __trunctfdf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptrunc_v2f128_v2f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __trunctfdf2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    bl __trunctfdf2
+; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fptrunc <2 x fp128> %a to <2 x double>
+  ret <2 x double> %c
+}
+
 define <2 x float> @fptrunc_v2f64_v2f32(<2 x double> %a) {
 ; CHECK-LABEL: fptrunc_v2f64_v2f32:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/ldexp.ll b/llvm/test/CodeGen/AArch64/ldexp.ll
index ba04ba1d7bb6ac..6019fa1490e3dc 100644
--- a/llvm/test/CodeGen/AArch64/ldexp.ll
+++ b/llvm/test/CodeGen/AArch64/ldexp.ll
@@ -1,17 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck -check-prefixes=SVE,SVELINUX %s
+; RUN: llc -mtriple=aarch64-windows-msvc -mattr=+sve < %s -o - | FileCheck -check-prefixes=SVE,SVEWINDOWS %s
+; RUN: llc -mtriple=aarch64-windows-msvc < %s -o - | FileCheck -check-prefixes=WINDOWS %s
 
 define double @testExp(double %val, i32 %a) {
-; CHECK-LABEL: testExp:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fscale z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT:    ret
+; SVE-LABEL: testExp:
+; SVE:       // %bb.0: // %entry
+; SVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; SVE-NEXT:    sxtw x8, w0
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT:    fmov d1, x8
+; SVE-NEXT:    fscale z0.d, p0/m, z0.d, z1.d
+; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE-NEXT:    ret
+;
+; WINDOWS-LABEL: testExp:
+; WINDOWS:       // %bb.0: // %entry
+; WINDOWS-NEXT:    b ldexp
 entry:
   %call = tail call fast double @ldexp(double %val, i32 %a)
   ret double %call
@@ -19,26 +25,90 @@ entry:
 
 declare double @ldexp(double, i32) memory(none)
 
+define double @testExpIntrinsic(double %val, i32 %a) {
+; SVE-LABEL: testExpIntrinsic:
+; SVE:       // %bb.0: // %entry
+; SVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; SVE-NEXT:    sxtw x8, w0
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT:    fmov d1, x8
+; SVE-NEXT:    fscale z0.d, p0/m, z0.d, z1.d
+; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE-NEXT:    ret
+;
+; WINDOWS-LABEL: testExpIntrinsic:
+; WINDOWS:       // %bb.0: // %entry
+; WINDOWS-NEXT:    b ldexp
+entry:
+  %call = tail call fast double @llvm.ldexp.f64(double %val, i32 %a)
+  ret double %call
+}
+
 define float @testExpf(float %val, i32 %a) {
-; CHECK-LABEL: testExpf:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
-; CHECK-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
-; CHECK-NEXT:    ret
+; SVELINUX-LABEL: testExpf:
+; SVELINUX:       // %bb.0: // %entry
+; SVELINUX-NEXT:    fmov s1, w0
+; SVELINUX-NEXT:    ptrue p0.s
+; SVELINUX-NEXT:    // kill: def $s0 killed $s0 def $z0
+; SVELINUX-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
+; SVELINUX-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; SVELINUX-NEXT:    ret
+;
+; SVEWINDOWS-LABEL: testExpf:
+; SVEWINDOWS:       // %bb.0: // %entry
+; SVEWINDOWS-NEXT:    b ldexpf
+;
+; WINDOWS-LABEL: testExpf:
+; WINDOWS:       // %bb.0: // %entry
+; WINDOWS-NEXT:    b ldexpf
 entry:
   %call = tail call fast float @ldexpf(float %val, i32 %a)
   ret float %call
 }
 
+define float @testExpfIntrinsic(float %val, i32 %a) {
+; SVE-LABEL: testExpfIntrinsic:
+; SVE:       // %bb.0: // %entry
+; SVE-NEXT:    fmov s1, w0
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    // kill: def $s0 killed $s0 def $z0
+; SVE-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; SVE-NEXT:    ret
+;
+; WINDOWS-LABEL: testExpfIntrinsic:
+; WINDOWS:       .seh_proc testExpfIntrinsic
+; WINDOWS-NEXT:  // %bb.0: // %entry
+; WINDOWS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; WINDOWS-NEXT:    .seh_save_reg_x x30, 16
+; WINDOWS-NEXT:    .seh_endprologue
+; WINDOWS-NEXT:    fcvt d0, s0
+; WINDOWS-NEXT:    bl ldexp
+; WINDOWS-NEXT:    fcvt s0, d0
+; WINDOWS-NEXT:    .seh_startepilogue
+; WINDOWS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; WINDOWS-NEXT:    .seh_save_reg_x x30, 16
+; WINDOWS-NEXT:    .seh_endepilogue
+; WINDOWS-NEXT:    ret
+; WINDOWS-NEXT:    .seh_endfunclet
+; WINDOWS-NEXT:    .seh_endproc
+entry:
+  %call = tail call fast float @llvm.ldexp.f32(float %val, i32 %a)
+  ret float %call
+}
+
+
 declare float @ldexpf(float, i32) memory(none)
 
 define fp128 @testExpl(fp128 %val, i32 %a) {
-; CHECK-LABEL: testExpl:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    b ldexpl
+; SVE-LABEL: testExpl:
+; SVE:       // %bb.0: // %entry
+; SVE-NEXT:    b ldexpl
+;
+; WINDOWS-LABEL: testExpl:
+; WINDOWS:       // %bb.0: // %entry
+; WINDOWS-NEXT:    b ldexpl
 entry:
   %call = tail call fast fp128 @ldexpl(fp128 %val, i32 %a)
   ret fp128 %call
@@ -47,14 +117,31 @@ entry:
 declare fp128 @ldexpl(fp128, i32) memory(none)
 
 define half @testExpf16(half %val, i32 %a) {
-; CHECK-LABEL: testExpf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    fcvt h0, s0
-; CHECK-NEXT:    ret
+; SVE-LABEL: testExpf16:
+; SVE:       // %bb.0: // %entry
+; SVE-NEXT:    fcvt s0, h0
+; SVE-NEXT:    fmov s1, w0
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    fcvt h0, s0
+; SVE-NEXT:    ret
+;
+; WINDOWS-LABEL: testExpf16:
+; WINDOWS:       .seh_proc testExpf16
+; WINDOWS-NEXT:  // %bb.0: // %entry
+; WINDOWS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; WINDOWS-NEXT:    .seh_save_reg_x x30, 16
+; WINDOWS-NEXT:    .seh_endprologue
+; WINDOWS-NEXT:    fcvt d0, h0
+; WINDOWS-NEXT:    bl ldexp
+; WINDOWS-NEXT:    fcvt h0, d0
+; WINDOWS-NEXT:    .seh_startepilogue
+; WINDOWS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; WINDOWS-NEXT:    .seh_save_reg_x x30, 16
+; WINDOWS-NEXT:    .seh_endepilogue
+; WINDOWS-NEXT:    ret
+; WINDOWS-NEXT:    .seh_endfunclet
+; WINDOWS-NEXT:    .seh_endproc
 entry:
   %0 = tail call fast half @llvm.ldexp.f16.i32(half %val, i32 %a)
   ret half %0
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index c4839175ded5ae..7e26b877a42286 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -992,3 +992,41 @@ define <4 x i32> @muladd_demand_commute(<4 x i32> %x, <4 x i32> %y) {
   %r = and <4 x i32> %a, <i32 131071, i32 131071, i32 131071, i32 131071>
   ret <4 x i32> %r
 }
+
+; Transforming `(mul x, -(2^(N-M) - 1) * 2^M)` to `(sub (shl x, M), (shl x, N))`
+; will cause overflow when N is 32 and M is 31.
+define i32 @shift_overflow(i32 %x) {
+; CHECK-LABEL: shift_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    mul w0, w0, w8
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: shift_overflow:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #-2147483648 // =0x80000000
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+  %const = bitcast i32 2147483648 to i32
+  %r = mul i32 %x, %const
+  ret i32 %r
+}
+
+; Transforming `(mul x, -(2^(N-M) - 1) * 2^M)` to `(sub (shl x, M), (shl x, N))`
+; will not cause overflow when N is 31 and M is 30.
+define i32 @shift_no_overflow(i32 %x) {
+; CHECK-LABEL: shift_no_overflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w8, w0, #31
+; CHECK-NEXT:    sub w0, w8, w0, lsl #30
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: shift_no_overflow:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #1073741824 // =0x40000000
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+  %const = bitcast i32 1073741824 to i32
+  %r = mul i32 %x, %const
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll b/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll
index 396005dfdd27e6..9b9717c19321e7 100644
--- a/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefixes=DARWIN %s
+; RUN: llc -mtriple=aarch64-pc-windows < %s | FileCheck --check-prefixes=WIN %s
 
 ; This test checks various function call behaviors between preserve_none and
 ; normal calling conventions.
@@ -43,7 +44,7 @@ define void @caller1(ptr %a) {
 ; CHECK-NEXT:    .cfi_offset b13, -144
 ; CHECK-NEXT:    .cfi_offset b14, -152
 ; CHECK-NEXT:    .cfi_offset b15, -160
-; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    mov x20, x0
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
@@ -90,7 +91,7 @@ define void @caller1(ptr %a) {
 ; DARWIN-NEXT:    .cfi_offset b13, -144
 ; DARWIN-NEXT:    .cfi_offset b14, -152
 ; DARWIN-NEXT:    .cfi_offset b15, -160
-; DARWIN-NEXT:    mov x19, x0
+; DARWIN-NEXT:    mov x20, x0
 ; DARWIN-NEXT:    bl _callee
 ; DARWIN-NEXT:    ldp x29, x30, [sp, #144] ; 16-byte Folded Reload
 ; DARWIN-NEXT:    ldp x20, x19, [sp, #128] ; 16-byte Folded Reload
@@ -103,6 +104,58 @@ define void @caller1(ptr %a) {
 ; DARWIN-NEXT:    ldp d13, d12, [sp, #16] ; 16-byte Folded Reload
 ; DARWIN-NEXT:    ldp d15, d14, [sp], #160 ; 16-byte Folded Reload
 ; DARWIN-NEXT:    ret
+;
+; WIN-LABEL: caller1:
+; WIN:       .seh_proc caller1
+; WIN-NEXT:  // %bb.0:
+; WIN-NEXT:    stp x19, x20, [sp, #-160]! // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp_x x19, 160
+; WIN-NEXT:    stp x21, x22, [sp, #16] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x21, 16
+; WIN-NEXT:    stp x23, x24, [sp, #32] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x23, 32
+; WIN-NEXT:    stp x25, x26, [sp, #48] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x25, 48
+; WIN-NEXT:    stp x27, x28, [sp, #64] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x27, 64
+; WIN-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; WIN-NEXT:    .seh_save_reg x30, 80
+; WIN-NEXT:    stp d8, d9, [sp, #88] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d8, 88
+; WIN-NEXT:    stp d10, d11, [sp, #104] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d10, 104
+; WIN-NEXT:    stp d12, d13, [sp, #120] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d12, 120
+; WIN-NEXT:    stp d14, d15, [sp, #136] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d14, 136
+; WIN-NEXT:    .seh_endprologue
+; WIN-NEXT:    mov x20, x0
+; WIN-NEXT:    bl callee
+; WIN-NEXT:    .seh_startepilogue
+; WIN-NEXT:    ldp d14, d15, [sp, #136] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d14, 136
+; WIN-NEXT:    ldp d12, d13, [sp, #120] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d12, 120
+; WIN-NEXT:    ldp d10, d11, [sp, #104] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d10, 104
+; WIN-NEXT:    ldp d8, d9, [sp, #88] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d8, 88
+; WIN-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; WIN-NEXT:    .seh_save_reg x30, 80
+; WIN-NEXT:    ldp x27, x28, [sp, #64] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x27, 64
+; WIN-NEXT:    ldp x25, x26, [sp, #48] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x25, 48
+; WIN-NEXT:    ldp x23, x24, [sp, #32] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x23, 32
+; WIN-NEXT:    ldp x21, x22, [sp, #16] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x21, 16
+; WIN-NEXT:    ldp x19, x20, [sp], #160 // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp_x x19, 160
+; WIN-NEXT:    .seh_endepilogue
+; WIN-NEXT:    ret
+; WIN-NEXT:    .seh_endfunclet
+; WIN-NEXT:    .seh_endproc
   tail call preserve_nonecc void @callee(ptr %a)
   ret void
 }
@@ -118,21 +171,24 @@ define preserve_nonecc void @caller2(ptr %a) {
 ; DARWIN-LABEL: caller2:
 ; DARWIN:       ; %bb.0:
 ; DARWIN-NEXT:    b _callee
+;
+; WIN-LABEL: caller2:
+; WIN:       // %bb.0:
+; WIN-NEXT:    b callee
   tail call preserve_nonecc void @callee(ptr %a)
   ret void
 }
 
 ; Preserve_none function can use more registers to pass parameters.
-declare preserve_nonecc i64 @callee_with_many_param2(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12, i64 %a13, i64 %a14, i64 %a15, i64 %a16, i64 %a17, i64 %a18, i64 %a19, i64 %a20, i64 %a21, i64 %a22, i64 %a23, i64 %a24, i64 %a25)
-define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12, i64 %a13, i64 %a14, i64 %a15, i64 %a16, i64 %a17, i64 %a18, i64 %a19, i64 %a20, i64 %a21, i64 %a22, i64 %a23, i64 %a24, i64 %a25) {
+declare preserve_nonecc i64 @callee_with_many_param2(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12, i64 %a13, i64 %a14, i64 %a15, i64 %a16, i64 %a17, i64 %a18, i64 %a19, i64 %a20, i64 %a21, i64 %a22, i64 %a23, i64 %a24)
+define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12, i64 %a13, i64 %a14, i64 %a15, i64 %a16, i64 %a17, i64 %a18, i64 %a19, i64 %a20, i64 %a21, i64 %a22, i64 %a23, i64 %a24) {
 ; CHECK-LABEL: callee_with_many_param:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    mov x8, x15
-; CHECK-NEXT:    mov x15, x19
-; CHECK-NEXT:    mov x19, x20
+; CHECK-NEXT:    mov x15, x20
 ; CHECK-NEXT:    mov x20, x21
 ; CHECK-NEXT:    mov x21, x22
 ; CHECK-NEXT:    mov x22, x23
@@ -149,13 +205,13 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
 ; CHECK-NEXT:    mov x4, x5
 ; CHECK-NEXT:    mov x5, x6
 ; CHECK-NEXT:    mov x6, x7
-; CHECK-NEXT:    mov x7, x9
-; CHECK-NEXT:    mov x9, x10
+; CHECK-NEXT:    mov x7, x10
 ; CHECK-NEXT:    mov x10, x11
 ; CHECK-NEXT:    mov x11, x12
 ; CHECK-NEXT:    mov x12, x13
 ; CHECK-NEXT:    mov x13, x14
-; CHECK-NEXT:    mov x14, x8
+; CHECK-NEXT:    mov x14, x9
+; CHECK-NEXT:    mov x9, x8
 ; CHECK-NEXT:    bl callee_with_many_param2
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -167,8 +223,7 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
 ; DARWIN-NEXT:    .cfi_offset w30, -8
 ; DARWIN-NEXT:    .cfi_offset w29, -16
 ; DARWIN-NEXT:    mov x8, x15
-; DARWIN-NEXT:    mov x15, x19
-; DARWIN-NEXT:    mov x19, x20
+; DARWIN-NEXT:    mov x15, x20
 ; DARWIN-NEXT:    mov x20, x21
 ; DARWIN-NEXT:    mov x21, x22
 ; DARWIN-NEXT:    mov x22, x23
@@ -185,17 +240,62 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
 ; DARWIN-NEXT:    mov x4, x5
 ; DARWIN-NEXT:    mov x5, x6
 ; DARWIN-NEXT:    mov x6, x7
-; DARWIN-NEXT:    mov x7, x9
-; DARWIN-NEXT:    mov x9, x10
+; DARWIN-NEXT:    mov x7, x10
 ; DARWIN-NEXT:    mov x10, x11
 ; DARWIN-NEXT:    mov x11, x12
 ; DARWIN-NEXT:    mov x12, x13
 ; DARWIN-NEXT:    mov x13, x14
-; DARWIN-NEXT:    mov x14, x8
+; DARWIN-NEXT:    mov x14, x9
+; DARWIN-NEXT:    mov x9, x8
 ; DARWIN-NEXT:    bl _callee_with_many_param2
 ; DARWIN-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; DARWIN-NEXT:    ret
-  %ret = call preserve_nonecc i64 @callee_with_many_param2(i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12, i64 %a13, i64 %a14, i64 %a15, i64 %a16, i64 %a17, i64 %a18, i64 %a19, i64 %a20, i64 %a21, i64 %a22, i64 %a23, i64 %a24, i64 %a25, i64 %a1)
+;
+; WIN-LABEL: callee_with_many_param:
+; WIN:       .seh_proc callee_with_many_param
+; WIN-NEXT:  // %bb.0:
+; WIN-NEXT:    sub sp, sp, #32
+; WIN-NEXT:    .seh_stackalloc 32
+; WIN-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; WIN-NEXT:    .seh_save_reg x30, 16
+; WIN-NEXT:    .seh_endprologue
+; WIN-NEXT:    ldr x8, [sp, #32]
+; WIN-NEXT:    mov x15, x20
+; WIN-NEXT:    mov x20, x21
+; WIN-NEXT:    mov x21, x22
+; WIN-NEXT:    mov x22, x23
+; WIN-NEXT:    mov x23, x24
+; WIN-NEXT:    mov x24, x25
+; WIN-NEXT:    mov x25, x26
+; WIN-NEXT:    mov x26, x27
+; WIN-NEXT:    mov x27, x28
+; WIN-NEXT:    mov x28, x0
+; WIN-NEXT:    mov x0, x1
+; WIN-NEXT:    mov x1, x2
+; WIN-NEXT:    mov x2, x3
+; WIN-NEXT:    mov x3, x4
+; WIN-NEXT:    mov x4, x5
+; WIN-NEXT:    mov x5, x6
+; WIN-NEXT:    mov x6, x7
+; WIN-NEXT:    mov x7, x10
+; WIN-NEXT:    mov x10, x11
+; WIN-NEXT:    mov x11, x12
+; WIN-NEXT:    mov x12, x13
+; WIN-NEXT:    mov x13, x14
+; WIN-NEXT:    mov x14, x9
+; WIN-NEXT:    mov x9, x8
+; WIN-NEXT:    str x15, [sp]
+; WIN-NEXT:    bl callee_with_many_param2
+; WIN-NEXT:    .seh_startepilogue
+; WIN-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; WIN-NEXT:    .seh_save_reg x30, 16
+; WIN-NEXT:    add sp, sp, #32
+; WIN-NEXT:    .seh_stackalloc 32
+; WIN-NEXT:    .seh_endepilogue
+; WIN-NEXT:    ret
+; WIN-NEXT:    .seh_endfunclet
+; WIN-NEXT:    .seh_endproc
+  %ret = call preserve_nonecc i64 @callee_with_many_param2(i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12, i64 %a13, i64 %a14, i64 %a15, i64 %a16, i64 %a17, i64 %a18, i64 %a19, i64 %a20, i64 %a21, i64 %a22, i64 %a23, i64 %a24, i64 %a1)
   ret i64 %ret
 }
 
@@ -232,27 +332,30 @@ define i64 @caller3() {
 ; CHECK-NEXT:    .cfi_offset b13, -144
 ; CHECK-NEXT:    .cfi_offset b14, -152
 ; CHECK-NEXT:    .cfi_offset b15, -160
-; CHECK-NEXT:    mov w19, #1 // =0x1
-; CHECK-NEXT:    mov w20, #2 // =0x2
-; CHECK-NEXT:    mov w21, #3 // =0x3
-; CHECK-NEXT:    mov w22, #4 // =0x4
-; CHECK-NEXT:    mov w23, #5 // =0x5
-; CHECK-NEXT:    mov w24, #6 // =0x6
-; CHECK-NEXT:    mov w25, #7 // =0x7
-; CHECK-NEXT:    mov w26, #8 // =0x8
-; CHECK-NEXT:    mov w27, #9 // =0x9
-; CHECK-NEXT:    mov w28, #10 // =0xa
-; CHECK-NEXT:    mov w0, #11 // =0xb
-; CHECK-NEXT:    mov w1, #12 // =0xc
-; CHECK-NEXT:    mov w2, #13 // =0xd
-; CHECK-NEXT:    mov w3, #14 // =0xe
-; CHECK-NEXT:    mov w4, #15 // =0xf
-; CHECK-NEXT:    mov w5, #16 // =0x10
-; CHECK-NEXT:    mov w6, #17 // =0x11
-; CHECK-NEXT:    mov w7, #18 // =0x12
-; CHECK-NEXT:    mov w9, #19 // =0x13
-; CHECK-NEXT:    mov w10, #20 // =0x14
-; CHECK-NEXT:    mov w11, #21 // =0x15
+; CHECK-NEXT:    mov w20, #1 // =0x1
+; CHECK-NEXT:    mov w21, #2 // =0x2
+; CHECK-NEXT:    mov w22, #3 // =0x3
+; CHECK-NEXT:    mov w23, #4 // =0x4
+; CHECK-NEXT:    mov w24, #5 // =0x5
+; CHECK-NEXT:    mov w25, #6 // =0x6
+; CHECK-NEXT:    mov w26, #7 // =0x7
+; CHECK-NEXT:    mov w27, #8 // =0x8
+; CHECK-NEXT:    mov w28, #9 // =0x9
+; CHECK-NEXT:    mov w0, #10 // =0xa
+; CHECK-NEXT:    mov w1, #11 // =0xb
+; CHECK-NEXT:    mov w2, #12 // =0xc
+; CHECK-NEXT:    mov w3, #13 // =0xd
+; CHECK-NEXT:    mov w4, #14 // =0xe
+; CHECK-NEXT:    mov w5, #15 // =0xf
+; CHECK-NEXT:    mov w6, #16 // =0x10
+; CHECK-NEXT:    mov w7, #17 // =0x11
+; CHECK-NEXT:    mov w10, #18 // =0x12
+; CHECK-NEXT:    mov w11, #19 // =0x13
+; CHECK-NEXT:    mov w12, #20 // =0x14
+; CHECK-NEXT:    mov w13, #21 // =0x15
+; CHECK-NEXT:    mov w14, #22 // =0x16
+; CHECK-NEXT:    mov w9, #23 // =0x17
+; CHECK-NEXT:    mov w15, #24 // =0x18
 ; CHECK-NEXT:    bl callee_with_many_param
 ; CHECK-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
@@ -299,27 +402,30 @@ define i64 @caller3() {
 ; DARWIN-NEXT:    .cfi_offset b13, -144
 ; DARWIN-NEXT:    .cfi_offset b14, -152
 ; DARWIN-NEXT:    .cfi_offset b15, -160
-; DARWIN-NEXT:    mov w19, #1 ; =0x1
-; DARWIN-NEXT:    mov w20, #2 ; =0x2
-; DARWIN-NEXT:    mov w21, #3 ; =0x3
-; DARWIN-NEXT:    mov w22, #4 ; =0x4
-; DARWIN-NEXT:    mov w23, #5 ; =0x5
-; DARWIN-NEXT:    mov w24, #6 ; =0x6
-; DARWIN-NEXT:    mov w25, #7 ; =0x7
-; DARWIN-NEXT:    mov w26, #8 ; =0x8
-; DARWIN-NEXT:    mov w27, #9 ; =0x9
-; DARWIN-NEXT:    mov w28, #10 ; =0xa
-; DARWIN-NEXT:    mov w0, #11 ; =0xb
-; DARWIN-NEXT:    mov w1, #12 ; =0xc
-; DARWIN-NEXT:    mov w2, #13 ; =0xd
-; DARWIN-NEXT:    mov w3, #14 ; =0xe
-; DARWIN-NEXT:    mov w4, #15 ; =0xf
-; DARWIN-NEXT:    mov w5, #16 ; =0x10
-; DARWIN-NEXT:    mov w6, #17 ; =0x11
-; DARWIN-NEXT:    mov w7, #18 ; =0x12
-; DARWIN-NEXT:    mov w9, #19 ; =0x13
-; DARWIN-NEXT:    mov w10, #20 ; =0x14
-; DARWIN-NEXT:    mov w11, #21 ; =0x15
+; DARWIN-NEXT:    mov w20, #1 ; =0x1
+; DARWIN-NEXT:    mov w21, #2 ; =0x2
+; DARWIN-NEXT:    mov w22, #3 ; =0x3
+; DARWIN-NEXT:    mov w23, #4 ; =0x4
+; DARWIN-NEXT:    mov w24, #5 ; =0x5
+; DARWIN-NEXT:    mov w25, #6 ; =0x6
+; DARWIN-NEXT:    mov w26, #7 ; =0x7
+; DARWIN-NEXT:    mov w27, #8 ; =0x8
+; DARWIN-NEXT:    mov w28, #9 ; =0x9
+; DARWIN-NEXT:    mov w0, #10 ; =0xa
+; DARWIN-NEXT:    mov w1, #11 ; =0xb
+; DARWIN-NEXT:    mov w2, #12 ; =0xc
+; DARWIN-NEXT:    mov w3, #13 ; =0xd
+; DARWIN-NEXT:    mov w4, #14 ; =0xe
+; DARWIN-NEXT:    mov w5, #15 ; =0xf
+; DARWIN-NEXT:    mov w6, #16 ; =0x10
+; DARWIN-NEXT:    mov w7, #17 ; =0x11
+; DARWIN-NEXT:    mov w10, #18 ; =0x12
+; DARWIN-NEXT:    mov w11, #19 ; =0x13
+; DARWIN-NEXT:    mov w12, #20 ; =0x14
+; DARWIN-NEXT:    mov w13, #21 ; =0x15
+; DARWIN-NEXT:    mov w14, #22 ; =0x16
+; DARWIN-NEXT:    mov w9, #23 ; =0x17
+; DARWIN-NEXT:    mov w15, #24 ; =0x18
 ; DARWIN-NEXT:    bl _callee_with_many_param
 ; DARWIN-NEXT:    ldp x29, x30, [sp, #144] ; 16-byte Folded Reload
 ; DARWIN-NEXT:    ldp x20, x19, [sp, #128] ; 16-byte Folded Reload
@@ -332,6 +438,86 @@ define i64 @caller3() {
 ; DARWIN-NEXT:    ldp d13, d12, [sp, #16] ; 16-byte Folded Reload
 ; DARWIN-NEXT:    ldp d15, d14, [sp], #160 ; 16-byte Folded Reload
 ; DARWIN-NEXT:    ret
-  %ret = call preserve_nonecc i64 @callee_with_many_param(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21)
+;
+; WIN-LABEL: caller3:
+; WIN:       .seh_proc caller3
+; WIN-NEXT:  // %bb.0:
+; WIN-NEXT:    sub sp, sp, #176
+; WIN-NEXT:    .seh_stackalloc 176
+; WIN-NEXT:    stp x19, x20, [sp, #16] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x19, 16
+; WIN-NEXT:    stp x21, x22, [sp, #32] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x21, 32
+; WIN-NEXT:    stp x23, x24, [sp, #48] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x23, 48
+; WIN-NEXT:    stp x25, x26, [sp, #64] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x25, 64
+; WIN-NEXT:    stp x27, x28, [sp, #80] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_regp x27, 80
+; WIN-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; WIN-NEXT:    .seh_save_reg x30, 96
+; WIN-NEXT:    stp d8, d9, [sp, #104] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d8, 104
+; WIN-NEXT:    stp d10, d11, [sp, #120] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d10, 120
+; WIN-NEXT:    stp d12, d13, [sp, #136] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d12, 136
+; WIN-NEXT:    stp d14, d15, [sp, #152] // 16-byte Folded Spill
+; WIN-NEXT:    .seh_save_fregp d14, 152
+; WIN-NEXT:    .seh_endprologue
+; WIN-NEXT:    mov w8, #24 // =0x18
+; WIN-NEXT:    mov w20, #1 // =0x1
+; WIN-NEXT:    mov w21, #2 // =0x2
+; WIN-NEXT:    mov w22, #3 // =0x3
+; WIN-NEXT:    mov w23, #4 // =0x4
+; WIN-NEXT:    mov w24, #5 // =0x5
+; WIN-NEXT:    mov w25, #6 // =0x6
+; WIN-NEXT:    mov w26, #7 // =0x7
+; WIN-NEXT:    mov w27, #8 // =0x8
+; WIN-NEXT:    mov w28, #9 // =0x9
+; WIN-NEXT:    mov w0, #10 // =0xa
+; WIN-NEXT:    mov w1, #11 // =0xb
+; WIN-NEXT:    mov w2, #12 // =0xc
+; WIN-NEXT:    mov w3, #13 // =0xd
+; WIN-NEXT:    mov w4, #14 // =0xe
+; WIN-NEXT:    mov w5, #15 // =0xf
+; WIN-NEXT:    mov w6, #16 // =0x10
+; WIN-NEXT:    mov w7, #17 // =0x11
+; WIN-NEXT:    mov w10, #18 // =0x12
+; WIN-NEXT:    mov w11, #19 // =0x13
+; WIN-NEXT:    mov w12, #20 // =0x14
+; WIN-NEXT:    mov w13, #21 // =0x15
+; WIN-NEXT:    mov w14, #22 // =0x16
+; WIN-NEXT:    mov w9, #23 // =0x17
+; WIN-NEXT:    str x8, [sp]
+; WIN-NEXT:    bl callee_with_many_param
+; WIN-NEXT:    .seh_startepilogue
+; WIN-NEXT:    ldp d14, d15, [sp, #152] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d14, 152
+; WIN-NEXT:    ldp d12, d13, [sp, #136] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d12, 136
+; WIN-NEXT:    ldp d10, d11, [sp, #120] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d10, 120
+; WIN-NEXT:    ldp d8, d9, [sp, #104] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_fregp d8, 104
+; WIN-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; WIN-NEXT:    .seh_save_reg x30, 96
+; WIN-NEXT:    ldp x27, x28, [sp, #80] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x27, 80
+; WIN-NEXT:    ldp x25, x26, [sp, #64] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x25, 64
+; WIN-NEXT:    ldp x23, x24, [sp, #48] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x23, 48
+; WIN-NEXT:    ldp x21, x22, [sp, #32] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x21, 32
+; WIN-NEXT:    ldp x19, x20, [sp, #16] // 16-byte Folded Reload
+; WIN-NEXT:    .seh_save_regp x19, 16
+; WIN-NEXT:    add sp, sp, #176
+; WIN-NEXT:    .seh_stackalloc 176
+; WIN-NEXT:    .seh_endepilogue
+; WIN-NEXT:    ret
+; WIN-NEXT:    .seh_endfunclet
+; WIN-NEXT:    .seh_endproc
+  %ret = call preserve_nonecc i64 @callee_with_many_param(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24)
   ret i64 %ret
 }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-constant-in-code.ll b/llvm/test/CodeGen/AArch64/ptrauth-constant-in-code.ll
new file mode 100644
index 00000000000000..73993fa87bf27f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-constant-in-code.ll
@@ -0,0 +1,230 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- err1.ll
+
+; RUN: not --crash llc < err1.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=0 -verify-machineinstrs 2>&1 | FileCheck --check-prefix=ERR1 %s
+
+@g = external global i32
+
+define ptr @foo() {
+; ERR1: LLVM ERROR: key in ptrauth global out of range [0, 3]
+  ret ptr ptrauth (ptr @g, i32 4)
+}
+
+;--- err2.ll
+
+; RUN: not --crash llc < err2.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=0 -verify-machineinstrs 2>&1 | FileCheck --check-prefix=ERR2 %s
+
+@g = external global i32
+
+define ptr @foo() {
+; ERR2: LLVM ERROR: constant discriminator in ptrauth global out of range [0, 0xffff]
+  ret ptr ptrauth (ptr @g, i32 2, i64 65536)
+}
+
+;--- err3.ll
+
+; RUN: not --crash llc < err3.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=0 -verify-machineinstrs 2>&1 | FileCheck --check-prefix=ERR3 %s
+
+@g_weak = extern_weak global i32
+
+define ptr @foo() {
+; ERR3: LLVM ERROR: unsupported non-zero offset in weak ptrauth global reference
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g_weak, i64 16), i32 2, i64 42)
+}
+
+;--- err4.ll
+
+; RUN: not --crash llc < err4.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel=0 -verify-machineinstrs 2>&1 | FileCheck --check-prefix=ERR4 %s
+
+@g_weak = extern_weak global i32
+@g_weak.ref.da.42.addr = dso_local constant ptr ptrauth (ptr @g_weak, i32 2, i64 42, ptr @g_weak.ref.da.42.addr)
+
+define ptr @foo() {
+; ERR4: LLVM ERROR: unsupported weak addr-div ptrauth global
+  ret ptr ptrauth (ptr @g_weak, i32 0, i64 42, ptr @g_weak.ref.da.42.addr)
+}
+
+;--- err5.ll
+
+; RUN: not --crash llc < err5.ll -mtriple arm64-apple-darwin -mattr=+pauth \
+; RUN:   -global-isel=0 -verify-machineinstrs 2>&1 | FileCheck --check-prefix=ERR5 %s
+
+@g = external global i32
+
+define ptr @foo() {
+; ERR5: LLVM ERROR: ptrauth global lowering is only implemented for ELF
+  ret ptr ptrauth (ptr @g, i32 0)
+}
+
+;--- ok.ll
+
+; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth -global-isel=0 \
+; RUN:   -verify-machineinstrs | FileCheck %s
+; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth -global-isel=0 \
+; RUN:   -verify-machineinstrs -filetype=obj
+
+@g = external global i32
+@g_weak = extern_weak global i32
+@g_strong_def = dso_local constant i32 42
+
+define ptr @test_global_zero_disc() {
+; CHECK-LABEL: test_global_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    paciza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr @g, i32 0)
+}
+
+define ptr @test_global_offset_zero_disc() {
+; CHECK-LABEL: test_global_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    add     x16, x16, #16
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2)
+}
+
+define ptr @test_global_neg_offset_zero_disc() {
+; CHECK-LABEL: test_global_neg_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    sub     x16, x16, #576
+; CHECK-NEXT:    sub     x16, x16, #30, lsl #12
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 -123456), i32 2)
+}
+
+define ptr @test_global_big_offset_zero_disc() {
+; CHECK-LABEL: test_global_big_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #1
+; CHECK-NEXT:    movk    x17, #32769, lsl #16
+; CHECK-NEXT:    add     x16, x16, x17
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2)
+}
+
+define ptr @test_global_big_neg_offset_zero_disc() {
+; CHECK-LABEL: test_global_big_neg_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #-52501
+; CHECK-NEXT:    movk    x17, #63652, lsl #16
+; CHECK-NEXT:    add     x16, x16, x17
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 -123456789), i32 2)
+}
+
+define ptr @test_global_huge_neg_offset_zero_disc() {
+; CHECK-LABEL: test_global_huge_neg_offset_zero_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #-65536
+; CHECK-NEXT:    movk    x17, #0, lsl #16
+; CHECK-NEXT:    movk    x17, #0, lsl #32
+; CHECK-NEXT:    movk    x17, #32768, lsl #48
+; CHECK-NEXT:    add     x16, x16, x17
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 -9223372036854775808), i32 2)
+}
+
+define ptr @test_global_disc() {
+; CHECK-LABEL: test_global_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov     x17, #42 // =0x2a
+; CHECK-NEXT:    pacia   x16, x17
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr @g, i32 0, i64 42)
+}
+
+@g.ref.da.42.addr = dso_local constant ptr ptrauth (ptr @g, i32 2, i64 42, ptr @g.ref.da.42.addr)
+
+define ptr @test_global_addr_disc() {
+; CHECK-LABEL: test_global_addr_disc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, g.ref.da.42.addr
+; CHECK-NEXT:    add x8, x8, :lo12:g.ref.da.42.addr
+; CHECK-NEXT:    adrp x16, :got:g
+; CHECK-NEXT:    ldr x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    mov x17, x8
+; CHECK-NEXT:    movk x17, #42, lsl #48
+; CHECK-NEXT:    pacda x16, x17
+; CHECK-NEXT:    mov x0, x16
+; CHECK-NEXT:    ret
+
+  ret ptr ptrauth (ptr @g, i32 2, i64 42, ptr @g.ref.da.42.addr)
+}
+
+define ptr @test_global_process_specific() {
+; CHECK-LABEL: test_global_process_specific:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, :got:g
+; CHECK-NEXT:    ldr     x16, [x16, :got_lo12:g]
+; CHECK-NEXT:    pacizb  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+  ret ptr ptrauth (ptr @g, i32 1)
+}
+
+; weak symbols can't be assumed to be non-nil. Use $auth_ptr$ stub slot always.
+; The alternative is to emit a null-check here, but that'd be redundant with
+; whatever null-check follows in user code.
+
+define ptr @test_global_weak() {
+; CHECK-LABEL: test_global_weak:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x0, g_weak$auth_ptr$ia$42
+; CHECK-NEXT:    ldr     x0, [x0, :lo12:g_weak$auth_ptr$ia$42]
+; CHECK-NEXT:    ret
+  ret ptr ptrauth (ptr @g_weak, i32 0, i64 42)
+}
+
+; Non-external symbols don't need to be accessed through the GOT.
+
+define ptr @test_global_strong_def() {
+; CHECK-LABEL: test_global_strong_def:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp    x16, g_strong_def
+; CHECK-NEXT:    add     x16, x16, :lo12:g_strong_def
+; CHECK-NEXT:    pacdza  x16
+; CHECK-NEXT:    mov     x0, x16
+; CHECK-NEXT:    ret
+  ret ptr ptrauth (ptr @g_strong_def, i32 2)
+}
+
+; CHECK-LABEL: g_weak$auth_ptr$ia$42:
+; CHECK-NEXT:    .xword  g_weak@AUTH(ia,42)
diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
deleted file mode 100644
index 3e708b0678fbcd..00000000000000
--- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
+++ /dev/null
@@ -1,276 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
-
-target triple = "aarch64-linux"
-
-; CHECK-LABEL: .LCPI0_0:
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    4
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    5
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    6
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    7
-; CHECK-NEXT:  .LCPI0_1:
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    0
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    1
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    2
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    255
-; CHECK-NEXT:  .byte    3
-
-define void @sitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
-; CHECK-LABEL: sitofp_v8i8_to_v8f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    adrp x9, .LCPI0_1
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI0_1]
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  .LBB0_1: // %loop
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr d2, [x0, x8, lsl #3]
-; CHECK-NEXT:    add x9, x1, x8, lsl #5
-; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    cmp x8, #1000
-; CHECK-NEXT:    tbl v3.16b, { v2.16b }, v0.16b
-; CHECK-NEXT:    tbl v2.16b, { v2.16b }, v1.16b
-; CHECK-NEXT:    scvtf v3.4s, v3.4s, #24
-; CHECK-NEXT:    scvtf v2.4s, v2.4s, #24
-; CHECK-NEXT:    stp q2, q3, [x9]
-; CHECK-NEXT:    b.eq .LBB0_1
-; CHECK-NEXT:  // %bb.2: // %exit
-; CHECK-NEXT:    ret
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv
-  %l = load <8 x i8>, ptr %gep.src
-  %conv = sitofp <8 x i8> %l to <8 x float>
-  %gep.dst = getelementptr inbounds <8 x float>, ptr %dst, i64 %iv
-  store <8 x float> %conv, ptr %gep.dst
-  %iv.next = add i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, 1000
-  br i1 %ec, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-; CHECK-LABEL: .LCPI1_0:
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    12
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    13
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    14
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    15
-; CHECK-NEXT: .LCPI1_1:
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    8
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    9
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    10
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    11
-; CHECK-NEXT: .LCPI1_2:
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    4
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    5
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    6
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    7
-; CHECK-NEXT: .LCPI1_3:
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    0
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    1
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    2
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    255
-; CHECK-NEXT: .byte    3
-
-define void @sitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
-; CHECK-LABEL: sitofp_v16i8_to_v16f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, .LCPI1_0
-; CHECK-NEXT:    adrp x9, .LCPI1_1
-; CHECK-NEXT:    adrp x10, .LCPI1_2
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT:    adrp x8, .LCPI1_3
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI1_1]
-; CHECK-NEXT:    ldr q2, [x10, :lo12:.LCPI1_2]
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI1_3]
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  .LBB1_1: // %loop
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q4, [x0, x8, lsl #4]
-; CHECK-NEXT:    add x9, x1, x8, lsl #6
-; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    cmp x8, #1000
-; CHECK-NEXT:    tbl v5.16b, { v4.16b }, v0.16b
-; CHECK-NEXT:    tbl v6.16b, { v4.16b }, v1.16b
-; CHECK-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
-; CHECK-NEXT:    tbl v4.16b, { v4.16b }, v3.16b
-; CHECK-NEXT:    scvtf v5.4s, v5.4s, #24
-; CHECK-NEXT:    scvtf v6.4s, v6.4s, #24
-; CHECK-NEXT:    scvtf v7.4s, v7.4s, #24
-; CHECK-NEXT:    scvtf v4.4s, v4.4s, #24
-; CHECK-NEXT:    stp q6, q5, [x9, #32]
-; CHECK-NEXT:    stp q4, q7, [x9]
-; CHECK-NEXT:    b.eq .LBB1_1
-; CHECK-NEXT:  // %bb.2: // %exit
-; CHECK-NEXT:    ret
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep.src = getelementptr inbounds <16 x i8>, ptr %src, i64 %iv
-  %l = load <16 x i8>, ptr %gep.src
-  %conv = sitofp <16 x i8> %l to <16 x float>
-  %gep.dst = getelementptr inbounds <16 x float>, ptr %dst, i64 %iv
-  store <16 x float> %conv, ptr %gep.dst
-  %iv.next = add i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, 1000
-  br i1 %ec, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-
-; Negative tests, currently we don't convert to f16/bf16 via `tbl`.
-define void @sitofp_v8i8_to_v8f16(ptr %src, ptr %dst) {
-; CHECK-LABEL: sitofp_v8i8_to_v8f16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  .LBB2_1: // %loop
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr d0, [x0, x8, lsl #3]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NEXT:    fcvtn2 v1.8h, v0.4s
-; CHECK-NEXT:    str q1, [x1, x8, lsl #4]
-; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    cmp x8, #1000
-; CHECK-NEXT:    b.eq .LBB2_1
-; CHECK-NEXT:  // %bb.2: // %exit
-; CHECK-NEXT:    ret
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv
-  %l = load <8 x i8>, ptr %gep.src
-  %conv = sitofp <8 x i8> %l to <8 x half>
-  %gep.dst = getelementptr inbounds <8 x half>, ptr %dst, i64 %iv
-  store <8 x half> %conv, ptr %gep.dst
-  %iv.next = add i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, 1000
-  br i1 %ec, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-
-; Negative test, conversion to double with the help of `tbl` not implemented (TODO)
-define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) {
-; CHECK-LABEL: sitofp_v2i8_to_v2f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  .LBB3_1: // %loop
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x9, x0, x8, lsl #1
-; CHECK-NEXT:    ldrsb w10, [x9]
-; CHECK-NEXT:    ldrsb w9, [x9, #1]
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    str q0, [x1, x8, lsl #4]
-; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    cmp x8, #1000
-; CHECK-NEXT:    b.eq .LBB3_1
-; CHECK-NEXT:  // %bb.2: // %exit
-; CHECK-NEXT:    ret
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep.src = getelementptr inbounds <2 x i8>, ptr %src, i64 %iv
-  %l = load <2 x i8>, ptr %gep.src
-  %conv = sitofp <2 x i8> %l to <2 x double>
-  %gep.dst = getelementptr inbounds <2 x double>, ptr %dst, i64 %iv
-  store <2 x double> %conv, ptr %gep.dst
-  %iv.next = add i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, 1000
-  br i1 %ec, label %loop, label %exit
-
-exit:
-  ret void
-}
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-add.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-add.ll
index 5bf7acc94d16e9..2fe64aef5df045 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-add.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-add.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-i16i64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define void @addha_s(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x i32> %zn) {
 ; CHECK-LABEL: addha_s:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index da764cf52445be..57f8e5438eaf2b 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define void @ld1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) {
 ; CHECK-LABEL: ld1b:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
index 364ecaaa2a3fb1..2c71b6fb77e82d 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define void @bfmopa(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: bfmopa:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
index aec01fa5b96059..d479370af43148 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define void @bfmops(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: bfmops:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
index 53e9b6300951c2..1ff32aade4a1f9 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define void @st1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) {
 ; CHECK-LABEL: st1b:
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll
index e7a6c0d6c549be..4f8969894d417d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 target triple = "aarch64-linux"
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
index ecaf8bccb71fb6..eee577ce48349f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; ADD Multi-Single x2
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
index 611cdcda157e21..6ccf70e7e658f2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; FCVT
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
index 30dc7cbfaea6c9..cec4fc9a03d907 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvtl_widen_x2_f16(<vscale x 8 x half> %zn0) {
 ; CHECK-LABEL: multi_vector_cvtl_widen_x2_f16:
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll
index 3a73ff7cdc29ce..9b68f25bb0649c 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; FCVTN
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
index 1e1895fc54c2a7..ca5399a0503e91 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
 
 ;
 ; Move Multi-Vector From Tile (Read) x2
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
index 401cdd0b9dfb79..006e8aa2475ba0 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-f64f64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs | FileCheck %s
 
 ; FMLA (SINGLE)
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
index 3e807b7e633844..e72a7ab43c0e57 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:.*$" --version 4
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 target triple = "aarch64-linux"
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
index b4fd5a2272e7ea..ca149f41dbb835 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 target triple="aarch64-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-frint.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-frint.ll
index 95c61f582ff6fc..539c1c405ed90e 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-frint.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-frint.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; FRINTA
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
index 99de6f832a3c91..c7a2ec16e3f7f6 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; Move Multi-Vector To Tile (Write) x 2
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index e154a4df86efe1..1e835c92ba9e4c 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 target triple="aarch64-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 3a94b0333e267e..29d3d68fc4c3de 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
 
 define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 8ecb7c858c6a26..3d3748e1011228 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
 
 define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll
index 8f9993cf00c59b..7874b2ea78ea76 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 ; lookup table expand one register
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll
index b5d9ff8afdd462..f2ffdcea279df2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 ; lookup table expand one register
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll
index 1516e301196a7b..6752b11e88ee19 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 ; lookup table expand one register
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll
index 1be2253dd92399..5fb9ac076eac30 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 ; lookup table expand one register
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
index 0e47a3916c10a2..92d3e1182bf349 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 ; lookup table expand one register
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll
index 6202398b1534b9..c45a10c941205e 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 ; lookup table expand one register
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
index 79db677853cb58..6e2c48f88e0244 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+b16b16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; SMAX (Single, x2)
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
index e5c36d42fb1351..d37984596f8474 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+b16b16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; SMIN (Single, x2)
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
index 346afc611eb756..d33609b6351274 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; SMLALL
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
index ba10c2dd3cf485..e817dac3d1a6b1 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+bf16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+bf16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; BF/F/S/UMLAL x1 (SINGLE)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll
index 8b17dad8d01c11..63b7613845ecc0 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; MOPA/MOPS
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll
index fa0fd43607020f..d1219d4b382d2b 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 target triple = "aarch64-linux"
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
index 9e4bc17d917c9d..9e7d96c89cba79 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py$
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; SQCVT
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvtn.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvtn.ll
index 9285417574bb45..25f14d0a6c4250 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvtn.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvtn.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py$
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; SQCVTN
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll
index e7ec0ba7636e78..f0070557a28fba 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; S/UQRSHR x2
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
index 12a940ff03e29a..13205d30258188 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; SRSHL (Single, x2)
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-select-sme-tileslice.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-select-sme-tileslice.ll
index f41791e626f5f5..5e94f11390a821 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-select-sme-tileslice.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-select-sme-tileslice.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -force-streaming < %s | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
index e71afe213d8a59..ce9d0ca35cb629 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; SQDMULH (Single, x2)
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
index da8c679d5a39a8..c2865990faab6f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; SUB Multi-Single x2
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index b698b60007eb9f..a0d8c18f55c3a0 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 
 ; == FVDOT ==
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll
index 14a4dba2466bf3..4e336eb3ec9c48 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 define void @zero_zt0() {
 ; CHECK-LABEL: zero_zt0:
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll
index 30205d86f2fb20..5d7efd5a09092d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; LDR
 
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
new file mode 100644
index 00000000000000..1a4393521aadc0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
@@ -0,0 +1,904 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+;MOVAZ (tile to vector, Multi)
+
+
+;;
+; X2 - Horiz
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x2(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z8_i8_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0h.b[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 14
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z32_i32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z64_i64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z32_f32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z64_f64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+;;
+; X2- Vert
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x2(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z8_i8_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0v.b[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 14
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z32_i32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z64_i64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z32_f32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z64_f64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+;;
+; X4 - Horiz
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x4(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z8_i8_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0h.b[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 12
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_i16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z32_i32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 3, i32 %slice)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z64_i64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_bf16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_f16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z32_f32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 %slice)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z64_f64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+;;
+; X4 - Vert
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x4(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z8_i8_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0v.b[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 12
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_i16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z32_i32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 3, i32 %slice)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z64_i64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_bf16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_f16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z32_f32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 %slice)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z64_f64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x2.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32, i32)
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x2.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32, i32)
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x4.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32, i32)
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x4.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32, i32)
+
+;MOVAZ (tile to vector, single)
+
+;;
+; Horiz
+;;
+define <vscale x 16 x i8> @test_readz_hor_z8_i8(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z8_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.b, za0h.b[w12, 0]
+; CHECK-NEXT:    movaz z0.b, za0h.b[w12, 14]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 14
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice.max)
+  ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_hor_z16_i16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.h, za0h.h[w12, 0]
+; CHECK-NEXT:    movaz z0.h, za1h.h[w12, 7]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 %slice.max)
+  ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_hor_z32_i32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z32_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.s, za0h.s[w12, 0]
+; CHECK-NEXT:    movaz z0.s, za3h.s[w12, 3]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 3
+  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 3, i32 %slice.max)
+  ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_hor_z64_i64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z64_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.d, za0h.d[w12, 0]
+; CHECK-NEXT:    movaz z1.d, za7h.d[w12, 1]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 1
+  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 7, i32 %slice.max)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_hor_z16_bf16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.h, za0h.h[w12, 0]
+; CHECK-NEXT:    movaz z0.h, za1h.h[w12, 7]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 %slice.max)
+  ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_hor_z16_f16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z16_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.h, za0h.h[w12, 0]
+; CHECK-NEXT:    movaz z0.h, za1h.h[w12, 7]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 1, i32 %slice.max)
+  ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_hor_z32_f32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z32_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.s, za0h.s[w12, 0]
+; CHECK-NEXT:    movaz z0.s, za3h.s[w12, 3]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 3
+  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 %slice.max)
+  ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_hor_z64_f64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z64_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.d, za0h.d[w12, 0]
+; CHECK-NEXT:    movaz z1.d, za7h.d[w12, 1]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 1
+  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 %slice.max)
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 16 x i8> @test_readz_hor_z128_i8(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 %slice)
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 15, i32 %slice)
+  ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_hor_z128_i16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 0, i32 %slice)
+  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 15, i32 %slice)
+  ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_hor_z128_i32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 0, i32 %slice)
+  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 15, i32 %slice)
+  ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_hor_z128_i64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z1.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 0, i32 %slice)
+  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 15, i32 %slice)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_hor_z128_bf16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 0, i32 %slice)
+  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 15, i32 %slice)
+  ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_hor_z128_f16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 0, i32 %slice)
+  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 15, i32 %slice)
+  ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_hor_z128_f32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 0, i32 %slice)
+  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 15, i32 %slice)
+  ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_hor_z128_f64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_hor_z128_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT:    movaz z1.q, za15h.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 0, i32 %slice)
+  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 %slice)
+  ret <vscale x 2 x double> %res
+}
+
+;;
+; Vert
+;;
+define <vscale x 16 x i8> @test_readz_ver_z8_i8(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z8_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.b, za0v.b[w12, 0]
+; CHECK-NEXT:    movaz z0.b, za0v.b[w12, 14]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 14
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice.max)
+  ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_ver_z16_i16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.h, za0v.h[w12, 0]
+; CHECK-NEXT:    movaz z0.h, za1v.h[w12, 7]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 1, i32 %slice.max)
+  ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_ver_z32_i32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z32_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.s, za0v.s[w12, 0]
+; CHECK-NEXT:    movaz z0.s, za3v.s[w12, 3]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 3
+  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 3, i32 %slice.max)
+  ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_ver_z64_i64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z64_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.d, za0v.d[w12, 0]
+; CHECK-NEXT:    movaz z1.d, za7v.d[w12, 1]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 1
+  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 7, i32 %slice.max)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_ver_z16_bf16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.h, za0v.h[w12, 0]
+; CHECK-NEXT:    movaz z0.h, za1v.h[w12, 7]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 1, i32 %slice.max)
+  ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_ver_z16_f16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z16_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.h, za0v.h[w12, 0]
+; CHECK-NEXT:    movaz z0.h, za1v.h[w12, 7]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 1, i32 %slice.max)
+  ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_ver_z32_f32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z32_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.s, za0v.s[w12, 0]
+; CHECK-NEXT:    movaz z0.s, za3v.s[w12, 3]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 3
+  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 3, i32 %slice.max)
+  ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_ver_z64_f64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z64_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.d, za0v.d[w12, 0]
+; CHECK-NEXT:    movaz z1.d, za7v.d[w12, 1]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 1
+  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 7, i32 %slice.max)
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 16 x i8> @test_readz_ver_z128_i8(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 0, i32 %slice)
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 15, i32 %slice)
+  ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_ver_z128_i16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 0, i32 %slice)
+  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 15, i32 %slice)
+  ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_ver_z128_i32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 0, i32 %slice)
+  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 15, i32 %slice)
+  ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_ver_z128_i64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z1.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 0, i32 %slice)
+  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 15, i32 %slice)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_ver_z128_bf16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 0, i32 %slice)
+  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 15, i32 %slice)
+  ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_ver_z128_f16(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 0, i32 %slice)
+  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 15, i32 %slice)
+  ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_ver_z128_f32(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 0, i32 %slice)
+  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 15, i32 %slice)
+  ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_ver_z128_f64(i32 %tile, i32 %slice) #0 {
+; CHECK-LABEL: test_readz_ver_z128_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT:    movaz z1.q, za15v.q[w12, 0]
+; CHECK-NEXT:    ret
+  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 0, i32 %slice)
+  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 15, i32 %slice)
+  ret <vscale x 2 x double> %res
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32, i32)
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32, i32)
+
+
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32, i32)
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32, i32)
+attributes #0 = { nounwind "target-features" = "+sme2p1"}
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll
index ba77637580f4cb..fe7e2138e72db5 100644
--- a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s
 
 target triple = "aarch64-linux"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index c62016d8ea01af..470c0dd45782c9 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=NOPAIR
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=PAIR
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
 
 
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-contiguous-prefetches.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-contiguous-prefetches.ll
index 3b2c7fed1d807a..da63a6748e6124 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-contiguous-prefetches.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-contiguous-prefetches.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; Testing prfop encodings
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-conversion.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-conversion.ll
index 9e36403c73eb05..30415f15cfb9c2 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-conversion.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-conversion.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; SXTB
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-bits.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-bits.ll
index 2a2386eb0b4b36..d2d68327cab5bc 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-bits.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-bits.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; CLS
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll
index 9a579dadeb4378..8f0a9eac87c27c 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-scalar-inc-vl < %s | FileCheck %s -check-prefix=USE_SCALAR_INC
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s -check-prefix=USE_SCALAR_INC
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s -check-prefix=USE_SCALAR_INC
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefix=USE_SCALAR_INC
 
 ;
 ; CNTB
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-converts.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-converts.ll
index ec3f58329f1f4c..4cb8829e89d464 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-converts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-converts.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; FCVT
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll
index e88eaaae3f63e1..bd43050cfae9b2 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme -force-streaming < %s | FileCheck %s
 
 ; NOTE: invalid, upper and lower bound immediate values of the regimm
 ; addressing mode are checked only for the byte version of each
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+reg-addr-mode.ll
index 443253c2b2909e..ee4b8da05ef1c4 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+reg-addr-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+reg-addr-mode.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme -force-streaming < %s | FileCheck %s
 
 ; ld2b
 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8(<vscale x 16 x i1> %Pg, ptr %addr, i64 %a) {
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical.ll
index a2def791cdc9ce..dd6076c02bb269 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; CNOT
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-creation.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-creation.ll
index d549597a6a6d93..a7adc8460811df 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-creation.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-creation.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; PTRUE
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-operations.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-operations.ll
index 360afcf895dc79..fd1469930726f2 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-operations.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-operations.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; BRKA
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-testing.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-testing.ll
index d38d3e3f01981c..8cd04dd76b3c44 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-testing.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-pred-testing.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; PTEST
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
index c7c102f5d567d9..5127fa7e934803 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; Converting to svbool_t (<vscale x 16 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reversal.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reversal.ll
index 3ee23707cf29cc..80a2bd4f92e050 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reversal.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reversal.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; RBIT
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-sel.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-sel.ll
index 8b47bc18b4b514..b168dcfa4556ad 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-sel.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-sel.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; SEL (Vectors)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-sqdec.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-sqdec.ll
index baecf6e1bf8371..f4ab2025f71822 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-sqdec.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-sqdec.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ; Since SQDEC{B|H|W|D|P} and SQINC{B|H|W|D|P} have identical semantics, the tests for
 ;   * @llvm.aarch64.sve.sqinc{b|h|w|d|p}, and
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-sqinc.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-sqinc.ll
index dcee9962c784a3..aa3403b714cbcc 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-sqinc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-sqinc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ; Since SQDEC{B|H|W|D|P} and SQINC{B|H|W|D|P} have identical semantics, the tests for
 ;   * @llvm.aarch64.sve.sqinc{b|h|w|d|p}, and
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll
index f10b44fe63e0f3..153821c473ad23 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; ST1B
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll
index e909d447f0fc96..8919742f2e9868 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; ST1B
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll
index 5214ccff0bfea3..b3c34f5ca9bead 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 ; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; ST1B
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
index 8882fc9290386d..579358ecb887ef 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ; NOTE: invalid, upper and lower bound immediate values of the reg+imm
 ; addressing mode are checked only for the byte version of each
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll
index d6ee787a23f877..c8fc8d7a70cc68 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; ST2B
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
index d07fd8785121b3..b09baa6bf7e0ac 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; ST2B
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-uqdec.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-uqdec.ll
index 4c3ab8573ab1fd..a1d0b5d05d7d54 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-uqdec.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-uqdec.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ; Since UQDEC{B|H|W|D|P} and UQINC{B|H|W|D|P} have identical semantics, the tests for
 ;   * @llvm.aarch64.sve.uqinc{b|h|w|d|p}, and
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-uqinc.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-uqinc.ll
index 9db0a0cf560551..6fb9ef6535c4df 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-uqinc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-uqinc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ; Since UQDEC{B|H|W|D|P} and UQINC{B|H|W|D|P} have identical semantics, the tests for
 ;   * @llvm.aarch64.sve.uqinc{b|h|w|d|p}, and
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
index 657d9bea88a241..ba7826aaae7733 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; WHILELE
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 9741147b332e39..88c83a214c7394 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-add-sub.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-add-sub.ll
index 7b7bbe0a9acb37..e2483cff3d1863 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-add-sub.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-add-sub.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ; ADDHNB
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-shr.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-shr.ll
index 76200ff790027d..2f7b82751cdcfc 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-shr.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-shr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; SHRNB
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-complex-dot.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-complex-dot.ll
index c4c69f18057943..8d395adda0799e 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-complex-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-complex-dot.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; CDOT
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-contiguous-conflict-detection.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-contiguous-conflict-detection.ll
index 0799989308d463..6005fb69ae1ba2 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-contiguous-conflict-detection.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-contiguous-conflict-detection.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; WHILERW
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-converts.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-converts.ll
index 6054034ff1cce9..16041766605e97 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-converts.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-converts.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; FCVTLT
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll
index 8fc6d57cf90251..52c04d614b4e1e 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; FLOGB
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll
index 97e6aeaa96e1f6..6dc2c67b5fd9ef 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; FMLALB (Vectors)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-mul-lane.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-mul-lane.ll
index f7b4790b6af3db..c46016e0c40de5 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-mul-lane.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-mul-lane.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; MUL
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll
index 7d506166f76791..ab4945f5178d80 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; ADDP
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic.ll
index ba895226cfc642..08afb88a2c5763 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; EORBT
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
index 4ce5d61ea77d3c..ccc7a4d14a7705 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 16 x i1> @psel_b(<vscale x 16 x i1> %p1, <vscale x 16 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_b:
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-revd.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-revd.ll
index dd02e1a89e9aca..16ebd594ca21c4 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-revd.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-revd.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 16 x i8> @test_revd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_revd_i8:
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-unary-narrowing.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-unary-narrowing.ll
index 8896322f4cab55..6492f094b707a5 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-unary-narrowing.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-unary-narrowing.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; SQXTNB
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-complex-arith.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-complex-arith.ll
index 7ae5f883ca1125..8c0ef68dda5aa9 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-complex-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-complex-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; CADD
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
index 5f7476397891bc..e5750099258d66 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll
index 2052ebf4c8bb03..610333f0ff34bc 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; WHILEGE
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-complex-int-arith.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-complex-int-arith.ll
index ee31ec99ca4ac8..1786f5c514fba6 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-complex-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-complex-int-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; SADDLBT
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
index 86ef2dd3bf1ef1..3d2a6930ca9765 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; SABALB
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-pairwise-arith.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-pairwise-arith.ll
index c83ab43bbe693c..6854ad51989a2b 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-pairwise-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-pairwise-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s
 
 ;
 ; SADALP
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-cntp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-cntp.ll
index 5a56b86c3b422f..45446a4127ee46 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-cntp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-cntp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 define i64 @test_svcntp_c8_vlx2(target("aarch64.svcount") %pn) nounwind {
 ; CHECK-LABEL: test_svcntp_c8_vlx2:
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-dots.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-dots.ll
index e54a93bdaf6580..f4cb0a3f70ee57 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-dots.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 define <vscale x 4 x i32> @sdot_x2(<vscale x 4 x i32> %zda, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: sdot_x2:
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll
index df6b34a3280a7a..0a5a0b7cfcc880 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll
@@ -105,7 +105,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 
 
 attributes #0 = { "target-features"="+sve2p1" }
-attributes #1 = { "target-features"="+sme2" }
+attributes #1 = { "target-features"="+sme2" "aarch64_pstate_sm_enabled" }
 
 declare <vscale x 8 x half> @llvm.aarch64.sve.fclamp.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.fclamp.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fp-reduce.ll
index 7957366fce3b05..41b4c77178b5ee 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fp-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sme2p1 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s
 
 ;
 ; FMAXNMQV
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-int-reduce.ll
index a730ba9c932093..94545a3c644298 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-int-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sme2p1 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s
 
 ;
 ; ORQV
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
index 3af91fcd95546d..c0e0a5834eecd8 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -force-streaming < %s | FileCheck %s
 
 ; == Normal Multi-Vector Consecutive Loads ==
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll
index 33d5f27b08fd36..9e033a978aab6b 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64 -mattr=+sve2p1 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64 -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 define <vscale x 16 x i1> @pext_b(target("aarch64.svcount") %x) nounwind {
 ; CHECK-LABEL: pext_b:
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll
index 912d5d853aa8d5..0475601f1907eb 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll
@@ -137,7 +137,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 
 attributes #0 = { "target-features"="+sve2p1" }
-attributes #1 = { "target-features"="+sme2" }
+attributes #1 = { "target-features"="+sme2" "aarch64_pstate_sm_enabled" }
 
 declare <vscale x 16 x i8> @llvm.aarch64.sve.sclamp.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sclamp.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
index e821de24b5f140..f30b924fb9e3b9 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; == 8 to 64-bit elements ==
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
index 3a21eaead5f72e..b19b5d871459a8 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; == 8 to 64-bit elements ==
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
index 8b54a626b36609..e7d9c3e6839e43 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -force-streaming < %s | FileCheck %s
 
 ; == Normal Multi-Vector Consecutive Stores ==
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll
index de1695162c98eb..a39f6cfa3b4c56 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll
@@ -137,7 +137,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 
 attributes #0 = { "target-features"="+sve2p1" }
-attributes #1 = { "target-features"="+sme2" }
+attributes #1 = { "target-features"="+sme2" "aarch64_pstate_sm_enabled" }
 
 declare <vscale x 16 x i8> @llvm.aarch64.sve.uclamp.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.uclamp.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll
index 8334bbe0cff305..19b24f7822e1ea 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 
 ; == 2 vectors ==
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll
index 86027c7fecd81d..f4685b7ff700d3 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
 
 ; == 8 to 64-bit elements ==
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll
index fe3ddbf747acec..198d3c92158466 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; == 8 to 64-bit elements ==
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pn.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pn.ll
index aae09255bb7ee8..7a3ffb020ea56f 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pn.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pn.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
 
 
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll
index c7a3c8cd0159a9..f94ef580179436 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; == 8 to 64-bit elements ==
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx4.ll
index d3f8e2fa6d05b4..f86a31df8a6502 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx4.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx4.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; == 8 to 64-bit elements ==
 
diff --git a/llvm/test/CodeGen/AArch64/taildup-ssa-update-pr62712.mir b/llvm/test/CodeGen/AArch64/taildup-ssa-update-pr62712.mir
new file mode 100755
index 00000000000000..abe1813cb2f0e5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/taildup-ssa-update-pr62712.mir
@@ -0,0 +1,302 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -verify-machineinstrs -mtriple aarch64-none-linux-gnu -run-pass early-tailduplication -o - %s | FileCheck %s
+
+# These tests used to hit verification failure due to register class
+# mismatches after tail dupilication (and the SSA form updates).
+
+# In test1 we have a simple case when the COPY already is duplicated, but
+# TailDuplication will try to eliminate the PHI in bb.2 by adding a new PHI in
+# bb.4. The presence of a PHI node in bb.4, which happens to assign to a
+# fpr32 register, was enough to mess up the result. The PHI node was reused
+# and the use of %2 in the CBNZW was changed into using %3 instead. But the
+# register class of %3 is not correct for CBNZW.
+# The fix involves adding a COPY instruction that moves the value to
+# a register of correct regclass.
+---
+name:            test1
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test1
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %0
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %1
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr32 = PHI %0, %bb.0, %1, %bb.1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY [[PHI]]
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit [[PHI]]
+  ; CHECK-NEXT:   CBNZW [[COPY]], %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  bb.0:
+    $x0 = COPY undef $x0, implicit-def %0:gpr32
+    B %bb.2
+
+  bb.1:
+    $x0 = COPY undef $x0, implicit-def %1:gpr32
+
+  bb.2:
+    %2:gpr32 = PHI %1, %bb.1, %0, %bb.0
+    B %bb.4
+
+  bb.3:
+    B %bb.3
+
+  bb.4:
+    %3:fpr32 = PHI %2, %bb.2
+    $x0 = COPY undef $x0, implicit %3:fpr32
+    CBNZW %2, %bb.5
+
+  bb.5:
+
+...
+
+# In test2 there are two PHIs already present, one with the wanted register
+# class. No idea if this is a common scenario in reality (hand written mir
+# test case).
+# FIXME: Can we pick the best PHI directly instead of getting a COPY from the
+#        one with wrong register class?
+---
+name:            test2
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %0
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %1
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr32 = PHI %0, %bb.0, %1, %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr32 = PHI %0, %bb.0, %1, %bb.1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY [[PHI]]
+  ; CHECK-NEXT:   CBNZW [[COPY]], %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit [[PHI]]
+  bb.0:
+    $x0 = COPY undef $x0, implicit-def %0:gpr32
+    B %bb.2
+
+  bb.1:
+    $x0 = COPY undef $x0, implicit-def %1:gpr32
+
+  bb.2:
+    %2:gpr32 = PHI %1, %bb.1, %0, %bb.0
+    B %bb.4
+
+  bb.3:
+    B %bb.3
+
+  bb.4:
+    %3:fpr32 = PHI %2, %bb.2
+    %4:gpr32 = PHI %2, %bb.2
+    CBNZW %2, %bb.5
+
+  bb.5:
+    $x0 = COPY undef $x0, implicit %3:fpr32
+...
+
+# In test3 we have multiple uses, and in multiple BBs. This test is to show
+# that we get one COPY instruction inserted for each BB where there is a use.
+---
+name:            test3
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test3
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %0
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %1
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr32 = PHI %0, %bb.0, %1, %bb.1
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit [[PHI]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY [[PHI]]
+  ; CHECK-NEXT:   CBNZW [[COPY]], %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY [[PHI]]
+  ; CHECK-NEXT:   CBNZW [[COPY1]], %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  bb.0:
+    $x0 = COPY undef $x0, implicit-def %0:gpr32
+    B %bb.2
+
+  bb.1:
+    $x0 = COPY undef $x0, implicit-def %1:gpr32
+
+  bb.2:
+    %2:gpr32 = PHI %1, %bb.1, %0, %bb.0
+    B %bb.4
+
+  bb.3:
+    B %bb.3
+
+  bb.4:
+    %3:fpr32 = PHI %2, %bb.2
+    $x0 = COPY undef $x0, implicit %3:fpr32
+
+  bb.5:
+    CBNZW %2, %bb.6
+
+  bb.6:
+    CBNZW %2, %bb.7
+
+  bb.7:
+...
+
+# In test4 we do not need to insert a COPY.
+# The register class can be constrained instead.
+---
+name:            test4
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test4
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %0
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %1
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI %0, %bb.0, %1, %bb.1
+  ; CHECK-NEXT:   CBNZW [[PHI]], %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  bb.0:
+    $x0 = COPY undef $x0, implicit-def %0:gpr32
+    B %bb.2
+
+  bb.1:
+    $x0 = COPY undef $x0, implicit-def %1:gpr32
+
+  bb.2:
+    %2:gpr32 = PHI %1, %bb.1, %0, %bb.0
+    B %bb.4
+
+  bb.3:
+    B %bb.3
+
+  bb.4:
+    %3:gpr32sp = PHI %2, %bb.2
+    CBNZW %2, %bb.5
+
+  bb.5:
+
+...
+
+# In test5 we do not need to insert a COPY.
+# The register class can be constrained instead.
+---
+name:            test5
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test5
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %0
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = COPY undef $x0, implicit-def %1
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI %0, %bb.0, %1, %bb.1
+  ; CHECK-NEXT:   CBNZW [[PHI]], %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  bb.0:
+    $x0 = COPY undef $x0, implicit-def %0:gpr32common
+    B %bb.2
+
+  bb.1:
+    $x0 = COPY undef $x0, implicit-def %1:gpr32common
+
+  bb.2:
+    %2:gpr32common = PHI %1, %bb.1, %0, %bb.0
+    B %bb.4
+
+  bb.3:
+    B %bb.3
+
+  bb.4:
+    %3:gpr32 = PHI %2, %bb.2
+    CBNZW %2, %bb.5
+
+  bb.5:
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 220dc70165e87c..bdfafa89cd0477 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -74,7 +73,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
 ; IR-NEXT:    ret void
@@ -172,7 +171,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
 ; IR-NEXT:    ret void
@@ -273,7 +272,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP5]], 1
 ; IR-NEXT:    [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0)
@@ -374,7 +373,7 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
 ; IR-NEXT:    [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -476,7 +475,7 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
 ; IR-NEXT:    [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -581,7 +580,7 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP5]], 1
 ; IR-NEXT:    [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
index feebe791ddec70..f2f00823e0278e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
@@ -32,4 +32,70 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x
   ret <2 x half> %ret
 }
 
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>)
+define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) #0 {
+  ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn
+  ; GFX940: bb.1 (%ir-block.0):
+  ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX940-NEXT: {{  $}}
+  ; GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX940-NEXT:   [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr)
+  ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
+  ; GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %ret
+}
+
+define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) #0 {
+  ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_rtn
+  ; GFX940: bb.1 (%ir-block.0):
+  ; GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX940-NEXT: {{  $}}
+  ; GFX940-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX940-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+  ; GFX940-NEXT:   [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[COPY3]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr)
+  ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
+  ; GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %ret
+}
+
+define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) #0 {
+  ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn
+  ; GFX940: bb.1 (%ir-block.0):
+  ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX940-NEXT: {{  $}}
+  ; GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX940-NEXT:   FLAT_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr)
+  ; GFX940-NEXT:   S_ENDPGM 0
+  %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) #0 {
+  ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_no_rtn
+  ; GFX940: bb.1 (%ir-block.0):
+  ; GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX940-NEXT: {{  $}}
+  ; GFX940-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX940-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+  ; GFX940-NEXT:   FLAT_ATOMIC_PK_ADD_F16 [[COPY3]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr)
+  ; GFX940-NEXT:   S_ENDPGM 0
+  %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index a8aa6c780b86a5..7cd99fcfd5e740 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -16,7 +16,6 @@
 ; RW-FLAT-NOT: .amdhsa_enable_private_segment
 ; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
 ; RO-FLAT:     .amdhsa_enable_private_segment 1
-; GCN-NOT:     .amdhsa_reserve_flat_scratch
 ; GCN:         COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
 ; RW-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 6
 ; RO-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -41,7 +40,6 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
 ; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
 ; RO-FLAT:     .amdhsa_enable_private_segment 1
 ; RW-FLAT:     .amdhsa_reserve_flat_scratch 0
-; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch
 ; GCN:         COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
 ; RW-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 6
 ; RO-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -62,7 +60,6 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 ; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
 ; RO-FLAT:     .amdhsa_enable_private_segment 0
 ; RW-FLAT:     .amdhsa_reserve_flat_scratch 0
-; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch 0
 ; GCN:         COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
 ; RW-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 4
 ; RO-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index fff341b07881b2..632dbd45279fbe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -7,8 +7,6 @@ declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x h
 ; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
 declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
 declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
-declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
-declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
 
 define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_noret:
@@ -156,59 +154,6 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
   ret <2 x i16> %ret
 }
 
-define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NEXT:    ds_pk_add_f16 v0, v1
-; GFX940-NEXT:    s_endpgm
-  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
-  ret void
-}
-
-define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
-  ret <2 x half> %ret
-}
-
-define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    ds_pk_add_bf16 v1, v0
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    s_endpgm
-  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
-  ret void
-}
-
-define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
-  ret <2 x i16> %ret
-}
-
 define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) {
 ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset:
 ; GFX940:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 4e94a646f6da5e..66b22bedaf0721 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -20,7 +20,6 @@ declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr,
 declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
-declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
 
 define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -1923,54 +1922,6 @@ main_body:
   ret double %ret
 }
 
-define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
-; GFX90A-LABEL: local_atomic_fadd_f64_noret:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
-; GFX90A-NEXT:    s_endpgm
-;
-; GFX940-LABEL: local_atomic_fadd_f64_noret:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x24
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s4
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
-; GFX940-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
-  ret void
-}
-
-define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v1
-; GFX940-NEXT:    v_mov_b32_e32 v5, v2
-; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
-  ret double %ret
-}
-
 define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
@@ -1980,7 +1931,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB65_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB63_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -1990,7 +1941,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:  .LBB65_2:
+; GFX90A-NEXT:  .LBB63_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
@@ -2001,7 +1952,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB65_2
+; GFX940-NEXT:    s_cbranch_execz .LBB63_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2011,7 +1962,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:  .LBB65_2:
+; GFX940-NEXT:  .LBB63_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2027,7 +1978,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB66_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB64_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2037,7 +1988,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:  .LBB66_2:
+; GFX90A-NEXT:  .LBB64_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
@@ -2048,7 +1999,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB66_2
+; GFX940-NEXT:    s_cbranch_execz .LBB64_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2058,14 +2009,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:  .LBB66_2:
+; GFX940-NEXT:  .LBB64_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
 }
 
-define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #2 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], exec
@@ -2074,7 +2025,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB67_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB65_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2084,7 +2035,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:  .LBB67_2:
+; GFX90A-NEXT:  .LBB65_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2095,7 +2046,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB67_2
+; GFX940-NEXT:    s_cbranch_execz .LBB65_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2105,7 +2056,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:  .LBB67_2:
+; GFX940-NEXT:  .LBB65_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2134,54 +2085,6 @@ main_body:
   ret double %ret
 }
 
-define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v1
-; GFX940-NEXT:    v_mov_b32_e32 v5, v2
-; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
-  ret double %ret
-}
-
-define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v1
-; GFX940-NEXT:    v_mov_b32_e32 v5, v2
-; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
-  ret double %ret
-}
-
 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" }
-attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" }
-attributes #3 = { "denormal-fp-math"="ieee,ieee" }
-attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
index c71beff4dc5c13..ace66e6a234aae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 
 define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
   ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
@@ -111,5 +111,60 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
   ret void
 }
 
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
+define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 {
+  ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1)
+  ; GFX908-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn
+  ; GFX90A_GFX940: bb.1 (%ir-block.0):
+  ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A_GFX940-NEXT: {{  $}}
+  ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A_GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1)
+  ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 {
+  ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1)
+  ; GFX908-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn
+  ; GFX90A_GFX940: bb.1 (%ir-block.0):
+  ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX90A_GFX940-NEXT: {{  $}}
+  ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX90A_GFX940-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1)
+  ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll
index 9f05ecc4cf4304..e895d6f18c9791 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 
 define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic
@@ -68,5 +68,39 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(p
   ret <2 x half> %ret
 }
 
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 {
+  ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn
+  ; GFX90A_GFX940: bb.1 (%ir-block.0):
+  ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A_GFX940-NEXT: {{  $}}
+  ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A_GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A_GFX940-NEXT:   [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1)
+  ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
+  ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %ret
+}
+
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 {
+  ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn
+  ; GFX90A_GFX940: bb.1 (%ir-block.0):
+  ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX90A_GFX940-NEXT: {{  $}}
+  ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX90A_GFX940-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1)
+  ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
+  ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %ret
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index c44477273dad09..504f7697a0fcca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -1234,7 +1234,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4)
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -1
@@ -1304,7 +1312,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4)
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -524288
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir
index ac72bf1dbbb6fa..60ccd20c095cdd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir
@@ -22,8 +22,8 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; SI-NEXT: %4:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %4(s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; VI-LABEL: name: test_copysign_s16_s16
@@ -37,8 +37,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; VI-NEXT: %4:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %4(s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s16_s16
@@ -52,8 +52,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; GFX9-NEXT: %4:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %4(s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
@@ -79,8 +79,8 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; SI-NEXT: %2:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0 = COPY %2(s32)
     ;
     ; VI-LABEL: name: test_copysign_s32_s32
     ; VI: liveins: $vgpr0, $vgpr1
@@ -91,8 +91,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; VI-NEXT: %2:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0 = COPY %2(s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s32_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -103,8 +103,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: %2:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0 = COPY %2(s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = G_FCOPYSIGN %0, %1
@@ -126,8 +126,8 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; SI-NEXT: %2:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY %2(s64)
     ;
     ; VI-LABEL: name: test_copysign_s64_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -138,8 +138,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; VI-NEXT: %2:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY %2(s64)
     ;
     ; GFX9-LABEL: name: test_copysign_s64_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -150,8 +150,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; GFX9-NEXT: %2:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %2(s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s64) = G_FCOPYSIGN %0, %1
@@ -176,8 +176,8 @@ body: |
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C2]](s32)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; SI-NEXT: %2:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY %2(s64)
     ;
     ; VI-LABEL: name: test_copysign_s64_s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
@@ -191,8 +191,8 @@ body: |
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C2]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; VI-NEXT: %2:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY %2(s64)
     ;
     ; GFX9-LABEL: name: test_copysign_s64_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
@@ -206,8 +206,8 @@ body: |
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C2]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; GFX9-NEXT: %2:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %2(s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s32) = COPY $vgpr2
     %2:_(s64) = G_FCOPYSIGN %0, %1
@@ -232,8 +232,8 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; SI-NEXT: %2:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0 = COPY %2(s32)
     ;
     ; VI-LABEL: name: test_copysign_s32_s64
     ; VI: liveins: $vgpr0, $vgpr1_vgpr2
@@ -247,8 +247,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; VI-NEXT: %2:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0 = COPY %2(s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s32_s64
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
@@ -262,8 +262,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: %2:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0 = COPY %2(s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s64) = COPY $vgpr1_vgpr2
     %2:_(s32) = G_FCOPYSIGN %0, %1
@@ -289,8 +289,8 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; SI-NEXT: %3:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; VI-LABEL: name: test_copysign_s16_s32
@@ -306,8 +306,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; VI-NEXT: %3:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s16_s32
@@ -323,8 +323,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; GFX9-NEXT: %3:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
@@ -353,8 +353,8 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
-    ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; SI-NEXT: %3:_(s32) = disjoint G_OR [[AND]], [[AND2]]
+    ; SI-NEXT: $vgpr0 = COPY %3(s32)
     ;
     ; VI-LABEL: name: test_copysign_s32_s16
     ; VI: liveins: $vgpr0, $vgpr1
@@ -369,8 +369,8 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
-    ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; VI-NEXT: %3:_(s32) = disjoint G_OR [[AND]], [[AND2]]
+    ; VI-NEXT: $vgpr0 = COPY %3(s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s32_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -385,8 +385,8 @@ body: |
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: %3:_(s32) = disjoint G_OR [[AND]], [[AND2]]
+    ; GFX9-NEXT: $vgpr0 = COPY %3(s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %1
@@ -414,8 +414,8 @@ body: |
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND1]], [[C3]](s32)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; SI-NEXT: %3:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY %3(s64)
     ;
     ; VI-LABEL: name: test_copysign_s64_s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
@@ -431,8 +431,8 @@ body: |
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND1]], [[C3]](s32)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; VI-NEXT: %3:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY %3(s64)
     ;
     ; GFX9-LABEL: name: test_copysign_s64_s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
@@ -448,8 +448,8 @@ body: |
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND1]], [[C3]](s32)
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; GFX9-NEXT: %3:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %3(s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s32) = COPY $vgpr2
     %2:_(s16) = G_TRUNC %1
@@ -476,8 +476,8 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; SI-NEXT: %3:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; VI-LABEL: name: test_copysign_s16_s64
@@ -493,8 +493,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; VI-NEXT: %3:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s16_s64
@@ -510,8 +510,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; GFX9-NEXT: %3:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s64) = COPY $vgpr1_vgpr2
@@ -543,8 +543,8 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY]], [[BITCAST1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY1]], [[BITCAST]]
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(<2 x s16>) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0 = COPY [[OR2]](<2 x s16>)
+    ; SI-NEXT: %2:_(<2 x s16>) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0 = COPY %2(<2 x s16>)
     ;
     ; VI-LABEL: name: test_copysign_v2s16_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
@@ -562,8 +562,8 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY]], [[BITCAST1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY1]], [[BITCAST]]
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(<2 x s16>) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0 = COPY [[OR2]](<2 x s16>)
+    ; VI-NEXT: %2:_(<2 x s16>) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0 = COPY %2(<2 x s16>)
     ;
     ; GFX9-LABEL: name: test_copysign_v2s16_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -576,8 +576,8 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16)
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY]], [[BUILD_VECTOR1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY1]], [[BUILD_VECTOR]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](<2 x s16>)
+    ; GFX9-NEXT: %2:_(<2 x s16>) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0 = COPY %2(<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
     %2:_(<2 x s16>) = G_FCOPYSIGN %0, %1
@@ -601,8 +601,8 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
     ; SI-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY]], [[BUILD_VECTOR1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY1]], [[BUILD_VECTOR]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ; SI-NEXT: %2:_(<2 x s32>) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY %2(<2 x s32>)
     ;
     ; VI-LABEL: name: test_copysign_v2s32_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -615,8 +615,8 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
     ; VI-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY]], [[BUILD_VECTOR1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY1]], [[BUILD_VECTOR]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ; VI-NEXT: %2:_(<2 x s32>) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY %2(<2 x s32>)
     ;
     ; GFX9-LABEL: name: test_copysign_v2s32_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -629,8 +629,8 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY]], [[BUILD_VECTOR1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY1]], [[BUILD_VECTOR]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ; GFX9-NEXT: %2:_(<2 x s32>) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %2(<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
     %2:_(<2 x s32>) = G_FCOPYSIGN %0, %1
@@ -656,9 +656,9 @@ body: |
     ; SI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[UV2]], [[C]]
     ; SI-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV3]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
+    ; SI-NEXT: %13:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; SI-NEXT: %14:_(s64) = disjoint G_OR [[AND1]], [[AND3]]
+    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %13(s64), %14(s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; VI-LABEL: name: test_copysign_v2s64_v2s64
@@ -674,9 +674,9 @@ body: |
     ; VI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[UV2]], [[C]]
     ; VI-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV3]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
+    ; VI-NEXT: %13:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; VI-NEXT: %14:_(s64) = disjoint G_OR [[AND1]], [[AND3]]
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %13(s64), %14(s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; GFX9-LABEL: name: test_copysign_v2s64_v2s64
@@ -692,9 +692,9 @@ body: |
     ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[UV2]], [[C]]
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV3]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
+    ; GFX9-NEXT: %13:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; GFX9-NEXT: %14:_(s64) = disjoint G_OR [[AND1]], [[AND3]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %13(s64), %14(s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
@@ -727,9 +727,9 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
     ; SI-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
+    ; SI-NEXT: %17:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; SI-NEXT: %18:_(s64) = disjoint G_OR [[AND1]], [[AND3]]
+    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %17(s64), %18(s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; VI-LABEL: name: test_copysign_v2s64_v2s32
@@ -751,9 +751,9 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
     ; VI-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
+    ; VI-NEXT: %17:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; VI-NEXT: %18:_(s64) = disjoint G_OR [[AND1]], [[AND3]]
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %17(s64), %18(s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; GFX9-LABEL: name: test_copysign_v2s64_v2s32
@@ -775,9 +775,9 @@ body: |
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32)
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
+    ; GFX9-NEXT: %17:_(s64) = disjoint G_OR [[AND]], [[AND2]]
+    ; GFX9-NEXT: %18:_(s64) = disjoint G_OR [[AND1]], [[AND3]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %17(s64), %18(s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<2 x s32>) = COPY $vgpr4_vgpr5
@@ -811,8 +811,8 @@ body: |
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR1]](s64)
     ; SI-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR2]], [[BUILD_VECTOR]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ; SI-NEXT: %2:_(<2 x s32>) = disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY %2(<2 x s32>)
     ;
     ; VI-LABEL: name: test_copysign_v2s32_v2s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
@@ -833,8 +833,8 @@ body: |
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR1]](s64)
     ; VI-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR2]], [[BUILD_VECTOR]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ; VI-NEXT: %2:_(<2 x s32>) = disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY %2(<2 x s32>)
     ;
     ; GFX9-LABEL: name: test_copysign_v2s32_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
@@ -855,8 +855,8 @@ body: |
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR1]](s64)
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR2]], [[BUILD_VECTOR]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ; GFX9-NEXT: %2:_(<2 x s32>) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %2(<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     %2:_(<2 x s32>) = G_FCOPYSIGN %0, %1
@@ -878,8 +878,8 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; SI-NEXT: %2:_(s32) = nnan disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: $vgpr0 = COPY %2(s32)
     ;
     ; VI-LABEL: name: test_copysign_s32_s32_flagss
     ; VI: liveins: $vgpr0, $vgpr1
@@ -890,8 +890,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; VI-NEXT: %2:_(s32) = nnan disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: $vgpr0 = COPY %2(s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s32_s32_flagss
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -902,8 +902,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: %2:_(s32) = nnan disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: $vgpr0 = COPY %2(s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = nnan G_FCOPYSIGN %0, %1
@@ -929,8 +929,8 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND2]]
-    ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; SI-NEXT: %3:_(s32) = nnan disjoint G_OR [[AND]], [[AND2]]
+    ; SI-NEXT: $vgpr0 = COPY %3(s32)
     ;
     ; VI-LABEL: name: test_copysign_s32_s16_flags
     ; VI: liveins: $vgpr0, $vgpr1
@@ -945,8 +945,8 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND2]]
-    ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; VI-NEXT: %3:_(s32) = nnan disjoint G_OR [[AND]], [[AND2]]
+    ; VI-NEXT: $vgpr0 = COPY %3(s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s32_s16_flags
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -961,8 +961,8 @@ body: |
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND2]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: %3:_(s32) = nnan disjoint G_OR [[AND]], [[AND2]]
+    ; GFX9-NEXT: $vgpr0 = COPY %3(s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %1
@@ -990,8 +990,8 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = nnan G_OR [[AND]], [[AND1]]
-    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; SI-NEXT: %3:_(s16) = nnan disjoint G_OR [[AND]], [[AND1]]
+    ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; VI-LABEL: name: test_copysign_s16_s32_flags
@@ -1007,8 +1007,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = nnan G_OR [[AND]], [[AND1]]
-    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; VI-NEXT: %3:_(s16) = nnan disjoint G_OR [[AND]], [[AND1]]
+    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; GFX9-LABEL: name: test_copysign_s16_s32_flags
@@ -1024,8 +1024,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = nnan G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; GFX9-NEXT: %3:_(s16) = nnan disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %3(s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir
index 79c726fc800493..2a3fa6fbfdb770 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir
@@ -27,8 +27,8 @@ body: |
     ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX6-NEXT: %10:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX6-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ;
     ; GFX8-LABEL: name: test_intrinsic_round_s32
@@ -47,8 +47,8 @@ body: |
     ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %10:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX8-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ;
     ; GFX9-LABEL: name: test_intrinsic_round_s32
@@ -67,8 +67,8 @@ body: |
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %10:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = G_INTRINSIC_ROUND %0
@@ -97,8 +97,8 @@ body: |
     ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX6-NEXT: %10:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX6-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ;
     ; GFX8-LABEL: name: test_intrinsic_round_s32_flags
@@ -117,8 +117,8 @@ body: |
     ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %10:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX8-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ;
     ; GFX9-LABEL: name: test_intrinsic_round_s32_flags
@@ -137,8 +137,8 @@ body: |
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %10:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = nsz G_INTRINSIC_ROUND %0
@@ -187,8 +187,8 @@ body: |
     ; GFX6-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT2]], [[C12]]
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C11]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]]
-    ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], [[OR]]
+    ; GFX6-NEXT: %10:_(s64) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], %10
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[FADD1]](s64)
     ;
     ; GFX8-LABEL: name: test_intrinsic_round_s64
@@ -208,8 +208,8 @@ body: |
     ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C3]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %10:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FADD1]](s64)
     ;
     ; GFX9-LABEL: name: test_intrinsic_round_s64
@@ -229,8 +229,8 @@ body: |
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C3]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %10:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], %10
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FADD1]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_INTRINSIC_ROUND %0
@@ -260,8 +260,8 @@ body: |
     ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C3]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX6-NEXT: %24:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], %24
     ; GFX6-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV1]]
     ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[INTRINSIC_TRUNC1]]
     ; GFX6-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[FSUB1]]
@@ -269,8 +269,8 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]]
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C4]]
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C3]]
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[AND3]]
-    ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX6-NEXT: %14:_(s32) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], %14
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
@@ -291,8 +291,8 @@ body: |
     ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C3]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %24:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], %24
     ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV1]]
     ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[INTRINSIC_TRUNC1]]
     ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[FSUB1]]
@@ -300,8 +300,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]]
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C4]]
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C3]]
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[AND3]]
-    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX8-NEXT: %14:_(s32) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], %14
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
@@ -322,8 +322,8 @@ body: |
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C3]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %24:_(s32) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], %24
     ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV1]]
     ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[INTRINSIC_TRUNC1]]
     ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[FSUB1]]
@@ -331,8 +331,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]]
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C4]]
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C3]]
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[AND3]]
-    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX9-NEXT: %14:_(s32) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], %14
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
@@ -383,8 +383,8 @@ body: |
     ; GFX6-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT2]], [[C12]]
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C11]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]]
-    ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], [[OR]]
+    ; GFX6-NEXT: %45:_(s64) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], %45
     ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ubfe), [[UV5]](s32), [[C]](s32), [[C1]](s32)
     ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C2]]
@@ -404,8 +404,8 @@ body: |
     ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[C9]], [[C10]]
     ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SELECT5]], [[C12]]
     ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C11]]
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND6]], [[AND7]]
-    ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[SELECT4]], [[OR1]]
+    ; GFX6-NEXT: %14:_(s64) = disjoint G_OR [[AND6]], [[AND7]]
+    ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[SELECT4]], %14
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD1]](s64), [[FADD3]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
@@ -427,8 +427,8 @@ body: |
     ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C3]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %25:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], %25
     ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s64) = G_INTRINSIC_TRUNC [[UV1]]
     ; GFX8-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC1]]
     ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]]
@@ -437,8 +437,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]]
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT1]], [[C4]]
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C3]]
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]]
-    ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX8-NEXT: %14:_(s64) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], %14
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD1]](s64), [[FADD3]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
@@ -460,8 +460,8 @@ body: |
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C3]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %25:_(s64) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], %25
     ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s64) = G_INTRINSIC_TRUNC [[UV1]]
     ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC1]]
     ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]]
@@ -470,8 +470,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]]
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT1]], [[C4]]
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C3]]
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]]
-    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX9-NEXT: %14:_(s64) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], %14
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD1]](s64), [[FADD3]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
@@ -510,9 +510,9 @@ body: |
     ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C4]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
+    ; GFX6-NEXT: %12:_(s16) = disjoint G_OR [[AND]], [[AND1]]
     ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16)
-    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16)
+    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT %12(s16)
     ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]]
     ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
     ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC2]](s16)
@@ -535,8 +535,8 @@ body: |
     ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C4]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %12:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %12
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
@@ -557,8 +557,8 @@ body: |
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C4]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %12:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %12
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -603,9 +603,9 @@ body: |
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
+    ; GFX6-NEXT: %35:_(s16) = disjoint G_OR [[AND]], [[AND1]]
     ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16)
-    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16)
+    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT %35(s16)
     ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]]
     ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
     ; GFX6-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
@@ -623,16 +623,16 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
+    ; GFX6-NEXT: %14:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
     ; GFX6-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC3]](s16)
-    ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[OR1]](s16)
+    ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT %14(s16)
     ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT12]], [[FPEXT13]]
     ; GFX6-NEXT: [[FPTRUNC5:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC2]](s16)
     ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC5]](s16)
     ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX6-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+    ; GFX6-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX8-LABEL: name: test_intrinsic_round_v2s16
@@ -656,8 +656,8 @@ body: |
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %24:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %24
     ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]]
     ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]]
     ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]]
@@ -665,13 +665,13 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
-    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX8-NEXT: %14:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], %14
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16)
     ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16)
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: test_intrinsic_round_v2s16
@@ -695,8 +695,8 @@ body: |
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %24:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %24
     ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]]
     ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]]
     ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]]
@@ -704,8 +704,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
-    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX9-NEXT: %14:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], %14
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -750,9 +750,9 @@ body: |
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
+    ; GFX6-NEXT: %65:_(s16) = disjoint G_OR [[AND]], [[AND1]]
     ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16)
-    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16)
+    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT %65(s16)
     ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]]
     ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
     ; GFX6-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
@@ -770,9 +770,9 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
+    ; GFX6-NEXT: %46:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
     ; GFX6-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC3]](s16)
-    ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[OR1]](s16)
+    ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT %46(s16)
     ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT12]], [[FPEXT13]]
     ; GFX6-NEXT: [[FPTRUNC5:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; GFX6-NEXT: [[FPEXT14:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
@@ -790,9 +790,9 @@ body: |
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]]
     ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]]
+    ; GFX6-NEXT: %25:_(s16) = disjoint G_OR [[AND4]], [[AND5]]
     ; GFX6-NEXT: [[FPEXT19:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC6]](s16)
-    ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT [[OR2]](s16)
+    ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT %25(s16)
     ; GFX6-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FPEXT19]], [[FPEXT20]]
     ; GFX6-NEXT: [[FPTRUNC8:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD5]](s32)
     ; GFX6-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
@@ -803,18 +803,18 @@ body: |
     ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC2]](s16)
     ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC5]](s16)
     ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+    ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC8]](s16)
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C6]]
     ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
-    ; GFX6-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
-    ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
+    ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]]
     ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; GFX6-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
-    ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
+    ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
@@ -842,8 +842,8 @@ body: |
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %43:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %43
     ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]]
     ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]]
     ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]]
@@ -851,8 +851,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
-    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX8-NEXT: %35:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], %35
     ; GFX8-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]]
     ; GFX8-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]]
     ; GFX8-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]]
@@ -860,8 +860,8 @@ body: |
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]]
     ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]]
     ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]]
-    ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]]
+    ; GFX8-NEXT: %25:_(s16) = disjoint G_OR [[AND4]], [[AND5]]
+    ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], %25
     ; GFX8-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX8-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
@@ -870,18 +870,18 @@ body: |
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16)
     ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16)
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+    ; GFX8-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FADD2]](s16)
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C6]]
     ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
-    ; GFX8-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
-    ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
+    ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]]
     ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; GFX8-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
-    ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
+    ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
@@ -909,8 +909,8 @@ body: |
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %43:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %43
     ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]]
     ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]]
     ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]]
@@ -918,8 +918,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
-    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX9-NEXT: %35:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], %35
     ; GFX9-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]]
     ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]]
     ; GFX9-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]]
@@ -927,8 +927,8 @@ body: |
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]]
     ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]]
     ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]]
-    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]]
+    ; GFX9-NEXT: %25:_(s16) = disjoint G_OR [[AND4]], [[AND5]]
+    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], %25
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
@@ -990,9 +990,9 @@ body: |
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
+    ; GFX6-NEXT: %77:_(s16) = disjoint G_OR [[AND]], [[AND1]]
     ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16)
-    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16)
+    ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT %77(s16)
     ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]]
     ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
     ; GFX6-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
@@ -1010,9 +1010,9 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
+    ; GFX6-NEXT: %58:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
     ; GFX6-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC3]](s16)
-    ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[OR1]](s16)
+    ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT %58(s16)
     ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT12]], [[FPEXT13]]
     ; GFX6-NEXT: [[FPTRUNC5:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; GFX6-NEXT: [[FPEXT14:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
@@ -1030,9 +1030,9 @@ body: |
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]]
     ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]]
+    ; GFX6-NEXT: %39:_(s16) = disjoint G_OR [[AND4]], [[AND5]]
     ; GFX6-NEXT: [[FPEXT19:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC6]](s16)
-    ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT [[OR2]](s16)
+    ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT %39(s16)
     ; GFX6-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FPEXT19]], [[FPEXT20]]
     ; GFX6-NEXT: [[FPTRUNC8:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD5]](s32)
     ; GFX6-NEXT: [[FPEXT21:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
@@ -1050,21 +1050,21 @@ body: |
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[SELECT3]], [[C5]]
     ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX6-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[AND7]]
+    ; GFX6-NEXT: %18:_(s16) = disjoint G_OR [[AND6]], [[AND7]]
     ; GFX6-NEXT: [[FPEXT26:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC9]](s16)
-    ; GFX6-NEXT: [[FPEXT27:%[0-9]+]]:_(s32) = G_FPEXT [[OR3]](s16)
+    ; GFX6-NEXT: [[FPEXT27:%[0-9]+]]:_(s32) = G_FPEXT %18(s16)
     ; GFX6-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[FPEXT26]], [[FPEXT27]]
     ; GFX6-NEXT: [[FPTRUNC11:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
     ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC2]](s16)
     ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC5]](s16)
     ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+    ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC8]](s16)
     ; GFX6-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC11]](s16)
     ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
-    ; GFX6-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
-    ; GFX6-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
+    ; GFX6-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
@@ -1094,8 +1094,8 @@ body: |
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX8-NEXT: %44:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %44
     ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]]
     ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]]
     ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]]
@@ -1103,8 +1103,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
-    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX8-NEXT: %36:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], %36
     ; GFX8-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]]
     ; GFX8-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]]
     ; GFX8-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]]
@@ -1112,8 +1112,8 @@ body: |
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]]
     ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]]
     ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]]
-    ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]]
+    ; GFX8-NEXT: %28:_(s16) = disjoint G_OR [[AND4]], [[AND5]]
+    ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], %28
     ; GFX8-NEXT: [[INTRINSIC_TRUNC3:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC3]]
     ; GFX8-NEXT: [[FSUB3:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC3]], [[INTRINSIC_TRUNC3]]
     ; GFX8-NEXT: [[FABS3:%[0-9]+]]:_(s16) = G_FABS [[FSUB3]]
@@ -1121,18 +1121,18 @@ body: |
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[C2]], [[C3]]
     ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[SELECT3]], [[C5]]
     ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX8-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[AND7]]
-    ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], [[OR3]]
+    ; GFX8-NEXT: %18:_(s16) = disjoint G_OR [[AND6]], [[AND7]]
+    ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], %18
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16)
     ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16)
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FADD2]](s16)
     ; GFX8-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[FADD3]](s16)
     ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
-    ; GFX8-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
-    ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
+    ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
@@ -1162,8 +1162,8 @@ body: |
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
-    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]]
+    ; GFX9-NEXT: %44:_(s16) = disjoint G_OR [[AND]], [[AND1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], %44
     ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]]
     ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]]
     ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]]
@@ -1171,8 +1171,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]]
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]]
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]]
-    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]]
+    ; GFX9-NEXT: %36:_(s16) = disjoint G_OR [[AND2]], [[AND3]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], %36
     ; GFX9-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]]
     ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]]
     ; GFX9-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]]
@@ -1180,8 +1180,8 @@ body: |
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]]
     ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]]
     ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]]
-    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]]
+    ; GFX9-NEXT: %28:_(s16) = disjoint G_OR [[AND4]], [[AND5]]
+    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], %28
     ; GFX9-NEXT: [[INTRINSIC_TRUNC3:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC3]]
     ; GFX9-NEXT: [[FSUB3:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC3]], [[INTRINSIC_TRUNC3]]
     ; GFX9-NEXT: [[FABS3:%[0-9]+]]:_(s16) = G_FABS [[FSUB3]]
@@ -1189,8 +1189,8 @@ body: |
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[C2]], [[C3]]
     ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[SELECT3]], [[C5]]
     ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[AND7]]
-    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], [[OR3]]
+    ; GFX9-NEXT: %18:_(s16) = disjoint G_OR [[AND6]], [[AND7]]
+    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], %18
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD2]](s16), [[FADD3]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
deleted file mode 100644
index 03ed6837291c88..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
-
-define amdgpu_ps float @ds_fadd_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fadd_f32_ss:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fadd_f32_ss:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fadd_f32_ss:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-  %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps float @ds_fadd_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fadd_f32_ss_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fadd_f32_ss_offset:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fadd_f32_ss_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps void @ds_fadd_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_f32 v0, v1
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fadd_f32_ss_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_add_f32 v0, v1
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: ds_fadd_f32_ss_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    ds_add_f32 v0, v1
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: ds_fadd_f32_ss_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    ds_add_f32 v0, v1
-; GFX11-NEXT:    s_endpgm
-  %unused = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @ds_fadd_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_f32 v1, v0 offset:512
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_add_f32 v1, v0 offset:512
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    ds_add_f32 v1, v0 offset:512
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    ds_add_f32 v1, v0 offset:512
-; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %unused = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fadd_f32_vv(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define float @ds_fadd_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define void @ds_fadd_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_f32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_add_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_nortn:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_add_f32 v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define void @ds_fadd_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_f32 v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_add_f32 v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset_nortn:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_add_f32 v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fadd_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_volatile:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_volatile:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_volatile:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
-  ret float %ret
-}
-
-declare float @llvm.amdgcn.ds.fadd(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
deleted file mode 100644
index e4c4f42b137ef7..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-
-; Make sure the memory operand information is preserved.
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-MIR %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9-MIR %s
-
-
-define amdgpu_ps float @ds_fmax_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmax_f32_ss:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps float @ds_fmax_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v1, v0 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmax_f32_ss_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v1, v0 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps void @ds_fmax_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v0, v1
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmax_f32_ss_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_max_f32 v0, v1
-; GFX9-NEXT:    s_endpgm
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   S_ENDPGM 0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   S_ENDPGM 0
-  %unused = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @ds_fmax_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v1, v0 offset:512
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmax_f32_ss_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_max_f32 v1, v0 offset:512
-; GFX9-NEXT:    s_endpgm
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   S_ENDPGM 0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   S_ENDPGM 0
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %unused = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmax_f32_vv(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define float @ds_fmax_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define void @ds_fmax_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   SI_RETURN
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   SI_RETURN
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define void @ds_fmax_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_f32 v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   SI_RETURN
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   SI_RETURN
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmax_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_volatile:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_volatile:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_volatile
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_volatile
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
-  ret float %ret
-}
-
-declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
deleted file mode 100644
index 0f6fb5acd56ada..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
-
-define amdgpu_ps float @ds_fmin_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmin_f32_ss:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fmin_f32_ss:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fmin_f32_ss:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps float @ds_fmin_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmin_f32_ss_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fmin_f32_ss_offset:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fmin_f32_ss_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps void @ds_fmin_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v0, v1
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmin_f32_ss_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_min_f32 v0, v1
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: ds_fmin_f32_ss_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    ds_min_f32 v0, v1
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: ds_fmin_f32_ss_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    ds_min_f32 v0, v1
-; GFX11-NEXT:    s_endpgm
-  %unused = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @ds_fmin_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %unused = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmin_f32_vv(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define float @ds_fmin_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define void @ds_fmin_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_nortn:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_f32 v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define void @ds_fmin_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset_nortn:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmin_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_volatile:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_volatile:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_volatile:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
-  ret float %ret
-}
-
-declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index b169063d67872d..9d586e3e4a09a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -141,51 +141,57 @@ define float @v_powi_1_f32(float %l) {
 }
 
 define float @v_powi_neg1_f32(float %l) {
-; GFX78-LABEL: v_powi_neg1_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, -1.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg1_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg1_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg1_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, -1.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -1)
   ret float %res
@@ -195,99 +201,74 @@ define float @v_powi_2_f32(float %l) {
 ; GFX78-LABEL: v_powi_2_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_2_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 2.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 2)
   ret float %res
 }
 
 define float @v_powi_neg2_f32(float %l) {
-; GFX78-LABEL: v_powi_neg2_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, -2.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg2_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg2_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg2_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, -2.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -2)
   ret float %res
@@ -297,48 +278,16 @@ define float @v_powi_4_f32(float %l) {
 ; GFX78-LABEL: v_powi_4_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 4.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_4_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 4.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 4)
   ret float %res
@@ -348,48 +297,18 @@ define float @v_powi_8_f32(float %l) {
 ; GFX78-LABEL: v_powi_8_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0x41000000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_8_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 8)
   ret float %res
@@ -399,48 +318,21 @@ define float @v_powi_16_f32(float %l) {
 ; GFX78-LABEL: v_powi_16_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0x41800000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_16_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 16)
   ret float %res
@@ -450,99 +342,110 @@ define float @v_powi_128_f32(float %l) {
 ; GFX78-LABEL: v_powi_128_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0x43000000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_128_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 128)
   ret float %res
 }
 
 define float @v_powi_neg128_f32(float %l) {
-; GFX78-LABEL: v_powi_neg128_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0xc3000000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg128_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg128_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg128_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -128)
   ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 139f82b3dc9f7a..9ee0acf2aa2db6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -88,11 +88,13 @@ entry:
   ret void
 }
 
-; GFX9_10 can use a signed immediate byte offset
+; GFX9+ can use a signed immediate byte offset but not without sgpr[offset]
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
-; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], -0x4
+; GFX9_10: s_add_u32 s2, s2, -4
+; GFX9_10: s_addc_u32 s3, s3, -1
+; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
   %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
index c07cd4e493b9ad..019f76aa44a87c 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
@@ -48,7 +48,7 @@ define amdgpu_kernel void @ham(ptr addrspace(4) %arg) {
 ; CHECK-NEXT:    [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[BB7]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[PHI]], i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[PHI]], i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 1, [[TMP10]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 6cec55bf3d4f34..d50ba64ba5d47f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -28,7 +28,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -38,7 +38,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s4
@@ -49,7 +49,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
@@ -78,7 +78,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -87,7 +87,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
@@ -114,7 +114,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
@@ -141,7 +141,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
@@ -168,7 +168,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
@@ -197,12 +197,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -216,7 +216,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -226,7 +226,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s4
@@ -237,7 +237,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
@@ -265,7 +265,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -274,7 +274,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -309,7 +309,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
@@ -335,7 +335,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
@@ -363,12 +363,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -401,7 +401,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
@@ -429,7 +429,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s1, exec_lo
@@ -459,7 +459,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
@@ -521,7 +521,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
@@ -547,7 +547,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
@@ -605,7 +605,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
@@ -663,7 +663,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
@@ -720,7 +720,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
@@ -778,118 +778,68 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_addk_i32 s8, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
@@ -897,39 +847,35 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s8
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -937,30 +883,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -968,30 +910,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s10
+; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
@@ -999,31 +937,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s10
+; GFX6-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1031,111 +965,68 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s4
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[2:3], v[4:5], v[0:1]
-; GFX12-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX11-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v5
-; GFX10-NEXT:    v_mov_b32_e32 v9, v4
+; GFX10-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v7
-; GFX10-NEXT:    v_mov_b32_e32 v5, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
@@ -1143,3521 +1034,2068 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, s8
-; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s10
-; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v5
-; GFX908-NEXT:    v_mov_b32_e32 v9, v4
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v7
-; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v8
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, v5
-; GFX8-NEXT:    v_mov_b32_e32 v9, v4
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s10
+; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v10, v5
-; GFX7-NEXT:    v_mov_b32_e32 v9, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s10
+; GFX6-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX6-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v5
-; GFX6-NEXT:    v_mov_b32_e32 v9, v4
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT:    v_mov_b32_e32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x400
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[11:12], v[13:14], v[5:6]
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v6
-; GFX940-NEXT:    v_mov_b32_e32 v6, v5
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
-; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14
-; GFX10-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v1
-; GFX10-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v0, v11
-; GFX908-NEXT:    v_mov_b32_e32 v1, v12
-; GFX908-NEXT:    v_mov_b32_e32 v2, v13
-; GFX908-NEXT:    v_mov_b32_e32 v3, v14
-; GFX908-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT:    v_mov_b32_e32 v14, v1
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v0, v11
-; GFX8-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT:    v_mov_b32_e32 v14, v1
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_mov_b32_e32 v0, v11
-; GFX7-NEXT:    v_mov_b32_e32 v1, v12
-; GFX7-NEXT:    v_mov_b32_e32 v2, v13
-; GFX7-NEXT:    v_mov_b32_e32 v3, v14
-; GFX7-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s10
+; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX7-NEXT:    v_mov_b32_e32 v14, v1
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s10
+; GFX6-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v11
-; GFX6-NEXT:    v_mov_b32_e32 v1, v12
-; GFX6-NEXT:    v_mov_b32_e32 v2, v13
-; GFX6-NEXT:    v_mov_b32_e32 v3, v14
-; GFX6-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX6-NEXT:    v_mov_b32_e32 v14, v1
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  ret float %result
 }
 
-; --------------------------------------------------------------------
-; half
-; --------------------------------------------------------------------
-
-define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s5, s4, -4
-; GFX12-NEXT:    s_and_b32 s4, s4, 3
-; GFX12-NEXT:    v_mov_b32_e32 v5, s5
-; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_not_b32 s6, s5
-; GFX12-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s4, 0x200
-; GFX940-NEXT:    s_and_b32 s5, s4, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
-; GFX940-NEXT:    s_and_b32 s4, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT:    s_not_b32 s7, s4
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
-; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_addk_i32 s4, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s5, s4, -4
-; GFX11-NEXT:    s_and_b32 s4, s4, 3
-; GFX11-NEXT:    v_mov_b32_e32 v5, s5
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s6, s5
-; GFX11-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_addk_i32 s8, 0x200
-; GFX10-NEXT:    s_and_b32 s9, s8, -4
-; GFX10-NEXT:    s_and_b32 s8, s8, 3
-; GFX10-NEXT:    v_mov_b32_e32 v5, s9
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
-; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT:    s_not_b32 s10, s9
-; GFX10-NEXT:    buffer_load_dword v2, v5, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_addk_i32 s8, 0x200
-; GFX90A-NEXT:    s_and_b32 s9, s8, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[4:7], 0 offen
-; GFX90A-NEXT:    s_and_b32 s8, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT:    s_not_b32 s11, s8
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s10, v3
-; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s10, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s11, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_addk_i32 s8, 0x200
-; GFX908-NEXT:    s_and_b32 s9, s8, -4
-; GFX908-NEXT:    v_mov_b32_e32 v5, s9
-; GFX908-NEXT:    buffer_load_dword v2, v5, s[4:7], 0 offen
-; GFX908-NEXT:    s_and_b32 s8, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT:    s_not_b32 s11, s8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
-; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s11, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_addk_i32 s8, 0x200
-; GFX8-NEXT:    s_and_b32 s9, s8, -4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NEXT:    buffer_load_dword v2, v5, s[4:7], 0 offen
-; GFX8-NEXT:    s_and_b32 s8, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT:    s_not_b32 s11, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
-; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, s11, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_addk_i32 s8, 0x200
-; GFX7-NEXT:    s_and_b32 s9, s8, -4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s9
-; GFX7-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_and_b32 s8, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX7-NEXT:    s_not_b32 s11, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s11, v1
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_addk_i32 s8, 0x200
-; GFX6-NEXT:    s_and_b32 s9, s8, -4
-; GFX6-NEXT:    v_mov_b32_e32 v4, s9
-; GFX6-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_and_b32 s8, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX6-NEXT:    s_not_b32 s11, s8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s11, v1
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
-  ret half %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s5, s4, -4
-; GFX12-NEXT:    s_and_b32 s4, s4, 3
-; GFX12-NEXT:    v_mov_b32_e32 v3, s5
-; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_not_b32 s6, s5
-; GFX12-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s4, 0x200
-; GFX940-NEXT:    s_and_b32 s5, s4, -4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
-; GFX940-NEXT:    s_and_b32 s4, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT:    s_not_b32 s7, s4
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
-; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_addk_i32 s4, 0x200
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s5, s4, -4
-; GFX11-NEXT:    s_and_b32 s4, s4, 3
-; GFX11-NEXT:    v_mov_b32_e32 v3, s5
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_not_b32 s6, s5
-; GFX11-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v4
-; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_addk_i32 s8, 0x200
-; GFX10-NEXT:    s_and_b32 s9, s8, -4
-; GFX10-NEXT:    s_and_b32 s8, s8, 3
-; GFX10-NEXT:    v_mov_b32_e32 v3, s9
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
-; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT:    s_not_b32 s10, s9
-; GFX10-NEXT:    buffer_load_dword v2, v3, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v5, v2
-; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
-; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_addk_i32 s8, 0x200
-; GFX90A-NEXT:    s_and_b32 s9, s8, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90A-NEXT:    buffer_load_dword v3, v1, s[4:7], 0 offen
-; GFX90A-NEXT:    s_and_b32 s8, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT:    s_not_b32 s11, s8
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s10, v3
-; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s10, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s11, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_addk_i32 s8, 0x200
-; GFX908-NEXT:    s_and_b32 s9, s8, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s9
-; GFX908-NEXT:    buffer_load_dword v2, v3, s[4:7], 0 offen
-; GFX908-NEXT:    s_and_b32 s8, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT:    s_not_b32 s11, s8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
-; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s11, v1
-; GFX908-NEXT:    v_mov_b32_e32 v5, v2
-; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_addk_i32 s8, 0x200
-; GFX8-NEXT:    s_and_b32 s9, s8, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    buffer_load_dword v2, v3, s[4:7], 0 offen
-; GFX8-NEXT:    s_and_b32 s8, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT:    s_not_b32 s11, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
-; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, s11, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_addk_i32 s8, 0x200
-; GFX7-NEXT:    s_and_b32 s9, s8, -4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s9
-; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_and_b32 s8, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX7-NEXT:    s_not_b32 s11, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, s11, v1
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_addk_i32 s8, 0x200
-; GFX6-NEXT:    s_and_b32 s9, s8, -4
-; GFX6-NEXT:    v_mov_b32_e32 v2, s9
-; GFX6-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_and_b32 s8, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX6-NEXT:    s_not_b32 s11, s8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, s11, v1
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %result
 }
 
-define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX12-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-NEXT:    v_not_b32_e32 v11, v7
-; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX12-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT:    v_and_b32_e32 v10, -4, v4
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v6, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v11, v6
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v7, v10, s[4:7], 0 offen
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-NEXT:    s_mov_b64 s[2:3], 0
-; GFX940-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB8_4 Depth 2
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX940-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX11-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-NEXT:    v_not_b32_e32 v11, v7
-; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
-; GFX11-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT:    v_and_b32_e32 v10, -4, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v11, v7
-; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    s_addk_i32 s8, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s8
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX10-NEXT:    v_mov_b32_e32 v9, v7
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
-; GFX10-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT:    v_and_b32_e32 v10, -4, v4
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v11, v6
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB8_4 Depth 2
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX90A-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT:    v_and_b32_e32 v10, -4, v4
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v11, v6
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB8_4 Depth 2
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX908-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT:    v_mov_b32_e32 v9, v7
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
-; GFX908-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT:    v_and_b32_e32 v10, -4, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v11, v6
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB8_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
-; GFX8-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT:    v_and_b32_e32 v8, v7, v11
-; GFX8-NEXT:    v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT:    v_mov_b32_e32 v9, v7
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
-; GFX8-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT:    v_not_b32_e32 v9, v4
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB8_4 Depth 2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s10
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT:    v_not_b32_e32 v9, v4
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT:  .LBB8_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB8_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s10
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
-  ret half %result
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT:    s_and_b32 s5, s4, -4
-; GFX12-NEXT:    s_and_b32 s4, s4, 3
-; GFX12-NEXT:    v_mov_b32_e32 v4, s5
-; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT:    s_not_b32 s6, s5
-; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT:    v_add_f64_e32 v[2:3], v[4:5], v[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v2
-; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s4, 0x200
-; GFX940-NEXT:    s_and_b32 s5, s4, -4
-; GFX940-NEXT:    v_mov_b32_e32 v4, s5
-; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
-; GFX940-NEXT:    s_and_b32 s4, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT:    s_not_b32 s7, s4
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX940-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s8
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v1, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_addk_i32 s4, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_and_b32 s5, s4, -4
-; GFX11-NEXT:    s_and_b32 s4, s4, 3
-; GFX11-NEXT:    v_mov_b32_e32 v4, s5
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT:    s_not_b32 s6, s5
-; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v2
-; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_addk_i32 s8, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT:    s_and_b32 s9, s8, -4
-; GFX10-NEXT:    s_and_b32 s8, s8, 3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s9
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
-; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT:    s_not_b32 s10, s9
-; GFX10-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v5
+; GFX10-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s10, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; GFX10-NEXT:    v_mov_b32_e32 v7, v2
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
-; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v7
+; GFX10-NEXT:    v_mov_b32_e32 v5, v8
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_addk_i32 s8, 0x200
-; GFX90A-NEXT:    s_and_b32 s9, s8, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s9
-; GFX90A-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX90A-NEXT:    s_and_b32 s8, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT:    s_not_b32 s11, s8
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s12
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_addk_i32 s8, 0x200
-; GFX908-NEXT:    s_and_b32 s9, s8, -4
-; GFX908-NEXT:    v_mov_b32_e32 v4, s9
-; GFX908-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX908-NEXT:    s_and_b32 s8, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT:    s_not_b32 s11, s8
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s12
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v5
+; GFX908-NEXT:    v_mov_b32_e32 v9, v4
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_addk_i32 s8, 0x200
-; GFX8-NEXT:    s_and_b32 s9, s8, -4
-; GFX8-NEXT:    v_mov_b32_e32 v4, s9
-; GFX8-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX8-NEXT:    s_and_b32 s8, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT:    s_not_b32 s11, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, s11, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v5
+; GFX8-NEXT:    v_mov_b32_e32 v9, v4
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_addk_i32 s8, 0x200
-; GFX7-NEXT:    s_and_b32 s9, s8, -4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s9
-; GFX7-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX7-NEXT:    s_and_b32 s8, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s11, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s11, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v5
+; GFX7-NEXT:    v_mov_b32_e32 v9, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_addk_i32 s8, 0x200
-; GFX6-NEXT:    s_and_b32 s9, s8, -4
-; GFX6-NEXT:    v_mov_b32_e32 v4, s9
-; GFX6-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX6-NEXT:    s_and_b32 s8, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s11, s8
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, s11, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v10, v5
+; GFX6-NEXT:    v_mov_b32_e32 v9, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
-  ret bfloat %result
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_and_b32 s5, s4, -4
-; GFX12-NEXT:    s_and_b32 s4, s4, 3
-; GFX12-NEXT:    v_mov_b32_e32 v2, s5
-; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT:    s_not_b32 s6, s5
-; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB10_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB10_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT:    v_add_f64_e32 v[11:12], v[13:14], v[5:6]
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s4, 0x200
-; GFX940-NEXT:    s_and_b32 s5, s4, -4
-; GFX940-NEXT:    v_mov_b32_e32 v2, s5
-; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
-; GFX940-NEXT:    s_and_b32 s4, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT:    s_not_b32 s7, s4
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    v_mov_b32_e32 v7, v6
+; GFX940-NEXT:    v_mov_b32_e32 v6, v5
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX940-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s8
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX940-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b32_e32 v1, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_addk_i32 s4, 0x200
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    s_and_b32 s5, s4, -4
-; GFX11-NEXT:    s_and_b32 s4, s4, 3
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT:    s_not_b32 s6, s5
-; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB10_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB10_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_addk_i32 s8, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    s_and_b32 s9, s8, -4
-; GFX10-NEXT:    s_and_b32 s8, s8, 3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
-; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT:    s_not_b32 s10, s9
-; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; GFX10-NEXT:    v_mov_b32_e32 v7, v2
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:  .LBB10_3: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Loop Header: Depth=1
+; GFX10-NEXT:    ; Child Loop BB10_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v1, s10, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v0, v11
+; GFX10-NEXT:    v_mov_b32_e32 v1, v12
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13
+; GFX10-NEXT:    v_mov_b32_e32 v3, v14
+; GFX10-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX10-NEXT:    v_mov_b32_e32 v14, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_addk_i32 s8, 0x200
-; GFX90A-NEXT:    s_and_b32 s9, s8, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s9
-; GFX90A-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT:    s_and_b32 s8, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT:    s_not_b32 s11, s8
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s12
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_addk_i32 s8, 0x200
-; GFX908-NEXT:    s_and_b32 s9, s8, -4
-; GFX908-NEXT:    v_mov_b32_e32 v2, s9
-; GFX908-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT:    s_and_b32 s8, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT:    s_not_b32 s11, s8
-; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB10_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB10_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s12
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v0, v11
+; GFX908-NEXT:    v_mov_b32_e32 v1, v12
+; GFX908-NEXT:    v_mov_b32_e32 v2, v13
+; GFX908-NEXT:    v_mov_b32_e32 v3, v14
+; GFX908-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT:    v_mov_b32_e32 v14, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_addk_i32 s8, 0x200
-; GFX8-NEXT:    s_and_b32 s9, s8, -4
-; GFX8-NEXT:    v_mov_b32_e32 v2, s9
-; GFX8-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX8-NEXT:    s_and_b32 s8, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT:    s_not_b32 s11, s8
-; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB10_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB10_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s11, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v1, v12
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v14
+; GFX8-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT:    v_mov_b32_e32 v14, v1
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_addk_i32 s8, 0x200
-; GFX7-NEXT:    s_and_b32 s9, s8, -4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s9
-; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_and_b32 s8, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    s_not_b32 s11, s8
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:  .LBB10_3: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Loop Header: Depth=1
+; GFX7-NEXT:    ; Child Loop BB10_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, s11, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:    v_mov_b32_e32 v0, v11
+; GFX7-NEXT:    v_mov_b32_e32 v1, v12
+; GFX7-NEXT:    v_mov_b32_e32 v2, v13
+; GFX7-NEXT:    v_mov_b32_e32 v3, v14
+; GFX7-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX7-NEXT:    v_mov_b32_e32 v14, v1
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_addk_i32 s8, 0x200
-; GFX6-NEXT:    s_and_b32 s9, s8, -4
-; GFX6-NEXT:    v_mov_b32_e32 v2, s9
-; GFX6-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX6-NEXT:    s_and_b32 s8, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
-; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT:    s_not_b32 s11, s8
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
+; GFX6-NEXT:  .LBB10_3: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Loop Header: Depth=1
+; GFX6-NEXT:    ; Child Loop BB10_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, s11, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v11
+; GFX6-NEXT:    v_mov_b32_e32 v1, v12
+; GFX6-NEXT:    v_mov_b32_e32 v2, v13
+; GFX6-NEXT:    v_mov_b32_e32 v3, v14
+; GFX6-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX6-NEXT:    v_mov_b32_e32 v14, v1
+; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX12-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX12-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT:    v_and_b32_e32 v9, -4, v4
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
-; GFX940-NEXT:    v_not_b32_e32 v10, v4
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-NEXT:    s_mov_b64 s[2:3], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB11_4 Depth 2
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_add_f32_e32 v4, v4, v11
-; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX11-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX10-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
-; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    s_addk_i32 s8, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s8
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
-; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT:    v_and_b32_e32 v9, -4, v4
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
-; GFX90A-NEXT:    v_not_b32_e32 v10, v4
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB11_4 Depth 2
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v11
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
-; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
-; GFX908-NEXT:    v_not_b32_e32 v9, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB11_4 Depth 2
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
-; GFX8-NEXT:    v_not_b32_e32 v9, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB11_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_e32 v5, v6, v9
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT:    v_not_b32_e32 v9, v4
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB11_4 Depth 2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s10
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT:    v_not_b32_e32 v9, v4
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT:  .LBB11_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB11_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
-; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s10
+; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
-  ret bfloat %result
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-; --------------------------------------------------------------------
-; <2 x half>
-; --------------------------------------------------------------------
-
-define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -4666,28 +3104,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    s_addk_i32 s8, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s8
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
 ; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
@@ -4695,35 +3137,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s8
-; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s10
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v0
-; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
@@ -4731,28 +3177,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
-; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
@@ -4760,45 +3208,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s10
+; GFX7-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
@@ -4806,46 +3239,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, s10
+; GFX6-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
-; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
@@ -4853,1054 +3271,7195 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret <2 x half> %result
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; --------------------------------------------------------------------
+; half
+; --------------------------------------------------------------------
+
+define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s5, s4, -4
+; GFX12-NEXT:    s_and_b32 s4, s4, 3
+; GFX12-NEXT:    v_mov_b32_e32 v5, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    s_addk_i32 s4, 0x200
+; GFX940-NEXT:    s_and_b32 s5, s4, -4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_addk_i32 s4, 0x200
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s5, s4, -4
+; GFX11-NEXT:    s_and_b32 s4, s4, 3
+; GFX11-NEXT:    v_mov_b32_e32 v5, s5
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v4
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    s_addk_i32 s8, 0x200
+; GFX10-NEXT:    s_and_b32 s9, s8, -4
+; GFX10-NEXT:    s_and_b32 s8, s8, 3
+; GFX10-NEXT:    v_mov_b32_e32 v5, s9
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
+; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT:    s_not_b32 s10, s9
+; GFX10-NEXT:    buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX10-NEXT:    s_mov_b32 s9, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s8, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_addk_i32 s8, 0x200
+; GFX90A-NEXT:    s_and_b32 s9, s8, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90A-NEXT:    buffer_load_dword v3, v1, s[4:7], 0 offen
+; GFX90A-NEXT:    s_and_b32 s8, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT:    s_not_b32 s11, s8
+; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s10, v3
+; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s10, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s11, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s8
-; GFX908-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX908-NEXT:    s_addk_i32 s8, 0x200
+; GFX908-NEXT:    s_and_b32 s9, s8, -4
+; GFX908-NEXT:    v_mov_b32_e32 v5, s9
+; GFX908-NEXT:    buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX908-NEXT:    s_and_b32 s8, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT:    s_not_b32 s11, s8
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v5, v2
-; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
+; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s11, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_addk_i32 s8, 0x200
+; GFX8-NEXT:    s_and_b32 s9, s8, -4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX8-NEXT:    s_and_b32 s8, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
-; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
+; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, s11, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_addk_i32 s8, 0x200
+; GFX7-NEXT:    s_and_b32 s9, s8, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s9
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_and_b32 s8, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s11, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_addk_i32 s8, 0x200
+; GFX6-NEXT:    s_and_b32 s9, s8, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s9
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_and_b32 s8, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v2, s11, v1
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
 }
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s5, s4, -4
+; GFX12-NEXT:    s_and_b32 s4, s4, 3
+; GFX12-NEXT:    v_mov_b32_e32 v3, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_addk_i32 s4, 0x200
+; GFX940-NEXT:    s_and_b32 s5, s4, -4
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
-; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB14_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB14_4 Depth 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v7, v8, v5
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-NEXT:    v_mov_b32_e32 v7, v8
-; GFX11-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_addk_i32 s4, 0x200
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s5, s4, -4
+; GFX11-NEXT:    s_and_b32 s4, s4, 3
+; GFX11-NEXT:    v_mov_b32_e32 v3, s5
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v6
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:  .LBB14_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT:    s_addk_i32 s8, 0x200
+; GFX10-NEXT:    s_and_b32 s9, s8, -4
+; GFX10-NEXT:    s_and_b32 s8, s8, 3
+; GFX10-NEXT:    v_mov_b32_e32 v3, s9
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
+; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT:    s_not_b32 s10, s9
+; GFX10-NEXT:    buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v7, v8, v5
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s8, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
-; GFX10-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_addk_i32 s8, 0x200
+; GFX90A-NEXT:    s_and_b32 s9, s8, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90A-NEXT:    buffer_load_dword v3, v1, s[4:7], 0 offen
+; GFX90A-NEXT:    s_and_b32 s8, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT:    s_not_b32 s11, s8
+; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s10, v3
+; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s10, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s11, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB14_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB14_4 Depth 2
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v7, v8, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
-; GFX908-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_addk_i32 s8, 0x200
+; GFX908-NEXT:    s_and_b32 s9, s8, -4
+; GFX908-NEXT:    v_mov_b32_e32 v3, s9
+; GFX908-NEXT:    buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX908-NEXT:    s_and_b32 s8, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT:    s_not_b32 s11, s8
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
+; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s11, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB14_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT:    s_addk_i32 s8, 0x200
+; GFX8-NEXT:    s_and_b32 s9, s8, -4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX8-NEXT:    s_and_b32 s8, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT:    s_not_b32 s11, s8
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v6, v8, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_mov_b32_e32 v6, v7
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
-; GFX8-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s10, v2
+; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, s11, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s10, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB14_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB14_4 Depth 2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v10
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_addk_i32 s8, 0x200
+; GFX7-NEXT:    s_and_b32 s9, s8, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_and_b32 s8, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT:    s_not_b32 s11, s8
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s11, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v5
+; GFX6-NEXT:    s_addk_i32 s8, 0x200
+; GFX6-NEXT:    s_and_b32 s9, s8, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s9
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_and_b32 s8, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT:    s_not_b32 s11, s8
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB14_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB14_4 Depth 2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v10
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v11
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    v_and_b32_e32 v4, s11, v1
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret <2 x half> %result
+  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-; --------------------------------------------------------------------
-; <2 x bfloat>
-; --------------------------------------------------------------------
-
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x400
-; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:    v_mov_b32_e32 v4, s4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v0
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX12-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX12-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX12-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX12-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX12-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-NEXT:    v_not_b32_e32 v11, v7
+; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v6, v6, v5
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v0
-; GFX940-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT:    s_addk_i32 s4, 0x400
-; GFX940-NEXT:    s_mov_b64 s[6:7], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
+; GFX940-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v6, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v11, v6
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    buffer_load_dword v7, v10, s[4:7], 0 offen
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Loop Header: Depth=1
+; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
-; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX940-NEXT:    v_add_f16_e32 v6, v6, v5
+; GFX940-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX940-NEXT:    s_mov_b64 s[8:9], exec
+; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
-; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
-; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
-; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x400
-; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    v_mov_b32_e32 v4, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX11-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-NEXT:    v_not_b32_e32 v11, v7
+; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT:    v_add_f16_e32 v6, v6, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-NEXT:    v_and_or_b32 v6, v7, v11, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x400
-; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s8
-; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mov_b32_e32 v1, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX10-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v11, v7
+; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Loop Header: Depth=1
+; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f16_e32 v6, v6, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX10-NEXT:    v_mov_b32_e32 v9, v7
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX90A-NEXT:    s_addk_i32 s8, 0x400
-; GFX90A-NEXT:    s_mov_b64 s[10:11], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
+; GFX90A-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v11, v6
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
+; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s12
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s12
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[8:9]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s13
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX90A-NEXT:    v_add_f16_e32 v6, v6, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
-; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s8
-; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX908-NEXT:    s_addk_i32 s8, 0x400
-; GFX908-NEXT:    s_mov_b64 s[10:11], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s8
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
+; GFX908-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v11, v6
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s12
-; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s12
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[8:9]
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s13
-; GFX908-NEXT:    v_mov_b32_e32 v0, v5
-; GFX908-NEXT:    v_mov_b32_e32 v1, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX908-NEXT:    v_add_f16_e32 v6, v6, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX908-NEXT:    v_mov_b32_e32 v9, v7
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX8-NEXT:    s_addk_i32 s8, 0x400
-; GFX8-NEXT:    s_mov_b64 s[10:11], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
+; GFX8-NEXT:    v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v11, v6
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX8-NEXT:    v_add_f16_e32 v6, v6, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX8-NEXT:    v_and_b32_e32 v8, v7, v11
+; GFX8-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v9, v7
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT:    v_not_b32_e32 v9, v4
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Loop Header: Depth=1
+; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    v_mov_b32_e32 v5, v6
+; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_not_b32_e32 v9, v4
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Loop Header: Depth=1
+; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    v_mov_b32_e32 v5, v6
+; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+}
+
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    s_and_b32 s5, s4, -4
+; GFX12-NEXT:    s_and_b32 s4, s4, 3
+; GFX12-NEXT:    v_mov_b32_e32 v4, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_addk_i32 s4, 0x200
+; GFX940-NEXT:    s_and_b32 s5, s4, -4
+; GFX940-NEXT:    v_mov_b32_e32 v4, s5
+; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX940-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_addk_i32 s4, 0x200
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_and_b32 s5, s4, -4
+; GFX11-NEXT:    s_and_b32 s4, s4, 3
+; GFX11-NEXT:    v_mov_b32_e32 v4, s5
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_addk_i32 s8, 0x200
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    s_and_b32 s9, s8, -4
+; GFX10-NEXT:    s_and_b32 s8, s8, 3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s9
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
+; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT:    s_not_b32 s10, s9
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_addk_i32 s8, 0x200
+; GFX90A-NEXT:    s_and_b32 s9, s8, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s9
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX90A-NEXT:    s_and_b32 s8, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT:    s_not_b32 s11, s8
+; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s12
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_addk_i32 s8, 0x200
+; GFX908-NEXT:    s_and_b32 s9, s8, -4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s9
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX908-NEXT:    s_and_b32 s8, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT:    s_not_b32 s11, s8
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s12
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_addk_i32 s8, 0x200
+; GFX8-NEXT:    s_and_b32 s9, s8, -4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s9
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX8-NEXT:    s_and_b32 s8, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT:    s_not_b32 s11, s8
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s11, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_addk_i32 s8, 0x200
+; GFX7-NEXT:    s_and_b32 s9, s8, -4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s9
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX7-NEXT:    s_and_b32 s8, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_not_b32 s11, s8
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s11, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_addk_i32 s8, 0x200
+; GFX6-NEXT:    s_and_b32 s9, s8, -4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s9
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX6-NEXT:    s_and_b32 s8, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    s_not_b32 s11, s8
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v2, s11, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x200
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_and_b32 s5, s4, -4
+; GFX12-NEXT:    s_and_b32 s4, s4, 3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s5
+; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT:    s_not_b32 s6, s5
+; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_addk_i32 s4, 0x200
+; GFX940-NEXT:    s_and_b32 s5, s4, -4
+; GFX940-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX940-NEXT:    s_and_b32 s4, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT:    s_not_b32 s7, s4
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX940-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_addk_i32 s4, 0x200
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    s_and_b32 s5, s4, -4
+; GFX11-NEXT:    s_and_b32 s4, s4, 3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    s_not_b32 s6, s5
+; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_addk_i32 s8, 0x200
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    s_and_b32 s9, s8, -4
+; GFX10-NEXT:    s_and_b32 s8, s8, 3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 3
+; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT:    s_not_b32 s10, s9
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_addk_i32 s8, 0x200
+; GFX90A-NEXT:    s_and_b32 s9, s8, -4
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX90A-NEXT:    s_and_b32 s8, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT:    s_not_b32 s11, s8
+; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s12
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_addk_i32 s8, 0x200
+; GFX908-NEXT:    s_and_b32 s9, s8, -4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s9
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX908-NEXT:    s_and_b32 s8, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX908-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT:    s_not_b32 s11, s8
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s12
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_addk_i32 s8, 0x200
+; GFX8-NEXT:    s_and_b32 s9, s8, -4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX8-NEXT:    s_and_b32 s8, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX8-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT:    s_not_b32 s11, s8
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s11, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_addk_i32 s8, 0x200
+; GFX7-NEXT:    s_and_b32 s9, s8, -4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX7-NEXT:    s_and_b32 s8, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX7-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_not_b32 s11, s8
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s11, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_addk_i32 s8, 0x200
+; GFX6-NEXT:    s_and_b32 s9, s8, -4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s9
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX6-NEXT:    s_and_b32 s8, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s10, s8, 3
+; GFX6-NEXT:    s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    s_not_b32 s11, s8
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, s11, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT:    v_not_b32_e32 v9, v6
+; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
+; GFX940-NEXT:    v_and_b32_e32 v9, -4, v4
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT:    v_not_b32_e32 v10, v4
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
+; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Loop Header: Depth=1
+; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_mov_b64 s[8:9], exec
+; GFX940-NEXT:    v_add_f32_e32 v4, v4, v11
+; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT:    v_not_b32_e32 v9, v6
+; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v9, v6
+; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Loop Header: Depth=1
+; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
+; GFX90A-NEXT:    v_and_b32_e32 v9, -4, v4
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT:    v_not_b32_e32 v10, v4
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
+; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v11
+; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
+; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
+; GFX908-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT:    v_not_b32_e32 v9, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
+; GFX8-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT:    v_not_b32_e32 v9, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v5, v6, v9
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT:    v_not_b32_e32 v9, v4
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Loop Header: Depth=1
+; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:    v_mov_b32_e32 v5, v6
+; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_not_b32_e32 v9, v4
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Loop Header: Depth=1
+; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:    v_mov_b32_e32 v5, v6
+; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+}
+
+; --------------------------------------------------------------------
+; <2 x half>
+; --------------------------------------------------------------------
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s10
+; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s10
+; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s10
+; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v5
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v7, v8, v5
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v7
+; GFX11-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Loop Header: Depth=1
+; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v7, v8, v5
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v7, v8, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v8, v6
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v6, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Loop Header: Depth=1
+; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v10
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
+; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Loop Header: Depth=1
+; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v10
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v11
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
+}
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s10
+; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s10
+; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s10
+; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s10
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s10
+; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x half> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s10
+; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; <2 x bfloat>
+; --------------------------------------------------------------------
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s4, 0x400
+; GFX940-NEXT:    s_mov_b64 s[6:7], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s8
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_addk_i32 s8, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[10:11], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s12
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s12
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[8:9]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s13
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_addk_i32 s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[10:11], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s8
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s12
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s12
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_addk_i32 s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[10:11], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_addk_i32 s8, 0x400
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    s_mov_b64 s[10:11], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s8
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_addk_i32 s8, 0x400
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    s_mov_b64 s[10:11], 0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s8
+; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s4, 0x400
+; GFX940-NEXT:    s_mov_b64 s[6:7], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s8
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s8
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_addk_i32 s8, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[10:11], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s12
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s12
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s13
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_addk_i32 s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[10:11], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s8
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s12
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s12
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_addk_i32 s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[10:11], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_addk_i32 s8, 0x400
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    s_mov_b64 s[10:11], 0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_addk_i32 s8, 0x400
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    s_mov_b64 s[10:11], 0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v5
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
+; GFX940-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Loop Header: Depth=1
+; GFX940-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT:    s_mov_b64 s[8:9], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v10
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX940-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX10-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Loop Header: Depth=1
+; GFX10-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
+; GFX90A-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
+; GFX90A-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v10
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
+; GFX908-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT:    v_mov_b32_e32 v4, v5
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX8-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, v5
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Loop Header: Depth=1
+; GFX7-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX6-NEXT:  .LBB28_3: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Loop Header: Depth=1
+; GFX6-NEXT:    ; Child Loop BB28_4 Depth 2
+; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; GFX6-NEXT:    v_mul_f32_e32 v7, 1.0, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s4, 0x400
+; GFX940-NEXT:    s_mov_b64 s[6:7], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s8
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_addk_i32 s8, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[10:11], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s12
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s12
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[8:9]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s13
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_addk_i32 s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[10:11], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s8
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s12
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s12
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_addk_i32 s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[10:11], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_addk_i32 s8, 0x400
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    s_mov_b64 s[10:11], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s8
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_addk_i32 s8, 0x400
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    s_mov_b64 s[10:11], 0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s8
+; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret <2 x bfloat> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s4, 0x400
+; GFX940-NEXT:    s_mov_b64 s[6:7], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s8
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s8
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_addk_i32 s8, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[10:11], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s12
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s12
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s13
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_addk_i32 s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[10:11], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s8
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s12
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s12
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_addk_i32 s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[10:11], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_addk_i32 s8, 0x400
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    s_mov_b64 s[10:11], 0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_addk_i32 s8, 0x400
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    s_mov_b64 s[10:11], 0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s4, 0x400
+; GFX940-NEXT:    s_mov_b64 s[6:7], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s8
+; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_addk_i32 s8, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[10:11], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s12
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s12
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[8:9]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s13
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_addk_i32 s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[10:11], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s8
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s12
+; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s12
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v1, v6
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_addk_i32 s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[10:11], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -5930,12 +10489,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -5950,7 +10509,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s8
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -5972,12 +10531,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -5992,7 +10551,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6015,67 +10574,32 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
-; GFX12-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v5
-; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -6087,7 +10611,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6114,12 +10638,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6130,7 +10654,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6162,13 +10686,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
@@ -6178,7 +10702,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6207,12 +10731,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -6224,7 +10748,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6250,12 +10774,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -6267,7 +10791,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6294,12 +10818,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -6309,7 +10833,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6339,12 +10863,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -6359,7 +10883,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -6381,12 +10905,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -6401,7 +10925,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -6424,732 +10948,396 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB17_4 Depth 2
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX12-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX12-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX12-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-NEXT:    s_mov_b64 s[2:3], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
-; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s4, 0x400
+; GFX940-NEXT:    s_mov_b64 s[6:7], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX940-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX940-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v10
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s10
-; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT:    s_addk_i32 s4, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v5
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB17_4 Depth 2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
-; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addk_i32 s8, 0x400
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s8
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s8
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_addk_i32 s8, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[10:11], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v10
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s12
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s12
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
-; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s13
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_addk_i32 s8, 0x400
+; GFX908-NEXT:    s_mov_b64 s[10:11], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX908-NEXT:    v_mov_b32_e32 v4, s8
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s12
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s12
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_addk_i32 s8, 0x400
+; GFX8-NEXT:    s_mov_b64 s[10:11], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[8:9]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT:    s_addk_i32 s8, 0x400
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    s_mov_b64 s[10:11], 0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_4 Depth 2
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v9
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT:    s_addk_i32 s8, 0x400
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    s_mov_b64 s[10:11], 0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT:  .LBB17_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB17_4 Depth 2
-; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; GFX6-NEXT:    v_mul_f32_e32 v7, 1.0, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v9
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX6-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
-  ret <2 x bfloat> %result
+  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret void
 }
 
 ; --------------------------------------------------------------------
 ; misc
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7163,7 +11351,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -7178,12 +11366,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -7193,7 +11381,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
@@ -7203,7 +11391,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -7219,12 +11407,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
@@ -7233,7 +11421,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -7248,12 +11436,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
@@ -7262,7 +11450,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -7276,12 +11464,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
@@ -7290,7 +11478,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s10
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -7303,12 +11491,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
@@ -7317,7 +11505,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
@@ -7330,12 +11518,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
@@ -7344,7 +11532,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
@@ -7357,12 +11545,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
@@ -7371,7 +11559,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v0
@@ -7385,16 +11573,16 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst
+  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 
-
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index fb068e35fc5977..06dee9c279f2c5 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -13,43 +13,22 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -78,70 +57,29 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -169,7 +107,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -198,7 +136,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -227,106 +165,45 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_max_num_f32 v2, v0, v0
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v3, s4
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v2
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -354,68 +231,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_max_f32 v2, v0, v0
-; GFX11-NEXT:    s_addk_i32 s4, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x400
-; GFX10-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -442,7 +280,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -470,7 +308,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -498,78 +336,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX6-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -581,57 +380,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f32_e32 v8, v5, v5
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v6, v6
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v5, v4, v8
-; GFX12-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -692,12 +454,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -709,63 +470,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f32_e32 v8, v5, v5
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v5, v4, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -775,56 +498,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:    v_max_f32_e32 v8, v5, v5
-; GFX10-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v5, v4, v8
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
-; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -883,7 +572,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
@@ -943,7 +632,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
@@ -1003,10 +692,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -1017,55 +705,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v5
-; GFX7-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v4, v8
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
@@ -1076,123 +731,61 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_mul_f32_e32 v8, 1.0, v5
-; GFX6-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; GFX6-NEXT:    v_max_f32_e32 v5, v4, v8
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
-; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NEXT:    v_mov_b32_e32 v3, v1
-; GFX940-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_add_i32 s6, s4, 0x800
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s6, s4, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX940-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1200,99 +793,49 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    s_addk_i32 s8, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX90A-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1300,32 +843,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1333,32 +872,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1366,818 +901,924 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s10
-; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s10
-; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX6-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s4
-; GFX940-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_add_i32 s6, s4, 0x800
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s6, s4, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_addk_i32 s4, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX10-NEXT:    s_addk_i32 s8, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8
-; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v7
-; GFX10-NEXT:    v_mov_b32_e32 v3, v8
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
-; GFX90A-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX90A-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, s8
-; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v3
-; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_mov_b32_e32 v8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, v3
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s10
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, v3
-; GFX7-NEXT:    v_mov_b32_e32 v9, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v7, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v8
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX6-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s10
-; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX6-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v3
-; GFX6-NEXT:    v_mov_b32_e32 v9, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v7, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v7
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v8
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v11, v6
-; GFX940-NEXT:    v_mov_b32_e32 v10, v5
-; GFX940-NEXT:    v_mov_b32_e32 v9, v3
-; GFX940-NEXT:    v_mov_b32_e32 v8, v2
-; GFX940-NEXT:    v_mov_b32_e32 v7, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, v0
-; GFX940-NEXT:    v_add_u32_e32 v14, 0x800, v4
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v6
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v8
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v9
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[8:9]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dwordx2 v[12:13], v4, s[4:7], 0 offen offset:2048
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-NEXT:    s_mov_b64 s[2:3], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[10:11], v[10:11]
-; GFX940-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[12:13], v[12:13]
-; GFX940-NEXT:    v_max_f64 v[10:11], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[10:11]
-; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v6
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v8
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v9
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[8:9]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v14, s[4:7], 0 offen sc0
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[12:13]
-; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX10-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14
-; GFX10-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v1
-; GFX10-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v11, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v10, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v8, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-NEXT:    v_add_u32_e32 v14, 0x800, v4
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v6
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v7
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v8
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v9
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[6:7]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[8:9]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dwordx2 v[12:13], v4, s[8:11], 0 offen offset:2048
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[10:11], v[10:11]
-; GFX90A-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[12:13], v[12:13]
-; GFX90A-NEXT:    v_max_f64 v[10:11], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[12:13], v[12:13] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v6
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v7
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v8
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v9
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[6:7]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[8:9]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v14, s[8:11], 0 offen glc
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[12:13]
-; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v11
-; GFX908-NEXT:    v_mov_b32_e32 v1, v12
-; GFX908-NEXT:    v_mov_b32_e32 v2, v13
-; GFX908-NEXT:    v_mov_b32_e32 v3, v14
-; GFX908-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT:    v_mov_b32_e32 v14, v1
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v6
+; GFX940-NEXT:    v_mov_b32_e32 v6, v5
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v11
+; GFX908-NEXT:    v_mov_b32_e32 v1, v12
+; GFX908-NEXT:    v_mov_b32_e32 v2, v13
+; GFX908-NEXT:    v_mov_b32_e32 v3, v14
+; GFX908-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT:    v_mov_b32_e32 v14, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v13, v0
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v13
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
+; GFX8-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
@@ -2188,155 +1829,465 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT:    v_mov_b32_e32 v14, v1
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v13, v0
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT:    v_mov_b32_e32 v14, v1
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v11
-; GFX7-NEXT:    v_mov_b32_e32 v1, v12
-; GFX7-NEXT:    v_mov_b32_e32 v2, v13
-; GFX7-NEXT:    v_mov_b32_e32 v3, v14
-; GFX7-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX7-NEXT:    v_mov_b32_e32 v14, v1
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
-; GFX6-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v11
-; GFX6-NEXT:    v_mov_b32_e32 v1, v12
-; GFX6-NEXT:    v_mov_b32_e32 v2, v13
-; GFX6-NEXT:    v_mov_b32_e32 v3, v14
-; GFX6-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX6-NEXT:    v_mov_b32_e32 v14, v1
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -2344,8 +2295,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
 ; half
 ; --------------------------------------------------------------------
 
-define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2363,7 +2314,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2385,13 +2336,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -2404,7 +2355,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2421,13 +2372,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -2442,7 +2393,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2465,13 +2416,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -2484,7 +2435,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
@@ -2503,13 +2454,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -2522,7 +2473,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_not_b32 s11, s8
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2538,13 +2489,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -2557,7 +2508,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_not_b32 s11, s8
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2574,13 +2525,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -2593,7 +2544,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2611,13 +2562,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -2631,7 +2582,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2650,14 +2601,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -2671,7 +2622,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2691,7 +2642,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
@@ -2699,12 +2650,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2722,7 +2673,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2744,12 +2695,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -2762,7 +2713,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2779,12 +2730,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -2799,7 +2750,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2822,12 +2773,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -2840,7 +2791,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
@@ -2859,12 +2810,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -2877,7 +2828,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_not_b32 s11, s8
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2893,12 +2844,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -2911,7 +2862,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_not_b32 s11, s8
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2928,12 +2879,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -2946,7 +2897,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2964,12 +2915,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -2983,7 +2934,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -3002,12 +2953,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -3021,7 +2972,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -3041,18 +2992,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3068,7 +3019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3081,14 +3032,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -3103,7 +3054,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3118,8 +3069,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -3128,13 +3079,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -3145,7 +3096,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3157,14 +3108,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX940-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3174,7 +3125,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3188,8 +3139,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -3197,13 +3148,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -3216,7 +3167,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3229,14 +3180,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -3251,7 +3202,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3266,8 +3217,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -3277,13 +3228,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -3294,7 +3245,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3306,13 +3257,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX10-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -3323,7 +3274,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3337,8 +3288,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -3348,13 +3299,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -3365,7 +3316,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3377,14 +3328,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX90A-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3393,7 +3344,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3406,8 +3357,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -3415,13 +3366,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -3432,7 +3383,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3444,14 +3395,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX908-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3461,7 +3412,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3474,8 +3425,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3483,13 +3434,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -3500,7 +3451,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3512,14 +3463,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX8-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3530,7 +3481,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3543,8 +3494,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3552,13 +3503,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3568,7 +3519,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3579,15 +3530,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3599,7 +3550,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX7-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3612,8 +3563,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3621,14 +3572,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3638,7 +3589,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3649,15 +3600,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3669,7 +3620,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX6-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3682,8 +3633,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3691,7 +3642,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -3699,7 +3650,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
@@ -3707,8 +3658,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3726,7 +3677,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3755,13 +3706,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -3775,7 +3726,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3797,13 +3748,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -3818,7 +3769,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3848,13 +3799,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -3867,7 +3818,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3890,13 +3841,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -3910,7 +3861,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3930,13 +3881,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -3950,7 +3901,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3971,13 +3922,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -3990,7 +3941,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -4014,13 +3965,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -4034,7 +3985,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4054,14 +4005,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -4075,7 +4026,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4096,7 +4047,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
@@ -4104,12 +4055,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4127,7 +4078,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -4156,12 +4107,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -4175,7 +4126,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4197,12 +4148,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -4217,7 +4168,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -4247,12 +4198,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -4265,7 +4216,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4288,12 +4239,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -4307,7 +4258,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4327,12 +4278,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -4346,7 +4297,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4367,12 +4318,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -4385,7 +4336,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -4409,12 +4360,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -4428,7 +4379,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4448,12 +4399,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -4467,7 +4418,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4488,18 +4439,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4515,7 +4466,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4528,14 +4479,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -4557,7 +4508,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4572,8 +4523,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4582,13 +4533,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4599,7 +4550,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4611,15 +4562,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
@@ -4634,7 +4585,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4648,8 +4599,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4657,13 +4608,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4676,7 +4627,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4689,15 +4640,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -4719,7 +4670,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4734,8 +4685,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4745,14 +4696,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4763,7 +4714,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4775,13 +4726,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -4796,7 +4747,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4810,8 +4761,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4821,13 +4772,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4838,7 +4789,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4850,15 +4801,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v11
@@ -4871,7 +4822,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4884,8 +4835,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4893,13 +4844,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4910,7 +4861,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4922,15 +4873,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_max_f32_e32 v4, v4, v10
@@ -4944,7 +4895,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4957,8 +4908,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4966,13 +4917,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -4983,7 +4934,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4995,14 +4946,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v10
@@ -5018,7 +4969,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5031,8 +4982,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5040,13 +4991,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -5056,7 +5007,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5067,15 +5018,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -5088,7 +5039,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5101,8 +5052,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5110,14 +5061,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -5127,7 +5078,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5138,15 +5089,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -5159,7 +5110,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5172,8 +5123,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5181,7 +5132,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -5189,7 +5140,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
@@ -5197,8 +5148,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5212,7 +5163,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -5229,12 +5180,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -5244,7 +5195,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -5259,12 +5210,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
@@ -5274,7 +5225,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -5292,12 +5243,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -5307,7 +5258,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -5323,12 +5274,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -5338,7 +5289,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -5351,12 +5302,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -5366,7 +5317,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s10
-; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -5380,12 +5331,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -5396,7 +5347,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -5413,12 +5364,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -5434,7 +5385,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s10
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -5459,12 +5410,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -5480,7 +5431,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s10
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -5506,18 +5457,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5530,7 +5481,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
@@ -5546,12 +5497,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -5560,7 +5511,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5575,12 +5526,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s4
@@ -5589,7 +5540,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5606,12 +5557,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
@@ -5620,7 +5571,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5636,12 +5587,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -5650,7 +5601,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5663,12 +5614,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -5677,7 +5628,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s10
-; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5691,12 +5642,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -5706,7 +5657,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -5723,12 +5674,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -5744,7 +5695,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5769,12 +5720,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -5790,7 +5741,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s10
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5816,18 +5767,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5836,7 +5787,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5850,14 +5801,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -5866,7 +5817,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v8
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5881,8 +5832,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5891,18 +5842,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5915,21 +5866,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX940-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5943,8 +5894,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5952,19 +5903,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5978,14 +5929,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -5994,7 +5945,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6009,8 +5960,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6020,19 +5971,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6045,13 +5996,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -6059,7 +6010,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6073,8 +6024,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6084,18 +6035,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6108,20 +6059,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX90A-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6134,8 +6085,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6143,18 +6094,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6167,21 +6118,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX908-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6194,8 +6145,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6203,18 +6154,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6227,15 +6178,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
@@ -6245,7 +6196,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6258,8 +6209,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6267,18 +6218,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6290,7 +6241,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -6302,9 +6253,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
@@ -6320,7 +6271,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6333,8 +6284,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -6344,19 +6295,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6368,7 +6319,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -6380,9 +6331,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
@@ -6399,7 +6350,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6412,8 +6363,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -6423,7 +6374,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v4
@@ -6431,7 +6382,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
@@ -6439,8 +6390,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6455,7 +6406,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v0
@@ -6488,12 +6439,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -6506,7 +6457,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -6533,12 +6484,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
@@ -6551,7 +6502,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -6585,13 +6536,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -6602,7 +6553,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6631,12 +6582,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -6649,7 +6600,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -6675,12 +6626,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -6693,7 +6644,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -6720,12 +6671,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -6736,7 +6687,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6766,12 +6717,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -6786,7 +6737,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s8
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6808,12 +6759,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -6828,7 +6779,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6851,18 +6802,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6875,7 +6826,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6906,12 +6857,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -6923,7 +6874,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6950,12 +6901,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6966,7 +6917,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6998,13 +6949,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
@@ -7014,7 +6965,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7043,12 +6994,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -7060,7 +7011,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7086,12 +7037,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -7103,7 +7054,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7130,12 +7081,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -7145,7 +7096,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7175,12 +7126,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -7195,7 +7146,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -7217,12 +7168,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -7237,7 +7188,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -7260,18 +7211,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7280,7 +7231,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -7294,15 +7245,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7326,7 +7277,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7341,8 +7292,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7351,18 +7302,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -7375,7 +7326,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
@@ -7383,9 +7334,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
-; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX940-NEXT:    v_max_f32_e32 v4, v4, v9
@@ -7406,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
 ; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7420,8 +7371,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7429,19 +7380,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -7455,16 +7406,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7488,7 +7439,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7503,8 +7454,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7514,20 +7465,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7540,14 +7491,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
@@ -7568,7 +7519,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7582,8 +7533,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7593,18 +7544,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7617,7 +7568,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
@@ -7625,9 +7576,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v9
@@ -7646,7 +7597,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7659,8 +7610,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7668,18 +7619,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7692,7 +7643,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
@@ -7700,9 +7651,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX908-NEXT:    v_max_f32_e32 v4, v4, v8
@@ -7722,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7735,8 +7686,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7744,18 +7695,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7768,15 +7719,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v8
@@ -7799,7 +7750,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7812,8 +7763,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7821,18 +7772,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7844,7 +7795,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7855,9 +7806,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7871,7 +7822,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7884,8 +7835,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7894,19 +7845,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7918,7 +7869,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7929,9 +7880,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7945,7 +7896,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7958,8 +7909,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7969,14 +7920,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
@@ -7984,8 +7935,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; misc
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7998,7 +7949,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -8015,12 +7966,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -8030,7 +7981,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -8044,12 +7995,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
@@ -8058,7 +8009,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -8076,12 +8027,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -8091,7 +8042,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -8107,12 +8058,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -8122,7 +8073,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -8137,12 +8088,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -8152,7 +8103,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s10
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -8166,12 +8117,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -8181,7 +8132,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
@@ -8195,12 +8146,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v0
@@ -8210,7 +8161,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
@@ -8224,12 +8175,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v0
@@ -8239,7 +8190,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v0
@@ -8254,16 +8205,16 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 
-
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 89289c15dcae58..2791162396a910 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -13,43 +13,22 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -78,70 +57,29 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -169,7 +107,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -198,7 +136,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -227,106 +165,45 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_max_num_f32 v2, v0, v0
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v3, s4
-; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v2
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -354,68 +231,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_max_f32 v2, v0, v0
-; GFX11-NEXT:    s_addk_i32 s4, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
-; GFX10-NEXT:    s_addk_i32 s8, 0x400
-; GFX10-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -442,7 +280,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -470,7 +308,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -498,78 +336,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x400
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX6-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -581,57 +380,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f32_e32 v8, v5, v5
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v6, v6
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v5, v4, v8
-; GFX12-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -692,12 +454,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -709,63 +470,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f32_e32 v8, v5, v5
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v5, v4, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -775,56 +498,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:    v_max_f32_e32 v8, v5, v5
-; GFX10-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v5, v4, v8
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
-; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -883,7 +572,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
@@ -943,7 +632,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
@@ -1003,10 +692,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -1017,55 +705,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v5
-; GFX7-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v4, v8
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
@@ -1076,123 +731,61 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_mul_f32_e32 v8, 1.0, v5
-; GFX6-NEXT:  .LBB2_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB2_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; GFX6-NEXT:    v_min_f32_e32 v5, v4, v8
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
-; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB2_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NEXT:    v_mov_b32_e32 v3, v1
-; GFX940-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_add_i32 s6, s4, 0x800
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s6, s4, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX940-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1200,99 +793,49 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_addk_i32 s4, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    s_addk_i32 s8, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX90A-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1300,32 +843,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1333,32 +872,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -1366,818 +901,924 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s10
-; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s10
-; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX6-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s4
-; GFX940-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_add_i32 s6, s4, 0x800
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_add_i32 s6, s4, 0x400
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_addk_i32 s4, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX10-NEXT:    s_addk_i32 s8, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8
-; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v7
-; GFX10-NEXT:    v_mov_b32_e32 v3, v8
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
-; GFX90A-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX90A-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, s8
-; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v3
-; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_mov_b32_e32 v8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, v3
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s10
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, v3
-; GFX7-NEXT:    v_mov_b32_e32 v9, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v7, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v8
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
-; GFX6-NEXT:    s_add_i32 s10, s8, 0x800
-; GFX6-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s10
-; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v3
-; GFX6-NEXT:    v_mov_b32_e32 v9, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v7, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v7
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v8
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v11, v6
-; GFX940-NEXT:    v_mov_b32_e32 v10, v5
-; GFX940-NEXT:    v_mov_b32_e32 v9, v3
-; GFX940-NEXT:    v_mov_b32_e32 v8, v2
-; GFX940-NEXT:    v_mov_b32_e32 v7, v1
-; GFX940-NEXT:    v_mov_b32_e32 v6, v0
-; GFX940-NEXT:    v_add_u32_e32 v14, 0x800, v4
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v6
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v8
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v9
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[8:9]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    buffer_load_dwordx2 v[12:13], v4, s[4:7], 0 offen offset:2048
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-NEXT:    s_mov_b64 s[2:3], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[10:11], v[10:11]
-; GFX940-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[12:13], v[12:13]
-; GFX940-NEXT:    v_min_f64 v[10:11], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[10:11]
-; GFX940-NEXT:    s_mov_b64 s[8:9], exec
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v6
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v8
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v9
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[8:9]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v14, s[4:7], 0 offen sc0
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[12:13]
-; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
-; GFX10-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX10-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14
-; GFX10-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v1
-; GFX10-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v11, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v10, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v8, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-NEXT:    v_add_u32_e32 v14, 0x800, v4
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v6
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v7
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v8
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v9
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[6:7]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[8:9]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dwordx2 v[12:13], v4, s[8:11], 0 offen offset:2048
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[10:11], v[10:11]
-; GFX90A-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[12:13], v[12:13]
-; GFX90A-NEXT:    v_min_f64 v[10:11], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[12:13], v[12:13] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v6
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v7
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v8
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v9
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[6:7]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[8:9]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v14, s[8:11], 0 offen glc
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[12:13]
-; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v11
-; GFX908-NEXT:    v_mov_b32_e32 v1, v12
-; GFX908-NEXT:    v_mov_b32_e32 v2, v13
-; GFX908-NEXT:    v_mov_b32_e32 v3, v14
-; GFX908-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT:    v_mov_b32_e32 v14, v1
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v6
+; GFX940-NEXT:    v_mov_b32_e32 v6, v5
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v11
+; GFX908-NEXT:    v_mov_b32_e32 v1, v12
+; GFX908-NEXT:    v_mov_b32_e32 v2, v13
+; GFX908-NEXT:    v_mov_b32_e32 v3, v14
+; GFX908-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT:    v_mov_b32_e32 v14, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v13, v0
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v13
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
+; GFX8-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
@@ -2188,155 +1829,465 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT:    v_mov_b32_e32 v14, v1
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v13, v0
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT:    v_mov_b32_e32 v14, v1
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v11
-; GFX7-NEXT:    v_mov_b32_e32 v1, v12
-; GFX7-NEXT:    v_mov_b32_e32 v2, v13
-; GFX7-NEXT:    v_mov_b32_e32 v3, v14
-; GFX7-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX7-NEXT:    v_mov_b32_e32 v14, v1
-; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX6-NEXT:    s_mov_b64 s[12:13], exec
-; GFX6-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v11
-; GFX6-NEXT:    v_mov_b32_e32 v1, v12
-; GFX6-NEXT:    v_mov_b32_e32 v2, v13
-; GFX6-NEXT:    v_mov_b32_e32 v3, v14
-; GFX6-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_addk_co_i32 s4, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_addk_i32 s4, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX908-NEXT:    s_mov_b64 s[8:9], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s10
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s10, s8, 0x800
+; GFX8-NEXT:    s_mov_b64 s[8:9], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX6-NEXT:    v_mov_b32_e32 v14, v1
-; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -2344,8 +2295,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
 ; half
 ; --------------------------------------------------------------------
 
-define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2363,7 +2314,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2385,13 +2336,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -2404,7 +2355,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2421,13 +2372,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -2442,7 +2393,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2465,13 +2416,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -2484,7 +2435,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
@@ -2503,13 +2454,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -2522,7 +2473,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_not_b32 s11, s8
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2538,13 +2489,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -2557,7 +2508,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_not_b32 s11, s8
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2574,13 +2525,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -2593,7 +2544,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2611,13 +2562,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -2631,7 +2582,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2650,14 +2601,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -2671,7 +2622,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2691,7 +2642,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
@@ -2699,12 +2650,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2722,7 +2673,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2744,12 +2695,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -2762,7 +2713,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2779,12 +2730,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -2799,7 +2750,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2822,12 +2773,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -2840,7 +2791,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
@@ -2859,12 +2810,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -2877,7 +2828,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_not_b32 s11, s8
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2893,12 +2844,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -2911,7 +2862,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_not_b32 s11, s8
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2928,12 +2879,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -2946,7 +2897,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -2964,12 +2915,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -2983,7 +2934,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -3002,12 +2953,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -3021,7 +2972,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -3041,18 +2992,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3068,7 +3019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3081,14 +3032,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -3103,7 +3054,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3118,8 +3069,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -3128,13 +3079,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -3145,7 +3096,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3157,14 +3108,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX940-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3174,7 +3125,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3188,8 +3139,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -3197,13 +3148,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -3216,7 +3167,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3229,14 +3180,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -3251,7 +3202,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3266,8 +3217,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -3277,13 +3228,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -3294,7 +3245,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3306,13 +3257,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX10-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -3323,7 +3274,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3337,8 +3288,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -3348,13 +3299,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -3365,7 +3316,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3377,14 +3328,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX90A-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3393,7 +3344,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3406,8 +3357,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -3415,13 +3366,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -3432,7 +3383,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3444,14 +3395,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX908-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3461,7 +3412,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3474,8 +3425,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3483,13 +3434,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -3500,7 +3451,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3512,14 +3463,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX8-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -3530,7 +3481,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3543,8 +3494,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3552,13 +3503,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3568,7 +3519,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3579,15 +3530,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3599,7 +3550,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX7-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3612,8 +3563,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3621,14 +3572,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3638,7 +3589,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3649,15 +3600,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3669,7 +3620,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX6-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3682,8 +3633,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3691,7 +3642,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -3699,7 +3650,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
@@ -3707,8 +3658,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3726,7 +3677,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3755,13 +3706,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -3775,7 +3726,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3797,13 +3748,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -3818,7 +3769,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3848,13 +3799,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -3867,7 +3818,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v4, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3890,13 +3841,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -3910,7 +3861,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3930,13 +3881,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -3950,7 +3901,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3971,13 +3922,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -3990,7 +3941,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -4014,13 +3965,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -4034,7 +3985,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4054,14 +4005,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -4075,7 +4026,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4096,7 +4047,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
@@ -4104,12 +4055,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4127,7 +4078,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -4156,12 +4107,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s4, 0x200
@@ -4175,7 +4126,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4197,12 +4148,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s4, 0x200
@@ -4217,7 +4168,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -4247,12 +4198,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s8, 0x200
@@ -4265,7 +4216,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_not_b32 s10, s9
 ; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4288,12 +4239,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s8, 0x200
@@ -4307,7 +4258,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4327,12 +4278,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s8, 0x200
@@ -4346,7 +4297,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -4367,12 +4318,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s8, 0x200
@@ -4385,7 +4336,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s11, s8
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -4409,12 +4360,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s8, 0x200
@@ -4428,7 +4379,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s11, s8
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4448,12 +4399,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s8, 0x200
@@ -4467,7 +4418,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s11, s8
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
@@ -4488,18 +4439,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4515,7 +4466,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4528,14 +4479,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -4557,7 +4508,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4572,8 +4523,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4582,13 +4533,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4599,7 +4550,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4611,15 +4562,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
@@ -4634,7 +4585,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4648,8 +4599,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4657,13 +4608,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4676,7 +4627,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4689,15 +4640,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -4719,7 +4670,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4734,8 +4685,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4745,14 +4696,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4763,7 +4714,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4775,13 +4726,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -4796,7 +4747,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4810,8 +4761,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4821,13 +4772,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4838,7 +4789,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4850,15 +4801,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v11
@@ -4871,7 +4822,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4884,8 +4835,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4893,13 +4844,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4910,7 +4861,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4922,15 +4873,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_min_f32_e32 v4, v4, v10
@@ -4944,7 +4895,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4957,8 +4908,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4966,13 +4917,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -4983,7 +4934,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4995,14 +4946,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v10
@@ -5018,7 +4969,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5031,8 +4982,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5040,13 +4991,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -5056,7 +5007,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5067,15 +5018,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -5088,7 +5039,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5101,8 +5052,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5110,14 +5061,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -5127,7 +5078,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5138,15 +5089,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -5159,7 +5110,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5172,8 +5123,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5181,7 +5132,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -5189,7 +5140,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
@@ -5197,8 +5148,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5212,7 +5163,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -5229,12 +5180,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -5244,7 +5195,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -5259,12 +5210,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
@@ -5274,7 +5225,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -5292,12 +5243,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -5307,7 +5258,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -5323,12 +5274,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -5338,7 +5289,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -5351,12 +5302,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -5366,7 +5317,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s10
-; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -5380,12 +5331,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -5396,7 +5347,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -5413,12 +5364,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -5434,7 +5385,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s10
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -5459,12 +5410,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -5480,7 +5431,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s10
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -5506,18 +5457,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5530,7 +5481,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
@@ -5546,12 +5497,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -5560,7 +5511,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5575,12 +5526,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s4
@@ -5589,7 +5540,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5606,12 +5557,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
@@ -5620,7 +5571,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5636,12 +5587,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -5650,7 +5601,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5663,12 +5614,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -5677,7 +5628,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s10
-; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5691,12 +5642,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -5706,7 +5657,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -5723,12 +5674,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -5744,7 +5695,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5769,12 +5720,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -5790,7 +5741,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s10
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5816,18 +5767,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5836,7 +5787,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5850,14 +5801,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -5866,7 +5817,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    v_pk_min_num_f16 v5, v4, v8
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5881,8 +5832,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5891,18 +5842,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5915,21 +5866,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX940-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5943,8 +5894,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5952,19 +5903,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5978,14 +5929,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -5994,7 +5945,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6009,8 +5960,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6020,19 +5971,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6045,13 +5996,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -6059,7 +6010,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6073,8 +6024,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6084,18 +6035,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6108,20 +6059,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX90A-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6134,8 +6085,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6143,18 +6094,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6167,21 +6118,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX908-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6194,8 +6145,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6203,18 +6154,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6227,15 +6178,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
@@ -6245,7 +6196,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6258,8 +6209,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6267,18 +6218,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6290,7 +6241,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -6302,9 +6253,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
@@ -6320,7 +6271,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6333,8 +6284,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -6344,19 +6295,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -6368,7 +6319,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -6380,9 +6331,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
@@ -6399,7 +6350,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -6412,8 +6363,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -6423,7 +6374,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v4
@@ -6431,7 +6382,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
@@ -6439,8 +6390,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6455,7 +6406,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v0
@@ -6488,12 +6439,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -6506,7 +6457,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -6533,12 +6484,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
@@ -6551,7 +6502,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -6585,13 +6536,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -6602,7 +6553,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6631,12 +6582,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -6649,7 +6600,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -6675,12 +6626,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -6693,7 +6644,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -6720,12 +6671,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -6736,7 +6687,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6766,12 +6717,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -6786,7 +6737,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s8
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6808,12 +6759,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -6828,7 +6779,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6851,18 +6802,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6875,7 +6826,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6906,12 +6857,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
@@ -6923,7 +6874,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6950,12 +6901,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6966,7 +6917,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6998,13 +6949,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
@@ -7014,7 +6965,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7043,12 +6994,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s8
@@ -7060,7 +7011,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7086,12 +7037,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s8
@@ -7103,7 +7054,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7130,12 +7081,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
@@ -7145,7 +7096,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -7175,12 +7126,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -7195,7 +7146,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -7217,12 +7168,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -7237,7 +7188,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -7260,18 +7211,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7280,7 +7231,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -7294,15 +7245,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7326,7 +7277,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7341,8 +7292,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7351,18 +7302,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -7375,7 +7326,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
@@ -7383,9 +7334,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
-; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX940-NEXT:    v_min_f32_e32 v4, v4, v9
@@ -7406,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
 ; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7420,8 +7371,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7429,19 +7380,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -7455,16 +7406,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7488,7 +7439,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7503,8 +7454,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7514,20 +7465,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7540,14 +7491,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
@@ -7568,7 +7519,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7582,8 +7533,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7593,18 +7544,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7617,7 +7568,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
@@ -7625,9 +7576,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v9
@@ -7646,7 +7597,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7659,8 +7610,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7668,18 +7619,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7692,7 +7643,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
@@ -7700,9 +7651,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX908-NEXT:    v_min_f32_e32 v4, v4, v8
@@ -7722,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7735,8 +7686,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7744,18 +7695,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7768,15 +7719,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v8
@@ -7799,7 +7750,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7812,8 +7763,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7821,18 +7772,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7844,7 +7795,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7855,9 +7806,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7871,7 +7822,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7884,8 +7835,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7894,19 +7845,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7918,7 +7869,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7929,9 +7880,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7945,7 +7896,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7958,8 +7909,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7969,14 +7920,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
@@ -7984,8 +7935,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; misc
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7998,7 +7949,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -8015,12 +7966,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -8030,7 +7981,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -8044,12 +7995,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
@@ -8058,7 +8009,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -8076,12 +8027,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -8091,7 +8042,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -8107,12 +8058,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -8122,7 +8073,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -8137,12 +8088,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -8152,7 +8103,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s10
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -8166,12 +8117,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -8181,7 +8132,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
@@ -8195,12 +8146,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v0
@@ -8210,7 +8161,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
@@ -8224,12 +8175,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v0
@@ -8239,7 +8190,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v0
@@ -8254,16 +8205,16 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 
-
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index 54dc5b8b9d3dd6..c7f7f30a5e6bd3 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -279,38 +279,30 @@ end:
 }
 
 define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, i32 inreg %val) {
-; GFX678-LABEL: test_sink_smem_offset_neg400:
-; GFX678:       ; %bb.0: ; %entry
-; GFX678-NEXT:    s_add_u32 s0, s0, 0xfffffe70
-; GFX678-NEXT:    s_addc_u32 s1, s1, -1
-; GFX678-NEXT:  .LBB5_1: ; %loop
-; GFX678-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX678-NEXT:    s_add_i32 s2, s2, -1
-; GFX678-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX678-NEXT:    s_cbranch_scc1 .LBB5_1
-; GFX678-NEXT:  ; %bb.2: ; %end
-; GFX678-NEXT:    s_endpgm
-;
-; GFX9-LABEL: test_sink_smem_offset_neg400:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  .LBB5_1: ; %loop
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s3, s[0:1], -0x190
-; GFX9-NEXT:    s_add_i32 s2, s2, -1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
-; GFX9-NEXT:  ; %bb.2: ; %end
-; GFX9-NEXT:    s_endpgm
+; GFX6789-LABEL: test_sink_smem_offset_neg400:
+; GFX6789:       ; %bb.0: ; %entry
+; GFX6789-NEXT:    s_add_u32 s0, s0, 0xfffffe70
+; GFX6789-NEXT:    s_addc_u32 s1, s1, -1
+; GFX6789-NEXT:  .LBB5_1: ; %loop
+; GFX6789-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6789-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX6789-NEXT:    s_add_i32 s2, s2, -1
+; GFX6789-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX6789-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX6789-NEXT:  ; %bb.2: ; %end
+; GFX6789-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sink_smem_offset_neg400:
 ; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_movk_i32 s4, 0xfe70
+; GFX12-NEXT:    s_mov_b32 s5, -1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:  .LBB5_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], -0x190
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -331,3 +323,52 @@ loop:
 end:
   ret void
 }
+
+; Same for address space 6, constant 32-bit.
+define amdgpu_cs void @test_sink_smem_offset_neg400_32bit(ptr addrspace(6) inreg %ptr, i32 inreg %val) {
+; GFX6789-LABEL: test_sink_smem_offset_neg400_32bit:
+; GFX6789:       ; %bb.0: ; %entry
+; GFX6789-NEXT:    s_add_i32 s2, s0, 0xfffffe70
+; GFX6789-NEXT:    s_mov_b32 s3, 0
+; GFX6789-NEXT:  .LBB6_1: ; %loop
+; GFX6789-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6789-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX6789-NEXT:    s_add_i32 s1, s1, -1
+; GFX6789-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6789-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX6789-NEXT:  ; %bb.2: ; %end
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_sink_smem_offset_neg400_32bit:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_add_co_i32 s2, s0, 0xfffffe70
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:  .LBB6_1: ; %loop
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s1, s1, -1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %end
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(6) %ptr, i64 -400
+  br label %loop
+
+loop:
+  %count = phi i32 [ %dec, %loop ], [ %val, %entry ]
+  %dec = sub i32 %count, 1
+  %load = load volatile i32, ptr addrspace(6) %gep
+  %cond = icmp eq i32 %dec, 0
+  br i1 %cond, label %end, label %loop
+
+end:
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX678: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 74b9d98257fb34..ac03d2dae8fa87 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -1,7 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX11,GFX1100 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX11,GFX1150 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,NOT-GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10,NOT-GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX1100,NOT-GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX1150,NOT-GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX1200 %s
 
 declare float @llvm.fabs.f32(float)
 declare float @llvm.fma.f32(float, float, float)
@@ -24,10 +25,21 @@ define float @v_mul_f32_vop2(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f32_vop2:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, %y
   ret float %mul
 }
-; CHECK: codeLenInByte = 12
+; NOT-GFX12: codeLenInByte = 12
+; GFX1200: codeLenInByte = 28
 
 define float @v_mul_f32_vop2_inline_imm(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_inline_imm:
@@ -47,10 +59,21 @@ define float @v_mul_f32_vop2_inline_imm(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f32_vop2_inline_imm:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, 4.0
   ret float %mul
 }
-; CHECK: codeLenInByte = 12
+; NOT-GFX12: codeLenInByte = 12
+; GFX1200: codeLenInByte = 28
 
 define float @v_mul_f32_vop2_literal(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_literal:
@@ -70,10 +93,21 @@ define float @v_mul_f32_vop2_literal(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f32_vop2_literal:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, 123.0
   ret float %mul
 }
-; CHECK: codeLenInByte = 16
+; NOT-GFX12: codeLenInByte = 16
+; GFX1200: codeLenInByte = 32
 
 define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods:
@@ -93,11 +127,22 @@ define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f32_vop3_src_mods:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, %y
   ret float %mul
 }
-; CHECK: codeLenInByte = 16
+; NOT-GFX12: codeLenInByte = 16
+; GFX1200: codeLenInByte = 32
 
 define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
@@ -117,12 +162,23 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, 4.0
   ret float %mul
 }
 
-; CHECK: codeLenInByte = 16
+; NOT-GFX12: codeLenInByte = 16
+; GFX1200: codeLenInByte = 32
 
 define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods_literal:
@@ -143,6 +199,16 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f32_vop3_src_mods_literal:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, 123.0
   ret float %mul
@@ -151,6 +217,7 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX9: codeLenInByte = 24
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 20
+; GFX1200: codeLenInByte = 36
 
 define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_frame_index:
@@ -172,6 +239,16 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f32_vop2_frame_index:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %alloca = alloca i32, addrspace(5)
   %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
   %cast = bitcast i32 %ptrtoint to float
@@ -182,6 +259,7 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX9: codeLenInByte = 20
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 12
+; GFX1200: codeLenInByte = 28
 
 define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX9-LABEL: v_fma_f32:
@@ -201,11 +279,22 @@ define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_fma_f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
 }
 
-; CHECK: codeLenInByte = 16
+; NOT-GFX12: codeLenInByte = 16
+; GFX1200: codeLenInByte = 32
 
 define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 ; GFX9-LABEL: v_fma_f32_src_mods:
@@ -225,12 +314,23 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_fma_f32_src_mods:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
   ret float %fma
 }
 
-; CHECK: codeLenInByte = 16
+; NOT-GFX12: codeLenInByte = 16
+; GFX1200: codeLenInByte = 32
 
 define float @v_fmac_f32(float %x, float %y) {
 ; GFX9-LABEL: v_fmac_f32:
@@ -250,6 +350,16 @@ define float @v_fmac_f32(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_fmac_f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %x)
   ret float %fma
 }
@@ -257,6 +367,7 @@ define float @v_fmac_f32(float %x, float %y) {
 ; GFX9: codeLenInByte = 16
 ; GFX10: codeLenInByte = 12
 ; GFX11: codeLenInByte = 12
+; GFX1200: codeLenInByte = 28
 
 define float @v_fmaak_f32(float %x, float %y) {
 ; GFX9-LABEL: v_fmaak_f32:
@@ -277,6 +388,16 @@ define float @v_fmaak_f32(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_fmaak_f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
   ret float %fma
 }
@@ -284,6 +405,7 @@ define float @v_fmaak_f32(float %x, float %y) {
 ; GFX9: codeLenInByte = 24
 ; GFX10: codeLenInByte = 16
 ; GFX11: codeLenInByte = 16
+; GFX1200: codeLenInByte = 32
 
 define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX9-LABEL: v_fma_k_f32_src_mods:
@@ -304,6 +426,16 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_fma_f32 v0, |v0|, v1, 0x43800000 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0xfe,0x03,0x00,0x00,0x80,0x43]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_fma_k_f32_src_mods:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_fma_f32 v0, |v0|, v1, 0x43800000 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0xfe,0x03,0x00,0x00,0x80,0x43]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float 256.0)
   ret float %fma
@@ -312,6 +444,7 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX9: codeLenInByte = 24
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 20
+; GFX1200: codeLenInByte = 36
 
 define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ; GFX9-LABEL: s_fmaak_f32:
@@ -340,6 +473,13 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) ; encoding: [0x0b,0x00,0x87,0xbf]
 ; GFX1150-NEXT:    v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
 ; GFX1150-NEXT:    ; return to shader part epilog
+;
+; GFX1200-LABEL: s_fmaak_f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_fmaak_f32 s0, s0, s1, 0x43800000 ; encoding: [0x00,0x01,0x80,0xa2,0x00,0x00,0x80,0x43]
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) ; encoding: [0x0b,0x00,0x87,0xbf]
+; GFX1200-NEXT:    v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
+; GFX1200-NEXT:    ; return to shader part epilog
   %fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
   ret float %fma
 }
@@ -348,3 +488,4 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ; GFX10: codeLenInByte = 12
 ; GFX1100: codeLenInByte = 16
 ; GFX1150: codeLenInByte = 16
+; GFX1200: codeLenInByte = 16
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
new file mode 100644
index 00000000000000..b7a1749be18bc0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
@@ -0,0 +1,70 @@
+; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
+; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=1100 -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+
+; FIXME: Merge these tests with existing lane op tests (llvm.amdgcn.readlane.ll, llvm.amdgcn.writelane.ll ...) once the crash is fixed.
+
+; CHECK-LABEL: name:            basic_readfirstlane_i64
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+define i64 @basic_readfirstlane_i64(i64 %src, i1 %cond) #0 {
+entry:
+  %t = call token @llvm.experimental.convergence.anchor()
+  %x = add i64 %src, 1
+  br i1 %cond, label %then, label %else
+
+then:
+; CHECK-ERROR: Cannot mix controlled and uncontrolled convergence in the same function.
+; CHECK-ERROR: V_READFIRSTLANE_B32
+  %r = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %x) [ "convergencectrl"(token %t) ]
+  br label %else
+
+else:
+  %p = phi i64 [%r, %then], [%x, %entry]
+  ret i64 %p
+}
+
+; CHECK-LABEL: name:            basic_readlane_i64
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
+;        ISEL:    {{.*}} = V_READLANE_B32 {{.*}}, implicit [[TOKEN]]
+;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
+;        ISEL:    {{.*}} = V_READLANE_B32 {{.*}}, implicit [[TOKEN]]
+define i64 @basic_readlane_i64(i64 %src, i32 %lane, i1 %cond) #0 {
+entry:
+  %t = call token @llvm.experimental.convergence.anchor()
+  %x = add i64 %src, 1
+  br i1 %cond, label %then, label %else
+
+then:
+  %r = call i64 @llvm.amdgcn.readlane.i64(i64 %x, i32 %lane) [ "convergencectrl"(token %t) ]
+  br label %else
+
+else:
+  %p = phi i64 [%r, %then], [%x, %entry]
+  ret i64 %p
+}
+
+; CHECK-LABEL: name:            basic_writelane_i64
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
+;        ISEL:    {{.*}} = V_WRITELANE_B32 {{.*}}, implicit [[TOKEN]]
+;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
+;        ISEL:    {{.*}} = V_WRITELANE_B32 {{.*}}, implicit [[TOKEN]]
+define i64 @basic_writelane_i64(i64 %src, i1 %cond, i32 %lane, ptr addrspace(1) %out) #0 {
+entry:
+  %old = load i64, ptr addrspace(1) %out
+  %t = call token @llvm.experimental.convergence.anchor()
+  %x = add i64 %src, 1
+  br i1 %cond, label %then, label %else
+
+then:
+  %r = call i64 @llvm.amdgcn.writelane.i64(i64 %x, i32 %lane, i64 %old) [ "convergencectrl"(token %t) ]
+  br label %else
+
+else:
+  %p = phi i64 [%r, %then], [%x, %entry]
+  ret i64 %p
+}
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index 96ec90b1f4d07e..13640b74a7937b 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
 
 ; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
 
@@ -15,6 +17,11 @@
 ; GFX11-MESA: .long 45100
 ; GFX11-MESA-NEXT: .long 1024
 
+; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
+
+; GFX1200-MESA: .long 45100
+; GFX1200-MESA-NEXT: .long 1024
+
 @lds = internal addrspace(3) global [4096 x i8] undef
 
 define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 458f966b0f75fe..af7e11127fbaea 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -12,8 +12,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32:
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -26,7 +26,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -35,7 +35,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -45,7 +45,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
@@ -69,7 +69,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
@@ -119,7 +119,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -141,7 +141,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -163,7 +163,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
@@ -184,12 +184,12 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -202,7 +202,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -211,7 +211,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -221,7 +221,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -246,7 +246,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
@@ -302,7 +302,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -324,7 +324,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -347,7 +347,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
@@ -370,12 +370,12 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -388,7 +388,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -400,7 +400,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -412,7 +412,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -437,7 +437,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -493,7 +493,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -518,7 +518,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -541,7 +541,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
@@ -564,12 +564,12 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 -512
-  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %result
 }
 
-define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32:
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -582,7 +582,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -591,7 +591,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -602,7 +602,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v4, v[0:1]
@@ -625,7 +625,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
@@ -677,7 +677,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
@@ -729,7 +729,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
@@ -750,7 +750,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v4, v[0:1]
@@ -770,12 +770,12 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -788,7 +788,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -797,7 +797,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -808,7 +808,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -833,7 +833,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
@@ -887,7 +887,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
@@ -941,7 +941,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -964,7 +964,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -987,12 +987,12 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1005,7 +1005,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1017,7 +1017,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -1030,7 +1030,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -1055,7 +1055,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1109,7 +1109,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1163,7 +1163,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1186,7 +1186,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
@@ -1209,12 +1209,12 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 -512
-  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1243,7 +1243,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -1252,7 +1252,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -1278,7 +1278,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -1303,7 +1303,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1327,7 +1327,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1349,7 +1349,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1372,7 +1372,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
@@ -1395,12 +1395,12 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, float %val seq_cst
+  %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %result
 }
 
-define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1427,7 +1427,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -1436,7 +1436,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
@@ -1460,7 +1460,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -1485,7 +1485,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
@@ -1508,7 +1508,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
@@ -1529,7 +1529,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1552,7 +1552,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -1575,56 +1575,81 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, float %val seq_cst
+  %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @flat_agent_atomic_fadd_ret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -1632,135 +1657,108 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT:    ; implicit-def: $vgpr3
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execz .LBB8_6
-; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX90A-NEXT:    ; implicit-def: $vgpr3
-; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execz .LBB8_3
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT:    ; implicit-def: $vgpr2
-; GFX90A-NEXT:  .LBB8_3: ; %Flow
-; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX90A-NEXT:    s_cbranch_execz .LBB8_5
-; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
-; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT:  .LBB8_5: ; %Flow1
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT:    ; implicit-def: $vgpr2
-; GFX90A-NEXT:  .LBB8_6: ; %Flow2
-; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_cbranch_execz .LBB8_8
-; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:  .LBB8_8: ; %atomicrmw.phi
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
-  ret float %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+  ret void
 }
 
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1768,48 +1766,49 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
@@ -1817,14 +1816,13 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
-; GFX90A:       ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB9_3
@@ -1837,116 +1835,147 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ; GFX90A-NEXT:  .LBB9_3: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
-; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB9_5
 ; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:  .LBB9_5: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB9_7
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
-; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX90A-NEXT:  .LBB9_7: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX90A-NEXT:  .LBB9_8: ; %atomicrmw.shared
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
-; GFX908:       ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_8
+; GFX908-NEXT:  .LBB9_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB9_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB9_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB9_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB9_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB9_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB9_2
+; GFX908-NEXT:  .LBB9_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
-  ret float %result
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1954,53 +1983,49 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
@@ -2008,14 +2033,13 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
-; GFX90A:       ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_3
@@ -2028,119 +2052,147 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ; GFX90A-NEXT:  .LBB10_3: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
-; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:  .LBB10_5: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB10_7
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
-; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX90A-NEXT:  .LBB10_7: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX90A-NEXT:  .LBB10_8: ; %atomicrmw.shared
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
-; GFX908:       ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v0, v[0:1]
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_8
+; GFX908-NEXT:  .LBB10_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB10_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB10_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB10_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB10_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB10_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB10_2
+; GFX908-NEXT:  .LBB10_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 -512
-  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
-  ret float %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2148,36 +2200,38 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2196,9 +2250,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2248,9 +2304,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2300,9 +2358,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
@@ -2321,9 +2381,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
@@ -2341,12 +2403,17 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2354,41 +2421,39 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
-; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -2396,174 +2461,135 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
-; GFX90A-NEXT:  ; %bb.1: ; %Flow2
-; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_8
-; GFX90A-NEXT:  .LBB12_2: ; %atomicrmw.phi
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_cbranch_execz .LBB12_6
+; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execz .LBB12_5
-; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_cbranch_execz .LBB12_3
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
-; GFX90A-NEXT:  .LBB12_5: ; %Flow
+; GFX90A-NEXT:  .LBB12_3: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX90A-NEXT:    s_cbranch_execz .LBB12_7
-; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    s_cbranch_execz .LBB12_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
 ; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT:  .LBB12_7: ; %Flow1
+; GFX90A-NEXT:  .LBB12_5: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB12_6: ; %Flow2
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_cbranch_execz .LBB12_2
-; GFX90A-NEXT:  .LBB12_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    s_cbranch_execz .LBB12_8
+; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
 ; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB12_8: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
-; GFX908-NEXT:  ; %bb.1: ; %Flow2
-; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_8
-; GFX908-NEXT:  .LBB12_2: ; %atomicrmw.phi
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.check.private
-; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execz .LBB12_5
-; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2
-; GFX908-NEXT:  .LBB12_5: ; %Flow
-; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX908-NEXT:    s_cbranch_execz .LBB12_7
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT:  .LBB12_7: ; %Flow1
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2
-; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_cbranch_execz .LBB12_2
-; GFX908-NEXT:  .LBB12_8: ; %atomicrmw.shared
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT:    ds_add_f32 v0, v2
-; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2571,54 +2597,48 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:-2048
-; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
@@ -2626,13 +2646,14 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_3
@@ -2645,215 +2666,157 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ; GFX90A-NEXT:  .LBB13_3: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB13_5
 ; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:  .LBB13_5: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB13_7
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
+; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX90A-NEXT:  .LBB13_7: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX90A-NEXT:  .LBB13_8: ; %atomicrmw.shared
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
+; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_3
-; GFX908-NEXT:  ; %bb.1: ; %Flow2
-; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_8
-; GFX908-NEXT:  .LBB13_2: ; %atomicrmw.phi
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-; GFX908-NEXT:  .LBB13_3: ; %atomicrmw.check.private
-; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execz .LBB13_5
-; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2
-; GFX908-NEXT:  .LBB13_5: ; %Flow
-; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX908-NEXT:    s_cbranch_execz .LBB13_7
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT:  .LBB13_7: ; %Flow1
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT:    ; implicit-def: $vgpr2
-; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_cbranch_execz .LBB13_2
-; GFX908-NEXT:  .LBB13_8: ; %atomicrmw.shared
-; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT:    ds_add_f32 v0, v2
-; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 -512
-  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
@@ -2874,57 +2837,92 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
-; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_8
+; GFX90A-NEXT:  .LBB14_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
+; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB14_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB14_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB14_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
+; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB14_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB14_2
+; GFX90A-NEXT:  .LBB14_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
+; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
@@ -2943,11 +2941,11 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
@@ -2965,79 +2963,50 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, float %val seq_cst
+  %gep = getelementptr float, ptr %ptr, i64 -512
+  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3056,55 +3025,113 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX90A:       ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_8
+; GFX90A-NEXT:  .LBB15_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB15_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB15_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB15_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB15_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB15_2
+; GFX90A-NEXT:  .LBB15_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX908:       ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_8
+; GFX908-NEXT:  .LBB15_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB15_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB15_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB15_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB15_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB15_2
+; GFX908-NEXT:  .LBB15_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3123,11 +3150,9 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3145,279 +3170,284 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, float %val seq_cst
+  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @flat_agent_atomic_fadd_ret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64:
-; GFX90A:       ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_8
+; GFX90A-NEXT:  .LBB16_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB16_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB16_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB16_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB16_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB16_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB16_2
+; GFX90A-NEXT:  .LBB16_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64:
-; GFX908:       ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_8
+; GFX908-NEXT:  .LBB16_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB16_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB16_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB16_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB16_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB16_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB16_2
+; GFX908-NEXT:  .LBB16_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    flat_load_dword v5, v[5:6]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -3425,188 +3455,247 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
-; GFX90A:       ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_8
+; GFX90A-NEXT:  .LBB17_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB17_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB17_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB17_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB17_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB17_2
+; GFX90A-NEXT:  .LBB17_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
-; GFX908:       ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_8
+; GFX908-NEXT:  .LBB17_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB17_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB17_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB17_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB17_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB17_2
+; GFX908-NEXT:  .LBB17_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
-; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[0:1]
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 255
-  %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr %ptr, i64 -512
+  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT:    flat_load_b64 v[0:1], v[0:1]
 ; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[2:3]
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
@@ -3614,62 +3703,68 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v9, v1
-; GFX908-NEXT:    v_mov_b32_e32 v8, v0
-; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
-; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
@@ -3677,57 +3772,53 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[0:1]
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 -256
-  %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64:
+define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3736,31 +3827,31 @@ define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3769,23 +3860,24 @@ define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -3793,169 +3885,183 @@ define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v6, v[0:1]
-; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v6, v[0:1]
-; GFX7-NEXT:    flat_load_dword v7, v[4:5]
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
@@ -3963,114 +4069,122 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:2040
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[0:1]
-; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x7f8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v7, v[0:1]
-; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 255
-  %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %result
 }
 
-define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4079,38 +4193,31 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b64 v[6:7], v[4:5]
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4119,25 +4226,24 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4145,355 +4251,5148 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v8, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[0:1]
-; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v7, v[0:1]
-; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 -256
-  %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; half
-; --------------------------------------------------------------------
-
-define half @flat_agent_atomic_fadd_ret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16:
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT:    flat_load_dword v4, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v5, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB22_6
+; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB22_3
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB22_3: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB22_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB22_5: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB22_6: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB22_8
+; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB22_8: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB23_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB23_8
+; GFX90A-NEXT:  .LBB23_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB23_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB23_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB23_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB23_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB23_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB23_2
+; GFX90A-NEXT:  .LBB23_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_8
+; GFX908-NEXT:  .LBB23_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB23_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB23_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB23_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB23_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB23_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB23_2
+; GFX908-NEXT:  .LBB23_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB24_6
+; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB24_3
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB24_3: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB24_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB24_5: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB24_6: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB24_8
+; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB24_8: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB25_8
+; GFX90A-NEXT:  .LBB25_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB25_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB25_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB25_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB25_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB25_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB25_2
+; GFX90A-NEXT:  .LBB25_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_8
+; GFX908-NEXT:  .LBB25_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB25_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB25_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB25_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB25_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB25_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB25_2
+; GFX908-NEXT:  .LBB25_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB26_6
+; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB26_3
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB26_3: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB26_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB26_5: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB26_6: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB26_8
+; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB26_8: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_8
+; GFX90A-NEXT:  .LBB27_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB27_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB27_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB27_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB27_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB27_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB27_2
+; GFX90A-NEXT:  .LBB27_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_8
+; GFX908-NEXT:  .LBB27_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB27_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB27_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB27_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB27_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB27_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB27_2
+; GFX908-NEXT:  .LBB27_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB28_6
+; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB28_3
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB28_3: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB28_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB28_5: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB28_6: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB28_8
+; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB28_8: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_8
+; GFX90A-NEXT:  .LBB29_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB29_3: ; %atomicrmw.check.private
+; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB29_5
+; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:  .LBB29_5: ; %Flow
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT:    s_cbranch_execz .LBB29_7
+; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:  .LBB29_7: ; %Flow1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB29_2
+; GFX90A-NEXT:  .LBB29_8: ; %atomicrmw.shared
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT:    ds_add_f32 v0, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_3
+; GFX908-NEXT:  ; %bb.1: ; %Flow2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_8
+; GFX908-NEXT:  .LBB29_2: ; %atomicrmw.phi
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-NEXT:  .LBB29_3: ; %atomicrmw.check.private
+; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB29_5
+; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:  .LBB29_5: ; %Flow
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT:    s_cbranch_execz .LBB29_7
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:  .LBB29_7: ; %Flow1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB29_2
+; GFX908-NEXT:  .LBB29_8: ; %atomicrmw.shared
+; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT:    ds_add_f32 v0, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7f8, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 255
+  %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT:    flat_load_b64 v[0:1], v[0:1]
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v9, v1
+; GFX908-NEXT:    v_mov_b32_e32 v8, v0
+; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 -256
+  %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v6, v[0:1]
+; GFX7-NEXT:    flat_load_dword v7, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:2040
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x7f8, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 255
+  %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1] offset:-2048
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b64 v[6:7], v[4:5]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v8, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v1
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 -256
+  %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; half
+; --------------------------------------------------------------------
+
+define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    flat_load_dword v4, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT:    v_not_b32_e32 v4, v4
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v6, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+}
+
+define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    flat_load_dword v4, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT:    v_not_b32_e32 v4, v4
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v6, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr %ptr, i64 1023
+  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+}
+
+define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    flat_load_dword v4, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT:    v_not_b32_e32 v4, v4
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v6, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr %ptr, i64 -1024
+  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+ }
+
+define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr %ptr, i64 1023
+  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr %ptr, i64 -1024
+  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr %ptr, i64 1023
+  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_add_f16_e32 v0, v1, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_e32 v0, v1, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr %ptr, i64 1023
+  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+}
+
+define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    flat_load_dword v4, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
@@ -4503,7 +9402,7 @@ define half @flat_agent_atomic_fadd_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -4520,18 +9419,19 @@ define half @flat_agent_atomic_fadd_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst
+  %gep = getelementptr half, ptr %ptr, i64 1023
+  %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4544,73 +9444,70 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
-; GFX940-NEXT:    flat_load_dword v4, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
 ; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4619,39 +9516,37 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4659,66 +9554,66 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v5, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
 ; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4726,119 +9621,118 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX908-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
-  ret half %result
+  %unused = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
@@ -4846,15 +9740,23 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
@@ -4866,55 +9768,60 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
-; GFX940-NEXT:    flat_load_dword v4, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v5
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
@@ -4922,15 +9829,24 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
@@ -4943,17 +9859,17 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -4961,13 +9877,18 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -4977,83 +9898,94 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v5
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -5062,14 +9994,21 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5077,36 +10016,35 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5114,340 +10052,427 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr %ptr, i64 -1024
-  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
-  ret half %result
- }
+  %result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+}
 
-define void @flat_agent_atomic_fadd_noret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16:
+define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v5
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v5
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr bfloat, ptr %ptr, i64 1023
+  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
 }
 
-define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
@@ -5456,506 +10481,615 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_mov_b32_e32 v7, v5
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v5
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
-  ret void
-}
+  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
+  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+ }
 
-define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
@@ -5965,302 +11099,457 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr %ptr, i64 -1024
-  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
+  %gep = getelementptr bfloat, ptr %ptr, i64 1023
+  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_add_f16_e32 v0, v1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_e32 v0, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4
-  ret half %result
+  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
+  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX940-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v4, v4, v3, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_and_or_b32 v4, v5, s3, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -6268,457 +11557,468 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v3
 ; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, s7, v3
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4
-  ret void
+  %gep = getelementptr bfloat, ptr %ptr, i64 1023
+  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
 }
 
-define half @flat_system_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
-; GFX940-NEXT:    flat_load_dword v4, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v5, v5, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s3, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v5, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v5, v5, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fadd ptr %gep, half %val seq_cst
-  ret half %result
+  %gep = getelementptr bfloat, ptr %ptr, i64 1023
+  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
@@ -6726,71 +12026,85 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
 ; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
@@ -6798,13 +12112,22 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
 ; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
@@ -6818,16 +12141,16 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -6835,12 +12158,17 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -6851,16 +12179,15 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -6869,32 +12196,36 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX908-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -6903,12 +12234,19 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6917,16 +12255,15 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -6935,13 +12272,20 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6950,74 +12294,70 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fadd ptr %gep, half %val seq_cst
+  %unused = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16:
+define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7040,24 +12380,26 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
@@ -7065,7 +12407,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7080,34 +12422,36 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
 ; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7136,25 +12480,26 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7175,16 +12520,17 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -7195,7 +12541,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7208,22 +12554,25 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
 ; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX908-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -7234,7 +12583,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7253,16 +12602,17 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -7272,7 +12622,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -7293,16 +12643,17 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -7312,7 +12663,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -7329,2994 +12680,1893 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst
+  %gep = getelementptr bfloat, ptr %ptr, i64 1023
+  %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v4, v4
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v5, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v5, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v4, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
-  ret bfloat %result
+  %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; --------------------------------------------------------------------
+; <2 x half>
+; --------------------------------------------------------------------
+
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v4, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v5, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v5, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
-  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
-  ret bfloat %result
- }
+  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
+}
 
-define void @flat_agent_atomic_fadd_noret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v4, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[4:5]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
 }
 
-define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v3, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v0, v1, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[4:5]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
 }
-
-define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+
+define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v3, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
-  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX940-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX940-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v4, v4, v3, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s3, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s7, v3
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
-  ret bfloat %result
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s3, v2
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
+; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v3, s7, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
+; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
+  %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v5, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v4, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v5, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    flat_load_dword v5, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v4, v4
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[4:5]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst
-  ret bfloat %result
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
 }
 
-define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    flat_load_dword v3, v[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; <2 x half>
-; --------------------------------------------------------------------
-
-define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10329,7 +14579,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -10338,12 +14588,12 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB64_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -10358,18 +14608,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB64_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -10382,18 +14632,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB64_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -10404,18 +14654,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB64_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -10426,18 +14676,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB64_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
@@ -10450,13 +14700,13 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -10469,7 +14719,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB64_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -10492,18 +14742,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10511,189 +14761,178 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB65_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB65_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB65_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB65_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB65_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[4:5]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB65_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
 ; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret <2 x half> %result
+  %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret void
 }
 
-define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10701,203 +14940,186 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB66_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB66_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB66_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v0, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB66_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB66_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[4:5]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB66_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
-  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x half> %result
 }
 
-define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10910,7 +15132,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -10919,12 +15141,12 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB67_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
@@ -10938,17 +15160,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB67_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
@@ -10961,17 +15183,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB67_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
@@ -10982,17 +15204,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB67_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
@@ -11003,17 +15225,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB67_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -11026,12 +15248,12 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -11044,7 +15266,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB67_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -11067,16 +15289,20 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; --------------------------------------------------------------------
+; <2 x bfloat>
+; --------------------------------------------------------------------
+
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11084,185 +15310,274 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
-; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
 }
 
-define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11270,200 +15585,277 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048
-; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v1, v[0:1]
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
-  %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
 }
 
-define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11471,191 +15863,291 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[4:5]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst
-  ret <2 x half> %result
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
+  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
 }
 
-define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11663,191 +16155,266 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v4, v[0:1]
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:  .LBB71_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; <2 x bfloat>
-; --------------------------------------------------------------------
-
-define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11855,274 +16422,273 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
-; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:  .LBB72_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
-; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB72_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
-  ret <2 x bfloat> %result
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12130,277 +16696,288 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB73_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
-; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB73_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB73_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB73_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB73_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
 ; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB73_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
-  ret <2 x bfloat> %result
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
+  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12408,87 +16985,79 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
-; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB74_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -12515,107 +17084,103 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB74_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB74_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v0, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB74_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -12643,16 +17208,16 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB74_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v0, v[4:5]
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -12662,7 +17227,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB74_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -12682,17 +17247,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB74_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
-  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12700,30 +17265,30 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -12746,7 +17311,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -12755,20 +17320,22 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -12795,21 +17362,21 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -12827,28 +17394,30 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -12866,26 +17435,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -12913,14 +17484,16 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -12930,7 +17503,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB75_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -12950,16 +17523,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB75_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12967,273 +17541,274 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
-; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
-; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %result
 }
 
-define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13241,37 +17816,30 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -13303,22 +17871,20 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -13345,107 +17911,97 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
 ; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v1, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
 ; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -13473,16 +18029,14 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13492,7 +18046,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -13512,17 +18066,16 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
-  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13530,30 +18083,30 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13578,7 +18131,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -13586,64 +18139,63 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
-; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13662,30 +18214,28 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13704,105 +18254,103 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB78_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13810,30 +18358,30 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -13856,7 +18404,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -13865,22 +18413,20 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -13907,21 +18453,21 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -13939,30 +18485,28 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -13980,28 +18524,26 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14029,16 +18571,14 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14048,7 +18588,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB79_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -14068,14 +18608,15 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret void
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 00d418cd3912c1..e6d6652c8b7b71 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -12,39 +12,21 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32:
+define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -69,61 +51,27 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -147,7 +95,7 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -171,7 +119,7 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -195,66 +143,32 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -279,62 +193,29 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -358,7 +239,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -382,7 +263,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -407,68 +288,35 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v0
@@ -500,67 +348,31 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -587,7 +399,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -614,7 +426,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -639,67 +451,35 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 -512
-  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32:
+define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -723,59 +503,29 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -798,7 +548,7 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -821,7 +571,7 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -844,64 +594,32 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -925,61 +643,31 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1002,7 +690,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1025,7 +713,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1050,67 +738,35 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -1140,65 +796,33 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -1225,7 +849,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -1252,7 +876,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1277,37 +901,22 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 -512
-  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1338,7 +947,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1363,7 +972,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -1391,7 +1000,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -1418,7 +1027,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1444,7 +1053,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1468,7 +1077,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1493,7 +1102,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
@@ -1518,12 +1127,12 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, float %val seq_cst
+  %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1553,7 +1162,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1577,7 +1186,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -1604,7 +1213,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -1631,7 +1240,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1656,7 +1265,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1679,7 +1288,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1704,7 +1313,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -1729,47 +1338,25 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, float %val seq_cst
+  %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -1794,61 +1381,27 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -1872,7 +1425,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -1896,7 +1449,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -1920,69 +1473,35 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1992,7 +1511,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2004,65 +1523,30 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2071,7 +1555,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2083,10 +1567,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2095,7 +1579,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -2107,744 +1591,516 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v0
-; GFX940-NEXT:    v_mov_b32_e32 v5, v1
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT:    flat_load_dword v0, v[0:1]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT:    v_max_f32_e32 v5, v0, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v0, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v0
-; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX908-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 -512
-  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @flat_agent_atomic_fmax_noret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT:    v_mov_b32_e32 v4, v0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
+; GFX940-NEXT:    flat_load_dword v0, v[0:1]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr %ptr, i64 -512
+  %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v3, v[4:5]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2865,123 +2121,77 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v1, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3002,297 +2212,199 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 -512
-  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -3302,113 +2414,91 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3429,372 +2519,338 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, float %val seq_cst
+  %gep = getelementptr float, ptr %ptr, i64 -512
+  %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @flat_agent_atomic_fmax_ret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64:
+define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64:
+; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64:
+; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64:
+; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64:
+; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64:
+; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64:
+; GFX8-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    flat_load_dword v5, v[5:6]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64:
+; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -3802,128 +2858,121 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
-; GFX8-NEXT:    flat_load_dword v0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[0:1]
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 255
-  %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -3934,7 +2983,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -3947,237 +2996,779 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v0
-; GFX940-NEXT:    v_mov_b32_e32 v5, v1
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX940-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT:    flat_load_b64 v[0:1], v[0:1]
 ; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7f8, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 255
+  %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT:    flat_load_b64 v[0:1], v[0:1]
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v5, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v4, vcc
+; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 -256
+  %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3] offset:2040
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3] offset:2040
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v5, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v4, vcc
-; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
-; GFX8-NEXT:    flat_load_dword v0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v4, v[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[0:1]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[0:1]
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7f8, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 -256
-  %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr double, ptr %ptr, i64 255
+  %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64:
+define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -4185,42 +3776,34 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[4:5]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -4236,99 +3819,80 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT:    v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[8:9]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    flat_load_dword v5, v[5:6]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v4, v[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[0:1]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -4336,504 +3900,337 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr %ptr, i64 -256
+  %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    flat_load_dword v4, v[6:7]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0x7f8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    flat_load_dword v4, v[6:7]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 255
-  %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    flat_load_dword v4, v[6:7]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    flat_load_dword v4, v[6:7]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 -256
-  %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16:
+define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4851,7 +4248,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4872,13 +4269,13 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -4891,7 +4288,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4907,13 +4304,13 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -4927,7 +4324,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4949,13 +4346,13 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -4967,7 +4364,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4984,13 +4381,13 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5003,7 +4400,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5018,13 +4415,13 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5037,7 +4434,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5052,13 +4449,13 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5071,7 +4468,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5087,13 +4484,13 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5106,7 +4503,7 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -5123,18 +4520,18 @@ define half @flat_agent_atomic_fmax_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5153,7 +4550,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5174,13 +4571,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5195,7 +4592,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5211,13 +4608,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -5232,7 +4629,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -5254,13 +4651,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -5273,7 +4670,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -5290,13 +4687,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -5310,7 +4707,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5325,13 +4722,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -5345,7 +4742,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5360,13 +4757,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -5380,7 +4777,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5396,13 +4793,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -5416,7 +4813,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -5433,19 +4830,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5464,7 +4861,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5485,13 +4882,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -5507,7 +4904,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5523,13 +4920,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -5544,7 +4941,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -5566,13 +4963,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -5585,7 +4982,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -5602,13 +4999,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -5622,7 +5019,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5637,13 +5034,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -5657,7 +5054,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5672,13 +5069,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -5692,7 +5089,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5708,13 +5105,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
@@ -5728,7 +5125,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -5745,19 +5142,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 -1024
-  %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16:
+define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5775,7 +5172,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5796,12 +5193,12 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5814,7 +5211,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5830,12 +5227,12 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5849,7 +5246,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5871,12 +5268,12 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5888,7 +5285,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5905,12 +5302,12 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5923,7 +5320,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5938,12 +5335,12 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5956,7 +5353,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5971,12 +5368,12 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5989,7 +5386,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6005,12 +5402,12 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -6023,7 +5420,7 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -6040,16 +5437,16 @@ define void @flat_agent_atomic_fmax_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6068,7 +5465,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6089,12 +5486,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -6109,7 +5506,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6125,12 +5522,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -6145,7 +5542,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6167,12 +5564,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -6185,7 +5582,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6202,12 +5599,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -6221,7 +5618,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6236,12 +5633,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -6255,7 +5652,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6270,12 +5667,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -6289,7 +5686,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6305,12 +5702,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6324,7 +5721,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6341,17 +5738,17 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6370,7 +5767,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6391,12 +5788,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -6412,7 +5809,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6428,12 +5825,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -6448,7 +5845,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6470,12 +5867,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -6488,7 +5885,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6505,12 +5902,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -6524,7 +5921,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6539,12 +5936,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -6558,7 +5955,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6573,12 +5970,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -6592,7 +5989,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6608,12 +6005,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6627,7 +6024,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6644,17 +6041,17 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6664,7 +6061,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -6682,20 +6079,20 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -6709,19 +6106,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -6740,13 +6137,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6754,7 +6151,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX10-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6770,19 +6167,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -6795,20 +6192,20 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -6821,13 +6218,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6835,7 +6232,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6849,12 +6246,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -6863,7 +6260,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
@@ -6878,18 +6275,18 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6899,7 +6296,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -6917,19 +6314,19 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6943,18 +6340,18 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6973,12 +6370,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
@@ -6986,7 +6383,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7002,19 +6399,19 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7027,19 +6424,19 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7052,12 +6449,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -7065,7 +6462,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7079,12 +6476,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -7093,7 +6490,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -7108,17 +6505,17 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7137,7 +6534,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7158,13 +6555,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7179,7 +6576,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7195,13 +6592,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7216,7 +6613,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7238,13 +6635,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7257,7 +6654,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7274,13 +6671,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7294,7 +6691,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7311,13 +6708,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7331,7 +6728,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7346,13 +6743,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -7366,7 +6763,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -7382,13 +6779,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -7402,7 +6799,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -7419,19 +6816,19 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fmax ptr %gep, half %val seq_cst
+  %result = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7450,7 +6847,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7471,12 +6868,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7491,7 +6888,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7507,12 +6904,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7527,7 +6924,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7549,12 +6946,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7567,7 +6964,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7584,12 +6981,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7603,7 +7000,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7620,12 +7017,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7639,7 +7036,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7654,12 +7051,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7673,7 +7070,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7689,12 +7086,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7708,7 +7105,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7725,12 +7122,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fmax ptr %gep, half %val seq_cst
+  %unused = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -7738,8 +7135,8 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16:
+define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7756,7 +7153,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7784,13 +7181,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -7804,7 +7201,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7826,13 +7223,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -7846,7 +7243,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7875,13 +7272,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -7893,7 +7290,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7914,13 +7311,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -7934,7 +7331,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7953,13 +7350,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -7973,7 +7370,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7992,13 +7389,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -8011,7 +7408,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8032,13 +7429,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -8051,7 +7448,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -8069,18 +7466,18 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8099,7 +7496,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8127,13 +7524,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -8149,7 +7546,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8171,13 +7568,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8193,7 +7590,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8222,13 +7619,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8241,7 +7638,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8262,13 +7659,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8283,7 +7680,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8302,13 +7699,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8323,7 +7720,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8342,13 +7739,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8362,7 +7759,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8383,13 +7780,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -8403,7 +7800,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -8421,19 +7818,19 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8452,7 +7849,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8480,13 +7877,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -8503,7 +7900,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8525,13 +7922,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8547,7 +7944,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8576,13 +7973,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8595,7 +7992,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8616,13 +8013,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8637,7 +8034,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8656,13 +8053,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8677,7 +8074,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8696,13 +8093,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8716,7 +8113,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8737,13 +8134,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
@@ -8757,7 +8154,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -8775,19 +8172,19 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 -1024
-  %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16:
+define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8804,7 +8201,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8831,12 +8228,12 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -8850,7 +8247,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8872,12 +8269,12 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -8891,7 +8288,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8919,12 +8316,12 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -8936,7 +8333,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8957,12 +8354,12 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -8976,7 +8373,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8995,12 +8392,12 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -9014,7 +8411,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9033,12 +8430,12 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -9051,7 +8448,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9072,12 +8469,12 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -9090,7 +8487,7 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -9108,16 +8505,16 @@ define void @flat_agent_atomic_fmax_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9136,7 +8533,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9163,12 +8560,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9184,7 +8581,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9206,12 +8603,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9227,7 +8624,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9255,12 +8652,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9273,7 +8670,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9294,12 +8691,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9314,7 +8711,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9333,12 +8730,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9353,7 +8750,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9372,12 +8769,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -9391,7 +8788,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9412,12 +8809,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9431,7 +8828,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9449,17 +8846,17 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9478,7 +8875,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9505,12 +8902,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9527,7 +8924,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9549,12 +8946,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9570,7 +8967,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9598,12 +8995,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9616,7 +9013,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9637,12 +9034,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9657,7 +9054,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9676,12 +9073,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9696,7 +9093,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9715,12 +9112,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9734,7 +9131,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9755,12 +9152,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9774,7 +9171,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9792,17 +9189,17 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9812,7 +9209,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -9837,13 +9234,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -9851,7 +9248,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -9872,20 +9269,20 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -9911,13 +9308,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -9925,7 +9322,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -9946,12 +9343,12 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -9959,7 +9356,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -9978,13 +9375,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -9992,7 +9389,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -10011,13 +9408,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10025,7 +9422,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -10045,12 +9442,12 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -10059,7 +9456,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
@@ -10075,18 +9472,18 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10096,7 +9493,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10120,12 +9517,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -10133,7 +9530,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10154,19 +9551,19 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10191,12 +9588,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
@@ -10204,7 +9601,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10225,12 +9622,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -10238,7 +9635,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10257,12 +9654,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -10270,7 +9667,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10289,12 +9686,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -10302,7 +9699,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10322,12 +9719,12 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -10336,7 +9733,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10352,17 +9749,17 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10381,7 +9778,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -10409,13 +9806,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10431,7 +9828,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -10453,13 +9850,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10475,7 +9872,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -10504,13 +9901,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10523,7 +9920,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -10544,13 +9941,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10565,7 +9962,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -10586,13 +9983,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10607,7 +10004,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10626,13 +10023,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10646,7 +10043,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10667,13 +10064,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -10687,7 +10084,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -10705,19 +10102,19 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fmax ptr %gep, bfloat %val seq_cst
+  %result = atomicrmw fmax ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10736,7 +10133,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10763,12 +10160,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10784,7 +10181,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10806,12 +10203,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10827,7 +10224,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10855,12 +10252,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10873,7 +10270,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10894,12 +10291,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10914,7 +10311,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10935,12 +10332,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10955,7 +10352,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10974,12 +10371,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -10993,7 +10390,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11014,12 +10411,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11033,7 +10430,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11051,12 +10448,12 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fmax ptr %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmax ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -11064,8 +10461,8 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11075,7 +10472,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11090,19 +10487,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11116,19 +10513,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11144,19 +10541,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11170,19 +10567,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11194,19 +10591,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11218,20 +10615,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -11246,13 +10643,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -11265,7 +10662,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11288,18 +10685,18 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11309,7 +10706,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11324,19 +10721,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11350,19 +10747,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11378,13 +10775,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -11392,7 +10789,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -11406,18 +10803,18 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11429,19 +10826,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11453,13 +10850,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -11468,7 +10865,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11483,12 +10880,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -11503,7 +10900,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -11526,17 +10923,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11546,7 +10943,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11561,13 +10958,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v0
@@ -11581,7 +10978,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -11595,12 +10992,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -11612,7 +11009,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -11628,12 +11025,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -11641,7 +11038,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -11655,12 +11052,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11670,7 +11067,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -11682,12 +11079,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11697,7 +11094,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -11709,12 +11106,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11723,7 +11120,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11738,12 +11135,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -11758,7 +11155,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -11781,17 +11178,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
-  %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11801,7 +11198,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -11816,18 +11213,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11841,18 +11238,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11868,18 +11265,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11893,18 +11290,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11916,18 +11313,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11939,19 +11336,19 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -11966,12 +11363,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -11984,7 +11381,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12007,16 +11404,16 @@ define void @flat_agent_atomic_fmax_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12026,7 +11423,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12041,18 +11438,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12066,18 +11463,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12093,12 +11490,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -12106,7 +11503,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12120,18 +11517,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12143,18 +11540,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12166,12 +11563,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12180,7 +11577,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12195,12 +11592,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -12215,7 +11612,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12238,17 +11635,17 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12258,7 +11655,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12273,12 +11670,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -12290,7 +11687,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12304,12 +11701,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -12319,7 +11716,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12335,12 +11732,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -12348,7 +11745,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12362,12 +11759,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -12377,7 +11774,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -12389,12 +11786,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -12404,7 +11801,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -12416,12 +11813,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -12430,7 +11827,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12445,12 +11842,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
@@ -12465,7 +11862,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12488,17 +11885,17 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
-  %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12508,7 +11905,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12523,19 +11920,19 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12549,19 +11946,19 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -12577,13 +11974,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -12591,7 +11988,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -12605,18 +12002,18 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -12630,19 +12027,19 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -12654,13 +12051,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -12669,7 +12066,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -12684,12 +12081,12 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -12704,7 +12101,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12727,17 +12124,17 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12747,7 +12144,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12762,18 +12159,18 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12787,18 +12184,18 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12814,12 +12211,12 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -12827,7 +12224,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12841,18 +12238,18 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12866,18 +12263,18 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12889,12 +12286,12 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12903,7 +12300,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12918,12 +12315,12 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -12938,7 +12335,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12961,12 +12358,12 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -12974,8 +12371,8 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12986,7 +12383,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13018,13 +12415,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -13033,7 +12430,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13060,13 +12457,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
@@ -13075,7 +12472,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13108,21 +12505,21 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13149,13 +12546,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -13164,7 +12561,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13189,13 +12586,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -13204,7 +12601,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13229,20 +12626,20 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -13270,13 +12667,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -13288,7 +12685,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13308,18 +12705,18 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13330,7 +12727,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13362,13 +12759,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -13377,7 +12774,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13404,13 +12801,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -13419,7 +12816,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13452,14 +12849,14 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -13468,7 +12865,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -13495,12 +12892,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -13509,7 +12906,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13534,13 +12931,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -13549,7 +12946,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13574,13 +12971,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13589,7 +12986,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13617,12 +13014,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -13636,7 +13033,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -13656,17 +13053,17 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13677,7 +13074,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13709,13 +13106,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v0
@@ -13732,7 +13129,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -13759,12 +13156,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -13779,7 +13176,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -13812,13 +13209,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -13827,7 +13224,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -13854,12 +13251,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -13872,7 +13269,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -13897,12 +13294,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -13915,7 +13312,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -13940,12 +13337,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -13954,7 +13351,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13982,12 +13379,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -14001,7 +13398,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14021,17 +13418,17 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
-  %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14042,7 +13439,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14073,12 +13470,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -14087,7 +13484,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14114,12 +13511,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
@@ -14128,7 +13525,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14160,20 +13557,20 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14200,12 +13597,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -14214,7 +13611,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14239,12 +13636,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -14253,7 +13650,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14278,19 +13675,19 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14318,12 +13715,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v4, v[0:1]
@@ -14335,7 +13732,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -14355,16 +13752,16 @@ define void @flat_agent_atomic_fmax_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14375,7 +13772,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14406,12 +13803,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -14420,7 +13817,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14447,12 +13844,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -14461,7 +13858,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14493,13 +13890,13 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -14508,7 +13905,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14535,12 +13932,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -14549,7 +13946,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14574,12 +13971,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -14588,7 +13985,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14613,12 +14010,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -14627,7 +14024,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14655,12 +14052,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -14674,7 +14071,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -14694,17 +14091,17 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14715,7 +14112,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14746,12 +14143,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -14766,7 +14163,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14793,12 +14190,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -14811,7 +14208,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14843,13 +14240,13 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -14858,7 +14255,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14885,12 +14282,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -14903,7 +14300,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -14928,12 +14325,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -14946,7 +14343,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -14971,12 +14368,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -14985,7 +14382,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15013,12 +14410,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
@@ -15032,7 +14429,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15052,17 +14449,17 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
-  %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15073,7 +14470,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15105,13 +14502,13 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15120,7 +14517,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15147,13 +14544,13 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -15162,7 +14559,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -15195,14 +14592,14 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -15211,7 +14608,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -15238,12 +14635,12 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15252,7 +14649,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -15279,13 +14676,13 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15294,7 +14691,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -15319,13 +14716,13 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -15334,7 +14731,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -15362,12 +14759,12 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -15381,7 +14778,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -15401,17 +14798,17 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15422,7 +14819,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15453,12 +14850,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15467,7 +14864,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15494,12 +14891,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -15508,7 +14905,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15540,13 +14937,13 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -15555,7 +14952,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15582,12 +14979,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15596,7 +14993,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15623,12 +15020,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15637,7 +15034,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15662,12 +15059,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -15676,7 +15073,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15704,12 +15101,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -15723,7 +15120,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15743,14 +15140,16 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index fdfbb4215eb6f9..6babcfd15ee1ac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -12,39 +12,21 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32:
+define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -69,61 +51,27 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -147,7 +95,7 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -171,7 +119,7 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -195,66 +143,32 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -279,62 +193,29 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -358,7 +239,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -382,7 +263,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -407,68 +288,35 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v0
@@ -500,67 +348,31 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -587,7 +399,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -614,7 +426,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -639,67 +451,35 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 -512
-  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32:
+define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -723,59 +503,29 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -798,7 +548,7 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -821,7 +571,7 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -844,64 +594,32 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -925,61 +643,31 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1002,7 +690,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1025,7 +713,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1050,67 +738,35 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -1140,65 +796,33 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -1225,7 +849,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -1252,7 +876,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1277,37 +901,22 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 -512
-  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1338,7 +947,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1363,7 +972,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -1391,7 +1000,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -1418,7 +1027,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1444,7 +1053,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1468,7 +1077,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1493,7 +1102,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
@@ -1518,12 +1127,12 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, float %val seq_cst
+  %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1553,7 +1162,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1577,7 +1186,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -1604,7 +1213,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -1631,7 +1240,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1656,7 +1265,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -1679,7 +1288,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1704,7 +1313,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -1729,47 +1338,25 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, float %val seq_cst
+  %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -1794,61 +1381,27 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -1872,7 +1425,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -1896,7 +1449,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -1920,69 +1473,35 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1992,7 +1511,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2004,65 +1523,30 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2071,7 +1555,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2083,10 +1567,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2095,7 +1579,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -2107,744 +1591,516 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v0
-; GFX940-NEXT:    v_mov_b32_e32 v5, v1
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT:    flat_load_dword v0, v[0:1]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT:    v_min_f32_e32 v5, v0, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v0, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v0
-; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX908-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 -512
-  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT:    v_mov_b32_e32 v4, v0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
+; GFX940-NEXT:    flat_load_dword v0, v[0:1]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr %ptr, i64 -512
+  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dword v3, v[4:5]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2865,123 +2121,77 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dword v1, v[0:1]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3002,297 +2212,199 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 -512
-  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -3302,113 +2414,91 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3429,372 +2519,338 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, float %val seq_cst
+  %gep = getelementptr float, ptr %ptr, i64 -512
+  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64:
+define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64:
+; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64:
+; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64:
+; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
+; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64:
+; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64:
+; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64:
+; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    flat_load_dword v5, v[5:6]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64:
+; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -3802,128 +2858,121 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
-; GFX8-NEXT:    flat_load_dword v0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[0:1]
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 255
-  %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr %ptr, i64 511
+  %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -3934,7 +2983,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
 ; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -3947,237 +2996,779 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v4, v0
-; GFX940-NEXT:    v_mov_b32_e32 v5, v1
-; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX940-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT:    flat_load_b64 v[0:1], v[0:1]
 ; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7f8, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 255
+  %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT:    flat_load_b64 v[0:1], v[0:1]
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v5, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v4, vcc
+; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %ptr, i64 -256
+  %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v5, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v4, vcc
-; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT:    v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[2:3]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
-; GFX8-NEXT:    flat_load_dword v0, v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v4, v[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[0:1]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v1, v[0:1]
-; GFX7-NEXT:    flat_load_dword v0, v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7f8, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 -256
-  %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr double, ptr %ptr, i64 255
+  %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64:
+define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -4185,42 +3776,34 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[4:5]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -4236,99 +3819,80 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT:    v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[0:1], v[8:9]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    flat_load_dword v5, v[5:6]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v4, v[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[0:1]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -4336,504 +3900,337 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr %ptr, i64 -256
+  %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    flat_load_dword v4, v[6:7]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0x7f8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    flat_load_dword v4, v[6:7]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 255
-  %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1] offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    flat_load_b64 v[4:5], v[4:5]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX908-NEXT:    s_mov_b64 s[4:5], vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT:    v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5]
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[0:1], v[8:9]
-; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    flat_load_dword v4, v[6:7]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    flat_load_dword v5, v[0:1]
-; GFX7-NEXT:    flat_load_dword v4, v[6:7]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr %ptr, i64 -256
-  %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16:
+define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4851,7 +4248,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4872,13 +4269,13 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -4891,7 +4288,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4907,13 +4304,13 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -4927,7 +4324,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4949,13 +4346,13 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -4967,7 +4364,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4984,13 +4381,13 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5003,7 +4400,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5018,13 +4415,13 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5037,7 +4434,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5052,13 +4449,13 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5071,7 +4468,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5087,13 +4484,13 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5106,7 +4503,7 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -5123,18 +4520,18 @@ define half @flat_agent_atomic_fmin_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5153,7 +4550,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5174,13 +4571,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5195,7 +4592,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5211,13 +4608,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -5232,7 +4629,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -5254,13 +4651,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -5273,7 +4670,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -5290,13 +4687,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -5310,7 +4707,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5325,13 +4722,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -5345,7 +4742,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5360,13 +4757,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -5380,7 +4777,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5396,13 +4793,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -5416,7 +4813,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -5433,19 +4830,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5464,7 +4861,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5485,13 +4882,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -5507,7 +4904,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5523,13 +4920,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -5544,7 +4941,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -5566,13 +4963,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -5585,7 +4982,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -5602,13 +4999,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -5622,7 +5019,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5637,13 +5034,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -5657,7 +5054,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5672,13 +5069,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -5692,7 +5089,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5708,13 +5105,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
@@ -5728,7 +5125,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -5745,19 +5142,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 -1024
-  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16:
+define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5775,7 +5172,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5796,12 +5193,12 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5814,7 +5211,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5830,12 +5227,12 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5849,7 +5246,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5871,12 +5268,12 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5888,7 +5285,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5905,12 +5302,12 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5923,7 +5320,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5938,12 +5335,12 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5956,7 +5353,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5971,12 +5368,12 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5989,7 +5386,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6005,12 +5402,12 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -6023,7 +5420,7 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -6040,16 +5437,16 @@ define void @flat_agent_atomic_fmin_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6068,7 +5465,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6089,12 +5486,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -6109,7 +5506,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6125,12 +5522,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -6145,7 +5542,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6167,12 +5564,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -6185,7 +5582,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6202,12 +5599,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -6221,7 +5618,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6236,12 +5633,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -6255,7 +5652,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6270,12 +5667,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -6289,7 +5686,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6305,12 +5702,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6324,7 +5721,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6341,17 +5738,17 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6370,7 +5767,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6391,12 +5788,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -6412,7 +5809,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6428,12 +5825,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -6448,7 +5845,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6470,12 +5867,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -6488,7 +5885,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6505,12 +5902,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -6524,7 +5921,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6539,12 +5936,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -6558,7 +5955,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6573,12 +5970,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -6592,7 +5989,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6608,12 +6005,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6627,7 +6024,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6644,17 +6041,17 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6664,7 +6061,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -6682,20 +6079,20 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -6709,19 +6106,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -6740,13 +6137,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6754,7 +6151,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX10-NEXT:    v_max_f16_e32 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6770,19 +6167,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -6795,20 +6192,20 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -6821,13 +6218,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6835,7 +6232,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6849,12 +6246,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -6863,7 +6260,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
@@ -6878,18 +6275,18 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6899,7 +6296,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -6917,19 +6314,19 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6943,18 +6340,18 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6973,12 +6370,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
@@ -6986,7 +6383,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7002,19 +6399,19 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7027,19 +6424,19 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7052,12 +6449,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -7065,7 +6462,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -7079,12 +6476,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -7093,7 +6490,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -7108,17 +6505,17 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7137,7 +6534,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7158,13 +6555,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7179,7 +6576,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7195,13 +6592,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7216,7 +6613,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7238,13 +6635,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7257,7 +6654,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7274,13 +6671,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7294,7 +6691,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7311,13 +6708,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7331,7 +6728,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7346,13 +6743,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -7366,7 +6763,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -7382,13 +6779,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -7402,7 +6799,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -7419,19 +6816,19 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %result = atomicrmw fmin ptr %gep, half %val seq_cst
+  %result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7450,7 +6847,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7471,12 +6868,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7491,7 +6888,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7507,12 +6904,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7527,7 +6924,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7549,12 +6946,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7567,7 +6964,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7584,12 +6981,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7603,7 +7000,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7620,12 +7017,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7639,7 +7036,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7654,12 +7051,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7673,7 +7070,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7689,12 +7086,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7708,7 +7105,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7725,12 +7122,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr %ptr, i64 1023
-  %unused = atomicrmw fmin ptr %gep, half %val seq_cst
+  %unused = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -7738,8 +7135,8 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16:
+define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7756,7 +7153,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7784,13 +7181,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -7804,7 +7201,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7826,13 +7223,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -7846,7 +7243,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7875,13 +7272,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -7893,7 +7290,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7914,13 +7311,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -7934,7 +7331,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7953,13 +7350,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -7973,7 +7370,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7992,13 +7389,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -8011,7 +7408,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8032,13 +7429,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -8051,7 +7448,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -8069,18 +7466,18 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8099,7 +7496,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8127,13 +7524,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -8149,7 +7546,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8171,13 +7568,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8193,7 +7590,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8222,13 +7619,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8241,7 +7638,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8262,13 +7659,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8283,7 +7680,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8302,13 +7699,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8323,7 +7720,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8342,13 +7739,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8362,7 +7759,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8383,13 +7780,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -8403,7 +7800,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -8421,19 +7818,19 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8452,7 +7849,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8480,13 +7877,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -8503,7 +7900,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8525,13 +7922,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8547,7 +7944,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8576,13 +7973,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8595,7 +7992,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8616,13 +8013,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8637,7 +8034,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8656,13 +8053,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8677,7 +8074,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8696,13 +8093,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8716,7 +8113,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8737,13 +8134,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
@@ -8757,7 +8154,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -8775,19 +8172,19 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 -1024
-  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16:
+define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8804,7 +8201,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8831,12 +8228,12 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -8850,7 +8247,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8872,12 +8269,12 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -8891,7 +8288,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8919,12 +8316,12 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -8936,7 +8333,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8957,12 +8354,12 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -8976,7 +8373,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8995,12 +8392,12 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -9014,7 +8411,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9033,12 +8430,12 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -9051,7 +8448,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9072,12 +8469,12 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -9090,7 +8487,7 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -9108,16 +8505,16 @@ define void @flat_agent_atomic_fmin_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9136,7 +8533,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9163,12 +8560,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9184,7 +8581,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9206,12 +8603,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9227,7 +8624,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9255,12 +8652,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9273,7 +8670,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9294,12 +8691,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9314,7 +8711,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9333,12 +8730,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9353,7 +8750,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9372,12 +8769,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -9391,7 +8788,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9412,12 +8809,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9431,7 +8828,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9449,17 +8846,17 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9478,7 +8875,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9505,12 +8902,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9527,7 +8924,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9549,12 +8946,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9570,7 +8967,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9598,12 +8995,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9616,7 +9013,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9637,12 +9034,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9657,7 +9054,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9676,12 +9073,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9696,7 +9093,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9715,12 +9112,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9734,7 +9131,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9755,12 +9152,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9774,7 +9171,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9792,17 +9189,17 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9812,7 +9209,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -9837,13 +9234,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -9851,7 +9248,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -9872,20 +9269,20 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -9911,13 +9308,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -9925,7 +9322,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -9946,12 +9343,12 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -9959,7 +9356,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -9978,13 +9375,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -9992,7 +9389,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -10011,13 +9408,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10025,7 +9422,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -10045,12 +9442,12 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -10059,7 +9456,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v3
@@ -10075,18 +9472,18 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10096,7 +9493,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10120,12 +9517,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -10133,7 +9530,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10154,19 +9551,19 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10191,12 +9588,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
@@ -10204,7 +9601,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10225,12 +9622,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -10238,7 +9635,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10257,12 +9654,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
@@ -10270,7 +9667,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10289,12 +9686,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -10302,7 +9699,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10322,12 +9719,12 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
@@ -10336,7 +9733,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10352,17 +9749,17 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10381,7 +9778,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -10409,13 +9806,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10431,7 +9828,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -10453,13 +9850,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10475,7 +9872,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -10504,13 +9901,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10523,7 +9920,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -10544,13 +9941,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10565,7 +9962,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -10586,13 +9983,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10607,7 +10004,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10626,13 +10023,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10646,7 +10043,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10667,13 +10064,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
@@ -10687,7 +10084,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    v_not_b32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v5
@@ -10705,19 +10102,19 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %result = atomicrmw fmin ptr %gep, bfloat %val seq_cst
+  %result = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10736,7 +10133,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10763,12 +10160,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10784,7 +10181,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10806,12 +10203,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10827,7 +10224,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10855,12 +10252,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10873,7 +10270,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10894,12 +10291,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10914,7 +10311,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10935,12 +10332,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10955,7 +10352,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10974,12 +10371,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -10993,7 +10390,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11014,12 +10411,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11033,7 +10430,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11051,12 +10448,12 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr %ptr, i64 1023
-  %unused = atomicrmw fmin ptr %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -11064,8 +10461,8 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11075,7 +10472,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11090,19 +10487,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11116,19 +10513,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11144,19 +10541,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11170,19 +10567,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11194,19 +10591,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11218,20 +10615,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -11246,13 +10643,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -11265,7 +10662,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11288,18 +10685,18 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11309,7 +10706,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11324,19 +10721,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11350,19 +10747,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11378,13 +10775,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -11392,7 +10789,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -11406,18 +10803,18 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11429,19 +10826,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11453,13 +10850,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -11468,7 +10865,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11483,12 +10880,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -11503,7 +10900,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -11526,17 +10923,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11546,7 +10943,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11561,13 +10958,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v0
@@ -11581,7 +10978,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -11595,12 +10992,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -11612,7 +11009,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -11628,12 +11025,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -11641,7 +11038,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -11655,12 +11052,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11670,7 +11067,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -11682,12 +11079,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11697,7 +11094,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX908-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -11709,12 +11106,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11723,7 +11120,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11738,12 +11135,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -11758,7 +11155,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -11781,17 +11178,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
-  %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11801,7 +11198,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -11816,18 +11213,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11841,18 +11238,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11868,18 +11265,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11893,18 +11290,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11916,18 +11313,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -11939,19 +11336,19 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -11966,12 +11363,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -11984,7 +11381,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12007,16 +11404,16 @@ define void @flat_agent_atomic_fmin_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12026,7 +11423,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12041,18 +11438,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12066,18 +11463,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12093,12 +11490,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -12106,7 +11503,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12120,18 +11517,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12143,18 +11540,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12166,12 +11563,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12180,7 +11577,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12195,12 +11592,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -12215,7 +11612,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12238,17 +11635,17 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12258,7 +11655,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12273,12 +11670,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -12290,7 +11687,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12304,12 +11701,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -12319,7 +11716,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12335,12 +11732,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -12348,7 +11745,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12362,12 +11759,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -12377,7 +11774,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -12389,12 +11786,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -12404,7 +11801,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX908-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -12416,12 +11813,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -12430,7 +11827,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12445,12 +11842,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
@@ -12465,7 +11862,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12488,17 +11885,17 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
-  %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12508,7 +11905,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12523,19 +11920,19 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12549,19 +11946,19 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -12577,13 +11974,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -12591,7 +11988,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -12605,18 +12002,18 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -12630,19 +12027,19 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -12654,13 +12051,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -12669,7 +12066,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -12684,12 +12081,12 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -12704,7 +12101,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12727,17 +12124,17 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12747,7 +12144,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12762,18 +12159,18 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12787,18 +12184,18 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12814,12 +12211,12 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -12827,7 +12224,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12841,18 +12238,18 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12866,18 +12263,18 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12889,12 +12286,12 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12903,7 +12300,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12918,12 +12315,12 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -12938,7 +12335,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12961,12 +12358,12 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -12974,8 +12371,8 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12986,7 +12383,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13018,13 +12415,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -13033,7 +12430,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13060,13 +12457,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
@@ -13075,7 +12472,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13108,21 +12505,21 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13149,13 +12546,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -13164,7 +12561,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13189,13 +12586,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -13204,7 +12601,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13229,20 +12626,20 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -13270,13 +12667,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v5, v[0:1]
@@ -13288,7 +12685,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13308,18 +12705,18 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13330,7 +12727,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13362,13 +12759,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -13377,7 +12774,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13404,13 +12801,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -13419,7 +12816,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13452,14 +12849,14 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -13468,7 +12865,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -13495,12 +12892,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -13509,7 +12906,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13534,13 +12931,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -13549,7 +12946,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13574,13 +12971,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13589,7 +12986,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13617,12 +13014,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -13636,7 +13033,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -13656,17 +13053,17 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13677,7 +13074,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13709,13 +13106,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v0
@@ -13732,7 +13129,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -13759,12 +13156,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -13779,7 +13176,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -13812,13 +13209,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -13827,7 +13224,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -13854,12 +13251,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -13872,7 +13269,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -13897,12 +13294,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -13915,7 +13312,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -13940,12 +13337,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -13954,7 +13351,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13982,12 +13379,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -14001,7 +13398,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14021,17 +13418,17 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
-  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14042,7 +13439,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14073,12 +13470,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1]
@@ -14087,7 +13484,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14114,12 +13511,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
@@ -14128,7 +13525,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14160,20 +13557,20 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14200,12 +13597,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
@@ -14214,7 +13611,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14239,12 +13636,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
@@ -14253,7 +13650,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14278,19 +13675,19 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14318,12 +13715,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v4, v[0:1]
@@ -14335,7 +13732,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -14355,16 +13752,16 @@ define void @flat_agent_atomic_fmin_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14375,7 +13772,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14406,12 +13803,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -14420,7 +13817,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14447,12 +13844,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -14461,7 +13858,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14493,13 +13890,13 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -14508,7 +13905,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14535,12 +13932,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -14549,7 +13946,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14574,12 +13971,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -14588,7 +13985,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14613,12 +14010,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -14627,7 +14024,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14655,12 +14052,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -14674,7 +14071,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -14694,17 +14091,17 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14715,7 +14112,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14746,12 +14143,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -14766,7 +14163,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14793,12 +14190,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -14811,7 +14208,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14843,13 +14240,13 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -14858,7 +14255,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14885,12 +14282,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -14903,7 +14300,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -14928,12 +14325,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -14946,7 +14343,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -14971,12 +14368,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -14985,7 +14382,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15013,12 +14410,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
@@ -15032,7 +14429,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15052,17 +14449,17 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
-  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15073,7 +14470,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15105,13 +14502,13 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15120,7 +14517,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15147,13 +14544,13 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -15162,7 +14559,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -15195,14 +14592,14 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -15211,7 +14608,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -15238,12 +14635,12 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15252,7 +14649,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -15279,13 +14676,13 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15294,7 +14691,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -15319,13 +14716,13 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -15334,7 +14731,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -15362,12 +14759,12 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
@@ -15381,7 +14778,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -15401,17 +14798,17 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15422,7 +14819,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15453,12 +14850,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15467,7 +14864,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15494,12 +14891,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
@@ -15508,7 +14905,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15540,13 +14937,13 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -15555,7 +14952,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15582,12 +14979,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15596,7 +14993,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15623,12 +15020,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
@@ -15637,7 +15034,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15662,12 +15059,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -15676,7 +15073,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15704,12 +15101,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -15723,7 +15120,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15743,14 +15140,16 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
-  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 9f339af0f55802..37a201e390f81f 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -8,111 +8,9 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i3
 declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
 declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
 declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
-declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
-declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
 declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
 declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
 
-define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    ds_pk_add_f16 v0, v1
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    ds_pk_add_f16 v0, v1
-; GFX12-GISEL-NEXT:    s_endpgm
-  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
-  ret void
-}
-
-define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    ds_pk_add_bf16 v0, v1
-; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
-; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-GISEL-NEXT:    ds_pk_add_bf16 v1, v0
-; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
-; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-GISEL-NEXT:    s_endpgm
-  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
-  ret void
-}
-
-define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
-  ret <2 x half> %ret
-}
-
-define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
-; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
-; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
-  ret <2 x i16> %ret
-}
-
 define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX12-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 5761c198e20baf..0746b93546124c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -321,6 +321,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
@@ -329,6 +330,8 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    ds_pk_add_f16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret void
@@ -349,8 +352,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret <2 x half> %ret
@@ -363,10 +368,8 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
@@ -376,7 +379,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret void
@@ -386,10 +389,8 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn:
@@ -402,7 +403,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index bf3dbec6fa9ef5..5690b99e43ece4 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1938,22 +1938,44 @@ main_body:
 define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX90A-NEXT:    s_mov_b64 s[2:3], exec
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB63_2
+; GFX90A-NEXT:  ; %bb.1:
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x24
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mul_f64 v[0:1], s[4:5], v[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB63_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x24
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB63_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s4
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NEXT:    v_mul_f64 v[0:1], s[4:5], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:  .LBB63_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index c69207c0472e7c..08da89ec0fb229 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -19,15 +19,31 @@ define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(
 }
 
 define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_s_load_i8_imm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_i8 s0, s[0:1], -0x64
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; DAG-LABEL: test_s_load_i8_imm:
+; DAG:       ; %bb.0:
+; DAG-NEXT:    s_movk_i32 s2, 0xff9c
+; DAG-NEXT:    s_mov_b32 s3, -1
+; DAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; DAG-NEXT:    s_load_i8 s0, s[0:1], 0x0
+; DAG-NEXT:    s_wait_kmcnt 0x0
+; DAG-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-NEXT:    s_nop 0
+; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_s_load_i8_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_add_co_u32 s0, s0, 0xffffff9c
+; GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GISEL-NEXT:    s_load_i8 s0, s[0:1], 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(4) %in, i64 -100
   %ld = load i8, ptr addrspace(4) %gep
   %sext = sext i8 %ld to i32
@@ -195,15 +211,31 @@ define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace
 }
 
 define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_s_load_i16_imm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_i16 s0, s[0:1], -0xc8
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; DAG-LABEL: test_s_load_i16_imm:
+; DAG:       ; %bb.0:
+; DAG-NEXT:    s_movk_i32 s2, 0xff38
+; DAG-NEXT:    s_mov_b32 s3, -1
+; DAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; DAG-NEXT:    s_load_i16 s0, s[0:1], 0x0
+; DAG-NEXT:    s_wait_kmcnt 0x0
+; DAG-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-NEXT:    s_nop 0
+; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_s_load_i16_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_add_co_u32 s0, s0, 0xffffff38
+; GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GISEL-NEXT:    s_load_i16 s0, s[0:1], 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
   %gep = getelementptr i16, ptr addrspace(4) %in, i64 -100
   %ld = load i16, ptr addrspace(4) %gep
   %sext = sext i16 %ld to i32
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
index 6b47f81bccb71e..6c61c837881c45 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
@@ -130,7 +130,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -193,7 +193,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -251,7 +251,7 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -310,7 +310,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -364,7 +364,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -421,7 +421,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -473,7 +473,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -526,7 +526,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -586,7 +586,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -649,7 +649,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -707,7 +707,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -766,7 +766,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -820,7 +820,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -878,7 +878,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -931,7 +931,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -985,7 +985,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1040,7 +1040,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1098,7 +1098,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1151,7 +1151,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1205,7 +1205,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1318,7 +1318,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1371,7 +1371,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1425,7 +1425,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 77c8e034e68a0a..e312b37b2e0bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -27,7 +27,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -36,7 +36,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -46,7 +46,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
@@ -70,7 +70,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
@@ -78,7 +78,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -100,7 +100,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -122,7 +122,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -150,7 +150,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -179,12 +179,12 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -197,7 +197,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -206,7 +206,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -216,7 +216,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -240,7 +240,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
@@ -248,7 +248,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -270,7 +270,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -293,7 +293,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -321,7 +321,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -351,12 +351,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -369,7 +369,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -378,7 +378,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -388,7 +388,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -412,7 +412,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
@@ -420,7 +420,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -442,7 +442,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -465,7 +465,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -498,7 +498,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -533,12 +533,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -551,7 +551,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -560,7 +560,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -570,7 +570,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -593,7 +593,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
@@ -601,7 +601,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
@@ -609,7 +609,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
@@ -630,7 +630,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -657,7 +657,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -685,12 +685,12 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -703,7 +703,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -712,7 +712,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -722,7 +722,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
@@ -745,7 +745,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
@@ -753,7 +753,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
@@ -761,7 +761,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -784,7 +784,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -811,7 +811,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -840,12 +840,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -858,7 +858,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -867,7 +867,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -877,7 +877,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
@@ -900,7 +900,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
@@ -908,7 +908,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
@@ -916,7 +916,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -939,7 +939,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -970,7 +970,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -1003,12 +1003,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1037,7 +1037,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -1046,7 +1046,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -1072,7 +1072,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1096,7 +1096,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1120,7 +1120,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1142,7 +1142,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1165,7 +1165,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1193,7 +1193,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1223,12 +1223,12 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1255,7 +1255,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
@@ -1264,7 +1264,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
@@ -1288,7 +1288,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
@@ -1311,7 +1311,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
@@ -1334,7 +1334,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
@@ -1355,7 +1355,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1378,7 +1378,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1405,7 +1405,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1434,16 +1434,12 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1451,34 +1447,34 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1486,7 +1482,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -1499,25 +1495,25 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1529,36 +1525,37 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1567,7 +1564,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1579,14 +1576,14 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1596,7 +1593,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1608,12 +1605,13 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
   ret float %result
 }
 
-define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1626,7 +1624,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -1635,7 +1633,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1645,7 +1643,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1669,7 +1667,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
@@ -1677,7 +1675,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1699,7 +1697,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1722,7 +1720,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1750,7 +1748,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1780,12 +1778,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
   ret float %result
 }
 
-define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1793,181 +1791,154 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_add_f32_e32 v5, v6, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
-; GFX6-NEXT:    v_add_f32_e32 v5, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret float %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
+  ret void
 }
 
-define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1975,151 +1946,171 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2127,38 +2118,39 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
@@ -2166,115 +2158,131 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2282,234 +2290,210 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
+  ret float %result
 }
 
-define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
@@ -2517,207 +2501,147 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  ret void
 }
 
-define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
@@ -2740,51 +2664,23 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -2807,7 +2703,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -2834,7 +2730,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -2863,762 +2759,539 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f64:
+define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
 }
 
-define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
+  ret void
 }
 
-define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f64:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -3626,406 +3299,321 @@ define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v7
-; GFX7-NEXT:    v_mov_b32_e32 v10, v6
-; GFX7-NEXT:    v_mov_b32_e32 v9, v5
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v7
-; GFX6-NEXT:    v_mov_b32_e32 v10, v6
-; GFX6-NEXT:    v_mov_b32_e32 v9, v5
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v7
-; GFX7-NEXT:    v_mov_b32_e32 v10, v6
-; GFX7-NEXT:    v_mov_b32_e32 v9, v5
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v7
-; GFX6-NEXT:    v_mov_b32_e32 v10, v6
-; GFX6-NEXT:    v_mov_b32_e32 v9, v5
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
@@ -4033,1971 +3621,994 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v7
-; GFX7-NEXT:    v_mov_b32_e32 v10, v6
-; GFX7-NEXT:    v_mov_b32_e32 v9, v5
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v7
-; GFX6-NEXT:    v_mov_b32_e32 v10, v6
-; GFX6-NEXT:    v_mov_b32_e32 v9, v5
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; half
-; --------------------------------------------------------------------
-
-define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
-; GFX7-NEXT:    v_not_b32_e32 v7, v2
 ; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
-; GFX6-NEXT:    v_not_b32_e32 v7, v2
 ; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, v4, v7
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
-  ret half %result
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
-  ret half %result
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
-  ret half %result
- }
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
+}
 
-define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v3
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    v_mov_b32_e32 v7, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
+; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_add_f32_e32 v5, v6, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v0
+; GFX6-NEXT:    v_add_f32_e32 v5, v6, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -6012,91 +4623,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6109,451 +4660,264 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_e32 v0, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
-  ret half %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2046
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -6566,65 +4930,33 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2046
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2046
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2048
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v3, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -6637,460 +4969,310 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v4
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v4
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst
-  ret half %result
-}
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
+}
 
-define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
@@ -7103,66 +5285,26 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -7176,27 +5318,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
 ; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -7209,61 +5341,39 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -7275,27 +5385,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -7308,1461 +5408,605 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v4, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v4, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v5, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, v4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v3
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, v4, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
-  ret bfloat %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %result
 }
 
-define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v4, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v4, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v5, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v4, v4
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v7, v4
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v7, v4
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
-  ret bfloat %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
 }
 
-define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v4, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v4, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v5, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
-  ret bfloat %result
- }
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
+}
 
-define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16:
+define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v3, v0
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v6, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v5, v4
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -8777,109 +6021,31 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v6, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, v0
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT:    v_not_b32_e32 v6, v3
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -8892,661 +6058,284 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    v_mov_b32_e32 v7, v3
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v5, v5
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xf800
-; GFX940-NEXT:    s_mov_b32 s1, -1
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
@@ -9554,1298 +6343,796 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
-; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v5, v5
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX940-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX940-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT:    v_add3_u32 v4, v4, v3, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_and_or_b32 v4, v5, s3, v3
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
 ; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX908-NEXT:    v_and_or_b32 v3, v4, s7, v3
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
+; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, v1
+; GFX6-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
-  ret bfloat %result
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v5, v5, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    v_and_or_b32 v2, v3, s3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
+; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
 ; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v3, s7, v2
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
+; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, v1
+; GFX6-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
-  ret void
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v5, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT:    v_not_b32_e32 v4, v4
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v5
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT:    v_not_b32_e32 v4, v4
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT:    global_load_dword v5, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v5
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT:    flat_load_dword v5, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v5
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v7, v4
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
+; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    v_mov_b32_e32 v7, v1
+; GFX6-NEXT:    v_mov_b32_e32 v6, v0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v7, v4
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst
-  ret bfloat %result
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_not_b32_e32 v5, v5
 ; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -10854,90 +7141,31 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
-; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT:    v_not_b32_e32 v5, v5
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -10946,38 +7174,23 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
@@ -10985,1397 +7198,1680 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
-; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
-; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT:    v_not_b32_e32 v5, v5
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
-; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT:    s_mov_b32 s4, 0xffff
-; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
+; GFX7-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7-NEXT:    v_mov_b32_e32 v8, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v5, v5
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
+; GFX6-NEXT:    v_mov_b32_e32 v9, v5
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; <2 x half>
-; --------------------------------------------------------------------
-
-define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16:
+define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2040
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2040
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2040
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
 ; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
-; GFX7-NEXT:    v_mov_b32_e32 v9, v7
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
+; GFX7-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7-NEXT:    v_mov_b32_e32 v8, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
 ; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
-; GFX6-NEXT:    v_mov_b32_e32 v9, v7
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
+; GFX6-NEXT:    v_mov_b32_e32 v9, v5
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
-  ret <2 x half> %result
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:-2048
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
 ; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
-; GFX7-NEXT:    v_mov_b32_e32 v9, v7
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
+; GFX7-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7-NEXT:    v_mov_b32_e32 v8, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
 ; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
-; GFX6-NEXT:    v_mov_b32_e32 v9, v7
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
+; GFX6-NEXT:    v_mov_b32_e32 v9, v5
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret <2 x half> %result
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; --------------------------------------------------------------------
+; half
+; --------------------------------------------------------------------
+
+define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX7-NEXT:    v_not_b32_e32 v7, v2
 ; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT:    v_mov_b32_e32 v9, v7
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX6-NEXT:    v_not_b32_e32 v7, v2
 ; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v6, v8, v0
-; GFX6-NEXT:    v_mov_b32_e32 v9, v7
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret <2 x half> %result
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
 }
 
-define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16:
+define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
 }
 
-define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
 ; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
-  ret void
-}
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
+  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+ }
 
-define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -12389,17 +8885,26 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
 ; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -12412,24 +8917,57 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -12441,19 +8979,26 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
@@ -12466,382 +9011,533 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v4
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
 ; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
-; GFX7-NEXT:    v_mov_b32_e32 v9, v7
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
-; GFX6-NEXT:    v_mov_b32_e32 v9, v7
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
-  ret <2 x half> %result
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -12855,17 +9551,27 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
 ; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -12878,26 +9584,59 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -12909,137 +9648,10218 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
+; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_e32 v0, v1, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+}
+
+define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:2046
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:2046
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2046
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v3, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v4, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
+; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+  %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret half %result
+}
+
+define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
+; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v4, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v5
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v5, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v5
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+}
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v4, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v5
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v5, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v5
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+}
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v4, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v5
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v5, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v5
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
+  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+ }
+
+define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v6, v3
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v0
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v6, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v3
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v6, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT:    v_not_b32_e32 v6, v3
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT:    v_not_b32_e32 v6, v3
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v4
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX940-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v4, v4, v3, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_and_or_b32 v4, v5, s3, v3
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v3
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, s7, v3
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+}
+
+define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v5, v5, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s3, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v5, v5, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v4, v4
+; GFX12-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT:    v_not_b32_e32 v4, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v5
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v4, v4
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v4, v4
+; GFX10-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT:    v_not_b32_e32 v4, v4
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT:    global_load_dword v5, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT:    v_not_b32_e32 v4, v4
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v5
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT:    global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v5
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT:  .LBB62_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB62_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+  %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret bfloat %result
+}
+
+define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT:    v_not_b32_e32 v5, v5
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_not_b32_e32 v5, v5
+; GFX10-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT:    v_not_b32_e32 v5, v5
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT:    v_not_b32_e32 v5, v5
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT:  .LBB63_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB63_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; <2 x half>
+; --------------------------------------------------------------------
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB64_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB64_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB64_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB64_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB64_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB64_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB64_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB64_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB64_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    v_mov_b32_e32 v9, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB64_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT:  .LBB64_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB64_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB65_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB65_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB65_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB65_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB65_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB65_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB65_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB65_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB65_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    v_mov_b32_e32 v9, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB65_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT:  .LBB65_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB65_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB66_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB66_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB66_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB66_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB66_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB66_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB66_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB66_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:  .LBB66_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT:    v_mov_b32_e32 v9, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB66_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:  .LBB66_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v0
+; GFX6-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB66_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB67_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB67_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB67_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB67_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB67_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB67_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB67_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB67_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:  .LBB67_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB67_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB68_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB68_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB68_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB68_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB68_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB68_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB68_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB68_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:  .LBB68_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB68_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB69_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB69_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB69_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB69_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB69_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB69_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB69_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB69_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:  .LBB69_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB69_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB70_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB70_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB70_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB70_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB70_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB70_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB70_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB70_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB70_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    v_mov_b32_e32 v9, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB70_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT:  .LBB70_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB70_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %result
+}
+
+define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB71_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB71_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB71_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB71_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB71_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB71_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB71_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB71_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB71_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB71_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:  .LBB71_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB71_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB72_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB72_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB72_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB72_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB72_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB72_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB72_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB72_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB72_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    v_mov_b32_e32 v9, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB72_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT:  .LBB72_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB72_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB73_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB73_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB73_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB73_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB73_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB73_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB73_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB73_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:  .LBB73_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB73_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB74_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB74_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB74_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB74_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB74_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB74_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB74_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB74_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB74_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    v_mov_b32_e32 v9, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB74_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT:  .LBB74_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB74_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB75_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB75_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB75_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB75_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB75_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB75_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB75_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB75_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:  .LBB75_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB75_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB76_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB76_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB76_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB76_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB76_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB76_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB76_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB76_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB76_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    v_mov_b32_e32 v9, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB76_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT:  .LBB76_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB76_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB77_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB77_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB77_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB77_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB77_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB77_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB77_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB77_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB77_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB77_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT:  .LBB77_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB77_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------
+; <2 x bfloat>
+; --------------------------------------------------------------------
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB78_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB78_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB78_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB78_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB78_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB78_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB78_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB78_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB78_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB78_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB78_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB78_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT:  .LBB78_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB78_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB79_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB79_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB79_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB79_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB79_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB79_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB79_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB79_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB79_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB79_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB79_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB79_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT:  .LBB79_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB79_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v1
+; GFX6-NEXT:    v_mov_b32_e32 v6, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB83_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB83_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB83_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB83_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v3
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; <2 x bfloat>
-; --------------------------------------------------------------------
-
-define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13047,30 +19867,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13095,7 +19915,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -13103,21 +19923,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB84_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13137,29 +19957,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB84_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13178,28 +19998,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB84_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13218,67 +20040,68 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB84_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB84_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -13287,7 +20110,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13301,7 +20124,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
@@ -13309,21 +20132,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB84_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -13332,7 +20155,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13347,7 +20170,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, 16
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
@@ -13355,19 +20178,20 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB84_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13375,232 +20199,229 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
-; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13609,43 +20430,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
-; GFX7-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13654,50 +20473,48 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, 16
-; GFX6-NEXT:    v_mov_b32_e32 v7, v3
-; GFX6-NEXT:    v_mov_b32_e32 v6, v2
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB85_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
-  ret <2 x bfloat> %result
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13705,30 +20522,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13753,7 +20570,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -13761,21 +20578,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB86_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13795,29 +20612,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB86_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13836,28 +20653,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB86_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13876,162 +20693,156 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB86_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB86_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB86_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX6-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB86_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14044,7 +20855,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
@@ -14053,7 +20864,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -14062,7 +20873,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14094,20 +20905,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14134,12 +20945,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -14148,7 +20959,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14173,12 +20984,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -14187,7 +20998,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14212,19 +21023,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14252,12 +21063,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14273,7 +21084,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -14295,12 +21106,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14316,7 +21127,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -14339,17 +21150,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14357,320 +21168,327 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
-; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT:    v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT:    v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14678,30 +21496,30 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14724,7 +21542,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -14733,20 +21551,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14765,7 +21583,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
 ; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -14773,21 +21591,21 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14805,28 +21623,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14844,28 +21662,26 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14893,32 +21709,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -14940,32 +21752,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -14988,18 +21796,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15007,30 +21814,30 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -15055,7 +21862,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -15063,21 +21870,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -15097,29 +21904,29 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -15138,30 +21945,28 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -15180,68 +21985,67 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -15250,7 +22054,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15264,7 +22068,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
@@ -15272,21 +22076,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -15295,7 +22099,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15310,7 +22114,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, 16
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
@@ -15318,20 +22122,19 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
   ret <2 x bfloat> %result
 }
 
-define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15339,30 +22142,30 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_pk_add_bf16 v[0:1], v2, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15385,7 +22188,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -15394,20 +22197,20 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15426,7 +22229,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
 ; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -15434,21 +22237,21 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15466,30 +22269,28 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15507,28 +22308,26 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15556,19 +22355,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -15577,7 +22376,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -15591,7 +22390,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -15599,19 +22398,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -15620,7 +22419,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
@@ -15635,7 +22434,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -15643,13 +22442,12 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
   ret void
 }
 
@@ -15665,7 +22463,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX12-NEXT:    s_cbranch_execz .LBB58_2
+; GFX12-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX12-NEXT:  ; %bb.1:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s2, s2
@@ -15675,7 +22473,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
-; GFX12-NEXT:  .LBB58_2:
+; GFX12-NEXT:  .LBB92_2:
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -15687,7 +22485,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB58_2
+; GFX940-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -15697,7 +22495,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
-; GFX940-NEXT:  .LBB58_2:
+; GFX940-NEXT:  .LBB92_2:
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: infer_as_before_atomic:
@@ -15707,7 +22505,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT:    s_cbranch_execz .LBB58_2
+; GFX11-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX11-NEXT:  ; %bb.1:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_bcnt1_i32_b32 s2, s2
@@ -15717,7 +22515,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
-; GFX11-NEXT:  .LBB58_2:
+; GFX11-NEXT:  .LBB92_2:
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -15729,7 +22527,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
-; GFX10-NEXT:    s_cbranch_execz .LBB58_3
+; GFX10-NEXT:    s_cbranch_execz .LBB92_3
 ; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-NEXT:    s_bcnt1_i32_b32 s3, s3
@@ -15741,7 +22539,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:  .LBB58_2: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB92_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
@@ -15750,8 +22548,8 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB58_2
-; GFX10-NEXT:  .LBB58_3:
+; GFX10-NEXT:    s_cbranch_execnz .LBB92_2
+; GFX10-NEXT:  .LBB92_3:
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: infer_as_before_atomic:
@@ -15761,7 +22559,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB58_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -15771,7 +22569,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
-; GFX90A-NEXT:  .LBB58_2:
+; GFX90A-NEXT:  .LBB92_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: infer_as_before_atomic:
@@ -15781,7 +22579,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX908-NEXT:    s_cbranch_execz .LBB58_2
+; GFX908-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX908-NEXT:  ; %bb.1:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX908-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -15791,7 +22589,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
-; GFX908-NEXT:  .LBB58_2:
+; GFX908-NEXT:  .LBB92_2:
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: infer_as_before_atomic:
@@ -15801,7 +22599,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB58_3
+; GFX8-NEXT:    s_cbranch_execz .LBB92_3
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -15815,7 +22613,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:  .LBB58_2: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB92_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_add_f32_e32 v2, v3, v4
 ; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
@@ -15824,8 +22622,8 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX8-NEXT:    s_cbranch_execnz .LBB58_2
-; GFX8-NEXT:  .LBB58_3:
+; GFX8-NEXT:    s_cbranch_execnz .LBB92_2
+; GFX8-NEXT:  .LBB92_3:
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: infer_as_before_atomic:
@@ -15835,7 +22633,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX7-NEXT:    s_cbranch_execz .LBB58_3
+; GFX7-NEXT:    s_cbranch_execz .LBB92_3
 ; GFX7-NEXT:  ; %bb.1:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX7-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -15849,7 +22647,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:  .LBB58_2: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB92_2: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
@@ -15860,8 +22658,8 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB58_2
-; GFX7-NEXT:  .LBB58_3:
+; GFX7-NEXT:    s_cbranch_execnz .LBB92_2
+; GFX7-NEXT:  .LBB92_3:
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX6-LABEL: infer_as_before_atomic:
@@ -15871,7 +22669,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX6-NEXT:    s_cbranch_execz .LBB58_3
+; GFX6-NEXT:    s_cbranch_execz .LBB92_3
 ; GFX6-NEXT:  ; %bb.1:
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -15885,7 +22683,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX6-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:  .LBB58_2: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB92_2: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -15897,13 +22695,15 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB58_2
-; GFX6-NEXT:  .LBB58_3:
+; GFX6-NEXT:    s_cbranch_execnz .LBB92_2
+; GFX6-NEXT:  .LBB92_3:
 ; GFX6-NEXT:    s_endpgm
   %load = load ptr, ptr addrspace(4) %arg
-  %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
+  %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 492c74e56b367b..e7d62fdc00cfff 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -13,39 +13,21 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -70,61 +52,27 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -148,7 +96,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -172,7 +120,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -196,104 +144,51 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -318,61 +213,27 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -396,7 +257,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -420,7 +281,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -445,105 +306,52 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -568,61 +376,27 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -646,7 +420,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -670,7 +444,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -695,114 +469,52 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT:    v_max_f32_e32 v5, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32:
+define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -826,59 +538,27 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -901,7 +581,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -924,7 +604,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -947,101 +627,49 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1065,59 +693,27 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1140,7 +736,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1163,7 +759,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1188,102 +784,50 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -1307,59 +851,27 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -1382,7 +894,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -1405,7 +917,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1430,80 +942,37 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1534,7 +1003,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1559,7 +1028,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -1587,7 +1056,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1613,7 +1082,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1639,7 +1108,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1663,7 +1132,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1688,7 +1157,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1718,7 +1187,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1750,12 +1219,12 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1785,7 +1254,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1809,7 +1278,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -1836,7 +1305,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1861,7 +1330,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1886,7 +1355,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1909,7 +1378,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1934,7 +1403,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1963,7 +1432,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1994,47 +1463,25 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -2059,61 +1506,27 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -2137,7 +1550,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -2161,7 +1574,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -2185,107 +1598,54 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2295,7 +1655,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2307,64 +1667,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2373,7 +1699,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2385,10 +1711,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2397,7 +1723,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -2409,133 +1735,82 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2545,7 +1820,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2557,64 +1832,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2623,7 +1864,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2635,10 +1876,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2647,7 +1888,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -2659,623 +1900,404 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT:    v_max_f32_e32 v5, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -3284,7 +2306,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3296,62 +2318,30 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -3359,7 +2349,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3371,10 +2361,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -3382,7 +2372,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3394,11 +2384,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3419,365 +2407,210 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3785,10 +2618,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -3798,62 +2631,30 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3861,10 +2662,8 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
@@ -3875,10 +2674,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3886,7 +2685,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3898,11 +2697,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3923,612 +2722,544 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64:
+define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -4539,7 +3270,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -4552,36 +3283,19 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -4592,7 +3306,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -4606,63 +3320,28 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -4672,7 +3351,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
@@ -4685,17 +3364,188 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
@@ -4706,112 +3556,560 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
 }
 
-define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64:
+define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -4819,49 +4117,34 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -4870,72 +4153,41 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -4943,18 +4195,20 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -4967,596 +4221,388 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v5
-; GFX6-NEXT:    v_mov_b32_e32 v10, v4
-; GFX6-NEXT:    v_mov_b32_e32 v9, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v9
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v5
-; GFX6-NEXT:    v_mov_b32_e32 v10, v4
-; GFX6-NEXT:    v_mov_b32_e32 v9, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v9
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v5
-; GFX6-NEXT:    v_mov_b32_e32 v10, v4
-; GFX6-NEXT:    v_mov_b32_e32 v9, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v9
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16:
+define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5574,7 +4620,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5595,13 +4641,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5614,7 +4660,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5630,13 +4676,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5650,7 +4696,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -5672,13 +4718,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5690,7 +4736,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -5707,13 +4753,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5726,7 +4772,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5741,13 +4787,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5760,7 +4806,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5775,13 +4821,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5794,7 +4840,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5810,13 +4856,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5833,7 +4879,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX7-NEXT:    v_not_b32_e32 v7, v2
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -5852,14 +4898,14 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -5876,7 +4922,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX6-NEXT:    v_not_b32_e32 v7, v2
-; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -5895,19 +4941,19 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5926,7 +4972,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5947,13 +4993,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5968,7 +5014,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5984,13 +5030,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6005,7 +5051,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6027,13 +5073,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6046,7 +5092,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6063,13 +5109,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6083,7 +5129,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6098,13 +5144,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6118,7 +5164,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6133,13 +5179,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6153,7 +5199,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6169,13 +5215,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6193,7 +5239,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6212,14 +5258,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6237,7 +5283,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6257,7 +5303,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6265,12 +5311,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6289,7 +5335,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -6310,13 +5356,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -6332,7 +5378,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -6348,13 +5394,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -6369,7 +5415,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6391,13 +5437,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -6410,7 +5456,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6427,13 +5473,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -6447,7 +5493,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6462,13 +5508,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -6482,7 +5528,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6497,13 +5543,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -6517,7 +5563,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6533,13 +5579,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6557,7 +5603,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6576,14 +5622,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6601,7 +5647,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6621,7 +5667,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6629,12 +5675,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16:
+define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6652,7 +5698,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6673,12 +5719,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -6691,7 +5737,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -6707,12 +5753,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -6726,7 +5772,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6748,12 +5794,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -6765,7 +5811,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6782,12 +5828,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -6800,7 +5846,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -6815,12 +5861,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -6833,7 +5879,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6848,12 +5894,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -6866,7 +5912,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6882,12 +5928,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -6904,7 +5950,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -6923,12 +5969,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -6945,7 +5991,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -6965,17 +6011,17 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6994,7 +6040,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7015,12 +6061,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7035,7 +6081,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7051,12 +6097,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7071,7 +6117,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7093,12 +6139,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7111,7 +6157,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7128,12 +6174,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7147,7 +6193,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7162,12 +6208,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7181,7 +6227,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7196,12 +6242,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7215,7 +6261,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7231,12 +6277,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7254,7 +6300,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7273,12 +6319,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7296,7 +6342,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7316,18 +6362,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7346,7 +6392,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7367,12 +6413,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -7388,7 +6434,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7404,12 +6450,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -7424,7 +6470,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7446,12 +6492,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -7464,7 +6510,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7481,12 +6527,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -7500,7 +6546,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7515,12 +6561,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -7534,7 +6580,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7549,12 +6595,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -7568,7 +6614,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7584,12 +6630,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -7607,7 +6653,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7626,12 +6672,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -7649,7 +6695,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7669,18 +6715,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7690,7 +6736,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -7708,20 +6754,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -7735,19 +6781,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -7766,19 +6812,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -7794,20 +6840,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -7820,20 +6866,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -7846,13 +6892,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -7860,7 +6906,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -7874,12 +6920,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -7890,7 +6936,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -7907,13 +6953,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -7924,7 +6970,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -7942,19 +6988,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7964,7 +7010,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -7982,19 +7028,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8008,18 +7054,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8038,18 +7084,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8065,19 +7111,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8090,19 +7136,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8115,12 +7161,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -8128,7 +7174,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8142,12 +7188,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -8158,7 +7204,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -8175,12 +7221,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -8191,7 +7237,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -8209,18 +7255,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8239,7 +7285,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8260,13 +7306,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -8281,7 +7327,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8297,13 +7343,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8318,7 +7364,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8340,13 +7386,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8359,7 +7405,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8376,13 +7422,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8396,7 +7442,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8413,13 +7459,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8433,7 +7479,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8448,13 +7494,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8468,7 +7514,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8484,13 +7530,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8508,7 +7554,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8527,14 +7573,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8552,7 +7598,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8572,7 +7618,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8580,12 +7626,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8604,7 +7650,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8625,12 +7671,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -8645,7 +7691,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8661,12 +7707,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -8681,7 +7727,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8703,12 +7749,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -8721,7 +7767,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8738,12 +7784,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -8757,7 +7803,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8774,12 +7820,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -8793,7 +7839,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8808,12 +7854,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -8827,7 +7873,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8843,12 +7889,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8866,7 +7912,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8885,12 +7931,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8908,7 +7954,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8928,13 +7974,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -8942,8 +7988,8 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16:
+define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8960,7 +8006,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8988,13 +8034,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -9008,7 +8054,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -9030,13 +8076,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -9050,7 +8096,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -9079,13 +8125,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -9097,7 +8143,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -9118,13 +8164,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -9138,7 +8184,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -9157,13 +8203,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -9177,7 +8223,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -9196,13 +8242,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -9215,7 +8261,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -9236,13 +8282,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -9259,7 +8305,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -9279,14 +8325,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -9303,7 +8349,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -9323,19 +8369,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9354,7 +8400,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -9382,13 +8428,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9404,7 +8450,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -9426,13 +8472,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -9448,7 +8494,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -9477,13 +8523,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -9496,7 +8542,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -9517,13 +8563,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -9538,7 +8584,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -9557,13 +8603,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -9578,7 +8624,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -9597,13 +8643,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -9617,7 +8663,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -9638,13 +8684,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9662,7 +8708,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -9682,14 +8728,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9707,7 +8753,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -9728,7 +8774,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -9736,12 +8782,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9760,7 +8806,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -9788,13 +8834,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9811,7 +8857,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -9833,13 +8879,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -9855,7 +8901,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -9884,13 +8930,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -9903,7 +8949,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -9924,13 +8970,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -9945,7 +8991,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -9964,13 +9010,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -9985,7 +9031,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10004,13 +9050,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -10024,7 +9070,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10045,13 +9091,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -10069,7 +9115,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10089,14 +9135,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -10114,7 +9160,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10135,7 +9181,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -10143,12 +9189,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16:
+define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10165,7 +9211,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10192,12 +9238,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -10211,7 +9257,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10233,12 +9279,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -10252,7 +9298,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10280,12 +9326,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -10297,7 +9343,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10318,12 +9364,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -10337,7 +9383,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10356,12 +9402,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -10375,7 +9421,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10394,12 +9440,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -10412,7 +9458,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10433,12 +9479,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -10455,7 +9501,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10475,12 +9521,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -10497,7 +9543,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10518,17 +9564,17 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10547,7 +9593,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10574,12 +9620,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10595,7 +9641,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10617,12 +9663,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10638,7 +9684,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10666,12 +9712,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10684,7 +9730,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10705,12 +9751,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10725,7 +9771,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10744,12 +9790,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10764,7 +9810,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10783,12 +9829,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -10802,7 +9848,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10823,12 +9869,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10846,7 +9892,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10866,12 +9912,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10889,7 +9935,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10910,18 +9956,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10940,7 +9986,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10967,12 +10013,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -10989,7 +10035,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11011,12 +10057,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -11032,7 +10078,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11060,12 +10106,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -11078,7 +10124,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11099,12 +10145,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11119,7 +10165,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11138,12 +10184,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11158,7 +10204,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11177,12 +10223,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11196,7 +10242,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11217,12 +10263,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -11240,7 +10286,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11260,12 +10306,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -11283,7 +10329,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11304,18 +10350,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11325,7 +10371,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11350,13 +10396,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11364,7 +10410,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11385,20 +10431,20 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11424,19 +10470,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11457,13 +10503,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11471,7 +10517,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11490,13 +10536,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11504,7 +10550,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11523,13 +10569,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -11537,7 +10583,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11557,12 +10603,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11573,7 +10619,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11591,13 +10637,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11608,7 +10654,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11627,19 +10673,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11649,7 +10695,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11673,12 +10719,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11686,7 +10732,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11707,19 +10753,19 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11744,18 +10790,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11776,12 +10822,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11789,7 +10835,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11808,12 +10854,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11821,7 +10867,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11840,12 +10886,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -11853,7 +10899,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11873,12 +10919,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11889,7 +10935,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11907,12 +10953,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11923,7 +10969,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11942,18 +10988,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11972,7 +11018,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -12000,13 +11046,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -12022,7 +11068,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -12044,13 +11090,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -12066,7 +11112,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -12095,13 +11141,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -12114,7 +11160,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -12135,13 +11181,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -12156,7 +11202,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -12177,13 +11223,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -12198,7 +11244,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -12217,13 +11263,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -12237,7 +11283,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -12258,13 +11304,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12282,7 +11328,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -12302,14 +11348,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12327,7 +11373,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -12348,7 +11394,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -12356,12 +11402,12 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12380,7 +11426,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12407,12 +11453,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -12428,7 +11474,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12450,12 +11496,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -12471,7 +11517,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12499,12 +11545,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -12517,7 +11563,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12538,12 +11584,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -12558,7 +11604,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12579,12 +11625,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -12599,7 +11645,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12618,12 +11664,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -12637,7 +11683,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12658,12 +11704,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12681,7 +11727,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12701,12 +11747,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12724,7 +11770,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12745,13 +11791,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -12759,8 +11805,8 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12770,7 +11816,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12785,19 +11831,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12811,19 +11857,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -12839,19 +11885,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -12865,19 +11911,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -12889,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -12913,20 +11959,20 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -12941,13 +11987,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12964,7 +12010,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -12989,14 +12035,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13013,7 +12059,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13039,19 +12085,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13061,7 +12107,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -13076,19 +12122,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -13102,19 +12148,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13130,19 +12176,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13156,19 +12202,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13180,19 +12226,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13204,13 +12250,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13219,7 +12265,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13234,12 +12280,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13256,7 +12302,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13281,14 +12327,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13305,7 +12351,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13331,7 +12377,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -13339,12 +12385,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13354,7 +12400,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -13369,19 +12415,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -13395,19 +12441,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13423,19 +12469,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13449,19 +12495,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13473,19 +12519,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13497,13 +12543,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -13512,7 +12558,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13527,12 +12573,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -13553,7 +12599,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -13578,12 +12624,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -13604,7 +12650,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -13630,18 +12676,18 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16:
+define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13651,7 +12697,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13666,18 +12712,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13691,18 +12737,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13718,18 +12764,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13743,18 +12789,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13766,18 +12812,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13789,19 +12835,19 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -13816,12 +12862,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13838,7 +12884,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13863,12 +12909,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13885,7 +12931,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13911,17 +12957,17 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13931,7 +12977,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13946,18 +12992,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13971,18 +13017,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13998,18 +13044,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14023,18 +13069,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14046,18 +13092,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14069,12 +13115,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -14083,7 +13129,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -14098,12 +13144,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14120,7 +13166,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14145,12 +13191,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14167,7 +13213,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14193,18 +13239,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14214,7 +13260,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -14229,18 +13275,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14254,18 +13300,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14281,18 +13327,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14306,18 +13352,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14329,18 +13375,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14352,12 +13398,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -14366,7 +13412,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -14381,12 +13427,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -14407,7 +13453,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14432,12 +13478,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -14458,7 +13504,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14484,18 +13530,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14505,7 +13551,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -14520,19 +13566,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -14546,19 +13592,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -14574,19 +13620,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -14600,19 +13646,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -14626,19 +13672,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -14650,13 +13696,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -14665,7 +13711,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14680,12 +13726,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14702,7 +13748,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -14727,14 +13773,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14751,7 +13797,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -14777,7 +13823,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -14785,12 +13831,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14800,7 +13846,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -14815,18 +13861,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14840,18 +13886,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14867,18 +13913,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14892,18 +13938,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14917,18 +13963,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14940,12 +13986,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -14954,7 +14000,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -14969,12 +14015,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14991,7 +14037,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -15016,12 +14062,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15038,7 +14084,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -15064,13 +14110,13 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -15078,8 +14124,8 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15090,7 +14136,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15122,13 +14168,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -15137,7 +14183,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15164,13 +14210,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -15179,7 +14225,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -15212,21 +14258,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -15253,13 +14299,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -15268,7 +14314,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -15293,13 +14339,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -15308,7 +14354,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -15333,20 +14379,20 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -15374,13 +14420,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15396,7 +14442,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15418,14 +14464,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15441,7 +14487,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15464,19 +14510,19 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15487,7 +14533,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15519,13 +14565,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15534,7 +14580,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15561,13 +14607,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15576,7 +14622,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -15609,21 +14655,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -15650,13 +14696,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15665,7 +14711,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -15690,13 +14736,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15705,7 +14751,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -15730,13 +14776,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -15745,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -15773,12 +14819,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15794,7 +14840,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15816,14 +14862,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15839,7 +14885,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15862,7 +14908,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -15870,12 +14916,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15886,7 +14932,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15918,13 +14964,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15933,7 +14979,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15960,13 +15006,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -15975,7 +15021,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -16008,21 +15054,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -16049,13 +15095,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -16064,7 +15110,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -16089,13 +15135,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -16104,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -16129,13 +15175,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -16144,7 +15190,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -16172,12 +15218,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -16197,7 +15243,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -16219,12 +15265,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -16244,7 +15290,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -16267,18 +15313,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16289,7 +15335,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16320,12 +15366,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -16334,7 +15380,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16361,12 +15407,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -16375,7 +15421,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16407,20 +15453,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16447,12 +15493,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -16461,7 +15507,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16486,12 +15532,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -16500,7 +15546,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16525,19 +15571,19 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16565,12 +15611,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16586,7 +15632,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16608,12 +15654,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16629,7 +15675,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16652,17 +15698,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16673,7 +15719,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16704,12 +15750,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16718,7 +15764,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16745,12 +15791,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -16759,7 +15805,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16791,20 +15837,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16831,12 +15877,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16845,7 +15891,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16870,12 +15916,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16884,7 +15930,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16909,12 +15955,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -16923,7 +15969,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16951,12 +15997,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16972,7 +16018,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16994,12 +16040,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -17015,7 +16061,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -17038,18 +16084,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -17060,7 +16106,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17091,12 +16137,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -17105,7 +16151,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17132,12 +16178,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -17146,7 +16192,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17178,20 +16224,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17218,12 +16264,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -17232,7 +16278,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17257,12 +16303,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -17271,7 +16317,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17296,12 +16342,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -17310,7 +16356,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17338,12 +16384,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -17363,7 +16409,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -17385,12 +16431,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -17410,7 +16456,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -17433,18 +16479,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -17455,7 +16501,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -17487,13 +16533,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17502,7 +16548,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -17529,13 +16575,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -17544,7 +16590,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -17577,21 +16623,21 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -17618,13 +16664,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17633,7 +16679,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -17660,13 +16706,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17675,7 +16721,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -17700,13 +16746,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -17715,7 +16761,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -17743,12 +16789,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -17764,7 +16810,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -17786,14 +16832,14 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -17809,7 +16855,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -17832,7 +16878,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -17840,12 +16886,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -17856,7 +16902,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17887,12 +16933,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17901,7 +16947,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17928,12 +16974,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -17942,7 +16988,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17974,20 +17020,20 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18014,12 +17060,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -18028,7 +17074,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18055,12 +17101,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -18069,7 +17115,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18094,12 +17140,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -18108,7 +17154,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18136,12 +17182,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -18157,7 +17203,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -18179,12 +17225,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -18200,7 +17246,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -18223,15 +17269,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index a4adb084da6bf3..91a8ac7c935b61 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -13,39 +13,21 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -70,61 +52,27 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -148,7 +96,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -172,7 +120,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -196,104 +144,51 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -318,61 +213,27 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -396,7 +257,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -420,7 +281,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -445,105 +306,52 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -568,61 +376,27 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -646,7 +420,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -670,7 +444,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -695,114 +469,52 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT:    v_min_f32_e32 v5, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32:
+define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -826,59 +538,27 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -901,7 +581,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -924,7 +604,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -947,101 +627,49 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1065,59 +693,27 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1140,7 +736,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1163,7 +759,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1188,102 +784,50 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -1307,59 +851,27 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -1382,7 +894,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -1405,7 +917,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1430,80 +942,37 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1534,7 +1003,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1559,7 +1028,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -1587,7 +1056,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1613,7 +1082,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1639,7 +1108,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1663,7 +1132,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1688,7 +1157,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1718,7 +1187,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1750,12 +1219,12 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1785,7 +1254,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1809,7 +1278,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -1836,7 +1305,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1861,7 +1330,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1886,7 +1355,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1909,7 +1378,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1934,7 +1403,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1963,7 +1432,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1994,47 +1463,25 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -2059,61 +1506,27 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -2137,7 +1550,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -2161,7 +1574,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -2185,107 +1598,54 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2295,7 +1655,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2307,64 +1667,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2373,7 +1699,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2385,10 +1711,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2397,7 +1723,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -2409,133 +1735,82 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2545,7 +1820,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2557,64 +1832,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2623,7 +1864,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2635,10 +1876,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2647,7 +1888,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -2659,623 +1900,404 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT:    v_min_f32_e32 v5, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -3284,7 +2306,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3296,62 +2318,30 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -3359,7 +2349,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3371,10 +2361,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -3382,7 +2372,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3394,11 +2384,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3419,365 +2407,210 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3785,10 +2618,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -3798,62 +2631,30 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3861,10 +2662,8 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
@@ -3875,10 +2674,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -3886,7 +2685,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -3898,11 +2697,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -3923,612 +2722,544 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64:
+define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -4539,7 +3270,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
 ; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -4552,36 +3283,19 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -4592,7 +3306,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -4606,63 +3320,28 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
@@ -4672,7 +3351,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
@@ -4685,17 +3364,188 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
@@ -4706,112 +3556,560 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
 }
 
-define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64:
+define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -4819,49 +4117,34 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -4870,72 +4153,41 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -4943,18 +4195,20 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -4967,596 +4221,388 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v5
-; GFX6-NEXT:    v_mov_b32_e32 v10, v4
-; GFX6-NEXT:    v_mov_b32_e32 v9, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v9
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v5
-; GFX6-NEXT:    v_mov_b32_e32 v10, v4
-; GFX6-NEXT:    v_mov_b32_e32 v9, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v9
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v5
-; GFX6-NEXT:    v_mov_b32_e32 v10, v4
-; GFX6-NEXT:    v_mov_b32_e32 v9, v3
-; GFX6-NEXT:    v_mov_b32_e32 v8, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v5, v9
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16:
+define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5574,7 +4620,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5595,13 +4641,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5614,7 +4660,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5630,13 +4676,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5650,7 +4696,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -5672,13 +4718,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5690,7 +4736,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -5707,13 +4753,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5726,7 +4772,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -5741,13 +4787,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5760,7 +4806,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -5775,13 +4821,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5794,7 +4840,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -5810,13 +4856,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5833,7 +4879,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX7-NEXT:    v_not_b32_e32 v7, v2
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -5852,14 +4898,14 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -5876,7 +4922,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX6-NEXT:    v_not_b32_e32 v7, v2
-; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -5895,19 +4941,19 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5926,7 +4972,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -5947,13 +4993,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5968,7 +5014,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -5984,13 +5030,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6005,7 +5051,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6027,13 +5073,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6046,7 +5092,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6063,13 +5109,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6083,7 +5129,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6098,13 +5144,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6118,7 +5164,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6133,13 +5179,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6153,7 +5199,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6169,13 +5215,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6193,7 +5239,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6212,14 +5258,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6237,7 +5283,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6257,7 +5303,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6265,12 +5311,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6289,7 +5335,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -6310,13 +5356,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -6332,7 +5378,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -6348,13 +5394,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -6369,7 +5415,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6391,13 +5437,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -6410,7 +5456,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6427,13 +5473,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -6447,7 +5493,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6462,13 +5508,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -6482,7 +5528,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6497,13 +5543,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -6517,7 +5563,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6533,13 +5579,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6557,7 +5603,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6576,14 +5622,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6601,7 +5647,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6621,7 +5667,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6629,12 +5675,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16:
+define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6652,7 +5698,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6673,12 +5719,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -6691,7 +5737,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -6707,12 +5753,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -6726,7 +5772,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6748,12 +5794,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -6765,7 +5811,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6782,12 +5828,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -6800,7 +5846,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -6815,12 +5861,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -6833,7 +5879,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6848,12 +5894,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -6866,7 +5912,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -6882,12 +5928,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -6904,7 +5950,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -6923,12 +5969,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -6945,7 +5991,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -6965,17 +6011,17 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6994,7 +6040,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7015,12 +6061,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7035,7 +6081,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7051,12 +6097,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7071,7 +6117,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7093,12 +6139,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7111,7 +6157,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7128,12 +6174,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7147,7 +6193,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7162,12 +6208,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7181,7 +6227,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7196,12 +6242,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7215,7 +6261,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7231,12 +6277,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7254,7 +6300,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7273,12 +6319,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7296,7 +6342,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7316,18 +6362,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7346,7 +6392,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7367,12 +6413,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -7388,7 +6434,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7404,12 +6450,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -7424,7 +6470,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7446,12 +6492,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -7464,7 +6510,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7481,12 +6527,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -7500,7 +6546,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7515,12 +6561,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -7534,7 +6580,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7549,12 +6595,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -7568,7 +6614,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7584,12 +6630,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -7607,7 +6653,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7626,12 +6672,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -7649,7 +6695,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7669,18 +6715,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7690,7 +6736,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -7708,20 +6754,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -7735,19 +6781,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -7766,19 +6812,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -7794,20 +6840,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -7820,20 +6866,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -7846,13 +6892,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -7860,7 +6906,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -7874,12 +6920,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -7890,7 +6936,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -7907,13 +6953,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -7924,7 +6970,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -7942,19 +6988,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7964,7 +7010,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -7982,19 +7028,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8008,18 +7054,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8038,18 +7084,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8065,19 +7111,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8090,19 +7136,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8115,12 +7161,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -8128,7 +7174,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -8142,12 +7188,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -8158,7 +7204,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -8175,12 +7221,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -8191,7 +7237,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -8209,18 +7255,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8239,7 +7285,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8260,13 +7306,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -8281,7 +7327,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8297,13 +7343,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8318,7 +7364,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8340,13 +7386,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -8359,7 +7405,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8376,13 +7422,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8396,7 +7442,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8413,13 +7459,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -8433,7 +7479,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8448,13 +7494,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8468,7 +7514,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8484,13 +7530,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8508,7 +7554,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8527,14 +7573,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8552,7 +7598,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8572,7 +7618,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8580,12 +7626,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8604,7 +7650,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8625,12 +7671,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -8645,7 +7691,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8661,12 +7707,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -8681,7 +7727,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8703,12 +7749,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -8721,7 +7767,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8738,12 +7784,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -8757,7 +7803,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8774,12 +7820,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -8793,7 +7839,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8808,12 +7854,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -8827,7 +7873,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8843,12 +7889,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8866,7 +7912,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8885,12 +7931,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8908,7 +7954,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8928,13 +7974,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -8942,8 +7988,8 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16:
+define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8960,7 +8006,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8988,13 +8034,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -9008,7 +8054,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -9030,13 +8076,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -9050,7 +8096,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -9079,13 +8125,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -9097,7 +8143,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -9118,13 +8164,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -9138,7 +8184,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -9157,13 +8203,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -9177,7 +8223,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -9196,13 +8242,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -9215,7 +8261,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -9236,13 +8282,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -9259,7 +8305,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -9279,14 +8325,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -9303,7 +8349,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -9323,19 +8369,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9354,7 +8400,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -9382,13 +8428,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9404,7 +8450,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -9426,13 +8472,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -9448,7 +8494,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -9477,13 +8523,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -9496,7 +8542,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -9517,13 +8563,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -9538,7 +8584,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -9557,13 +8603,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -9578,7 +8624,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -9597,13 +8643,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -9617,7 +8663,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -9638,13 +8684,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9662,7 +8708,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -9682,14 +8728,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9707,7 +8753,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -9728,7 +8774,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -9736,12 +8782,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9760,7 +8806,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -9788,13 +8834,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9811,7 +8857,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -9833,13 +8879,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -9855,7 +8901,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -9884,13 +8930,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -9903,7 +8949,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -9924,13 +8970,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -9945,7 +8991,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -9964,13 +9010,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -9985,7 +9031,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10004,13 +9050,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -10024,7 +9070,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10045,13 +9091,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -10069,7 +9115,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10089,14 +9135,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -10114,7 +9160,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10135,7 +9181,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -10143,12 +9189,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16:
+define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10165,7 +9211,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10192,12 +9238,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -10211,7 +9257,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10233,12 +9279,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -10252,7 +9298,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10280,12 +9326,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -10297,7 +9343,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10318,12 +9364,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -10337,7 +9383,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10356,12 +9402,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -10375,7 +9421,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10394,12 +9440,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -10412,7 +9458,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10433,12 +9479,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -10455,7 +9501,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10475,12 +9521,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -10497,7 +9543,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -10518,17 +9564,17 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10547,7 +9593,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10574,12 +9620,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10595,7 +9641,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10617,12 +9663,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10638,7 +9684,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10666,12 +9712,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10684,7 +9730,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10705,12 +9751,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10725,7 +9771,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10744,12 +9790,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10764,7 +9810,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10783,12 +9829,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -10802,7 +9848,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10823,12 +9869,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10846,7 +9892,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10866,12 +9912,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10889,7 +9935,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10910,18 +9956,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10940,7 +9986,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10967,12 +10013,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -10989,7 +10035,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11011,12 +10057,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -11032,7 +10078,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11060,12 +10106,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -11078,7 +10124,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11099,12 +10145,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11119,7 +10165,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11138,12 +10184,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11158,7 +10204,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11177,12 +10223,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -11196,7 +10242,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11217,12 +10263,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -11240,7 +10286,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11260,12 +10306,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -11283,7 +10329,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11304,18 +10350,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11325,7 +10371,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11350,13 +10396,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11364,7 +10410,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11385,20 +10431,20 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11424,19 +10470,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11457,13 +10503,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11471,7 +10517,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11490,13 +10536,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11504,7 +10550,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11523,13 +10569,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -11537,7 +10583,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11557,12 +10603,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11573,7 +10619,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11591,13 +10637,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11608,7 +10654,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11627,19 +10673,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11649,7 +10695,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11673,12 +10719,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11686,7 +10732,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11707,19 +10753,19 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11744,18 +10790,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11776,12 +10822,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11789,7 +10835,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11808,12 +10854,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -11821,7 +10867,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11840,12 +10886,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -11853,7 +10899,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11873,12 +10919,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11889,7 +10935,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11907,12 +10953,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11923,7 +10969,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -11942,18 +10988,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11972,7 +11018,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -12000,13 +11046,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -12022,7 +11068,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -12044,13 +11090,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -12066,7 +11112,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -12095,13 +11141,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -12114,7 +11160,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -12135,13 +11181,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -12156,7 +11202,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -12177,13 +11223,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -12198,7 +11244,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -12217,13 +11263,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -12237,7 +11283,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -12258,13 +11304,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12282,7 +11328,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -12302,14 +11348,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12327,7 +11373,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -12348,7 +11394,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -12356,12 +11402,12 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12380,7 +11426,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12407,12 +11453,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -12428,7 +11474,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12450,12 +11496,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -12471,7 +11517,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12499,12 +11545,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -12517,7 +11563,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12538,12 +11584,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -12558,7 +11604,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12579,12 +11625,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -12599,7 +11645,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12618,12 +11664,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -12637,7 +11683,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12658,12 +11704,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12681,7 +11727,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12701,12 +11747,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -12724,7 +11770,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -12745,13 +11791,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -12759,8 +11805,8 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12770,7 +11816,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12785,19 +11831,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12811,19 +11857,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -12839,19 +11885,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -12865,19 +11911,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -12889,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -12913,20 +11959,20 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -12941,13 +11987,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12964,7 +12010,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -12989,14 +12035,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13013,7 +12059,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13039,19 +12085,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13061,7 +12107,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -13076,19 +12122,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -13102,19 +12148,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13130,19 +12176,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13156,19 +12202,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13180,19 +12226,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13204,13 +12250,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13219,7 +12265,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13234,12 +12280,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13256,7 +12302,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13281,14 +12327,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13305,7 +12351,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13331,7 +12377,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -13339,12 +12385,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13354,7 +12400,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -13369,19 +12415,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -13395,19 +12441,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13423,19 +12469,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13449,19 +12495,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13473,19 +12519,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13497,13 +12543,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -13512,7 +12558,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13527,12 +12573,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -13553,7 +12599,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -13578,12 +12624,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -13604,7 +12650,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -13630,18 +12676,18 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16:
+define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13651,7 +12697,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13666,18 +12712,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13691,18 +12737,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13718,18 +12764,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13743,18 +12789,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13766,18 +12812,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13789,19 +12835,19 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -13816,12 +12862,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13838,7 +12884,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13863,12 +12909,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13885,7 +12931,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13911,17 +12957,17 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13931,7 +12977,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13946,18 +12992,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13971,18 +13017,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13998,18 +13044,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14023,18 +13069,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14046,18 +13092,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14069,12 +13115,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -14083,7 +13129,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -14098,12 +13144,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14120,7 +13166,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14145,12 +13191,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14167,7 +13213,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14193,18 +13239,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14214,7 +13260,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -14229,18 +13275,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14254,18 +13300,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14281,18 +13327,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14306,18 +13352,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14329,18 +13375,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14352,12 +13398,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -14366,7 +13412,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -14381,12 +13427,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -14407,7 +13453,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14432,12 +13478,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -14458,7 +13504,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -14484,18 +13530,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14505,7 +13551,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -14520,19 +13566,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -14546,19 +13592,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -14574,19 +13620,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -14600,19 +13646,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -14626,19 +13672,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -14650,13 +13696,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -14665,7 +13711,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14680,12 +13726,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14702,7 +13748,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -14727,14 +13773,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14751,7 +13797,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -14777,7 +13823,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -14785,12 +13831,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14800,7 +13846,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -14815,18 +13861,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14840,18 +13886,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14867,18 +13913,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14892,18 +13938,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14917,18 +13963,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -14940,12 +13986,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -14954,7 +14000,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -14969,12 +14015,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14991,7 +14037,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -15016,12 +14062,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15038,7 +14084,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -15064,13 +14110,13 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -15078,8 +14124,8 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15090,7 +14136,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15122,13 +14168,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -15137,7 +14183,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15164,13 +14210,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -15179,7 +14225,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -15212,21 +14258,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -15253,13 +14299,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -15268,7 +14314,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -15293,13 +14339,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -15308,7 +14354,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -15333,20 +14379,20 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -15374,13 +14420,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15396,7 +14442,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15418,14 +14464,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15441,7 +14487,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15464,19 +14510,19 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15487,7 +14533,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15519,13 +14565,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15534,7 +14580,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15561,13 +14607,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15576,7 +14622,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -15609,21 +14655,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -15650,13 +14696,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15665,7 +14711,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -15690,13 +14736,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15705,7 +14751,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -15730,13 +14776,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -15745,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -15773,12 +14819,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15794,7 +14840,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15816,14 +14862,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15839,7 +14885,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -15862,7 +14908,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -15870,12 +14916,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15886,7 +14932,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15918,13 +14964,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15933,7 +14979,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15960,13 +15006,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -15975,7 +15021,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -16008,21 +15054,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -16049,13 +15095,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -16064,7 +15110,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -16089,13 +15135,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -16104,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -16129,13 +15175,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -16144,7 +15190,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -16172,12 +15218,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -16197,7 +15243,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -16219,12 +15265,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -16244,7 +15290,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -16267,18 +15313,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16289,7 +15335,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16320,12 +15366,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -16334,7 +15380,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16361,12 +15407,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -16375,7 +15421,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16407,20 +15453,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16447,12 +15493,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -16461,7 +15507,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16486,12 +15532,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -16500,7 +15546,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16525,19 +15571,19 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16565,12 +15611,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16586,7 +15632,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16608,12 +15654,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16629,7 +15675,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16652,17 +15698,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16673,7 +15719,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16704,12 +15750,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16718,7 +15764,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16745,12 +15791,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -16759,7 +15805,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16791,20 +15837,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16831,12 +15877,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16845,7 +15891,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16870,12 +15916,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16884,7 +15930,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16909,12 +15955,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -16923,7 +15969,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16951,12 +15997,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16972,7 +16018,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16994,12 +16040,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -17015,7 +16061,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -17038,18 +16084,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -17060,7 +16106,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17091,12 +16137,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -17105,7 +16151,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17132,12 +16178,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -17146,7 +16192,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17178,20 +16224,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17218,12 +16264,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -17232,7 +16278,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17257,12 +16303,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -17271,7 +16317,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17296,12 +16342,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -17310,7 +16356,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17338,12 +16384,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -17363,7 +16409,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -17385,12 +16431,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -17410,7 +16456,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -17433,18 +16479,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -17455,7 +16501,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -17487,13 +16533,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17502,7 +16548,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -17529,13 +16575,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -17544,7 +16590,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -17577,21 +16623,21 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -17618,13 +16664,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17633,7 +16679,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -17660,13 +16706,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17675,7 +16721,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -17700,13 +16746,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -17715,7 +16761,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -17743,12 +16789,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -17764,7 +16810,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -17786,14 +16832,14 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -17809,7 +16855,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -17832,7 +16878,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -17840,12 +16886,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -17856,7 +16902,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17887,12 +16933,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -17901,7 +16947,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17928,12 +16974,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -17942,7 +16988,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -17974,20 +17020,20 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18014,12 +17060,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -18028,7 +17074,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18055,12 +17101,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -18069,7 +17115,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18094,12 +17140,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -18108,7 +17154,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -18136,12 +17182,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -18157,7 +17203,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -18179,12 +17225,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -18200,7 +17246,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -18223,15 +17269,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index d9cbbc11f9a738..2f7e91faa41847 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -157,12 +157,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_offset_neg4096:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x1000
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4096:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0xf000
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4096:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfffff000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -198,12 +211,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_offset_neg4097:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x1001
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4097:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0xefff
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4097:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xffffefff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -239,12 +265,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_offset_neg4098:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x1002
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4098:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0xeffe
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4098:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xffffeffe
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -376,12 +415,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_offset_neg2048:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x800
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2048:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0xf800
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2048:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfffff800
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -413,12 +465,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_offset_neg2049:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x801
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2049:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0xf7ff
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2049:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfffff7ff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -450,12 +515,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_offset_neg2050:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x802
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2050:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0xf7fe
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2050:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfffff7fe
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -525,12 +603,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x800000
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, 0xff800000
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xff800000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -1721,12 +1812,29 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], s4 offset:-0x18
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_mov_b32 s5, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[4:5]
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0xffe8
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xffffffe8
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index 03434caee23312..722c069f90a8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -29,7 +29,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns
 ; IR:       16:
 ; IR-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
+; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]])
 ; IR-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
 ; IR-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
@@ -62,7 +62,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -76,11 +76,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -120,7 +120,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -131,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = fadd float [[TMP37]], [[TMP38]]
@@ -167,7 +167,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
 ; IR-ITERATIVE:       16:
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -199,7 +199,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
 ; IR-DPP:       16:
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -232,7 +232,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -246,11 +246,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -290,7 +290,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -301,7 +301,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -337,7 +337,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
 ; IR-ITERATIVE:       16:
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -369,7 +369,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
 ; IR-DPP:       16:
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -402,7 +402,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -416,11 +416,11 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -460,7 +460,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -471,7 +471,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -503,7 +503,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
 ; IR-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
 ; IR-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
 ; IR-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
@@ -536,7 +536,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP28:%.*]])
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -550,11 +550,11 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP24]])
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -594,7 +594,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -605,7 +605,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.minnum.f32(float [[TMP37]], float [[TMP38]])
@@ -637,7 +637,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -665,7 +665,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
 ; IR-DPP:       12:
 ; IR-DPP-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
 ; IR-DPP-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -698,7 +698,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -712,11 +712,11 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -756,7 +756,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -767,7 +767,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP37]], float [[TMP38]], metadata !"fpexcept.strict") #[[ATTR8]]
@@ -803,7 +803,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
 ; IR-ITERATIVE:       16:
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -835,7 +835,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
 ; IR-DPP:       16:
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -868,7 +868,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -882,11 +882,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -926,7 +926,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -937,7 +937,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -1084,8 +1084,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s
 ; IR-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]])
-; IR-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]])
+; IR-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]])
+; IR-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]])
 ; IR-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1136,8 +1136,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
 ; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1174,8 +1174,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
 ; IR-DPP-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1226,8 +1226,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
 ; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1264,8 +1264,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
 ; IR-DPP-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1312,8 +1312,8 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s
 ; IR-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
 ; IR-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]])
-; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]])
+; IR-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]])
+; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]])
 ; IR-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
 ; IR-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
 ; IR-NEXT:    [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1360,8 +1360,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1394,8 +1394,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
 ; IR-DPP-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-DPP-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-DPP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1446,8 +1446,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
 ; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1484,8 +1484,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
 ; IR-DPP-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
index f954560d0f5ca9..4b4c99b3cd14c8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
@@ -83,7 +83,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(
 ; IR-NEXT:    [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[ENTRY]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
 ; IR-NEXT:    [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; IR-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[VALUE]], i32 [[TMP11]])
+; IR-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[VALUE]], i32 [[TMP11]])
 ; IR-NEXT:    [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = shl i64 1, [[TMP10]]
 ; IR-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index 86e3d9338e0780..38823681d1bb56 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -69,7 +69,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
 ; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -107,7 +107,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
@@ -191,7 +191,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
 ; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -229,7 +229,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index 239fe274d52329..fa66a0fdc76cec 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -61,7 +61,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -100,7 +100,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -196,7 +196,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -235,7 +235,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -331,7 +331,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -370,7 +370,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -438,7 +438,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP18]])
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -477,7 +477,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -565,7 +565,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -604,7 +604,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -700,7 +700,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -739,7 +739,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6548792180a0e1..7f052e1ae8b554 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -84,55 +84,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1064-NEXT:  .LBB0_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB0_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1032-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1032-NEXT:  .LBB0_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB0_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -142,60 +116,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1164-NEXT:  .LBB0_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB0_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1132-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1132-NEXT:  .LBB0_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB0_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -233,55 +180,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1064-DPP-NEXT:  .LBB0_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB0_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1032-DPP-NEXT:  .LBB0_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB0_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -291,60 +212,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1164-DPP-NEXT:  .LBB0_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB0_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1132-DPP-NEXT:  .LBB0_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB0_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
   ret void
@@ -501,18 +395,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -520,27 +414,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1064-NEXT:  .LBB1_5:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB1_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -571,45 +451,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1032-NEXT:  .LBB1_5:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB1_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -630,13 +496,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -644,7 +510,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,29 +519,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1164-NEXT:  .LBB1_5:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB1_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -696,13 +546,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -710,36 +560,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1132-NEXT:  .LBB1_5:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB1_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -904,27 +739,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1064-DPP-NEXT:  .LBB1_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB1_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -986,29 +807,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1032-DPP-NEXT:  .LBB1_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB1_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -1076,34 +883,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1164-DPP-NEXT:  .LBB1_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB1_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -1159,34 +950,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1132-DPP-NEXT:  .LBB1_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB1_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3626,59 +3401,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT:  .LBB6_3:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s5
-; GFX1032-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT:  .LBB6_3:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
@@ -3783,59 +3530,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT:  .LBB6_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB6_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT:  .LBB6_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB6_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
@@ -4039,23 +3758,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    global_atomic_fmax_x2 v40, v[0:1], s[34:35]
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4087,23 +3790,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    global_atomic_fmax_x2 v40, v[0:1], s[34:35]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4261,23 +3948,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v40, v[0:1], s[34:35]
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4309,23 +3980,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v40, v[0:1], s[34:35]
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 6936cdc4d379a8..a9f49ad1508d56 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -84,55 +84,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1064-NEXT:  .LBB0_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB0_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1032-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1032-NEXT:  .LBB0_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB0_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -142,60 +116,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1164-NEXT:  .LBB0_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB0_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1132-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1132-NEXT:  .LBB0_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB0_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -233,55 +180,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1064-DPP-NEXT:  .LBB0_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB0_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1032-DPP-NEXT:  .LBB0_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB0_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -291,60 +212,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1164-DPP-NEXT:  .LBB0_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB0_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX1132-DPP-NEXT:  .LBB0_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB0_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
   ret void
@@ -501,18 +395,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -520,27 +414,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1064-NEXT:  .LBB1_5:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB1_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -571,45 +451,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1032-NEXT:  .LBB1_5:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB1_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -630,13 +496,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -644,7 +510,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,29 +519,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1164-NEXT:  .LBB1_5:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB1_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -696,13 +546,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -710,36 +560,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX1132-NEXT:  .LBB1_5:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB1_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -904,27 +739,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1064-DPP-NEXT:  .LBB1_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB1_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -986,29 +807,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1032-DPP-NEXT:  .LBB1_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB1_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -1076,34 +883,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1164-DPP-NEXT:  .LBB1_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB1_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -1159,34 +950,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB1_2
-; GFX1132-DPP-NEXT:  .LBB1_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB1_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3626,59 +3401,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT:  .LBB6_3:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s5
-; GFX1032-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT:  .LBB6_3:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
@@ -3783,59 +3530,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT:  .LBB6_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB6_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT:  .LBB6_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB6_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
@@ -4039,23 +3758,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    global_atomic_fmin_x2 v40, v[0:1], s[34:35]
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4087,23 +3790,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    global_atomic_fmin_x2 v40, v[0:1], s[34:35]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4261,23 +3948,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v40, v[0:1], s[34:35]
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4309,23 +3980,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v40, v[0:1], s[34:35]
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 3dea5cb5c14233..9f093cc7b5abf2 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -20,6 +20,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL %s
+
 ; Test for integer mad formation for patterns used in clpeak
 
 define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
@@ -176,6 +179,44 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_i32:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_i32:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add i32 %x, 1
   %add = mul i32 %y18, %y
@@ -333,6 +374,43 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v2, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv33 = add i16 %x, 1
   %add = mul i16 %conv33, %y
@@ -510,6 +588,24 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: clpeak_imad_pat_v2i16:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v0, v1, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v3, v2, v1
+; GFX1200-NEXT:    v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y18, %y
@@ -805,6 +901,60 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v3i16:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v3i16:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
   %add = mul <3 x i16> %y48, %y
@@ -1186,6 +1336,60 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v4i16:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v4i16:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
   %add = mul <4 x i16> %y18, %y
@@ -1337,6 +1541,43 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v2, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv33 = add i16 %x, 1
   %add = mul i16 %conv33, %y
@@ -1514,6 +1755,24 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: clpeak_umad_pat_v2i16:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v0, v1, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v3, v2, v1
+; GFX1200-NEXT:    v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y18, %y
@@ -1809,6 +2068,60 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_umad_pat_v3i16:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_umad_pat_v3i16:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
   %add = mul <3 x i16> %y48, %y
@@ -2190,6 +2503,60 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_umad_pat_v4i16:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_umad_pat_v4i16:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
   %add = mul <4 x i16> %y18, %y
@@ -2449,6 +2816,62 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i32:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v4, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, v5, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v1, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[3:4], null, v0, v4, v[0:1]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[4:5], null, v2, v5, v[2:3]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v4, v2, v[4:5]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i32:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v4, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, v5, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v4
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, 1, v5
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i32> %x, <i32 1, i32 1>
   %add = mul <2 x i32> %y18, %y
@@ -2802,6 +3225,82 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v2, v5, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v3i32:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v2, 1, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v6, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, v7, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v9, v8, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v1, v4
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[4:5], null, v0, v6, v[0:1]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[5:6], null, v2, v7, v[2:3]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[6:7], null, v3, v8, v[3:4]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v0, v[4:5]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v5, v2, v[5:6]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[2:3], null, v6, v3, v[6:7]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v3i32:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v6, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, v7, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, v8, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v4
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, 1, v6
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v7
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v5, 1, v8
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v3, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v4, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y48 = add <3 x i32> %x, <i32 1, i32 1, i32 1>
   %add = mul <3 x i32> %y48, %y
@@ -3209,6 +3708,97 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v3, v6, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v4i32:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v2, 1, v2
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v3, 1, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v9, v1, v5
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v10, v2, v6
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v11, v3, v7
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v8, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, v9, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v12, v10, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v4, v11, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v1, v5
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v3, v12, v6
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v4, v4, v7
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[5:6], null, v0, v8, v[0:1]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[6:7], null, v2, v9, v[2:3]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[7:8], null, v3, v10, v[3:4]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[8:9], null, v4, v11, v[4:5]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v0, v[5:6]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v2, v[6:7]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[2:3], null, v7, v3, v[7:8]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[3:4], null, v8, v4, v[8:9]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v4i32:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v2
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, 1, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v9, v1, v5
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v10, v2, v6
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v11, v3, v7
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v8, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, v9, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, v10, v2
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, v11, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v6
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v7
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v8
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v5, 1, v9
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v6, 1, v10
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v7, 1, v11
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v8, 1, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v4, v1, v5
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v2, v6
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v6, v3, v7
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v2
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, 1, v3
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v8
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v4, v1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v3, v6, v3
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
   %add = mul <4 x i32> %y18, %y
@@ -3399,6 +3989,48 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_i24:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX1200-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 24
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_i24:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX1200-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 24
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %shl = shl i32 %x, 8
   %shr = ashr exact i32 %shl, 8
@@ -3593,6 +4225,48 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_u24:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX1200-SDAG-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_u24:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX1200-GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %shl = and i32 %x, 16777215
   %shl1 = and i32 %y, 16777215
@@ -3752,6 +4426,43 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_i8:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_i8:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v2, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv33 = add i8 %x, 1
   %add = mul i8 %conv33, %y
@@ -4013,6 +4724,68 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i8:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-SDAG-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v4, v1, v3, v1
+; GFX1200-SDAG-NEXT:    v_mad_u16 v5, v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u16 v3, v4, v3
+; GFX1200-SDAG-NEXT:    v_mul_lo_u16 v2, v5, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v3, v1, v3
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v2, v0, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v1, v3, v1
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v2, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_lshlrev_b16 v2, 8, v1
+; GFX1200-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1200-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i8:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v4, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v5, v1, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v4, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v5, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v2, v4, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v3, v5, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v4, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v5, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i8> %x, <i8 1, i8 1>
   %add = mul <2 x i8> %y18, %y
@@ -4436,6 +5209,86 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2]
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_i64:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 1
+; GFX1200-SDAG-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v7, v4, v3
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v6, v5, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v4
+; GFX1200-SDAG-NEXT:    v_add3_u32 v1, v1, v7, v6
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[3:4], null, v4, v2, 0
+; GFX1200-SDAG-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX1200-SDAG-NEXT:    v_add3_u32 v4, v4, v6, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v3, v1
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v5, v4, v0
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_add3_u32 v1, v5, v1, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v4
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v4, v1, v3
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1]
+; GFX1200-SDAG-NEXT:    v_add3_u32 v1, v4, v1, v2
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_i64:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v4, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v7, v0
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v4, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1]
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v3, v4, v2
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v6, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1]
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v3, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v6, v0
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v7, vcc_lo
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v3, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2]
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v2, v5, v7
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v0, v4, v[1:2]
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v5, v7
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v5, v6, v[2:3]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v3, v7, v[1:2]
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add i64 %x, 1
   %add = mul i64 %y18, %y
@@ -5202,6 +6055,140 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i64:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
+; GFX1200-SDAG-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX1200-SDAG-NEXT:    v_add_co_u32 v10, vcc_lo, v2, 1
+; GFX1200-SDAG-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v12, v9, v4
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v13, v8, v5
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v14, v11, v6
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v15, v10, v7
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[2:3], null, v10, v6, 0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_add3_u32 v12, v1, v13, v12
+; GFX1200-SDAG-NEXT:    v_add_co_u32 v1, vcc_lo, v0, v8
+; GFX1200-SDAG-NEXT:    v_add3_u32 v13, v3, v15, v14
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v12, v9, vcc_lo
+; GFX1200-SDAG-NEXT:    v_add_co_u32 v8, vcc_lo, v2, v10
+; GFX1200-SDAG-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v13, v11, vcc_lo
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v10, v3, v4
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v11, v1, v5
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v9, v9, v6
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[5:6], null, v8, v6, 0
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v4, 0
+; GFX1200-SDAG-NEXT:    v_add3_u32 v6, v6, v7, v9
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_add3_u32 v4, v4, v11, v10
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v11, v3, v12
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v9, v6, v2
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[7:8], null, v5, v2, v[5:6]
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v10, v4, v0
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v5, v13
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v6, v7, v6
+; GFX1200-SDAG-NEXT:    v_add3_u32 v1, v10, v1, v11
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v4
+; GFX1200-SDAG-NEXT:    v_add3_u32 v8, v9, v8, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v9, v1, v3
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1]
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v10, v8, v5
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[2:3], null, v7, v5, v[7:8]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_add3_u32 v1, v9, v1, v4
+; GFX1200-SDAG-NEXT:    v_add3_u32 v3, v10, v3, v6
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i64:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v2, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v8, v4
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v10, v6
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v12, v8, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v13, v10, v6
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v8, v5, v[0:1]
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v9, v4, v[2:3]
+; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v14, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v11, v6, v[0:1]
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v12, v8
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v14, v9, vcc_lo
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v13, v10
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v3, v4
+; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v15, v3, v4
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v10, v6
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v16, v10, v6
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v8, v11, vcc_lo
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v12, 1
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v3, v5, v[0:1]
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v14, vcc_lo
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2]
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v15, v12
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v9, v4, v[2:3]
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v9, vcc_lo, v13, 1
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v11, v6, v[0:1]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[4:5], null, v15, v5, v[1:2]
+; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v16, v9
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v14, v16, v9
+; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v11, v3
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v8, v15, v12
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v15, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v16, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v8, v10
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[5:6], null, v16, v7, v[0:1]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v14, v15
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[6:7], null, v2, v12, v[4:5]
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v8, v13, v[1:2]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v9, v[5:6]
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[4:5], null, v14, v11, v[0:1]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v10, v[1:2]
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v8, v10
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v14, v15
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v15, v[4:5]
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i64> %x, <i64 1, i64 1>
   %add = mul <2 x i64> %y18, %y
@@ -5434,6 +6421,31 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v5, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: v_multi_use_mul_chain_add_other_use_all:
+; GFX1200:       ; %bb.0: ; %bb
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, 1, v2
+; GFX1200-NEXT:    v_mul_lo_u32 v5, v1, v0
+; GFX1200-NEXT:    s_wait_storecnt 0x0
+; GFX1200-NEXT:    global_store_b32 v[3:4], v2, off scope:SCOPE_SYS
+; GFX1200-NEXT:    s_wait_storecnt 0x0
+; GFX1200-NEXT:    global_store_b32 v[3:4], v1, off scope:SCOPE_SYS
+; GFX1200-NEXT:    s_wait_storecnt 0x0
+; GFX1200-NEXT:    global_store_b32 v[3:4], v5, off scope:SCOPE_SYS
+; GFX1200-NEXT:    s_wait_storecnt 0x0
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, v5, v0
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = add i32 %arg, 1
   %i3 = mul i32 %i, %arg1
@@ -5644,6 +6656,29 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v5, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: v_multi_use_mul_chain_add_other_use_some:
+; GFX1200:       ; %bb.0: ; %bb
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-NEXT:    v_mul_lo_u32 v5, v0, v1
+; GFX1200-NEXT:    s_wait_storecnt 0x0
+; GFX1200-NEXT:    global_store_b32 v[3:4], v2, off scope:SCOPE_SYS
+; GFX1200-NEXT:    s_wait_storecnt 0x0
+; GFX1200-NEXT:    global_store_b32 v[3:4], v5, off scope:SCOPE_SYS
+; GFX1200-NEXT:    s_wait_storecnt 0x0
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, v5, v1
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = add i32 %arg, 1
   %i3 = mul i32 %i, %arg1
@@ -5913,6 +6948,66 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_i32_x2:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_i32_x2:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add i32 %x, 1
   %add = mul i32 %y38, %y
@@ -6384,6 +7479,106 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX11-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i32_x2:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v0, v4, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v1, v5, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v2, 1, v4
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v3, 1, v5
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v2, v4, v2
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v3, v5, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v2, 1, v4
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v3, 1, v5
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v2, v4, v2
+; GFX1200-SDAG-NEXT:    v_add_nc_u32_e32 v3, v5, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u32 v2, v3, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[3:4], null, v0, v4, v[0:1]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[4:5], null, v2, v5, v[2:3]
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[1:2], null, v4, v2, v[4:5]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, v4, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, v5, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v4
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, 1, v5
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, v4, v2
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, v5, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v4
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, 1, v5
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, v4, v2
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, v5, v3
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v4
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v3, 1, v5
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i32> %x, <i32 1, i32 1>
   %add = mul <2 x i32> %y38, %y
@@ -6633,6 +7828,59 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16_x2:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16_x2:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v2, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv69 = add i16 %x, 1
   %add = mul i16 %conv69, %y
@@ -6876,6 +8124,59 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16_x2:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16_x2:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v2, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v0
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv69 = add i16 %x, 1
   %add = mul i16 %conv69, %y
@@ -7173,6 +8474,32 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: clpeak_imad_pat_v2i16_x2:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v0, v1, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v1, v2, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v1, v0, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v1, v2, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v1, v0, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v3, v2, v1
+; GFX1200-NEXT:    v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y38, %y
@@ -7470,6 +8797,32 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: clpeak_umad_pat_v2i16_x2:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v0, v1, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v1, v2, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v1, v0, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v1, v2, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v1, v0, v0
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v3, v2, v1
+; GFX1200-NEXT:    v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y38, %y
@@ -7532,6 +8885,19 @@ define <2 x i32> @multi_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z0, i32 %z1) {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: multi_use_mul_mad_i32_var:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, v1, v2
+; GFX1200-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i32 %x, %y
   %add0 = add i32 %mul, %z0
@@ -7634,6 +9000,35 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: multi_use_mul_mad_i16_var:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mad_u16 v2, v0, v1, v2
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v3
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: multi_use_mul_mad_i16_var:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v1, v0, v2
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX1200-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i16 %x, %y
   %add0 = add i16 %mul, %z0
@@ -7691,6 +9086,20 @@ define i32 @other_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z, ptr addrspace(3) %
 ; GFX11-NEXT:    ds_store_b32 v3, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: other_use_mul_mad_i32_var:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, v1, v2
+; GFX1200-NEXT:    ds_store_b32 v3, v1
+; GFX1200-NEXT:    s_wait_dscnt 0x0
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i32 %x, %y
   %add0 = add i32 %mul, %z
@@ -7778,6 +9187,33 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %
 ; GFX11-GISEL-NEXT:    ds_store_b16 v3, v1
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: other_use_mul_mad_i16_var:
+; GFX1200-SDAG:       ; %bb.0: ; %entry
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mul_lo_u16 v4, v0, v1
+; GFX1200-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX1200-SDAG-NEXT:    ds_store_b16 v3, v4
+; GFX1200-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: other_use_mul_mad_i16_var:
+; GFX1200-GISEL:       ; %bb.0: ; %entry
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_mul_lo_u16 v1, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_add_nc_u16 v0, v1, v2
+; GFX1200-GISEL-NEXT:    ds_store_b16 v3, v1
+; GFX1200-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i16 %x, %y
   %add0 = add i16 %mul, %z
@@ -7882,6 +9318,19 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: multi_use_mul_mad_v2i16_var:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_pk_mad_u16 v2, v0, v1, v2
+; GFX1200-NEXT:    v_pk_mad_u16 v1, v0, v1, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul <2 x i16> %x, %y
   %add0 = add <2 x i16> %mul, %z0
@@ -7996,6 +9445,19 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX11-NEXT:    ds_store_b32 v3, v4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: other_use_mul_mad_v2i16_var:
+; GFX1200:       ; %bb.0: ; %entry
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_pk_mul_lo_u16 v4, v0, v1
+; GFX1200-NEXT:    v_pk_mad_u16 v0, v0, v1, v2
+; GFX1200-NEXT:    ds_store_b32 v3, v4
+; GFX1200-NEXT:    s_wait_dscnt 0x0
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul <2 x i16> %x, %y
   %add0 = add <2 x i16> %mul, %z
@@ -8069,6 +9531,30 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-LABEL: mul_u24_add64:
+; GFX1200-SDAG:       ; %bb.0:
+; GFX1200-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
+; GFX1200-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-LABEL: mul_u24_add64:
+; GFX1200-GISEL:       ; %bb.0:
+; GFX1200-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v0, v1
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
   %add = add i64 %mul, %z
   ret i64 %add
@@ -8115,6 +9601,19 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: mul_u24_zext_add64:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1200-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
   %mul.zext = zext i32 %mul to i64
   %add = add i64 %mul.zext, %z
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
deleted file mode 100644
index 74aa755bfce3d0..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-
-declare float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
-
-; GCN-LABEL: {{^}}lds_ds_fadd:
-; VI-DAG: s_mov_b32 m0
-; GFX9-NOT: m0
-; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
-; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
-; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
-; GCN: s_waitcnt lgkmcnt(1)
-; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
-define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, ptr addrspace(1) %out
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
deleted file mode 100644
index 142a6ed19daf83..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ /dev/null
@@ -1,1418 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s  | FileCheck -enable-var-scope -check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
-
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_SI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s  | FileCheck -enable-var-scope -check-prefix=G_GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_VI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX11 %s
-
-declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
-declare float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
-declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
-declare double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
-
-
-define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmin:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; SI-NEXT:    s_add_i32 s2, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    ds_min_f32 v2, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmin:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmin:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    ds_min_f32 v2, v0 offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmin:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmin:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX10-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_min_rtn_f32 v0, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmin:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX11-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_min_rtn_f32 v0, v3, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmin:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s6, -1
-; G_SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s4, s4, s3
-; G_SI-NEXT:    s_addc_u32 s5, s5, 0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s2, s2, 4
-; G_SI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; G_SI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    ds_min_f32 v2, v0
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_SI-NEXT:    v_mov_b32_e32 v1, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmin:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s6, -1
-; G_GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; G_GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    ds_min_f32 v2, v0
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmin:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s2, s2, 4
-; G_VI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; G_VI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    ds_min_f32 v2, v0
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_VI-NEXT:    v_mov_b32_e32 v1, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmin:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_min_f32 v2, v1
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_min_rtn_f32 v0, v1, v0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmin:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s6, -1
-; G_GFX10-NEXT:    s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX10-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX10-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX10-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX10-NEXT:    ds_min_f32 v2, v1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_min_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmin:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX11-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX11-NEXT:    ds_min_f32 v2, v1
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_min_rtn_f32 v0, v3, v0
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, ptr addrspace(5) %out
-  ret void
-}
-
-define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmax:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; SI-NEXT:    s_add_i32 s2, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    ds_max_f32 v2, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmax:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmax:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    ds_max_f32 v2, v0 offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmax:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmax:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX10-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_max_rtn_f32 v0, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmax:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX11-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_max_rtn_f32 v0, v3, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmax:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s6, -1
-; G_SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s4, s4, s3
-; G_SI-NEXT:    s_addc_u32 s5, s5, 0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s2, s2, 4
-; G_SI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; G_SI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    ds_max_f32 v2, v0
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_SI-NEXT:    v_mov_b32_e32 v1, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmax:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s6, -1
-; G_GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; G_GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    ds_max_f32 v2, v0
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmax:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s2, s2, 4
-; G_VI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; G_VI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    ds_max_f32 v2, v0
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_VI-NEXT:    v_mov_b32_e32 v1, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmax:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_max_f32 v2, v1
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_max_rtn_f32 v0, v1, v0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmax:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s6, -1
-; G_GFX10-NEXT:    s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX10-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX10-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX10-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX10-NEXT:    ds_max_f32 v2, v1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_max_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmax:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX11-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX11-NEXT:    ds_max_f32 v2, v1
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_max_rtn_f32 v0, v3, v0
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, ptr addrspace(5) %out
-  ret void
-}
-
-define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmin_f64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_add_i32 s1, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v5, s1
-; SI-NEXT:    ds_min_f64 v5, v[0:1]
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v2, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmin_f64:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v2, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s2
-; GFX7-NEXT:    v_mov_b32_e32 v4, s1
-; GFX7-NEXT:    ds_min_f64 v5, v[0:1] offset:64
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; GFX7-NEXT:    s_add_i32 s1, s0, 4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmin_f64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v5, s2
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    ds_min_f64 v5, v[0:1] offset:64
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT:    s_add_i32 s1, s0, 4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmin_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    ds_min_f64 v5, v[0:1] offset:64
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmin_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX10-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmin_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    v_mov_b32_e32 v5, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; GFX11-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX11-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmin_f64:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s10, -1
-; G_SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s8, s8, s3
-; G_SI-NEXT:    s_mov_b32 s2, 0
-; G_SI-NEXT:    s_addc_u32 s9, s9, 0
-; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v4, s2
-; G_SI-NEXT:    ds_min_f64 v4, v[0:1]
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_SI-NEXT:    v_mov_b32_e32 v2, s0
-; G_SI-NEXT:    s_add_u32 s0, s0, 4
-; G_SI-NEXT:    v_mov_b32_e32 v3, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_SI-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmin_f64:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s10, -1
-; G_GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX7-NEXT:    s_mov_b32 s2, 0
-; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX7-NEXT:    ds_min_f64 v4, v[0:1]
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX7-NEXT:    s_add_u32 s0, s0, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v3, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmin_f64:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_mov_b32 s2, 0
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v4, s2
-; G_VI-NEXT:    ds_min_f64 v4, v[0:1]
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT:    v_mov_b32_e32 v2, s0
-; G_VI-NEXT:    s_add_u32 s0, s0, 4
-; G_VI-NEXT:    v_mov_b32_e32 v3, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; G_VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmin_f64:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_mov_b32 s0, 0
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; G_GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; G_GFX9-NEXT:    ds_min_f64 v5, v[0:1]
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmin_f64:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s10, -1
-; G_GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX10-NEXT:    s_mov_b32 s0, 0
-; G_GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; G_GFX10-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; G_GFX10-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX10-NEXT:    ds_min_f64 v4, v[0:1]
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmin_f64:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
-; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX11-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX11-NEXT:    ds_min_f64 v4, v[0:1]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false)
-  store double %a3, ptr addrspace(5) %out
-  ret void
-}
-
-define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmax_f64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_add_i32 s1, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v5, s1
-; SI-NEXT:    ds_max_f64 v5, v[0:1]
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v2, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmax_f64:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v2, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s2
-; GFX7-NEXT:    v_mov_b32_e32 v4, s1
-; GFX7-NEXT:    ds_max_f64 v5, v[0:1] offset:64
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; GFX7-NEXT:    s_add_i32 s1, s0, 4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmax_f64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v5, s2
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    ds_max_f64 v5, v[0:1] offset:64
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT:    s_add_i32 s1, s0, 4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmax_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    ds_max_f64 v5, v[0:1] offset:64
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmax_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX10-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmax_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    v_mov_b32_e32 v5, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; GFX11-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX11-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmax_f64:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s10, -1
-; G_SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s8, s8, s3
-; G_SI-NEXT:    s_mov_b32 s2, 0
-; G_SI-NEXT:    s_addc_u32 s9, s9, 0
-; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v4, s2
-; G_SI-NEXT:    ds_max_f64 v4, v[0:1]
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_SI-NEXT:    v_mov_b32_e32 v2, s0
-; G_SI-NEXT:    s_add_u32 s0, s0, 4
-; G_SI-NEXT:    v_mov_b32_e32 v3, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_SI-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmax_f64:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s10, -1
-; G_GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX7-NEXT:    s_mov_b32 s2, 0
-; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX7-NEXT:    ds_max_f64 v4, v[0:1]
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX7-NEXT:    s_add_u32 s0, s0, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v3, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmax_f64:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_mov_b32 s2, 0
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v4, s2
-; G_VI-NEXT:    ds_max_f64 v4, v[0:1]
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT:    v_mov_b32_e32 v2, s0
-; G_VI-NEXT:    s_add_u32 s0, s0, 4
-; G_VI-NEXT:    v_mov_b32_e32 v3, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; G_VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmax_f64:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_mov_b32 s0, 0
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; G_GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; G_GFX9-NEXT:    ds_max_f64 v5, v[0:1]
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmax_f64:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s10, -1
-; G_GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX10-NEXT:    s_mov_b32 s0, 0
-; G_GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; G_GFX10-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; G_GFX10-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX10-NEXT:    ds_max_f64 v4, v[0:1]
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmax_f64:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
-; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX11-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX11-NEXT:    ds_max_f64 v4, v[0:1]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false)
-  store double %a3, ptr addrspace(5) %out
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-dynlds.ll b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-dynlds.ll
new file mode 100644
index 00000000000000..1c3734ef4d0017
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-dynlds.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+
+; Dynamic LDS that are direct mapped are not lowered in LowerModuleLDS pass.
+; In such cases, LowerModuleLDS is free to leave it in and ignore it, and we want to make sure
+; LowerModuleLDS doesn't crash if it re-runs on such modules.
+
+@loweredlds = addrspace(3) global i32 poison, !absolute_symbol !0
+@dynlds = external addrspace(3) global [0 x i32]
+
+define amdgpu_kernel void @kern(i32 %val0) {
+; CHECK-LABEL: define amdgpu_kernel void @kern(
+; CHECK-SAME: i32 [[VAL0:%.*]]) {
+; CHECK-NEXT:    store i32 0, ptr addrspace(3) @loweredlds, align 4
+; CHECK-NEXT:    store i32 1, ptr addrspace(3) @dynlds, align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 0, ptr addrspace(3) @loweredlds
+  store i32 1, ptr addrspace(3) @dynlds
+  ret void
+}
+
+
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll b/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll
index e121f0da327d0a..615f1e3b7fb382 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll
@@ -7,10 +7,12 @@
 ; Check AMDGPULowerModuleLDS can run more than once on the same module, and that
 ; the second run is a no-op.
 
+@dynlds = external addrspace(3) global [0 x i32], align 4
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
 define amdgpu_kernel void @test() {
 entry:
+  store i32 0, ptr addrspace(3) @dynlds
   store i32 1, ptr addrspace(3) @lds
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 85d20d1c0410cd..08cf83fd2bd0fb 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -366,12 +366,10 @@
 ; GCN-O1-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        Virtual Register Rewriter
-; GCN-O1-NEXT:        Stack Slot Coloring
 ; GCN-O1-NEXT:        SI lower SGPR spill instructions
 ; GCN-O1-NEXT:        Virtual Register Map
 ; GCN-O1-NEXT:        Live Register Matrix
 ; GCN-O1-NEXT:        SI Pre-allocate WWM Registers
-; GCN-O1-NEXT:        Live Stack Slot Analysis
 ; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        SI Lower WWM Copies
 ; GCN-O1-NEXT:        GCN NSA Reassign
@@ -673,12 +671,10 @@
 ; GCN-O1-OPTS-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        Virtual Register Rewriter
-; GCN-O1-OPTS-NEXT:        Stack Slot Coloring
 ; GCN-O1-OPTS-NEXT:        SI lower SGPR spill instructions
 ; GCN-O1-OPTS-NEXT:        Virtual Register Map
 ; GCN-O1-OPTS-NEXT:        Live Register Matrix
 ; GCN-O1-OPTS-NEXT:        SI Pre-allocate WWM Registers
-; GCN-O1-OPTS-NEXT:        Live Stack Slot Analysis
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        SI Lower WWM Copies
 ; GCN-O1-OPTS-NEXT:        GCN NSA Reassign
@@ -986,12 +982,10 @@
 ; GCN-O2-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        Virtual Register Rewriter
-; GCN-O2-NEXT:        Stack Slot Coloring
 ; GCN-O2-NEXT:        SI lower SGPR spill instructions
 ; GCN-O2-NEXT:        Virtual Register Map
 ; GCN-O2-NEXT:        Live Register Matrix
 ; GCN-O2-NEXT:        SI Pre-allocate WWM Registers
-; GCN-O2-NEXT:        Live Stack Slot Analysis
 ; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        SI Lower WWM Copies
 ; GCN-O2-NEXT:        GCN NSA Reassign
@@ -1311,12 +1305,10 @@
 ; GCN-O3-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        Virtual Register Rewriter
-; GCN-O3-NEXT:        Stack Slot Coloring
 ; GCN-O3-NEXT:        SI lower SGPR spill instructions
 ; GCN-O3-NEXT:        Virtual Register Map
 ; GCN-O3-NEXT:        Live Register Matrix
 ; GCN-O3-NEXT:        SI Pre-allocate WWM Registers
-; GCN-O3-NEXT:        Live Stack Slot Analysis
 ; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        SI Lower WWM Copies
 ; GCN-O3-NEXT:        GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 265d64f47bb23e..1ae22c3eec185b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -11,8 +11,8 @@ declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()
 
-define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss:
+define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -24,7 +24,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_vss:
+; GFX11-LABEL: v_permlane16_b32_vss_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
@@ -38,7 +38,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_vss:
+; GFX12-LABEL: v_permlane16_b32_vss_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
@@ -51,13 +51,252 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlane16_b32_vii:
+define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlane16_b32_vii_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -69,7 +308,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
 ; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_vii:
+; GFX11-LABEL: v_permlane16_b32_vii_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
@@ -83,7 +322,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_vii:
+; GFX12-LABEL: v_permlane16_b32_vii_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -94,596 +333,680 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
-define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlane16_b32_vll:
+define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlane16_b32_vii_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_vll:
+; GFX11-LABEL: v_permlane16_b32_vii_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    s_movk_i32 s2, 0x1234
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_vll:
+; GFX12-LABEL: v_permlane16_b32_vii_f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlane16_b32_vvv:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 null, 0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s0, s1
-; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_endpgm
+define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_permlane16_b32_vvv:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_clause 0x1
-; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_permlane16_b32_vvv:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_clause 0x1
-; GFX11-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: v_permlane16_b32_vvv:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: v_permlane16_b32_vvv:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vvs:
+define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vvs:
+; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_permlane16_b32_vvs:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_permlane16_b32_vvs:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: v_permlane16_b32_vvs:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: v_permlane16_b32_vvs:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vsv:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
+define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlane16_b32_vll_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vll_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vll_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vsv:
+; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_permlane16_b32_vsv:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_permlane16_b32_vsv:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: v_permlane16_b32_vsv:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: v_permlane16_b32_vsv:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss_fi:
+define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float %src0) {
+; GFX10-LABEL: v_permlane16_b32_vll_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_vss_fi:
+; GFX11-LABEL: v_permlane16_b32_vll_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_vss_fi:
+; GFX12-LABEL: v_permlane16_b32_vll_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss_bc:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_vss_bc:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_vss_bc:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss_fi_bc:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_vss_fi_bc:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_vss_fi_bc:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
-  store i32 %v, ptr addrspace(1) %out
+; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss:
+define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlane16_b32_vvv_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 null, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlanex16_b32_vss:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlanex16_b32_vss:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlanex16_b32_vii:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlanex16_b32_vii:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlanex16_b32_vii:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
-define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlanex16_b32_vll:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_movk_i32 s0, 0x1234
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
-; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_endpgm
+define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlanex16_b32_vll:
+; GFX11-LABEL: v_permlane16_b32_vvv_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    s_movk_i32 s2, 0x1234
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlanex16_b32_vll:
+; GFX12-LABEL: v_permlane16_b32_vvv_i64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlanex16_b32_vvv:
+define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlane16_b32_vvv_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -694,11 +1017,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
@@ -712,13 +1035,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x2c
@@ -731,13 +1054,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
@@ -749,13 +1072,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
@@ -766,42 +1089,117 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vvv_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vvv_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs:
+define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs:
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -809,26 +1207,26 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -836,863 +1234,8803 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv:
+define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv:
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i64:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_fi_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_fi_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_fi_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_bc_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_bc_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_bc_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_bc_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_bc_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_bc_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vii_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vii_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vii_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vii_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vii_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vii_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
+define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vll_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vll_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vll_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vll_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vll_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vll_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 null, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 null, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_tid_tid_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_tid_tid_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_tid_tid_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_tid_tid_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_tid_tid_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_tid_tid_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %v = call float @llvm.amdgcn.permlane16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %v = call double @llvm.amdgcn.permlane16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_undef_tid_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_undef_tid_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_undef_tid_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_undef_tid_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_undef_tid_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_undef_tid_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %v = call float @llvm.amdgcn.permlane16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s2, s3
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s2, s3
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40934a00
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40934a00
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %v = call double @llvm.amdgcn.permlane16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %v = call float @llvm.amdgcn.permlanex16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s2, s3
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s2, s3
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40934a00
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v1, s2, s3
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v0, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40934a00
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v2, v0, s2, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v3, v1, s2, s3
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v1, s2, s3
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v0, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v0, s2, s3
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v3, v1, s2, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v1, s2, s3
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v0, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v0, s2, s3
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v3, v1, s2, s3
+; GFX12-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %v = call double @llvm.amdgcn.permlanex16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss_fi:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlanex16_b32_vss_fi:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlanex16_b32_vss_fi:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false)
+  store double %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss_bc:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlanex16_b32_vss_bc:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlanex16_b32_vss_bc:
+; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1]
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc:
+; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
-  store i32 %v, ptr addrspace(1) %out
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true)
+  store double %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_tid_tid:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_tid_tid:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_tid_tid:
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+  %undef = freeze i32 poison
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_undef_tid:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_permlane16_b32_undef_tid:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: v_permlane16_b32_undef_tid:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %undef = freeze float poison
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %tidx_i64 = zext i32 %tidx to i64
+  %undef = freeze i64 poison
+  %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true)
+  store i64 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
 ; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid:
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
 ; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
 ; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid:
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
 ; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid:
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
 ; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid:
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
 ; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid:
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
 ; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %undef = freeze double poison
+  %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true)
+  store double %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_i_tid_fi:
+define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_half:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlane16_b32_i_tid_fi:
+; GFX11-LABEL: v_permlane16_half:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlane16_b32_i_tid_fi:
+; GFX12-LABEL: v_permlane16_half:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store half %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_i_tid_bc:
+define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_half:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlane16_b32_i_tid_bc:
+; GFX11-LABEL: v_permlanex16_half:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlane16_b32_i_tid_bc:
+; GFX12-LABEL: v_permlanex16_half:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
-  store i32 %v, ptr addrspace(1) %out
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store half %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc:
+define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_bfloat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc:
+; GFX11-LABEL: v_permlane16_bfloat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc:
+; GFX12-LABEL: v_permlane16_bfloat:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
-  store i32 %v, ptr addrspace(1) %out
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store bfloat %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_tid_tid:
+define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_bfloat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlanex16_b32_tid_tid:
+; GFX11-LABEL: v_permlanex16_bfloat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlanex16_b32_tid_tid:
+; GFX12-LABEL: v_permlanex16_bfloat:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store bfloat %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_undef_tid:
+define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i16 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i16 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <2 x half> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <2 x half> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v2f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v2f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v2f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v2f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v2f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v2f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <2 x float> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v2f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlanex16_b32_undef_tid:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: v_permlanex16_v2f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlanex16_b32_undef_tid:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+; GFX11-SDAG-LABEL: v_permlanex16_v2f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v2f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v2f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v2f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <2 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid:
+define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v7i32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_clause 0x1
-; GFX10-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v2, v1, s[4:5]
-; GFX10-SDAG-NEXT:    s_endpgm
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid:
+; GFX10-GISEL-LABEL: v_permlane16_v7i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid:
+; GFX11-SDAG-LABEL: v_permlane16_v7i32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_clause 0x1
-; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid:
+; GFX11-GISEL-LABEL: v_permlane16_v7i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_clause 0x1
-; GFX11-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid:
+; GFX12-SDAG-LABEL: v_permlane16_v7i32:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_clause 0x1
-; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid:
+; GFX12-GISEL-LABEL: v_permlane16_v7i32:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_clause 0x1
-; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <7 x i32> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_i_tid_fi:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v7i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlanex16_b32_i_tid_fi:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: v_permlanex16_v7i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlanex16_b32_i_tid_fi:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
-  store i32 %v, ptr addrspace(1) %out
+; GFX11-SDAG-LABEL: v_permlanex16_v7i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v7i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v7i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v7i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <7 x i32> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_i_tid_bc:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v8i16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlanex16_b32_i_tid_bc:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: v_permlane16_v8i16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlanex16_b32_i_tid_bc:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
-  store i32 %v, ptr addrspace(1) %out
+; GFX11-SDAG-LABEL: v_permlane16_v8i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v8i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v8i16:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v8i16:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <8 x i16> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v8i16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: v_permlanex16_v8i16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %undef = freeze i32 poison
-  %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
-  store i32 %v, ptr addrspace(1) %out
+; GFX11-SDAG-LABEL: v_permlanex16_v8i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v8i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v8i16:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v8i16:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <8 x i16> %v, ptr addrspace(1) %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
new file mode 100644
index 00000000000000..bb428342216818
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
@@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+
+define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr @llvm.amdgcn.permlane16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr @llvm.amdgcn.permlanex16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr> @llvm.amdgcn.permlane16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr> @llvm.amdgcn.permlanex16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p3:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr addrspace(3) @llvm.amdgcn.permlane16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr addrspace(3) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p3:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr addrspace(3) @llvm.amdgcn.permlanex16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr addrspace(3) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p3:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p3:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlanex16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p5:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p5:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr addrspace(5) @llvm.amdgcn.permlane16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr addrspace(5) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p5:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p5:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr addrspace(5) @llvm.amdgcn.permlanex16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr addrspace(5) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p5:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p5:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p5:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p5:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlanex16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p6:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr addrspace(6) @llvm.amdgcn.permlane16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr addrspace(6) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p6:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call ptr addrspace(6) @llvm.amdgcn.permlanex16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store ptr addrspace(6) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p6:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p6:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlanex16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index b81cb97725648b..f653baa7365c71 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -5,8 +5,8 @@
 declare i32 @llvm.amdgcn.permlane64(i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
-; GFX11-LABEL: test_s:
+define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX11-LABEL: test_s_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
@@ -19,13 +19,98 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
+  %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
-; GFX11-LABEL: test_i:
+define amdgpu_kernel void @test_s_f32(ptr addrspace(1) %out, float %src0) {
+; GFX11-LABEL: test_s_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlane64.f32(float %src0)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX11-SDAG-LABEL: test_s_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) {
+; GFX11-SDAG-LABEL: test_s_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlane64.f64(double %src0)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) {
+; GFX11-LABEL: test_i_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
@@ -36,13 +121,95 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane64(i32 99)
+  %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
-; GFX11-SDAG-LABEL: test_v:
+define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) {
+; GFX11-LABEL: test_i_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: test_i_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x63
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v2
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x63
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v2
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: test_i_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40934a00
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v2
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+  %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_i32:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0
@@ -53,7 +220,7 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: test_v:
+; GFX11-GISEL-LABEL: test_v_i32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
@@ -64,7 +231,221 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)
+  %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32)
+  store float %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+; GFX11-LABEL: test_v_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_permlane64_b32 v1, v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_i64 = zext i32 %tidx to i64
+  %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64)
+  store i64 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx_f32 = bitcast i32 %tidx to float
+  %tidx_f64 = fpext float %tidx_f32 to double
+  %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64)
+  store double %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_half(ptr addrspace(1) %out, half %src0) {
+; GFX11-LABEL: test_half:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call half @llvm.amdgcn.permlane64.f16(half %src0)
+  store half %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) {
+; GFX11-LABEL: test_bfloat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0)
+  store bfloat %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_i16(ptr addrspace(1) %out, i16 %src0) {
+; GFX11-LABEL: test_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0)
+  store i16 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) {
+; GFX11-LABEL: test_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0)
+  store <2 x half> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) {
+; GFX11-SDAG-LABEL: test_v2f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v2f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0)
+  store <2 x float> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) {
+; GFX11-SDAG-LABEL: test_v7i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v8, v8
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v7, v7
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v6, v6
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v7i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v6, v6
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v7, v7
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v8, v8
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0)
+  store <7 x i32> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) {
+; GFX11-SDAG-LABEL: test_v8i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v8i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0)
+  store <8 x i16> %v, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
new file mode 100644
index 00000000000000..2070a832e0fcd0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+
+define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
+; GFX11-SDAG-LABEL: test_p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0)
+  store ptr %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) {
+; GFX11-SDAG-LABEL: test_v3p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x44
+; GFX11-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x54
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s6
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v8, s2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v1
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v4
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v4, v8
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v3, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v7
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call <3 x ptr> @llvm.amdgcn.permlane64.v3p0(<3 x ptr> %src0)
+  store <3 x ptr> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0) {
+; GFX11-SDAG-LABEL: test_p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0)
+  store ptr addrspace(3) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0) {
+; GFX11-SDAG-LABEL: test_v3p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
+; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0)
+  store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0) {
+; GFX11-SDAG-LABEL: test_p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0)
+  store ptr addrspace(5) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0) {
+; GFX11-SDAG-LABEL: test_v3p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
+; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0)
+  store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0) {
+; GFX11-SDAG-LABEL: test_p6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0)
+  store ptr addrspace(6) %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0) {
+; GFX11-SDAG-LABEL: test_v3p6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
+; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+  %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0)
+  store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
index aaa4e2b3622df3..f3c5ac757e22b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
@@ -36,9 +36,9 @@ define amdgpu_ps void @test(ptr addrspace(1) inreg %ptr) {
 define amdgpu_ps void @test_loop() {
 ; SDAG-LABEL: test_loop:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
 ; SDAG-NEXT:  .LBB1_1: ; %loop
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
 ; SDAG-NEXT:    s_cmp_eq_u32 s0, 0
 ; SDAG-NEXT:    s_cbranch_scc1 .LBB1_1
 ; SDAG-NEXT:  ; %bb.2: ; %exit
@@ -46,9 +46,9 @@ define amdgpu_ps void @test_loop() {
 ;
 ; GFX9-GISEL-LABEL: test_loop:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
 ; GFX9-GISEL-NEXT:  .LBB1_1: ; %loop
 ; GFX9-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
 ; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-GISEL-NEXT:  ; %bb.2: ; %exit
@@ -56,9 +56,9 @@ define amdgpu_ps void @test_loop() {
 ;
 ; GFX10-GISEL-LABEL: test_loop:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
 ; GFX10-GISEL-NEXT:  .LBB1_1: ; %loop
 ; GFX10-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
 ; GFX10-GISEL-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX10-GISEL-NEXT:  ; %bb.2: ; %exit
@@ -77,14 +77,23 @@ define amdgpu_ps i32 @test_if(i1 inreg %cond) {
 ; SDAG:       ; %bb.0: ; %entry
 ; SDAG-NEXT:    s_bitcmp0_b32 s0, 0
 ; SDAG-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; SDAG-NEXT:    s_cbranch_scc1 .LBB2_2
+; SDAG-NEXT:  ; %bb.1: ; %body
+; SDAG-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; SDAG-NEXT:  .LBB2_2: ; %exit
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-GISEL-LABEL: test_if:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_mov_b32 s1, s0
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
 ; GFX9-GISEL-NEXT:    s_xor_b32 s1, s1, 1
 ; GFX9-GISEL-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %body
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX9-GISEL-NEXT:  .LBB2_2: ; %exit
 ; GFX9-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-GISEL-LABEL: test_if:
@@ -92,6 +101,11 @@ define amdgpu_ps i32 @test_if(i1 inreg %cond) {
 ; GFX10-GISEL-NEXT:    s_xor_b32 s0, s0, 1
 ; GFX10-GISEL-NEXT:    s_and_b32 s1, s0, 1
 ; GFX10-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX10-GISEL-NEXT:  ; %bb.1: ; %body
+; GFX10-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX10-GISEL-NEXT:  .LBB2_2: ; %exit
 ; GFX10-GISEL-NEXT:    ; return to shader part epilog
 entry:
   %id1 = call i32 @llvm.amdgcn.pops.exiting.wave.id()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 0284f44f5f14d4..cc6c630ae6466d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,65 +1,387 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0
+declare double @llvm.amdgcn.readfirstlane.f64(double) #0
 
-; CHECK-LABEL: {{^}}test_readfirstlane:
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm:
-; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
-; CHECK-NOT: [[SGPR_VAL]]
-; CHECK: ; use [[SGPR_VAL]]
-define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %src)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %src)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_mov_b32 s0, 32
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 32
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold:
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
-; CHECK-NOT: [[VVAL]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_mov_b64 s[0:1], 32
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_mov_b64 s[0:1], 32
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+  call void asm sideeffect "; use $0", "s"(i64 %readfirstlane)
+  ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_mov_b32 s0, 0
+; CHECK-SDAG-NEXT:    s_mov_b32 s1, 0x40400000
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+  call void asm sideeffect "; use $0", "s"(double %readfirstlane)
+  ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_m0:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_m0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_m0:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readfirstlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; Make sure this doesn't crash.
-; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %sgpr)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %sgpr)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_fi:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-SDAG-NEXT:    s_mov_b32 s4, 0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_fi:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint ptr addrspace(5) %alloca to i32
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
@@ -67,5 +389,210 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
   ret void
 }
 
+define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_half:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_half:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call half @llvm.amdgcn.readfirstlane.f16(half %src)
+  call void asm sideeffect "; use $0", "s"(half %x)
+  ret void
+}
+
+define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_float:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_float:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call float @llvm.amdgcn.readfirstlane.f32(float %src)
+  call void asm sideeffect "; use $0", "s"(float %x)
+  ret void
+}
+
+define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_bfloat:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_bfloat:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call bfloat @llvm.amdgcn.readfirstlane.bf16(bfloat %src)
+  call void asm sideeffect "; use $0", "s"(bfloat %x)
+  ret void
+}
+
+define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    s_and_b32 s4, s4, 0xffff
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
+  call void asm sideeffect "; use $0", "s"(i16 %x)
+  ret void
+}
+
+define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
+  call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:5]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %src)
+  call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v7i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:10]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v7i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:10]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <7 x i32> @llvm.amdgcn.readfirstlane.v7i32(<7 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:7]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v8i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:7]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <8 x i16> @llvm.amdgcn.readfirstlane.v8i16(<8 x i16> %src)
+  call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
new file mode 100644
index 00000000000000..3882a5f0f9f4fc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
+  call void asm sideeffect "; use $0", "s"(ptr %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:9]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+  ret void
+}
+
+define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+  ret void
+}
+
+define void @test_readfirstlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(5) @llvm.amdgcn.readfirstlane.p5(ptr addrspace(5) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readfirstlane.v3p5(<3 x ptr addrspace(5)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+  ret void
+}
+
+define void @test_readfirstlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(6) @llvm.amdgcn.readfirstlane.p6(ptr addrspace(6) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readfirstlane.v3p6(<3 x ptr addrspace(6)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 51465f6bd10ce5..66e1f9396de5af 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,82 +1,899 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
 
-declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
+declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
+declare double @llvm.amdgcn.readlane.f64(double, i32) #0
 
-; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
-; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_sreg_sreg(i32 %src0, i32 %src1) #1 {
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %src0, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i32 %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vreg_sreg:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_readlane_vreg_sreg(i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(i64 %readlane)
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(double %readlane)
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call i32 asm sideeffect "; def $0", "=v"()
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 %src1)
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i32 %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
-; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
+define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT:    v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(i64 %readlane)
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT:    v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %vgpr = call double asm sideeffect "; def $0", "=v"()
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(double %readlane)
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 32, i32 %src1)
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vregs:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
-define amdgpu_kernel void @test_readlane_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 32, i32 %src1)
+  store i64 %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %readlane = call double @llvm.amdgcn.readlane.f64(double 32.0, i32 %src1)
+  store double %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i32>, ptr addrspace(1) %gep.in
   %value = extractelement <2 x i32> %args, i32 0
   %lane = extractelement <2 x i32> %args, i32 1
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %value, i32 %lane)
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: m0 should be folded.
-; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
+define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x i64>, ptr addrspace(1) %gep.in
+  %value = extractelement <2 x i64> %args, i32 0
+  %lane = extractelement <2 x i64> %args, i32 1
+  %lane32 = trunc i64 %lane to i32
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %value, i32 %lane32)
+  store i64 %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x double>, ptr addrspace(1) %gep.in
+  %value = extractelement <2 x double> %args, i32 0
+  %lane = extractelement <2 x double> %args, i32 1
+  %lane_cast = bitcast double %lane to i64
+  %lane32 = trunc i64 %lane_cast to i32
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %value, i32 %lane32)
+  store double %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vgpr_imm:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define amdgpu_kernel void @test_readlane_vgpr_imm(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call i32 asm sideeffect "; def $0", "=v"()
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 32) #0
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 32) #0
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 32) #0
+  store i64 %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %vgpr = call double asm sideeffect "; def $0", "=v"()
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 32) #0
+  store double %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readlane(i32 %sgpr, i32 7)
+  %readfirstlane = call i32 @llvm.amdgcn.readlane.i32(i32 %sgpr, i32 7)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call i64 @llvm.amdgcn.readlane.i64(i64 %sgpr, i32 7)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
+  %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call double @llvm.amdgcn.readlane.f64(double %sgpr, i32 7)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readlane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_half:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_half:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call half @llvm.amdgcn.readlane.f16(half %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(half %x)
+  ret void
+}
+
+define void @test_readlane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_float:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_float:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call float @llvm.amdgcn.readlane.f32(float %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(float %x)
+  ret void
+}
+
+define void @test_readlane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_bfloat:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_bfloat:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(bfloat %x)
+  ret void
+}
+
+define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    s_and_b32 s4, s4, 0xffff
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(i16 %x)
+  ret void
+}
+
+define void @test_readlane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+  ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s5
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s5
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:5]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+  ret void
+}
+
+define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v7i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s10, v8, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:10]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v7i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v9
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s7, v5, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s8, v6, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s9, v7, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s10, v8, s10
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:10]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <7 x i32> @llvm.amdgcn.readlane.v7i32(<7 x i32> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+  ret void
+}
+
+define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v8i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:7]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v8i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v6
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s7, v5, s7
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:7]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
new file mode 100644
index 00000000000000..49f8ef391c230f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr %x)
+  ret void
+}
+
+define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:9]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+  ret void
+}
+
+define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+  ret void
+}
+
+define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+  ret void
+}
+
+define void @test_readlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(5) @llvm.amdgcn.readlane.p5(ptr addrspace(5) %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+  ret void
+}
+
+define void @test_readlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readlane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+  ret void
+}
+
+define void @test_readlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(6) @llvm.amdgcn.readlane.p6(ptr addrspace(6) %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+  ret void
+}
+
+define void @test_readlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readlane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
new file mode 100644
index 00000000000000..1fb5d53d5fd826
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret float %ret
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen slc
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret void
+}
+
+; Test waterfall loop on resource
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX6-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX7-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[3:4]
+; GFX10-NEXT:    s_and_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_and_saveexec_b32 s5, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
+; GFX11-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+; Test waterfall loop on soffset
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v7
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT:    ; implicit-def: $vgpr7
+; GFX6-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX7-NEXT:    v_readfirstlane_b32 s12, v7
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT:    ; implicit-def: $vgpr7
+; GFX7-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[3:4]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, s7, v7
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_b32 s4, s4, s5
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT:    ; implicit-def: $vgpr7
+; GFX10-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc
+; GFX11-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT:    ; implicit-def: $vgpr7
+; GFX11-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX12-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT:    ; implicit-def: $vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll
new file mode 100644
index 00000000000000..b859147b6dc6b2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9, except  90a/940
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret double %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret void
+}
+
+; Test waterfall loop on resource
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX7-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+; Test waterfall loop on soffset
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v8
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT:    ; implicit-def: $vgpr8
+; GFX6-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX7-NEXT:    v_readfirstlane_b32 s12, v8
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT:    ; implicit-def: $vgpr8
+; GFX7-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
new file mode 100644
index 00000000000000..87055db9a58f09
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret float %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen slc
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret void
+}
+
+; Test waterfall loop on resource
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX6-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX7-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[3:4]
+; GFX10-NEXT:    s_and_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_and_saveexec_b32 s5, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
+; GFX11-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+; Test waterfall loop on soffset
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v7
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT:    ; implicit-def: $vgpr7
+; GFX6-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX7-NEXT:    v_readfirstlane_b32 s12, v7
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT:    ; implicit-def: $vgpr7
+; GFX7-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, exec_lo
+; GFX10-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[3:4]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, s7, v7
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_b32 s4, s4, s5
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT:    ; implicit-def: $vgpr7
+; GFX10-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc
+; GFX11-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT:    ; implicit-def: $vgpr7
+; GFX11-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX12-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT:    ; implicit-def: $vgpr7
+; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret float %ret
+}
+
+declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll
new file mode 100644
index 00000000000000..5c23a86dab33ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9, except  90a/940
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret double %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+  ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+  ret void
+}
+
+; Test waterfall loop on resource
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[12:13], exec
+; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], exec
+; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX7-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+; Test waterfall loop on soffset
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v8
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT:    ; implicit-def: $vgpr8
+; GFX6-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX7-NEXT:    v_readfirstlane_b32 s12, v8
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT:    ; implicit-def: $vgpr8
+; GFX7-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 256
+  %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+  ret double %ret
+}
+
+declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 37951669dbe755..31f1085dd76ee4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,85 +1,2767 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
+declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0
 
-; CHECK-LABEL: {{^}}test_writelane_sreg:
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %oldval = load i64, ptr addrspace(1) %out
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %oldval = load double, ptr addrspace(1) %out
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, 32, s2
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 32, s2
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 32, s0
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, 32, s2
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, 32, s2
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 32, s0
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
-define amdgpu_kernel void @test_writelane_vreg_lane(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, 0, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 32, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, 32, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 0, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %oldval = load i64, ptr addrspace(1) %out
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    s_mov_b32 s2, 0x40400000
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s2, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 0, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT:    s_mov_b32 s2, 0x40400000
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, 0, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s2, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s0, s4
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %oldval = load double, ptr addrspace(1) %out
+  %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dword v0, v[0:1]
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    s_nop 2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1010-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1100-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dword v0, v[0:1]
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    s_nop 2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i32>, ptr addrspace(1) %gep.in
   %oldval = load i32, ptr addrspace(1) %out
   %lane = extractelement <2 x i32> %args, i32 1
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 12, i32 %lane, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
-; CHECK: s_mov_b32 m0, -1
-; CIGFX9: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, m0, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_nop 2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 12, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, 0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 12, s3
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 0, s3
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 12, s3
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT:    s_nop 3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x i64>, ptr addrspace(1) %gep.in
+  %oldval = load i64, ptr addrspace(1) %out
+  %lane = extractelement <2 x i64> %args, i32 1
+  %lane32 = trunc i64 %lane to i32
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 12, i32 %lane32, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 0, s2
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    s_mov_b32 s2, 0x40280000
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s2, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 0, s3
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT:    s_mov_b32 s2, 0x40280000
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s2, s3
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 0, s3
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT:    s_nop 2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT:    s_mov_b32 s3, 0x40280000
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v2, s3, s2
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT:    s_mov_b32 s3, 0x40280000
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v2, s3, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x double>, ptr addrspace(1) %gep.in
+  %oldval = load double, ptr addrspace(1) %out
+  %lane = extractelement <2 x double> %args, i32 1
+  %lane_cast = bitcast double %lane to i64
+  %lane32 = trunc i64 %lane_cast to i32
+  %writelane = call double @llvm.amdgcn.writelane.f64(double 12.0, i32 %lane32, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    ;;#ASMSTART
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX802-SDAG-NEXT:    ;;#ASMEND
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, m0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    ;;#ASMSTART
+; GFX1010-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX1010-SDAG-NEXT:    ;;#ASMEND
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, m0, s2
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    ;;#ASMSTART
+; GFX1100-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX1100-SDAG-NEXT:    ;;#ASMEND
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, m0, s0
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    ;;#ASMSTART
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX802-GISEL-NEXT:    ;;#ASMEND
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, m0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    ;;#ASMSTART
+; GFX1010-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX1010-GISEL-NEXT:    ;;#ASMEND
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, m0, s2
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    ;;#ASMSTART
+; GFX1100-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX1100-GISEL-NEXT:    ;;#ASMEND
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, m0, s0
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %m0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
-define amdgpu_kernel void @test_writelane_imm(ptr addrspace(1) %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, 32
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, 32
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, 32
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s0, 32
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %oldval = load i64, ptr addrspace(1) %out
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %oldval = load double, ptr addrspace(1) %out
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_oldval(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
+define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x2
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x2
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x2
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x2
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x2
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x2
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x2
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x2
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
+define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_half:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_half:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_half:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_half:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_half:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_half:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load half, ptr addrspace(1) %out
+  %writelane = call half @llvm.amdgcn.writelane.f16(half %src, i32 %src1, half %oldval)
+  store half %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_float:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_float:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_float:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_float:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_float:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_float:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load float, ptr addrspace(1) %out
+  %writelane = call float @llvm.amdgcn.writelane.f32(float %src, i32 %src1, float %oldval)
+  store float %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_bfloat:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_bfloat:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_bfloat:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_bfloat:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_bfloat:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_bfloat:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load bfloat, ptr addrspace(1) %out
+  %writelane = call bfloat @llvm.amdgcn.writelane.bf16(bfloat %src, i32 %src1, bfloat %oldval)
+  store bfloat %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_i16:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_i16:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_i16:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_i16:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_i16:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_i16:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load i16, ptr addrspace(1) %out
+  %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 %src, i32 %src1, i16 %oldval)
+  store i16 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v2f16:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v2f16:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v2f16:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v2f16:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v2f16:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v2f16:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <2 x half>, ptr addrspace(1) %out
+  %writelane = call <2 x half> @llvm.amdgcn.writelane.v2f16(<2 x half> %src, i32 %src1, <2 x half> %oldval)
+  store <2 x half> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_readlane_v2f32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_readlane_v2f32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_readlane_v2f32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_readlane_v2f32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v5, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_readlane_v2f32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v5, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v6, s6, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_readlane_v2f32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v5, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v6, s2, s1
+; GFX1100-GISEL-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <2 x float>, ptr addrspace(1) %out
+  %writelane = call <2 x float> @llvm.amdgcn.writelane.v2f32(<2 x float> %src, i32 %src1, <2 x float> %oldval)
+  store <2 x float> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v7i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v17, vcc, 16, v0
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[14:16], v[17:18]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v9
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v13, s7, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s8, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s9, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s10, m0
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v14, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[17:18], v[14:16]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v7i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[14:16], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s11, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v6
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v16, s4, s5
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s8, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s9, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s10, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s11, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v15, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v7i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_load_b96 v[14:16], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v6
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v16, s0, s1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s4, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s5, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s6, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s7, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v15, s2, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s3, s1
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v7i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 16, v0
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[14:17], v[18:19]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v10, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v11, s6, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v12, s7, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v13, s8, m0
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v14, s9, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v15, s10, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v16, s11, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-GISEL-NEXT:    flat_store_dwordx3 v[18:19], v[14:16]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v7i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v10, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v11, s6, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v12, s7, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v13, s8, s5
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v14, s9, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v15, s10, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v16, s11, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v7i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
+; GFX1100-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v10, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v11, s2, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v12, s3, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v13, s4, s1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v14, s5, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v15, s6, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v16, s7, s1
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off
+; GFX1100-GISEL-NEXT:    global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <7 x i32>, ptr addrspace(1) %out
+  %writelane = call <7 x i32> @llvm.amdgcn.writelane.v7i32(<7 x i32> %src, i32 %src1, <7 x i32> %oldval)
+  store <7 x i32> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v8i16:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v6
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v9, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s6, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s7, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v8i16:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v9, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s7, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s8, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v8i16:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v9, s2, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s3, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s4, s1
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[7:10], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v8i16:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v7, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v8, s6, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v9, s7, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v10, s8, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v8i16:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v7, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v8, s6, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v9, s7, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v10, s8, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v8i16:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v7, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v8, s2, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v9, s3, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v10, s4, s1
+; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[7:10], off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <8 x i16>, ptr addrspace(1) %out
+  %writelane = call <8 x i16> @llvm.amdgcn.writelane.v8i16(<8 x i16> %src, i32 %src1, <8 x i16> %oldval)
+  store <8 x i16> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
new file mode 100644
index 00000000000000..edc1afe410a632
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -0,0 +1,425 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p0:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p0:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p0:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr, ptr addrspace(1) %out
+  %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
+  store ptr %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p0:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v13, vcc, 16, v0
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[9:12], v[0:1]
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[15:16], v[13:14]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v8
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s6, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s7, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s8, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v9, s9, m0
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[13:14], v[15:16]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p0:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[13:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s9, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v9, s10, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p0:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_load_b64 v[13:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT:    global_load_b128 v[9:12], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v7
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s5, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v9, s6, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[9:12], off
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr> @llvm.amdgcn.writelane.v3p0(<3 x ptr> %src, i32 %src1, <3 x ptr> %oldval)
+  store <3 x ptr> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p3:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p3:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p3:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr addrspace(3), ptr addrspace(1) %out
+  %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval)
+  store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p3:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p3:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p3:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval)
+  store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p5:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p5:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p5:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr addrspace(5), ptr addrspace(1) %out
+  %writelane = call ptr addrspace(5) @llvm.amdgcn.writelane.p5(ptr addrspace(5) %src, i32 %src1, ptr addrspace(5) %oldval)
+  store ptr addrspace(5) %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p5:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p5:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p5:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr addrspace(5)>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr addrspace(5)> @llvm.amdgcn.writelane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1, <3 x ptr addrspace(5)> %oldval)
+  store <3 x ptr addrspace(5)> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p6:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p6:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p6:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr addrspace(6), ptr addrspace(1) %out
+  %writelane = call ptr addrspace(6) @llvm.amdgcn.writelane.p6(ptr addrspace(6) %src, i32 %src1, ptr addrspace(6) %oldval)
+  store ptr addrspace(6) %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p6:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p6:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p6:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr addrspace(6)>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr addrspace(6)> @llvm.amdgcn.writelane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1, <3 x ptr addrspace(6)> %oldval)
+  store <3 x ptr addrspace(6)> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index dc7fe840f51180..b2b5153bb6c2a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -565,6 +565,507 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
   ret <2 x half> %result
 }
 
+define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
+; GFX6-SDAG-LABEL: test_ldexp_v3f16_v3i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v2, v2, v5
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: test_ldexp_v3f16_v3i32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX8-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v5
+; GFX8-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v5
+; GFX8-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v5
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v2, v0, v2
+; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v5
+; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v5
+; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v5
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_ldexp_v3f16_v3i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SDAG-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v3, v5, v3
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-NEXT:    v_med3_i32 v2, v4, s0, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_ldexp_v3f16_v3i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v5
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: test_ldexp_v3f16_v3i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, 0xffff8000
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7fff
+; GFX8-GISEL-NEXT:    v_med3_i32 v2, v2, v5, v6
+; GFX8-GISEL-NEXT:    v_med3_i32 v3, v3, v5, v6
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v2, v0, v2
+; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_med3_i32 v3, v4, v5, v6
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_ldexp_v3f16_v3i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v5, v6
+; GFX9-GISEL-NEXT:    v_med3_i32 v3, v3, v5, v6
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v2, v0, v2
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_med3_i32 v3, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_ldexp_v3f16_v3i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v5
+; GFX11-GISEL-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v5
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v6, v3
+; GFX11-GISEL-NEXT:    v_med3_i32 v3, 0xffff8000, v4, v5
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> %a, <3 x i32> %b)
+  ret <3 x half> %result
+}
+
+define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
+; GFX6-SDAG-LABEL: test_ldexp_v3f16_v3i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v2, v2, v5
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: test_ldexp_v3f16_v3i16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_ldexp_v3f16_v3i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v2, v5, v4
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_ldexp_v3f16_v3i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GFX6-GISEL-NEXT:    v_bfe_i32 v3, v4, 0, 16
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX6-GISEL-NEXT:    v_bfe_i32 v3, v5, 0, 16
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v3
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: test_ldexp_v3f16_v3i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v4, v0, v2
+; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_ldexp_v3f16_v3i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_ldexp_v3f16_v3i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v4, v5
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> %a, <3 x i16> %b)
+  ret <3 x half> %result
+}
+
+define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
+; GFX6-SDAG-LABEL: test_ldexp_v4f16_v4i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v5
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v2, v2, v6
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v3, v3, v7
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: test_ldexp_v4f16_v4i32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7fff
+; GFX8-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v6
+; GFX8-SDAG-NEXT:    v_med3_i32 v5, v5, s4, v6
+; GFX8-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v6
+; GFX8-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v6
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v4, v1, v4
+; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v2, v0, v2
+; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v5, v5, s4, v6
+; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v6
+; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v6
+; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v6
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_ldexp_v4f16_v4i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-SDAG-NEXT:    v_med3_i32 v5, v5, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-SDAG-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_med3_i32 v4, v4, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v5, v6, v5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v3, v7, v3
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v5
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v6
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v3, v3, v7
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffff8000
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v7, 0x7fff
+; GFX8-GISEL-NEXT:    v_med3_i32 v2, v2, v6, v7
+; GFX8-GISEL-NEXT:    v_med3_i32 v3, v3, v6, v7
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v2, v0, v2
+; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_med3_i32 v3, v4, v6, v7
+; GFX8-GISEL-NEXT:    v_med3_i32 v4, v5, v6, v7
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v3, v1, v3
+; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_ldexp_v4f16_v4i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v6, v7
+; GFX9-GISEL-NEXT:    v_med3_i32 v3, v3, v6, v7
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v2, v0, v2
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_med3_i32 v3, v4, v6, v7
+; GFX9-GISEL-NEXT:    v_med3_i32 v4, v5, v6, v7
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v3, v1, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_ldexp_v4f16_v4i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7fff
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v6
+; GFX11-GISEL-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v6
+; GFX11-GISEL-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v6
+; GFX11-GISEL-NEXT:    v_med3_i32 v5, 0xffff8000, v5, v6
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v4
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v7, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v3, v8, v5
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> %a, <4 x i32> %b)
+  ret <4 x half> %result
+}
+
+define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
+; GFX6-SDAG-LABEL: test_ldexp_v4f16_v4i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 16
+; GFX6-SDAG-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GFX6-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX6-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v5
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v2, v2, v6
+; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v3, v3, v7
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: test_ldexp_v4f16_v4i16:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v5
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_ldexp_v4f16_v4i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v2, v6, v5
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v3, v7, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GFX6-GISEL-NEXT:    v_bfe_i32 v4, v6, 0, 16
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v4
+; GFX6-GISEL-NEXT:    v_bfe_i32 v4, v7, 0, 16
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v3, v3, v4
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v4, v0, v2
+; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v2, v1, v3
+; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_ldexp_v4f16_v4i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_ldexp_v4f16_v4i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v4, v6
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v3, v5, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> %a, <4 x i16> %b)
+  ret <4 x half> %result
+}
+
 declare float @llvm.ldexp.f32.i32(float, i32) #0
 declare float @llvm.ldexp.f32.i16(float, i16) #0
 declare float @llvm.ldexp.f32.i64(float, i64) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index 78fb23182f8004..e9acbec33f2f39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -2868,74 +2868,72 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_clause 0x1b
+; GFX12-NEXT:    s_clause 0x1f
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:16
 ; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:12
-; GFX12-NEXT:    scratch_load_b32 v31, off, s32
-; GFX12-NEXT:    scratch_load_b32 v37, off, s32 offset:120
-; GFX12-NEXT:    scratch_load_b32 v39, off, s32 offset:104
-; GFX12-NEXT:    scratch_load_b32 v49, off, s32 offset:24
-; GFX12-NEXT:    scratch_load_b32 v48, off, s32 offset:20
-; GFX12-NEXT:    scratch_load_b32 v51, off, s32 offset:32
-; GFX12-NEXT:    scratch_load_b32 v50, off, s32 offset:28
-; GFX12-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX12-NEXT:    scratch_load_b32 v52, off, s32 offset:36
-; GFX12-NEXT:    scratch_load_b32 v55, off, s32 offset:48
-; GFX12-NEXT:    scratch_load_b32 v54, off, s32 offset:44
-; GFX12-NEXT:    scratch_load_b32 v65, off, s32 offset:56
-; GFX12-NEXT:    scratch_load_b32 v64, off, s32 offset:52
-; GFX12-NEXT:    scratch_load_b32 v67, off, s32 offset:64
-; GFX12-NEXT:    scratch_load_b32 v66, off, s32 offset:60
-; GFX12-NEXT:    scratch_load_b32 v69, off, s32 offset:72
-; GFX12-NEXT:    scratch_load_b32 v68, off, s32 offset:68
-; GFX12-NEXT:    scratch_load_b32 v71, off, s32 offset:80
-; GFX12-NEXT:    scratch_load_b32 v70, off, s32 offset:76
-; GFX12-NEXT:    scratch_load_b32 v81, off, s32 offset:88
-; GFX12-NEXT:    scratch_load_b32 v80, off, s32 offset:84
-; GFX12-NEXT:    scratch_load_b32 v83, off, s32 offset:96
-; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:92
-; GFX12-NEXT:    scratch_load_b32 v38, off, s32 offset:100
-; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX12-NEXT:    scratch_load_b32 v36, off, s32 offset:20
+; GFX12-NEXT:    scratch_load_b32 v39, off, s32 offset:32
+; GFX12-NEXT:    scratch_load_b32 v38, off, s32 offset:28
+; GFX12-NEXT:    scratch_load_b32 v49, off, s32 offset:40
+; GFX12-NEXT:    scratch_load_b32 v48, off, s32 offset:36
+; GFX12-NEXT:    scratch_load_b32 v51, off, s32 offset:48
+; GFX12-NEXT:    scratch_load_b32 v50, off, s32 offset:44
+; GFX12-NEXT:    scratch_load_b32 v53, off, s32 offset:56
+; GFX12-NEXT:    scratch_load_b32 v52, off, s32 offset:52
+; GFX12-NEXT:    scratch_load_b32 v55, off, s32 offset:64
+; GFX12-NEXT:    scratch_load_b32 v54, off, s32 offset:60
+; GFX12-NEXT:    scratch_load_b32 v65, off, s32 offset:72
+; GFX12-NEXT:    scratch_load_b32 v64, off, s32 offset:68
+; GFX12-NEXT:    scratch_load_b32 v67, off, s32 offset:80
+; GFX12-NEXT:    scratch_load_b32 v66, off, s32 offset:76
+; GFX12-NEXT:    scratch_load_b32 v69, off, s32 offset:88
+; GFX12-NEXT:    scratch_load_b32 v68, off, s32 offset:84
+; GFX12-NEXT:    scratch_load_b32 v71, off, s32 offset:96
+; GFX12-NEXT:    scratch_load_b32 v70, off, s32 offset:92
+; GFX12-NEXT:    scratch_load_b32 v81, off, s32 offset:104
+; GFX12-NEXT:    scratch_load_b32 v80, off, s32 offset:100
+; GFX12-NEXT:    scratch_load_b32 v83, off, s32 offset:112
+; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:108
+; GFX12-NEXT:    scratch_load_b32 v85, off, s32 offset:120
+; GFX12-NEXT:    scratch_load_b32 v84, off, s32 offset:116
+; GFX12-NEXT:    scratch_load_b32 v87, off, s32 offset:128
+; GFX12-NEXT:    scratch_load_b32 v86, off, s32 offset:124
+; GFX12-NEXT:    s_wait_loadcnt 0x1e
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT:    s_clause 0x2
-; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:112
-; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:108
-; GFX12-NEXT:    scratch_load_b32 v36, off, s32 offset:116
-; GFX12-NEXT:    s_wait_loadcnt 0x1b
+; GFX12-NEXT:    s_wait_loadcnt 0x1c
 ; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:128
-; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[36:37]
 ; GFX12-NEXT:    s_wait_loadcnt 0x18
-; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[48:49]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[38:39]
 ; GFX12-NEXT:    s_wait_loadcnt 0x16
-; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[50:51]
+; GFX12-NEXT:    v_maximum_f64 v[8:9], v[8:9], v[48:49]
 ; GFX12-NEXT:    s_wait_loadcnt 0x14
-; GFX12-NEXT:    v_maximum_f64 v[8:9], v[8:9], v[52:53]
+; GFX12-NEXT:    v_maximum_f64 v[10:11], v[10:11], v[50:51]
 ; GFX12-NEXT:    s_wait_loadcnt 0x12
-; GFX12-NEXT:    v_maximum_f64 v[10:11], v[10:11], v[54:55]
+; GFX12-NEXT:    v_maximum_f64 v[12:13], v[12:13], v[52:53]
 ; GFX12-NEXT:    s_wait_loadcnt 0x10
-; GFX12-NEXT:    v_maximum_f64 v[12:13], v[12:13], v[64:65]
+; GFX12-NEXT:    v_maximum_f64 v[14:15], v[14:15], v[54:55]
 ; GFX12-NEXT:    s_wait_loadcnt 0xe
-; GFX12-NEXT:    v_maximum_f64 v[14:15], v[14:15], v[66:67]
+; GFX12-NEXT:    v_maximum_f64 v[16:17], v[16:17], v[64:65]
 ; GFX12-NEXT:    s_wait_loadcnt 0xc
-; GFX12-NEXT:    v_maximum_f64 v[16:17], v[16:17], v[68:69]
+; GFX12-NEXT:    v_maximum_f64 v[18:19], v[18:19], v[66:67]
 ; GFX12-NEXT:    s_wait_loadcnt 0xa
-; GFX12-NEXT:    v_maximum_f64 v[18:19], v[18:19], v[70:71]
+; GFX12-NEXT:    v_maximum_f64 v[20:21], v[20:21], v[68:69]
 ; GFX12-NEXT:    s_wait_loadcnt 0x8
-; GFX12-NEXT:    v_maximum_f64 v[20:21], v[20:21], v[80:81]
+; GFX12-NEXT:    v_maximum_f64 v[22:23], v[22:23], v[70:71]
 ; GFX12-NEXT:    s_wait_loadcnt 0x6
-; GFX12-NEXT:    v_maximum_f64 v[22:23], v[22:23], v[82:83]
-; GFX12-NEXT:    s_wait_loadcnt 0x5
-; GFX12-NEXT:    v_maximum_f64 v[24:25], v[24:25], v[38:39]
-; GFX12-NEXT:    s_wait_loadcnt 0x3
-; GFX12-NEXT:    v_maximum_f64 v[26:27], v[26:27], v[32:33]
+; GFX12-NEXT:    v_maximum_f64 v[24:25], v[24:25], v[80:81]
+; GFX12-NEXT:    s_wait_loadcnt 0x4
+; GFX12-NEXT:    v_maximum_f64 v[26:27], v[26:27], v[82:83]
 ; GFX12-NEXT:    s_wait_loadcnt 0x2
-; GFX12-NEXT:    v_maximum_f64 v[28:29], v[28:29], v[36:37]
+; GFX12-NEXT:    v_maximum_f64 v[28:29], v[28:29], v[84:85]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_maximum_f64 v[30:31], v[30:31], v[34:35]
+; GFX12-NEXT:    v_maximum_f64 v[30:31], v[30:31], v[86:87]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x double> @llvm.maximum.v16f64(<16 x double> %src0, <16 x double> %src1)
   ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 37fe2e958e62bf..d8462ec2202448 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -2868,74 +2868,72 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_clause 0x1b
+; GFX12-NEXT:    s_clause 0x1f
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:16
 ; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:12
-; GFX12-NEXT:    scratch_load_b32 v31, off, s32
-; GFX12-NEXT:    scratch_load_b32 v37, off, s32 offset:120
-; GFX12-NEXT:    scratch_load_b32 v39, off, s32 offset:104
-; GFX12-NEXT:    scratch_load_b32 v49, off, s32 offset:24
-; GFX12-NEXT:    scratch_load_b32 v48, off, s32 offset:20
-; GFX12-NEXT:    scratch_load_b32 v51, off, s32 offset:32
-; GFX12-NEXT:    scratch_load_b32 v50, off, s32 offset:28
-; GFX12-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX12-NEXT:    scratch_load_b32 v52, off, s32 offset:36
-; GFX12-NEXT:    scratch_load_b32 v55, off, s32 offset:48
-; GFX12-NEXT:    scratch_load_b32 v54, off, s32 offset:44
-; GFX12-NEXT:    scratch_load_b32 v65, off, s32 offset:56
-; GFX12-NEXT:    scratch_load_b32 v64, off, s32 offset:52
-; GFX12-NEXT:    scratch_load_b32 v67, off, s32 offset:64
-; GFX12-NEXT:    scratch_load_b32 v66, off, s32 offset:60
-; GFX12-NEXT:    scratch_load_b32 v69, off, s32 offset:72
-; GFX12-NEXT:    scratch_load_b32 v68, off, s32 offset:68
-; GFX12-NEXT:    scratch_load_b32 v71, off, s32 offset:80
-; GFX12-NEXT:    scratch_load_b32 v70, off, s32 offset:76
-; GFX12-NEXT:    scratch_load_b32 v81, off, s32 offset:88
-; GFX12-NEXT:    scratch_load_b32 v80, off, s32 offset:84
-; GFX12-NEXT:    scratch_load_b32 v83, off, s32 offset:96
-; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:92
-; GFX12-NEXT:    scratch_load_b32 v38, off, s32 offset:100
-; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX12-NEXT:    scratch_load_b32 v36, off, s32 offset:20
+; GFX12-NEXT:    scratch_load_b32 v39, off, s32 offset:32
+; GFX12-NEXT:    scratch_load_b32 v38, off, s32 offset:28
+; GFX12-NEXT:    scratch_load_b32 v49, off, s32 offset:40
+; GFX12-NEXT:    scratch_load_b32 v48, off, s32 offset:36
+; GFX12-NEXT:    scratch_load_b32 v51, off, s32 offset:48
+; GFX12-NEXT:    scratch_load_b32 v50, off, s32 offset:44
+; GFX12-NEXT:    scratch_load_b32 v53, off, s32 offset:56
+; GFX12-NEXT:    scratch_load_b32 v52, off, s32 offset:52
+; GFX12-NEXT:    scratch_load_b32 v55, off, s32 offset:64
+; GFX12-NEXT:    scratch_load_b32 v54, off, s32 offset:60
+; GFX12-NEXT:    scratch_load_b32 v65, off, s32 offset:72
+; GFX12-NEXT:    scratch_load_b32 v64, off, s32 offset:68
+; GFX12-NEXT:    scratch_load_b32 v67, off, s32 offset:80
+; GFX12-NEXT:    scratch_load_b32 v66, off, s32 offset:76
+; GFX12-NEXT:    scratch_load_b32 v69, off, s32 offset:88
+; GFX12-NEXT:    scratch_load_b32 v68, off, s32 offset:84
+; GFX12-NEXT:    scratch_load_b32 v71, off, s32 offset:96
+; GFX12-NEXT:    scratch_load_b32 v70, off, s32 offset:92
+; GFX12-NEXT:    scratch_load_b32 v81, off, s32 offset:104
+; GFX12-NEXT:    scratch_load_b32 v80, off, s32 offset:100
+; GFX12-NEXT:    scratch_load_b32 v83, off, s32 offset:112
+; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:108
+; GFX12-NEXT:    scratch_load_b32 v85, off, s32 offset:120
+; GFX12-NEXT:    scratch_load_b32 v84, off, s32 offset:116
+; GFX12-NEXT:    scratch_load_b32 v87, off, s32 offset:128
+; GFX12-NEXT:    scratch_load_b32 v86, off, s32 offset:124
+; GFX12-NEXT:    s_wait_loadcnt 0x1e
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT:    s_clause 0x2
-; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:112
-; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:108
-; GFX12-NEXT:    scratch_load_b32 v36, off, s32 offset:116
-; GFX12-NEXT:    s_wait_loadcnt 0x1b
+; GFX12-NEXT:    s_wait_loadcnt 0x1c
 ; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:128
-; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[36:37]
 ; GFX12-NEXT:    s_wait_loadcnt 0x18
-; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[48:49]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[38:39]
 ; GFX12-NEXT:    s_wait_loadcnt 0x16
-; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[50:51]
+; GFX12-NEXT:    v_minimum_f64 v[8:9], v[8:9], v[48:49]
 ; GFX12-NEXT:    s_wait_loadcnt 0x14
-; GFX12-NEXT:    v_minimum_f64 v[8:9], v[8:9], v[52:53]
+; GFX12-NEXT:    v_minimum_f64 v[10:11], v[10:11], v[50:51]
 ; GFX12-NEXT:    s_wait_loadcnt 0x12
-; GFX12-NEXT:    v_minimum_f64 v[10:11], v[10:11], v[54:55]
+; GFX12-NEXT:    v_minimum_f64 v[12:13], v[12:13], v[52:53]
 ; GFX12-NEXT:    s_wait_loadcnt 0x10
-; GFX12-NEXT:    v_minimum_f64 v[12:13], v[12:13], v[64:65]
+; GFX12-NEXT:    v_minimum_f64 v[14:15], v[14:15], v[54:55]
 ; GFX12-NEXT:    s_wait_loadcnt 0xe
-; GFX12-NEXT:    v_minimum_f64 v[14:15], v[14:15], v[66:67]
+; GFX12-NEXT:    v_minimum_f64 v[16:17], v[16:17], v[64:65]
 ; GFX12-NEXT:    s_wait_loadcnt 0xc
-; GFX12-NEXT:    v_minimum_f64 v[16:17], v[16:17], v[68:69]
+; GFX12-NEXT:    v_minimum_f64 v[18:19], v[18:19], v[66:67]
 ; GFX12-NEXT:    s_wait_loadcnt 0xa
-; GFX12-NEXT:    v_minimum_f64 v[18:19], v[18:19], v[70:71]
+; GFX12-NEXT:    v_minimum_f64 v[20:21], v[20:21], v[68:69]
 ; GFX12-NEXT:    s_wait_loadcnt 0x8
-; GFX12-NEXT:    v_minimum_f64 v[20:21], v[20:21], v[80:81]
+; GFX12-NEXT:    v_minimum_f64 v[22:23], v[22:23], v[70:71]
 ; GFX12-NEXT:    s_wait_loadcnt 0x6
-; GFX12-NEXT:    v_minimum_f64 v[22:23], v[22:23], v[82:83]
-; GFX12-NEXT:    s_wait_loadcnt 0x5
-; GFX12-NEXT:    v_minimum_f64 v[24:25], v[24:25], v[38:39]
-; GFX12-NEXT:    s_wait_loadcnt 0x3
-; GFX12-NEXT:    v_minimum_f64 v[26:27], v[26:27], v[32:33]
+; GFX12-NEXT:    v_minimum_f64 v[24:25], v[24:25], v[80:81]
+; GFX12-NEXT:    s_wait_loadcnt 0x4
+; GFX12-NEXT:    v_minimum_f64 v[26:27], v[26:27], v[82:83]
 ; GFX12-NEXT:    s_wait_loadcnt 0x2
-; GFX12-NEXT:    v_minimum_f64 v[28:29], v[28:29], v[36:37]
+; GFX12-NEXT:    v_minimum_f64 v[28:29], v[28:29], v[84:85]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_minimum_f64 v[30:31], v[30:31], v[34:35]
+; GFX12-NEXT:    v_minimum_f64 v[30:31], v[30:31], v[86:87]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x double> @llvm.minimum.v16f64(<16 x double> %src0, <16 x double> %src1)
   ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index 77fd0bc058aca5..2b517736ecff32 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -53,14 +53,25 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_data_sgpr_min_offset:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_data s[0:1], -0x800000, null, 0
-; GFX12-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_mov_b32 s2, 0xff800000
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_min_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -215,14 +226,25 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_inst_sgpr_min_offset:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_inst s[0:1], -0x800000, null, 0
-; GFX12-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_mov_b32 s2, 0xff800000
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_inst_sgpr_min_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 502cd14284e150..04fba9ef6d86df 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -3448,163 +3448,160 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 13, s2
+; GFX12-NEXT:    v_lshrrev_b16 v1, 13, s2
 ; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    v_lshrrev_b16 v10, 13, s3
-; GFX12-NEXT:    v_lshrrev_b16 v3, 9, s2
-; GFX12-NEXT:    v_and_b32_e32 v45, 1, v2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, s4
-; GFX12-NEXT:    v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 1, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 3, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 11, s3
-; GFX12-NEXT:    v_lshrrev_b16 v14, 7, s3
-; GFX12-NEXT:    v_lshrrev_b16 v18, 5, s4
+; GFX12-NEXT:    v_lshrrev_b16 v2, 9, s2
+; GFX12-NEXT:    v_lshrrev_b16 v3, 11, s2
+; GFX12-NEXT:    v_lshrrev_b16 v9, 13, s3
+; GFX12-NEXT:    v_and_b32_e32 v44, 1, v1
+; GFX12-NEXT:    v_lshrrev_b16 v1, 1, s4
 ; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX12-NEXT:    s_and_b32 s6, s3, 1
-; GFX12-NEXT:    s_bfe_u32 s14, s3, 0x10012
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10
-; GFX12-NEXT:    v_and_b32_e32 v10, 1, v2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 3, s4
-; GFX12-NEXT:    v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v11, 9, s3
-; GFX12-NEXT:    v_lshrrev_b16 v13, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v15, 1, s3
-; GFX12-NEXT:    v_lshrrev_b16 v16, 3, s3
-; GFX12-NEXT:    v_and_b32_e32 v43, 1, v4
-; GFX12-NEXT:    v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT:    s_bfe_u32 s19, s3, 0x10014
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3
-; GFX12-NEXT:    v_lshrrev_b16 v3, 5, s5
-; GFX12-NEXT:    s_bfe_u32 s13, s3, 0x10013
-; GFX12-NEXT:    v_lshrrev_b16 v29, 12, s3
-; GFX12-NEXT:    v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT:    v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT:    v_lshrrev_b16 v25, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v22, 6, s3
-; GFX12-NEXT:    v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12
-; GFX12-NEXT:    v_lshrrev_b16 v19, 2, s3
-; GFX12-NEXT:    v_and_b32_e32 v12, 1, v2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2
+; GFX12-NEXT:    v_lshrrev_b16 v4, 5, s2
+; GFX12-NEXT:    v_lshrrev_b16 v5, 7, s2
+; GFX12-NEXT:    v_lshrrev_b16 v6, 1, s2
+; GFX12-NEXT:    v_lshrrev_b16 v7, 3, s2
+; GFX12-NEXT:    v_lshrrev_b16 v10, 9, s3
+; GFX12-NEXT:    v_lshrrev_b16 v11, 11, s3
+; GFX12-NEXT:    v_lshrrev_b16 v12, 5, s3
+; GFX12-NEXT:    v_lshrrev_b16 v13, 7, s3
+; GFX12-NEXT:    v_lshrrev_b16 v14, 1, s3
+; GFX12-NEXT:    v_lshrrev_b16 v17, 5, s4
+; GFX12-NEXT:    v_lshrrev_b16 v2, 5, s5
 ; GFX12-NEXT:    s_and_b32 s7, s2, 1
-; GFX12-NEXT:    s_bfe_u32 s15, s3, 0x10011
+; GFX12-NEXT:    s_bfe_u32 s18, s3, 0x10010
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9
+; GFX12-NEXT:    v_and_b32_e32 v9, 1, v1
+; GFX12-NEXT:    v_lshrrev_b16 v1, 3, s4
+; GFX12-NEXT:    s_bfe_u32 s19, s3, 0x10017
+; GFX12-NEXT:    v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3
+; GFX12-NEXT:    v_lshrrev_b16 v3, 3, s5
+; GFX12-NEXT:    v_lshrrev_b16 v15, 3, s3
+; GFX12-NEXT:    v_lshrrev_b16 v28, 12, s3
+; GFX12-NEXT:    v_lshrrev_b16 v29, 14, s3
+; GFX12-NEXT:    v_lshrrev_b16 v30, 15, s3
+; GFX12-NEXT:    v_lshrrev_b16 v25, 10, s3
+; GFX12-NEXT:    v_lshrrev_b16 v20, 4, s3
+; GFX12-NEXT:    v_lshrrev_b16 v21, 6, s3
+; GFX12-NEXT:    v_and_b32_e32 v27, 1, v12
+; GFX12-NEXT:    s_and_b32 s6, s3, 1
+; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x10012
+; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x10011
+; GFX12-NEXT:    s_bfe_u32 s12, s2, 0x10017
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13
+; GFX12-NEXT:    v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17
+; GFX12-NEXT:    v_lshrrev_b16 v17, 6, s5
+; GFX12-NEXT:    s_bfe_u32 s13, s2, 0x10016
+; GFX12-NEXT:    v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14
+; GFX12-NEXT:    s_bfe_u32 s14, s2, 0x10015
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11
+; GFX12-NEXT:    v_and_b32_e32 v11, 1, v1
+; GFX12-NEXT:    v_lshrrev_b16 v1, 1, s5
+; GFX12-NEXT:    s_bfe_u32 s15, s3, 0x10013
+; GFX12-NEXT:    v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7
+; GFX12-NEXT:    v_lshrrev_b16 v7, 7, s5
+; GFX12-NEXT:    s_bfe_u32 s16, s3, 0x10012
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10
+; GFX12-NEXT:    s_bfe_u32 s17, s3, 0x10011
+; GFX12-NEXT:    v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5
+; GFX12-NEXT:    s_bfe_u32 s20, s3, 0x10016
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8
-; GFX12-NEXT:    v_lshrrev_b16 v8, 7, s5
-; GFX12-NEXT:    s_bfe_u32 s16, s3, 0x10010
-; GFX12-NEXT:    v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7
-; GFX12-NEXT:    v_lshrrev_b16 v7, 2, s5
-; GFX12-NEXT:    s_bfe_u32 s17, s3, 0x10017
-; GFX12-NEXT:    s_bfe_u32 s18, s3, 0x10016
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v51, s18 :: v_dual_and_b32 v44, 1, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 4, s5
-; GFX12-NEXT:    s_bfe_u32 s13, s2, 0x10015
-; GFX12-NEXT:    v_and_b32_e32 v23, 1, v14
-; GFX12-NEXT:    v_and_b32_e32 v14, 1, v18
-; GFX12-NEXT:    v_lshrrev_b16 v18, 6, s5
+; GFX12-NEXT:    v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6
+; GFX12-NEXT:    v_lshrrev_b16 v6, 2, s5
+; GFX12-NEXT:    s_bfe_u32 s21, s3, 0x10014
+; GFX12-NEXT:    v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4
+; GFX12-NEXT:    v_lshrrev_b16 v4, 4, s5
 ; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x10015
-; GFX12-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_and_b32 v39, 1, v6
-; GFX12-NEXT:    v_and_b32_e32 v32, 1, v11
-; GFX12-NEXT:    v_lshrrev_b16 v11, 2, s4
-; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x10012
-; GFX12-NEXT:    v_and_b32_e32 v20, 1, v16
-; GFX12-NEXT:    v_lshrrev_b16 v16, 7, s4
+; GFX12-NEXT:    v_and_b32_e32 v5, 1, v2
+; GFX12-NEXT:    v_dual_mov_b32 v61, s10 :: v_dual_and_b32 v2, 1, v3
+; GFX12-NEXT:    v_lshrrev_b16 v16, 15, s2
+; GFX12-NEXT:    v_lshrrev_b16 v24, 8, s3
+; GFX12-NEXT:    v_lshrrev_b16 v18, 2, s3
 ; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x10010
-; GFX12-NEXT:    v_and_b32_e32 v24, 1, v15
-; GFX12-NEXT:    v_lshrrev_b16 v15, 6, s4
-; GFX12-NEXT:    s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT:    v_mov_b32_e32 v50, s3
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT:    v_and_b32_e32 v28, 1, v13
-; GFX12-NEXT:    v_lshrrev_b16 v13, 4, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15
+; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
+; GFX12-NEXT:    v_lshrrev_b16 v8, 14, s2
+; GFX12-NEXT:    v_lshrrev_b16 v12, 4, s4
+; GFX12-NEXT:    v_lshrrev_b16 v14, 6, s4
+; GFX12-NEXT:    v_lshrrev_b16 v15, 7, s4
+; GFX12-NEXT:    v_lshrrev_b16 v32, 8, s2
+; GFX12-NEXT:    v_lshrrev_b16 v40, 10, s2
+; GFX12-NEXT:    v_lshrrev_b16 v36, 4, s2
+; GFX12-NEXT:    v_lshrrev_b16 v37, 6, s2
+; GFX12-NEXT:    v_lshrrev_b16 v33, 2, s2
+; GFX12-NEXT:    v_lshrrev_b16 v10, 2, s4
 ; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT:    v_and_b32_e32 v6, 1, v3
-; GFX12-NEXT:    v_and_b32_e32 v3, 1, v4
-; GFX12-NEXT:    v_lshrrev_b16 v17, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v1, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v9, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v41, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v37, 4, s2
 ; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x10011
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10014
-; GFX12-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX12-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v0, v[49:52], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v0, v[53:56], s[0:1] offset:192
-; GFX12-NEXT:    v_mov_b32_e32 v52, s12
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX12-NEXT:    v_dual_mov_b32 v54, s10 :: v_dual_and_b32 v3, 1, v7
-; GFX12-NEXT:    v_dual_mov_b32 v56, s8 :: v_dual_and_b32 v7, 1, v18
-; GFX12-NEXT:    v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s13
-; GFX12-NEXT:    v_mov_b32_e32 v51, s3
-; GFX12-NEXT:    v_dual_mov_b32 v53, s11 :: v_dual_and_b32 v18, 0xffff, v24
-; GFX12-NEXT:    v_and_b32_e32 v24, 0xffff, v23
-; GFX12-NEXT:    v_and_b32_e32 v23, 1, v22
-; GFX12-NEXT:    v_and_b32_e32 v22, 0xffff, v28
-; GFX12-NEXT:    v_and_b32_e32 v28, 0xffff, v27
-; GFX12-NEXT:    v_and_b32_e32 v27, 1, v26
-; GFX12-NEXT:    v_and_b32_e32 v26, 0xffff, v32
-; GFX12-NEXT:    v_and_b32_e32 v32, 0xffff, v31
-; GFX12-NEXT:    v_and_b32_e32 v31, 1, v30
-; GFX12-NEXT:    v_and_b32_e32 v30, 0xffff, v36
-; GFX12-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX12-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX12-NEXT:    v_dual_mov_b32 v55, s9 :: v_dual_and_b32 v48, 0xffff, v17
-; GFX12-NEXT:    v_dual_mov_b32 v17, s6 :: v_dual_and_b32 v20, 0xffff, v20
-; GFX12-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX12-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX12-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX12-NEXT:    v_and_b32_e32 v36, 0xffff, v35
-; GFX12-NEXT:    v_and_b32_e32 v35, 1, v34
-; GFX12-NEXT:    v_and_b32_e32 v34, 0xffff, v40
-; GFX12-NEXT:    v_and_b32_e32 v40, 0xffff, v39
-; GFX12-NEXT:    v_and_b32_e32 v39, 1, v38
-; GFX12-NEXT:    v_and_b32_e32 v38, 0xffff, v44
-; GFX12-NEXT:    v_and_b32_e32 v44, 0xffff, v43
-; GFX12-NEXT:    v_and_b32_e32 v43, 1, v41
-; GFX12-NEXT:    v_and_b32_e32 v47, 1, v9
-; GFX12-NEXT:    v_and_b32_e32 v46, 0xffff, v45
-; GFX12-NEXT:    v_and_b32_e32 v45, 1, v1
-; GFX12-NEXT:    v_and_b32_e32 v41, 1, v33
-; GFX12-NEXT:    v_dual_mov_b32 v33, s7 :: v_dual_and_b32 v14, 0xffff, v14
-; GFX12-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_and_b32 v42, 0xffff, v42
-; GFX12-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX12-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX12-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX12-NEXT:    v_and_b32_e32 v37, 1, v37
-; GFX12-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v6, 0xffff, v6
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    s_clause 0xd
-; GFX12-NEXT:    global_store_b128 v0, v[49:52], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v0, v[53:56], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v0, v[45:48], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v0, v[41:44], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v0, v[37:40], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v0, v[33:36], s[0:1]
-; GFX12-NEXT:    global_store_b128 v0, v[29:32], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v0, v[25:28], s[0:1] offset:160
-; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1] offset:144
-; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:128
-; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:96
+; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x10015
+; GFX12-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX12-NEXT:    v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6
+; GFX12-NEXT:    v_and_b32_e32 v6, 1, v17
+; GFX12-NEXT:    v_and_b32_e32 v17, 0xffff, v23
+; GFX12-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX12-NEXT:    v_and_b32_e32 v22, 1, v21
+; GFX12-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX12-NEXT:    v_dual_mov_b32 v49, s3 :: v_dual_and_b32 v28, 1, v28
+; GFX12-NEXT:    v_dual_mov_b32 v56, s2 :: v_dual_and_b32 v21, 0xffff, v27
+; GFX12-NEXT:    v_and_b32_e32 v27, 0xffff, v26
+; GFX12-NEXT:    v_and_b32_e32 v26, 1, v25
+; GFX12-NEXT:    v_and_b32_e32 v25, 0xffff, v31
+; GFX12-NEXT:    v_and_b32_e32 v31, 0xffff, v30
+; GFX12-NEXT:    v_and_b32_e32 v30, 1, v29
+; GFX12-NEXT:    v_and_b32_e32 v29, 0xffff, v35
+; GFX12-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX12-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX12-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX12-NEXT:    v_dual_mov_b32 v16, s6 :: v_dual_and_b32 v47, 0xffff, v16
+; GFX12-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX12-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX12-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX12-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX12-NEXT:    v_and_b32_e32 v46, 1, v8
+; GFX12-NEXT:    v_and_b32_e32 v45, 0xffff, v44
+; GFX12-NEXT:    v_and_b32_e32 v44, 1, v0
+; GFX12-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX12-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX12-NEXT:    v_and_b32_e32 v35, 0xffff, v34
+; GFX12-NEXT:    v_and_b32_e32 v34, 1, v33
+; GFX12-NEXT:    v_and_b32_e32 v33, 0xffff, v39
+; GFX12-NEXT:    v_and_b32_e32 v39, 0xffff, v38
+; GFX12-NEXT:    v_and_b32_e32 v38, 1, v37
+; GFX12-NEXT:    v_and_b32_e32 v37, 0xffff, v43
+; GFX12-NEXT:    v_and_b32_e32 v43, 0xffff, v42
+; GFX12-NEXT:    v_and_b32_e32 v42, 1, v40
+; GFX12-NEXT:    v_and_b32_e32 v41, 0xffff, v41
+; GFX12-NEXT:    v_and_b32_e32 v40, 1, v32
+; GFX12-NEXT:    v_dual_mov_b32 v32, s7 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT:    v_dual_mov_b32 v8, s5 :: v_dual_and_b32 v7, 0xffff, v7
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX12-NEXT:    v_and_b32_e32 v36, 1, v36
+; GFX12-NEXT:    s_clause 0x9
+; GFX12-NEXT:    global_store_b128 v64, v[48:51], s[0:1] offset:208
+; GFX12-NEXT:    global_store_b128 v64, v[52:55], s[0:1] offset:192
+; GFX12-NEXT:    global_store_b128 v64, v[56:59], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v64, v[60:63], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v64, v[44:47], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v64, v[40:43], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v64, v[36:39], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v64, v[32:35], s[0:1]
+; GFX12-NEXT:    global_store_b128 v64, v[28:31], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v64, v[24:27], s[0:1] offset:160
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v64, v[20:23], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v64, v[16:19], s[0:1] offset:128
+; GFX12-NEXT:    global_store_b128 v64, v[12:15], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v64, v[8:11], s[0:1] offset:224
+; GFX12-NEXT:    global_store_b128 v64, v[4:7], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v64, v[0:3], s[0:1] offset:96
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -4248,48 +4245,24 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
 ; GFX12-NEXT:    v_lshrrev_b16 v28, 12, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v29, 13, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v30, 14, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT:    v_lshrrev_b16 v24, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v25, 9, s3
-; GFX12-NEXT:    v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT:    v_lshrrev_b16 v27, 11, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v20, 4, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v21, 5, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v22, 6, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v23, 7, s3
+; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
+; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
+; GFX12-NEXT:    v_lshrrev_b16 v16, 14, s2
+; GFX12-NEXT:    v_lshrrev_b16 v24, 8, s3
+; GFX12-NEXT:    v_lshrrev_b16 v25, 9, s3
+; GFX12-NEXT:    v_lshrrev_b16 v26, 10, s3
+; GFX12-NEXT:    v_lshrrev_b16 v27, 11, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v17, 1, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v18, 2, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX12-NEXT:    v_lshrrev_b16 v4, 4, s5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 5, s5
-; GFX12-NEXT:    v_lshrrev_b16 v6, 6, s5
-; GFX12-NEXT:    v_lshrrev_b16 v1, 3, s5
-; GFX12-NEXT:    v_lshrrev_b16 v2, 2, s5
-; GFX12-NEXT:    v_lshrrev_b16 v7, 1, s5
-; GFX12-NEXT:    v_lshrrev_b16 v44, 7, s5
-; GFX12-NEXT:    s_bfe_i32 s5, s3, 0x10018
-; GFX12-NEXT:    s_bfe_i32 s6, s3, 0x10000
-; GFX12-NEXT:    s_bfe_i32 s13, s3, 0x10013
-; GFX12-NEXT:    s_bfe_i32 s14, s3, 0x10012
-; GFX12-NEXT:    s_bfe_i32 s15, s3, 0x10011
-; GFX12-NEXT:    s_bfe_i32 s16, s3, 0x10010
-; GFX12-NEXT:    s_bfe_i32 s17, s3, 0x10017
-; GFX12-NEXT:    s_bfe_i32 s18, s3, 0x10016
-; GFX12-NEXT:    s_bfe_i32 s19, s3, 0x10014
-; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x10015
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s3
-; GFX12-NEXT:    v_dual_mov_b32 v48, s19 :: v_dual_mov_b32 v51, s17
-; GFX12-NEXT:    v_dual_mov_b32 v50, s18 :: v_dual_mov_b32 v53, s15
-; GFX12-NEXT:    v_lshrrev_b16 v16, 14, s2
-; GFX12-NEXT:    v_dual_mov_b32 v52, s16 :: v_dual_mov_b32 v55, s13
-; GFX12-NEXT:    s_bfe_i32 s13, s2, 0x10015
-; GFX12-NEXT:    v_mov_b32_e32 v54, s14
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 13, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v32, 15, s2
@@ -4297,6 +4270,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 5, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 6, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v15, 7, s4
+; GFX12-NEXT:    v_lshrrev_b16 v1, 3, s5
+; GFX12-NEXT:    v_lshrrev_b16 v7, 1, s5
+; GFX12-NEXT:    v_lshrrev_b16 v44, 7, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v40, 8, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v41, 9, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v42, 10, s2
@@ -4311,15 +4287,32 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 1, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v10, 2, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v11, 3, s4
+; GFX12-NEXT:    v_lshrrev_b16 v4, 4, s5
+; GFX12-NEXT:    v_lshrrev_b16 v5, 5, s5
+; GFX12-NEXT:    v_lshrrev_b16 v6, 6, s5
+; GFX12-NEXT:    v_lshrrev_b16 v2, 2, s5
 ; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10018
+; GFX12-NEXT:    s_bfe_i32 s5, s3, 0x10018
+; GFX12-NEXT:    s_bfe_i32 s6, s3, 0x10000
 ; GFX12-NEXT:    s_bfe_i32 s7, s2, 0x10000
 ; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x10013
 ; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x10012
 ; GFX12-NEXT:    s_bfe_i32 s10, s2, 0x10011
 ; GFX12-NEXT:    s_bfe_i32 s11, s2, 0x10010
 ; GFX12-NEXT:    s_bfe_i32 s12, s2, 0x10017
-; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10016
+; GFX12-NEXT:    s_bfe_i32 s13, s2, 0x10016
+; GFX12-NEXT:    s_bfe_i32 s14, s2, 0x10015
 ; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10014
+; GFX12-NEXT:    s_bfe_i32 s15, s3, 0x10013
+; GFX12-NEXT:    s_bfe_i32 s16, s3, 0x10012
+; GFX12-NEXT:    s_bfe_i32 s17, s3, 0x10011
+; GFX12-NEXT:    s_bfe_i32 s18, s3, 0x10010
+; GFX12-NEXT:    s_bfe_i32 s19, s3, 0x10017
+; GFX12-NEXT:    s_bfe_i32 s20, s3, 0x10016
+; GFX12-NEXT:    s_bfe_i32 s21, s3, 0x10014
+; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x10015
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3
 ; GFX12-NEXT:    v_bfe_i32 v23, v23, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v22, v22, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v21, v21, 0, 1
@@ -4328,12 +4321,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_bfe_i32 v30, v30, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v29, v29, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v28, v28, 0, 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v56, v[48:51], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v56, v[52:55], s[0:1] offset:192
-; GFX12-NEXT:    v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s2
-; GFX12-NEXT:    v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s3
-; GFX12-NEXT:    v_mov_b32_e32 v53, s10
+; GFX12-NEXT:    v_dual_mov_b32 v48, s21 :: v_dual_mov_b32 v51, s19
+; GFX12-NEXT:    v_dual_mov_b32 v50, s20 :: v_dual_mov_b32 v53, s17
+; GFX12-NEXT:    v_dual_mov_b32 v54, s16 :: v_dual_mov_b32 v57, s14
+; GFX12-NEXT:    v_dual_mov_b32 v56, s2 :: v_dual_mov_b32 v59, s12
+; GFX12-NEXT:    v_dual_mov_b32 v58, s13 :: v_dual_mov_b32 v61, s10
 ; GFX12-NEXT:    v_bfe_i32 v19, v19, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v18, v18, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v17, v17, 0, 1
@@ -4342,8 +4334,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_bfe_i32 v25, v25, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v24, v24, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v46, v16, 0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v52, s11 :: v_dual_mov_b32 v55, s8
-; GFX12-NEXT:    v_mov_b32_e32 v54, s9
+; GFX12-NEXT:    v_dual_mov_b32 v52, s18 :: v_dual_mov_b32 v55, s15
+; GFX12-NEXT:    v_dual_mov_b32 v60, s11 :: v_dual_mov_b32 v63, s8
+; GFX12-NEXT:    v_mov_b32_e32 v62, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v16, s6
 ; GFX12-NEXT:    v_bfe_i32 v3, v1, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v1, v7, 0, 1
@@ -4374,24 +4367,27 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_bfe_i32 v35, v35, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v34, v34, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v33, v33, 0, 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v64, v[48:51], s[0:1] offset:208
+; GFX12-NEXT:    global_store_b128 v64, v[52:55], s[0:1] offset:192
 ; GFX12-NEXT:    v_mov_b32_e32 v32, s7
 ; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v56, v[48:51], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v56, v[52:55], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v56, v[44:47], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v56, v[40:43], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v56, v[36:39], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v56, v[32:35], s[0:1]
-; GFX12-NEXT:    global_store_b128 v56, v[28:31], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v56, v[24:27], s[0:1] offset:160
+; GFX12-NEXT:    global_store_b128 v64, v[56:59], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v64, v[60:63], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v64, v[44:47], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v64, v[40:43], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v64, v[36:39], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v64, v[32:35], s[0:1]
+; GFX12-NEXT:    global_store_b128 v64, v[28:31], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v64, v[24:27], s[0:1] offset:160
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v56, v[20:23], s[0:1] offset:144
-; GFX12-NEXT:    global_store_b128 v56, v[16:19], s[0:1] offset:128
-; GFX12-NEXT:    global_store_b128 v56, v[12:15], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v56, v[8:11], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v56, v[4:7], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v56, v[0:3], s[0:1] offset:96
+; GFX12-NEXT:    global_store_b128 v64, v[20:23], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v64, v[16:19], s[0:1] offset:128
+; GFX12-NEXT:    global_store_b128 v64, v[12:15], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v64, v[8:11], s[0:1] offset:224
+; GFX12-NEXT:    global_store_b128 v64, v[4:7], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v64, v[0:3], s[0:1] offset:96
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -7457,117 +7453,115 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 2, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 3, s2
 ; GFX12-NEXT:    s_lshr_b32 s22, s2, 24
-; GFX12-NEXT:    s_lshr_b32 s12, s2, 22
-; GFX12-NEXT:    s_lshr_b32 s14, s2, 23
-; GFX12-NEXT:    v_lshrrev_b16 v6, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 1, s2
-; GFX12-NEXT:    s_lshr_b32 s16, s2, 20
-; GFX12-NEXT:    s_lshr_b32 s18, s2, 21
-; GFX12-NEXT:    v_lshrrev_b16 v1, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 6, s22
-; GFX12-NEXT:    v_lshrrev_b16 v14, 7, s22
-; GFX12-NEXT:    v_lshrrev_b16 v9, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v10, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v16, 4, s22
-; GFX12-NEXT:    v_lshrrev_b16 v17, 5, s22
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s4, s2, 18
-; GFX12-NEXT:    v_lshrrev_b16 v37, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v34, 11, s2
-; GFX12-NEXT:    v_lshrrev_b16 v13, 2, s22
-; GFX12-NEXT:    v_lshrrev_b16 v15, 3, s22
-; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v49, s12
-; GFX12-NEXT:    v_lshrrev_b16 v30, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v32, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v11, 1, s22
-; GFX12-NEXT:    v_bfe_i32 v7, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v5, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v31, v28, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v29, v26, 0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s14
-; GFX12-NEXT:    v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s16
-; GFX12-NEXT:    s_lshr_b32 s6, s2, 19
-; GFX12-NEXT:    v_bfe_i32 v3, v3, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v27, v8, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v25, v6, 0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v54, s17 :: v_dual_mov_b32 v55, s18
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v56, s19
-; GFX12-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX12-NEXT:    v_lshrrev_b16 v8, 4, s2
+; GFX12-NEXT:    v_lshrrev_b16 v3, 5, s2
+; GFX12-NEXT:    v_lshrrev_b16 v2, 1, s2
+; GFX12-NEXT:    v_lshrrev_b16 v7, 6, s22
+; GFX12-NEXT:    v_lshrrev_b16 v11, 7, s22
+; GFX12-NEXT:    v_lshrrev_b16 v13, 4, s22
+; GFX12-NEXT:    v_lshrrev_b16 v15, 5, s22
+; GFX12-NEXT:    v_lshrrev_b16 v0, 14, s2
+; GFX12-NEXT:    v_lshrrev_b16 v1, 15, s2
+; GFX12-NEXT:    v_lshrrev_b16 v14, 2, s22
+; GFX12-NEXT:    v_lshrrev_b16 v16, 3, s22
+; GFX12-NEXT:    v_lshrrev_b16 v35, 12, s2
+; GFX12-NEXT:    v_lshrrev_b16 v37, 13, s2
+; GFX12-NEXT:    v_lshrrev_b16 v34, 10, s2
+; GFX12-NEXT:    v_lshrrev_b16 v36, 11, s2
+; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
+; GFX12-NEXT:    v_lshrrev_b16 v27, 9, s2
+; GFX12-NEXT:    v_lshrrev_b16 v12, 1, s22
+; GFX12-NEXT:    v_bfe_i32 v6, v5, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v28, v28, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v26, v26, 0, 1
+; GFX12-NEXT:    s_lshr_b32 s4, s2, 22
+; GFX12-NEXT:    s_lshr_b32 s8, s2, 23
+; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v10, v3, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v8, v8, 0, 1
+; GFX12-NEXT:    s_lshr_b32 s10, s2, 20
+; GFX12-NEXT:    s_lshr_b32 s12, s2, 21
 ; GFX12-NEXT:    s_lshr_b32 s20, s2, 17
-; GFX12-NEXT:    v_bfe_i32 v23, v14, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v21, v12, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v47, v2, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v45, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v19, v17, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v17, v16, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v43, v10, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v41, v9, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[8:9], s[2:3], 0x10000
-; GFX12-NEXT:    v_bfe_i32 v15, v15, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v13, v13, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v39, v34, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v37, v37, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v24, v11, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v22, v7, 0, 1
+; GFX12-NEXT:    s_lshr_b32 s14, s2, 18
+; GFX12-NEXT:    s_lshr_b32 s16, s2, 19
+; GFX12-NEXT:    v_bfe_i32 v20, v15, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v18, v13, 0, 1
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s18, s2, 16
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[2:3], 0x10000
+; GFX12-NEXT:    v_bfe_i32 v16, v16, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v14, v14, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v44, v1, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v42, v0, 0, 1
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX12-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GFX12-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX12-NEXT:    v_bfe_i32 v35, v32, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT:    v_bfe_i32 v33, v30, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v30, 31, v29
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v0, v[49:52], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v0, v[53:56], s[0:1] offset:160
-; GFX12-NEXT:    v_dual_mov_b32 v49, s4 :: v_dual_mov_b32 v50, s5
-; GFX12-NEXT:    v_dual_mov_b32 v51, s6 :: v_dual_mov_b32 v52, s7
-; GFX12-NEXT:    v_mov_b32_e32 v53, s10
+; GFX12-NEXT:    v_dual_mov_b32 v62, 0 :: v_dual_mov_b32 v47, s5
+; GFX12-NEXT:    v_bfe_i32 v12, v12, 0, 1
+; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT:    v_bfe_i32 v32, v27, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v30, v9, 0, 1
+; GFX12-NEXT:    v_ashrrev_i32_e32 v29, 31, v28
+; GFX12-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
+; GFX12-NEXT:    v_bfe_i32 v36, v36, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v34, v34, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v40, v37, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v38, v35, 0, 1
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v46, s4 :: v_dual_mov_b32 v49, s9
+; GFX12-NEXT:    v_dual_mov_b32 v48, s8 :: v_dual_mov_b32 v51, s11
 ; GFX12-NEXT:    s_bfe_i64 s[2:3], s[22:23], 0x10000
-; GFX12-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX12-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
-; GFX12-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX12-NEXT:    v_dual_mov_b32 v54, s11 :: v_dual_mov_b32 v55, s20
-; GFX12-NEXT:    v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v1, s8
-; GFX12-NEXT:    v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s2
-; GFX12-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX12-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX12-NEXT:    v_ashrrev_i32_e32 v48, 31, v47
-; GFX12-NEXT:    v_ashrrev_i32_e32 v46, 31, v45
-; GFX12-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GFX12-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GFX12-NEXT:    v_ashrrev_i32_e32 v44, 31, v43
-; GFX12-NEXT:    v_ashrrev_i32_e32 v42, 31, v41
-; GFX12-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GFX12-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GFX12-NEXT:    v_ashrrev_i32_e32 v40, 31, v39
-; GFX12-NEXT:    v_ashrrev_i32_e32 v38, 31, v37
-; GFX12-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GFX12-NEXT:    v_ashrrev_i32_e32 v36, 31, v35
-; GFX12-NEXT:    v_ashrrev_i32_e32 v34, 31, v33
-; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v0, v[49:52], s[0:1] offset:144
-; GFX12-NEXT:    global_store_b128 v0, v[53:56], s[0:1] offset:128
-; GFX12-NEXT:    global_store_b128 v0, v[45:48], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v0, v[41:44], s[0:1] offset:96
-; GFX12-NEXT:    global_store_b128 v0, v[37:40], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v0, v[33:36], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v0, v[29:32], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v0, v[25:28], s[0:1] offset:32
-; GFX12-NEXT:    v_mov_b32_e32 v10, s3
+; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX12-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v53, s13
+; GFX12-NEXT:    v_dual_mov_b32 v52, s12 :: v_dual_mov_b32 v55, s15
+; GFX12-NEXT:    v_dual_mov_b32 v60, s20 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
+; GFX12-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
+; GFX12-NEXT:    v_dual_mov_b32 v54, s14 :: v_dual_mov_b32 v57, s17
+; GFX12-NEXT:    v_dual_mov_b32 v56, s16 :: v_dual_mov_b32 v59, s19
+; GFX12-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GFX12-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX12-NEXT:    v_dual_mov_b32 v58, s18 :: v_dual_mov_b32 v61, s21
+; GFX12-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT:    v_ashrrev_i32_e32 v45, 31, v44
+; GFX12-NEXT:    v_ashrrev_i32_e32 v43, 31, v42
+; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX12-NEXT:    v_ashrrev_i32_e32 v33, 31, v32
+; GFX12-NEXT:    v_ashrrev_i32_e32 v31, 31, v30
+; GFX12-NEXT:    v_ashrrev_i32_e32 v37, 31, v36
+; GFX12-NEXT:    v_ashrrev_i32_e32 v35, 31, v34
+; GFX12-NEXT:    v_ashrrev_i32_e32 v41, 31, v40
+; GFX12-NEXT:    v_ashrrev_i32_e32 v39, 31, v38
+; GFX12-NEXT:    s_clause 0x9
+; GFX12-NEXT:    global_store_b128 v62, v[46:49], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v62, v[50:53], s[0:1] offset:160
+; GFX12-NEXT:    global_store_b128 v62, v[54:57], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v62, v[58:61], s[0:1] offset:128
+; GFX12-NEXT:    global_store_b128 v62, v[42:45], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v62, v[38:41], s[0:1] offset:96
+; GFX12-NEXT:    global_store_b128 v62, v[34:37], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v62, v[30:33], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v62, v[26:29], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v62, v[8:11], s[0:1] offset:32
+; GFX12-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
-; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[0:1] offset:192
+; GFX12-NEXT:    global_store_b128 v62, v[4:7], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v62, v[0:3], s[0:1]
+; GFX12-NEXT:    global_store_b128 v62, v[22:25], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v62, v[18:21], s[0:1] offset:224
+; GFX12-NEXT:    global_store_b128 v62, v[14:17], s[0:1] offset:208
+; GFX12-NEXT:    global_store_b128 v62, v[10:13], s[0:1] offset:192
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -8438,209 +8432,206 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 11, s2
+; GFX12-NEXT:    v_lshrrev_b16 v0, 13, s2
+; GFX12-NEXT:    v_lshrrev_b16 v3, 11, s2
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_lshrrev_b16 v2, 12, s2
+; GFX12-NEXT:    v_lshrrev_b16 v4, 9, s2
+; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT:    v_lshrrev_b16 v10, 3, s2
+; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10014
+; GFX12-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 7, s2
+; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10015
+; GFX12-NEXT:    v_lshrrev_b16 v12, 1, s2
+; GFX12-NEXT:    v_lshrrev_b16 v16, 11, s3
+; GFX12-NEXT:    v_and_b32_e32 v36, 1, v4
+; GFX12-NEXT:    v_and_b32_e32 v43, 1, v10
+; GFX12-NEXT:    v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2
+; GFX12-NEXT:    v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3
+; GFX12-NEXT:    v_mov_b32_e32 v66, v1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_lshrrev_b16 v8, 5, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 13, s3
-; GFX12-NEXT:    v_and_b32_e32 v34, 1, v4
 ; GFX12-NEXT:    v_lshrrev_b16 v18, 9, s3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v41, 1, v6
+; GFX12-NEXT:    v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 5, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 3, s4
+; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10016
+; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10017
+; GFX12-NEXT:    v_lshrrev_b16 v20, 7, s3
+; GFX12-NEXT:    v_and_b32_e32 v45, 1, v12
+; GFX12-NEXT:    v_and_b32_e32 v41, 1, v16
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT:    v_mov_b32_e32 v2, s9
+; GFX12-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX12-NEXT:    v_lshrrev_b16 v3, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v10, 3, s2
-; GFX12-NEXT:    v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT:    v_lshrrev_b16 v24, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v25, 3, s3
-; GFX12-NEXT:    v_and_b32_e32 v50, 1, v14
-; GFX12-NEXT:    v_and_b32_e32 v47, 1, v18
-; GFX12-NEXT:    v_and_b32_e32 v18, 1, v4
+; GFX12-NEXT:    v_lshrrev_b16 v22, 5, s3
+; GFX12-NEXT:    v_lshrrev_b16 v24, 3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8
+; GFX12-NEXT:    v_and_b32_e32 v44, 1, v14
 ; GFX12-NEXT:    v_and_b32_e32 v14, 1, v6
-; GFX12-NEXT:    v_lshrrev_b16 v4, 3, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 5, s5
-; GFX12-NEXT:    v_lshrrev_b16 v0, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 1, s2
-; GFX12-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT:    v_and_b32_e32 v42, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v52, 1, v10
-; GFX12-NEXT:    v_and_b32_e32 v40, 1, v23
-; GFX12-NEXT:    v_dual_mov_b32 v44, v1 :: v_dual_and_b32 v43, 1, v24
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 1, s5
-; GFX12-NEXT:    v_lshrrev_b16 v10, 2, s5
-; GFX12-NEXT:    v_lshrrev_b16 v24, 4, s5
-; GFX12-NEXT:    s_bfe_u32 s7, s3, 0x10014
-; GFX12-NEXT:    v_and_b32_e32 v33, 1, v25
-; GFX12-NEXT:    v_and_b32_e32 v25, 1, v6
-; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10015
-; GFX12-NEXT:    v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v16, 11, s3
-; GFX12-NEXT:    v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v35, 1, v5
-; GFX12-NEXT:    v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v5, 1, v12
-; GFX12-NEXT:    v_lshrrev_b16 v36, 7, s5
-; GFX12-NEXT:    v_lshrrev_b16 v37, 6, s5
-; GFX12-NEXT:    v_and_b32_e32 v56, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v10
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v23
-; GFX12-NEXT:    v_and_b32_e32 v8, 1, v24
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v25
-; GFX12-NEXT:    v_and_b32_e32 v23, 1, v2
-; GFX12-NEXT:    v_dual_mov_b32 v24, v1 :: v_dual_and_b32 v25, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v29, 0xffff, v3
-; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10013
-; GFX12-NEXT:    v_and_b32_e32 v27, 1, v7
-; GFX12-NEXT:    v_lshrrev_b16 v9, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v13, 6, s2
-; GFX12-NEXT:    v_and_b32_e32 v22, 1, v16
-; GFX12-NEXT:    v_lshrrev_b16 v54, 1, s3
-; GFX12-NEXT:    v_lshrrev_b16 v55, 1, s4
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v1, v[23:26], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v1, v[27:30], s[0:1] offset:96
-; GFX12-NEXT:    v_and_b32_e32 v23, 1, v37
-; GFX12-NEXT:    v_and_b32_e32 v25, 0xffff, v36
-; GFX12-NEXT:    v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v28, 0xffff, v34
-; GFX12-NEXT:    v_dual_mov_b32 v59, v1 :: v_dual_and_b32 v34, 1, v11
-; GFX12-NEXT:    v_dual_mov_b32 v35, v1 :: v_dual_and_b32 v36, 0xffff, v35
-; GFX12-NEXT:    v_dual_mov_b32 v37, v1 :: v_dual_and_b32 v26, 1, v9
-; GFX12-NEXT:    v_mov_b32_e32 v27, v1
-; GFX12-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v0, 1, v55
-; GFX12-NEXT:    global_store_b128 v1, v[34:37], s[0:1] offset:64
-; GFX12-NEXT:    v_and_b32_e32 v34, 1, v13
-; GFX12-NEXT:    v_and_b32_e32 v36, 0xffff, v41
-; GFX12-NEXT:    v_and_b32_e32 v2, 1, v54
-; GFX12-NEXT:    global_store_b128 v1, v[26:29], s[0:1] offset:80
-; GFX12-NEXT:    v_and_b32_e32 v30, 0xffff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s7
-; GFX12-NEXT:    global_store_b128 v1, v[34:37], s[0:1] offset:48
-; GFX12-NEXT:    v_and_b32_e32 v36, 0xffff, v2
-; GFX12-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT:    s_bfe_u32 s7, s3, 0x10016
-; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10017
-; GFX12-NEXT:    v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 14, s3
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:416
-; GFX12-NEXT:    v_mov_b32_e32 v0, s7
-; GFX12-NEXT:    v_mov_b32_e32 v2, s8
+; GFX12-NEXT:    v_lshrrev_b16 v10, 3, s5
 ; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10012
-; GFX12-NEXT:    v_lshrrev_b16 v19, 12, s3
-; GFX12-NEXT:    v_lshrrev_b16 v32, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v38, 6, s3
+; GFX12-NEXT:    v_and_b32_e32 v37, 1, v18
+; GFX12-NEXT:    v_and_b32_e32 v18, 1, v4
+; GFX12-NEXT:    v_lshrrev_b16 v4, 1, s4
+; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10013
+; GFX12-NEXT:    v_and_b32_e32 v33, 1, v20
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:432
-; GFX12-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s9
-; GFX12-NEXT:    v_lshrrev_b16 v39, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v31, 2, s3
-; GFX12-NEXT:    v_lshrrev_b16 v28, 10, s3
-; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10018
+; GFX12-NEXT:    v_mov_b32_e32 v0, s8
+; GFX12-NEXT:    v_lshrrev_b16 v9, 15, s3
+; GFX12-NEXT:    v_lshrrev_b16 v11, 14, s3
+; GFX12-NEXT:    v_lshrrev_b16 v23, 12, s3
+; GFX12-NEXT:    v_lshrrev_b16 v25, 10, s3
+; GFX12-NEXT:    v_lshrrev_b16 v27, 8, s3
+; GFX12-NEXT:    v_lshrrev_b16 v29, 6, s3
+; GFX12-NEXT:    v_lshrrev_b16 v28, 4, s3
+; GFX12-NEXT:    v_lshrrev_b16 v26, 1, s3
+; GFX12-NEXT:    v_and_b32_e32 v30, 1, v22
+; GFX12-NEXT:    v_and_b32_e32 v31, 1, v24
+; GFX12-NEXT:    v_lshrrev_b16 v24, 2, s3
+; GFX12-NEXT:    v_lshrrev_b16 v22, 7, s5
+; GFX12-NEXT:    v_lshrrev_b16 v20, 6, s5
+; GFX12-NEXT:    v_and_b32_e32 v39, 1, v6
+; GFX12-NEXT:    v_lshrrev_b16 v6, 2, s5
 ; GFX12-NEXT:    s_and_b32 s6, s3, 1
 ; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10011
+; GFX12-NEXT:    v_and_b32_e32 v35, 1, v4
+; GFX12-NEXT:    v_and_b32_e32 v4, 1, v8
+; GFX12-NEXT:    v_and_b32_e32 v8, 1, v10
+; GFX12-NEXT:    v_lshrrev_b16 v10, 4, s5
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10018
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x10010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:400
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10016
 ; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10017
-; GFX12-NEXT:    v_lshrrev_b16 v15, 4, s2
-; GFX12-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX12-NEXT:    v_lshrrev_b16 v13, 10, s2
+; GFX12-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX12-NEXT:    v_and_b32_e32 v82, 0xffff, v35
+; GFX12-NEXT:    v_and_b32_e32 v35, 1, v27
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:384
-; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1
+; GFX12-NEXT:    v_and_b32_e32 v81, 0xffff, v4
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10014
 ; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10015
-; GFX12-NEXT:    v_and_b32_e32 v29, 0xffff, v43
-; GFX12-NEXT:    v_and_b32_e32 v41, 1, v15
+; GFX12-NEXT:    v_dual_mov_b32 v72, v1 :: v_dual_and_b32 v65, 1, v13
+; GFX12-NEXT:    v_mov_b32_e32 v13, v1
+; GFX12-NEXT:    v_and_b32_e32 v83, 0xffff, v26
+; GFX12-NEXT:    v_and_b32_e32 v26, 0xffff, v31
+; GFX12-NEXT:    v_and_b32_e32 v31, 1, v29
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:176
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10012
 ; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT:    v_lshrrev_b16 v17, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v46, 7, s4
-; GFX12-NEXT:    v_lshrrev_b16 v49, 6, s4
-; GFX12-NEXT:    v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v43, 0xffff, v42
-; GFX12-NEXT:    v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v45, 1, v32
-; GFX12-NEXT:    v_and_b32_e32 v47, 0xffff, v47
+; GFX12-NEXT:    v_lshrrev_b16 v5, 15, s2
+; GFX12-NEXT:    v_lshrrev_b16 v7, 14, s2
+; GFX12-NEXT:    v_lshrrev_b16 v21, 2, s2
+; GFX12-NEXT:    v_and_b32_e32 v33, 0xffff, v33
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:160
-; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s8
+; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s2
+; GFX12-NEXT:    v_lshrrev_b16 v17, 6, s2
+; GFX12-NEXT:    v_lshrrev_b16 v19, 4, s2
+; GFX12-NEXT:    v_lshrrev_b16 v32, 7, s4
+; GFX12-NEXT:    v_lshrrev_b16 v34, 6, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v16, 4, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v12, 2, s4
 ; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
 ; GFX12-NEXT:    s_and_b32 s7, s2, 1
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10011
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10010
-; GFX12-NEXT:    v_and_b32_e32 v51, 1, v17
-; GFX12-NEXT:    v_dual_mov_b32 v54, v1 :: v_dual_and_b32 v53, 0xffff, v52
-; GFX12-NEXT:    v_and_b32_e32 v37, 0xffff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 1, v6
+; GFX12-NEXT:    v_and_b32_e32 v37, 0xffff, v37
+; GFX12-NEXT:    v_dual_mov_b32 v78, v1 :: v_dual_and_b32 v41, 0xffff, v41
+; GFX12-NEXT:    v_dual_mov_b32 v80, v1 :: v_dual_and_b32 v29, 0xffff, v45
+; GFX12-NEXT:    v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v50, 1, v21
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v8
+; GFX12-NEXT:    v_and_b32_e32 v8, 1, v10
+; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v39
+; GFX12-NEXT:    v_and_b32_e32 v39, 1, v25
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
-; GFX12-NEXT:    v_mov_b32_e32 v52, v1
-; GFX12-NEXT:    global_store_b128 v1, v[41:44], s[0:1] offset:32
-; GFX12-NEXT:    v_and_b32_e32 v41, 1, v49
-; GFX12-NEXT:    v_and_b32_e32 v43, 0xffff, v46
-; GFX12-NEXT:    v_mov_b32_e32 v13, v1
-; GFX12-NEXT:    v_and_b32_e32 v35, 0xffff, v56
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7
+; GFX12-NEXT:    v_and_b32_e32 v79, 0xffff, v5
+; GFX12-NEXT:    v_dual_mov_b32 v70, v1 :: v_dual_and_b32 v63, 0xffff, v36
+; GFX12-NEXT:    v_dual_mov_b32 v74, v1 :: v_dual_and_b32 v61, 1, v15
+; GFX12-NEXT:    v_dual_mov_b32 v64, v1 :: v_dual_and_b32 v73, 1, v11
+; GFX12-NEXT:    v_dual_mov_b32 v59, v1 :: v_dual_and_b32 v12, 1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v51, v1 :: v_dual_and_b32 v20, 1, v20
+; GFX12-NEXT:    v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v22, 0xffff, v22
+; GFX12-NEXT:    v_and_b32_e32 v52, 0xffff, v43
+; GFX12-NEXT:    v_and_b32_e32 v58, 1, v17
+; GFX12-NEXT:    v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v60, 0xffff, v38
+; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:128
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s7
-; GFX12-NEXT:    v_mov_b32_e32 v46, v1
-; GFX12-NEXT:    v_mov_b32_e32 v2, v37
-; GFX12-NEXT:    v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v16, 1, v16
-; GFX12-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v48, 1, v19
-; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX12-NEXT:    global_store_b128 v1, v[51:54], s[0:1] offset:16
-; GFX12-NEXT:    v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v52, 1, v21
-; GFX12-NEXT:    v_and_b32_e32 v54, 0xffff, v20
-; GFX12-NEXT:    v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v50, 0xffff, v50
-; GFX12-NEXT:    v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v18, 0xffff, v18
-; GFX12-NEXT:    v_mov_b32_e32 v51, v1
-; GFX12-NEXT:    v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v27, 1, v39
-; GFX12-NEXT:    v_and_b32_e32 v38, 1, v38
-; GFX12-NEXT:    v_and_b32_e32 v40, 0xffff, v40
-; GFX12-NEXT:    v_and_b32_e32 v56, 1, v28
-; GFX12-NEXT:    v_and_b32_e32 v58, 0xffff, v22
+; GFX12-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    global_store_b128 v1, v[77:80], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v1, v[69:72], s[0:1] offset:96
+; GFX12-NEXT:    global_store_b128 v1, v[65:68], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v1, v[61:64], s[0:1] offset:64
+; GFX12-NEXT:    v_dual_mov_b32 v61, v1 :: v_dual_and_b32 v16, 1, v16
+; GFX12-NEXT:    v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v54, 1, v19
+; GFX12-NEXT:    v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v56, 0xffff, v40
+; GFX12-NEXT:    v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v18, 0xffff, v18
+; GFX12-NEXT:    v_and_b32_e32 v46, 1, v34
+; GFX12-NEXT:    v_and_b32_e32 v48, 0xffff, v32
+; GFX12-NEXT:    v_dual_mov_b32 v76, v1 :: v_dual_and_b32 v75, 0xffff, v9
+; GFX12-NEXT:    v_dual_mov_b32 v43, v1 :: v_dual_and_b32 v24, 1, v24
+; GFX12-NEXT:    v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v42, 1, v23
+; GFX12-NEXT:    v_dual_mov_b32 v45, v1 :: v_dual_and_b32 v44, 0xffff, v44
+; GFX12-NEXT:    s_clause 0x6
+; GFX12-NEXT:    global_store_b128 v1, v[58:61], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v1, v[54:57], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v1, v[50:53], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    global_store_b128 v1, v[41:44], s[0:1] offset:496
-; GFX12-NEXT:    global_store_b128 v1, v[52:55], s[0:1] offset:368
-; GFX12-NEXT:    global_store_b128 v1, v[48:51], s[0:1] offset:352
-; GFX12-NEXT:    v_mov_b32_e32 v41, v1
-; GFX12-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    v_mov_b32_e32 v2, v36
-; GFX12-NEXT:    v_dual_mov_b32 v48, v1 :: v_dual_and_b32 v33, 0xffff, v33
+; GFX12-NEXT:    global_store_b128 v1, v[46:49], s[0:1] offset:496
+; GFX12-NEXT:    global_store_b128 v1, v[73:76], s[0:1] offset:368
+; GFX12-NEXT:    global_store_b128 v1, v[42:45], s[0:1] offset:352
+; GFX12-NEXT:    v_mov_b32_e32 v40, v1
+; GFX12-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v32, v1
+; GFX12-NEXT:    v_mov_b32_e32 v34, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    v_mov_b32_e32 v2, v83
+; GFX12-NEXT:    v_mov_b32_e32 v36, v1
+; GFX12-NEXT:    v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v7, v1
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v1, v[56:59], s[0:1] offset:336
-; GFX12-NEXT:    global_store_b128 v1, v[45:48], s[0:1] offset:320
-; GFX12-NEXT:    global_store_b128 v1, v[38:41], s[0:1] offset:304
+; GFX12-NEXT:    global_store_b128 v1, v[39:42], s[0:1] offset:336
+; GFX12-NEXT:    global_store_b128 v1, v[35:38], s[0:1] offset:320
+; GFX12-NEXT:    global_store_b128 v1, v[31:34], s[0:1] offset:304
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:256
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
-; GFX12-NEXT:    v_mov_b32_e32 v2, v30
-; GFX12-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v12
-; GFX12-NEXT:    v_mov_b32_e32 v15, v1
+; GFX12-NEXT:    v_dual_mov_b32 v2, v82 :: v_dual_mov_b32 v23, v1
+; GFX12-NEXT:    v_mov_b32_e32 v19, v1
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v1, v[31:34], s[0:1] offset:272
-; GFX12-NEXT:    global_store_b128 v1, v[23:26], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v1, v[24:27], s[0:1] offset:272
+; GFX12-NEXT:    global_store_b128 v1, v[20:23], s[0:1] offset:240
 ; GFX12-NEXT:    global_store_b128 v1, v[16:19], s[0:1] offset:480
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:448
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    v_dual_mov_b32 v2, v35 :: v_dual_mov_b32 v9, v1
+; GFX12-NEXT:    v_dual_mov_b32 v2, v81 :: v_dual_mov_b32 v9, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v11, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v28, v1
-; GFX12-NEXT:    v_mov_b32_e32 v30, v1
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v28, 1, v28
+; GFX12-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v30
+; GFX12-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX12-NEXT:    s_clause 0x4
 ; GFX12-NEXT:    global_store_b128 v1, v[12:15], s[0:1] offset:464
 ; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:224
 ; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v1, v[27:30], s[0:1] offset:288
+; GFX12-NEXT:    global_store_b128 v1, v[28:31], s[0:1] offset:288
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:192
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9725,243 +9716,243 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:    s_mov_b32 s19, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s19, s5
+; GFX12-NEXT:    s_mov_b32 s5, s19
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_load_b64 s[40:41], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s26, s3, 22
-; GFX12-NEXT:    s_lshr_b32 s28, s3, 23
-; GFX12-NEXT:    s_lshr_b32 s30, s3, 20
-; GFX12-NEXT:    s_lshr_b32 s34, s3, 21
+; GFX12-NEXT:    s_lshr_b32 s26, s41, 22
+; GFX12-NEXT:    s_lshr_b32 s28, s41, 23
+; GFX12-NEXT:    s_lshr_b32 s30, s41, 20
+; GFX12-NEXT:    s_lshr_b32 s34, s41, 21
 ; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s20, s3, 18
+; GFX12-NEXT:    s_lshr_b32 s20, s41, 18
 ; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v43, s27
-; GFX12-NEXT:    v_dual_mov_b32 v42, s26 :: v_dual_mov_b32 v45, s29
-; GFX12-NEXT:    v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31
-; GFX12-NEXT:    s_lshr_b32 s22, s3, 19
-; GFX12-NEXT:    v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35
+; GFX12-NEXT:    v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v69, s26
+; GFX12-NEXT:    v_dual_mov_b32 v70, s27 :: v_dual_mov_b32 v71, s28
+; GFX12-NEXT:    v_dual_mov_b32 v72, s29 :: v_dual_mov_b32 v73, s30
+; GFX12-NEXT:    s_lshr_b32 s22, s41, 19
+; GFX12-NEXT:    v_dual_mov_b32 v74, s31 :: v_dual_mov_b32 v75, s34
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v48, s34
-; GFX12-NEXT:    s_lshr_b32 s24, s3, 16
-; GFX12-NEXT:    s_lshr_b32 s36, s3, 17
+; GFX12-NEXT:    v_mov_b32_e32 v76, s35
+; GFX12-NEXT:    s_lshr_b32 s24, s41, 16
+; GFX12-NEXT:    s_lshr_b32 s36, s41, 17
 ; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s12, s2, 22
+; GFX12-NEXT:    s_lshr_b32 s12, s40, 22
 ; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[42:45], s[0:1] offset:432
-; GFX12-NEXT:    global_store_b128 v12, v[46:49], s[0:1] offset:416
-; GFX12-NEXT:    v_dual_mov_b32 v43, s21 :: v_dual_mov_b32 v42, s20
-; GFX12-NEXT:    v_dual_mov_b32 v45, s23 :: v_dual_mov_b32 v44, s22
-; GFX12-NEXT:    v_mov_b32_e32 v47, s25
-; GFX12-NEXT:    s_lshr_b32 s14, s2, 23
-; GFX12-NEXT:    v_dual_mov_b32 v46, s24 :: v_dual_mov_b32 v49, s37
+; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:432
+; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:416
+; GFX12-NEXT:    v_dual_mov_b32 v69, s20 :: v_dual_mov_b32 v70, s21
+; GFX12-NEXT:    v_dual_mov_b32 v71, s22 :: v_dual_mov_b32 v72, s23
+; GFX12-NEXT:    v_mov_b32_e32 v73, s24
+; GFX12-NEXT:    s_lshr_b32 s14, s40, 23
+; GFX12-NEXT:    v_dual_mov_b32 v74, s25 :: v_dual_mov_b32 v75, s36
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v48, s36
-; GFX12-NEXT:    s_lshr_b32 s16, s2, 20
-; GFX12-NEXT:    s_lshr_b32 s40, s2, 21
+; GFX12-NEXT:    v_mov_b32_e32 v76, s37
+; GFX12-NEXT:    s_lshr_b32 s16, s40, 20
+; GFX12-NEXT:    s_lshr_b32 s38, s40, 21
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s6, s2, 18
-; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s6, s40, 18
+; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[42:45], s[0:1] offset:400
-; GFX12-NEXT:    global_store_b128 v12, v[46:49], s[0:1] offset:384
-; GFX12-NEXT:    v_dual_mov_b32 v43, s13 :: v_dual_mov_b32 v42, s12
-; GFX12-NEXT:    v_dual_mov_b32 v45, s15 :: v_dual_mov_b32 v44, s14
-; GFX12-NEXT:    v_mov_b32_e32 v47, s17
-; GFX12-NEXT:    s_lshr_b32 s8, s2, 19
-; GFX12-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v49, s41
+; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:400
+; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:384
+; GFX12-NEXT:    v_dual_mov_b32 v69, s12 :: v_dual_mov_b32 v70, s13
+; GFX12-NEXT:    v_dual_mov_b32 v71, s14 :: v_dual_mov_b32 v72, s15
+; GFX12-NEXT:    v_mov_b32_e32 v73, s16
+; GFX12-NEXT:    s_lshr_b32 s8, s40, 19
+; GFX12-NEXT:    s_lshr_b32 s10, s40, 16
+; GFX12-NEXT:    s_lshr_b32 s42, s40, 17
+; GFX12-NEXT:    v_dual_mov_b32 v74, s17 :: v_dual_mov_b32 v75, s38
 ; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v48, s40
-; GFX12-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT:    v_lshrrev_b16 v3, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v9, 13, s2
+; GFX12-NEXT:    v_mov_b32_e32 v76, s39
+; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-NEXT:    v_lshrrev_b16 v0, 14, s40
+; GFX12-NEXT:    v_lshrrev_b16 v1, 15, s40
+; GFX12-NEXT:    v_lshrrev_b16 v17, 12, s40
+; GFX12-NEXT:    v_lshrrev_b16 v18, 13, s40
+; GFX12-NEXT:    v_lshrrev_b16 v33, 10, s40
+; GFX12-NEXT:    v_lshrrev_b16 v34, 11, s40
+; GFX12-NEXT:    v_lshrrev_b16 v65, 8, s40
+; GFX12-NEXT:    v_lshrrev_b16 v66, 9, s40
+; GFX12-NEXT:    v_lshrrev_b16 v86, 6, s40
+; GFX12-NEXT:    v_lshrrev_b16 v82, 7, s40
+; GFX12-NEXT:    v_lshrrev_b16 v81, 4, s40
+; GFX12-NEXT:    v_lshrrev_b16 v83, 5, s40
+; GFX12-NEXT:    v_lshrrev_b16 v77, 2, s40
+; GFX12-NEXT:    v_lshrrev_b16 v78, 3, s40
+; GFX12-NEXT:    v_lshrrev_b16 v58, 1, s40
+; GFX12-NEXT:    v_lshrrev_b16 v60, 14, s41
+; GFX12-NEXT:    v_lshrrev_b16 v61, 15, s41
+; GFX12-NEXT:    v_lshrrev_b16 v57, 12, s41
+; GFX12-NEXT:    v_lshrrev_b16 v54, 13, s41
+; GFX12-NEXT:    v_lshrrev_b16 v50, 10, s41
+; GFX12-NEXT:    v_lshrrev_b16 v46, 11, s41
+; GFX12-NEXT:    v_lshrrev_b16 v49, 8, s41
+; GFX12-NEXT:    v_lshrrev_b16 v51, 9, s41
+; GFX12-NEXT:    v_lshrrev_b16 v45, 6, s41
+; GFX12-NEXT:    v_lshrrev_b16 v38, 7, s41
+; GFX12-NEXT:    v_lshrrev_b16 v40, 4, s41
+; GFX12-NEXT:    v_lshrrev_b16 v41, 5, s41
+; GFX12-NEXT:    v_lshrrev_b16 v37, 2, s41
+; GFX12-NEXT:    v_lshrrev_b16 v36, 3, s41
+; GFX12-NEXT:    v_lshrrev_b16 v30, 1, s41
+; GFX12-NEXT:    s_lshr_b32 s4, s41, 24
+; GFX12-NEXT:    s_mov_b32 s18, s41
+; GFX12-NEXT:    s_lshr_b32 s2, s40, 24
+; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[42:45], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v12, v[46:49], s[0:1] offset:160
-; GFX12-NEXT:    v_dual_mov_b32 v43, s7 :: v_dual_mov_b32 v42, s6
-; GFX12-NEXT:    v_dual_mov_b32 v45, s9 :: v_dual_mov_b32 v44, s8
-; GFX12-NEXT:    v_mov_b32_e32 v47, s11
-; GFX12-NEXT:    s_lshr_b32 s42, s2, 17
-; GFX12-NEXT:    v_lshrrev_b16 v32, 10, s2
-; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX12-NEXT:    v_lshrrev_b16 v34, 11, s2
-; GFX12-NEXT:    v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v35, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v27, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v29, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v30, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v31, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v24, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v25, 3, s2
-; GFX12-NEXT:    v_lshrrev_b16 v23, 1, s2
-; GFX12-NEXT:    v_lshrrev_b16 v18, 14, s3
-; GFX12-NEXT:    v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT:    v_lshrrev_b16 v16, 12, s3
-; GFX12-NEXT:    v_lshrrev_b16 v19, 13, s3
-; GFX12-NEXT:    v_lshrrev_b16 v0, 10, s3
-; GFX12-NEXT:    v_lshrrev_b16 v1, 11, s3
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v15, 9, s3
-; GFX12-NEXT:    v_lshrrev_b16 v14, 6, s3
-; GFX12-NEXT:    v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v22, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v26, 2, s3
-; GFX12-NEXT:    v_lshrrev_b16 v28, 3, s3
-; GFX12-NEXT:    v_lshrrev_b16 v36, 1, s3
-; GFX12-NEXT:    s_lshr_b32 s18, s3, 24
-; GFX12-NEXT:    s_mov_b32 s4, s3
-; GFX12-NEXT:    s_lshr_b32 s38, s2, 24
-; GFX12-NEXT:    v_dual_mov_b32 v46, s10 :: v_dual_mov_b32 v49, s43
-; GFX12-NEXT:    v_bfe_i32 v52, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v50, v3, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v48, s42
-; GFX12-NEXT:    global_store_b128 v12, v[42:45], s[0:1] offset:144
-; GFX12-NEXT:    v_bfe_i32 v44, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v42, v7, 0, 1
-; GFX12-NEXT:    v_lshrrev_b16 v41, 2, s18
-; GFX12-NEXT:    global_store_b128 v12, v[46:49], s[0:1] offset:128
-; GFX12-NEXT:    v_lshrrev_b16 v54, 3, s18
-; GFX12-NEXT:    v_lshrrev_b16 v56, 6, s38
-; GFX12-NEXT:    v_ashrrev_i32_e32 v53, 31, v52
-; GFX12-NEXT:    v_ashrrev_i32_e32 v51, 31, v50
-; GFX12-NEXT:    v_ashrrev_i32_e32 v45, 31, v44
-; GFX12-NEXT:    v_ashrrev_i32_e32 v43, 31, v42
-; GFX12-NEXT:    v_bfe_i32 v46, v56, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v56, v54, 0, 1
-; GFX12-NEXT:    global_store_b128 v12, v[50:53], s[0:1] offset:112
-; GFX12-NEXT:    v_bfe_i32 v34, v34, 0, 1
-; GFX12-NEXT:    global_store_b128 v12, v[42:45], s[0:1] offset:96
-; GFX12-NEXT:    v_bfe_i32 v32, v32, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v54, v41, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v43, v35, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v41, v33, 0, 1
-; GFX12-NEXT:    v_lshrrev_b16 v39, 4, s18
-; GFX12-NEXT:    v_ashrrev_i32_e32 v35, 31, v34
-; GFX12-NEXT:    v_ashrrev_i32_e32 v33, 31, v32
-; GFX12-NEXT:    v_ashrrev_i32_e32 v44, 31, v43
-; GFX12-NEXT:    v_ashrrev_i32_e32 v42, 31, v41
-; GFX12-NEXT:    v_lshrrev_b16 v40, 5, s18
-; GFX12-NEXT:    v_lshrrev_b16 v37, 6, s18
-; GFX12-NEXT:    global_store_b128 v12, v[32:35], s[0:1] offset:80
-; GFX12-NEXT:    v_bfe_i32 v32, v39, 0, 1
-; GFX12-NEXT:    global_store_b128 v12, v[41:44], s[0:1] offset:64
-; GFX12-NEXT:    v_bfe_i32 v41, v29, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v39, v27, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v34, v40, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v60, v31, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v58, v30, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v42, 31, v41
-; GFX12-NEXT:    v_ashrrev_i32_e32 v40, 31, v39
-; GFX12-NEXT:    v_lshrrev_b16 v38, 7, s18
-; GFX12-NEXT:    v_bfe_i32 v62, v37, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v61, 31, v60
-; GFX12-NEXT:    v_ashrrev_i32_e32 v59, 31, v58
-; GFX12-NEXT:    global_store_b128 v12, v[39:42], s[0:1] offset:48
-; GFX12-NEXT:    v_bfe_i32 v39, v25, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v37, v24, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v64, v38, 0, 1
-; GFX12-NEXT:    global_store_b128 v12, v[58:61], s[0:1] offset:32
-; GFX12-NEXT:    v_bfe_i32 v43, v23, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v40, 31, v39
-; GFX12-NEXT:    v_ashrrev_i32_e32 v38, 31, v37
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX12-NEXT:    v_bfe_i32 v24, v36, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v44, 31, v43
-; GFX12-NEXT:    v_dual_mov_b32 v41, s2 :: v_dual_mov_b32 v42, s3
-; GFX12-NEXT:    v_mov_b32_e32 v23, s5
-; GFX12-NEXT:    global_store_b128 v12, v[37:40], s[0:1] offset:16
-; GFX12-NEXT:    v_bfe_i32 v38, v20, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v36, v18, 0, 1
-; GFX12-NEXT:    global_store_b128 v12, v[41:44], s[0:1]
-; GFX12-NEXT:    v_bfe_i32 v20, v19, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v18, v16, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v39, 31, v38
-; GFX12-NEXT:    v_ashrrev_i32_e32 v37, 31, v36
-; GFX12-NEXT:    v_lshrrev_b16 v55, 1, s18
-; GFX12-NEXT:    v_bfe_i32 v40, v21, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GFX12-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT:    v_lshrrev_b16 v9, 7, s38
-; GFX12-NEXT:    v_lshrrev_b16 v4, 3, s38
-; GFX12-NEXT:    v_lshrrev_b16 v8, 2, s38
-; GFX12-NEXT:    v_lshrrev_b16 v10, 5, s38
-; GFX12-NEXT:    v_lshrrev_b16 v11, 4, s38
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, s38
+; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:160
+; GFX12-NEXT:    v_dual_mov_b32 v69, s6 :: v_dual_mov_b32 v70, s7
+; GFX12-NEXT:    v_dual_mov_b32 v71, s8 :: v_dual_mov_b32 v72, s9
+; GFX12-NEXT:    v_dual_mov_b32 v73, s10 :: v_dual_mov_b32 v74, s11
+; GFX12-NEXT:    v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43
+; GFX12-NEXT:    v_bfe_i32 v79, v1, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v85, v65, 0, 1
+; GFX12-NEXT:    v_mov_b32_e32 v65, s40
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[36:39], s[0:1] offset:368
-; GFX12-NEXT:    global_store_b128 v12, v[18:21], s[0:1] offset:352
-; GFX12-NEXT:    v_bfe_i32 v38, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v36, v0, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v52, v55, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v20, v15, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v18, v13, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v48, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v16, v17, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v6, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v8, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v10, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v8, v11, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v42, v22, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[8:9], s[18:19], 0x10000
-; GFX12-NEXT:    v_bfe_i32 v28, v28, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v26, v26, 0, 1
+; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:128
+; GFX12-NEXT:    v_bfe_i32 v69, v77, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v77, v0, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v75, v18, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v73, v17, 0, 1
+; GFX12-NEXT:    v_lshrrev_b16 v26, 6, s4
+; GFX12-NEXT:    v_lshrrev_b16 v28, 7, s4
+; GFX12-NEXT:    v_lshrrev_b16 v20, 2, s4
+; GFX12-NEXT:    v_lshrrev_b16 v14, 3, s4
+; GFX12-NEXT:    v_lshrrev_b16 v22, 4, s4
+; GFX12-NEXT:    v_lshrrev_b16 v25, 5, s4
+; GFX12-NEXT:    v_lshrrev_b16 v19, 1, s4
+; GFX12-NEXT:    v_lshrrev_b16 v9, 6, s2
+; GFX12-NEXT:    v_lshrrev_b16 v5, 7, s2
+; GFX12-NEXT:    v_bfe_i32 v71, v78, 0, 1
+; GFX12-NEXT:    v_ashrrev_i32_e32 v80, 31, v79
+; GFX12-NEXT:    v_ashrrev_i32_e32 v78, 31, v77
+; GFX12-NEXT:    v_ashrrev_i32_e32 v76, 31, v75
+; GFX12-NEXT:    v_ashrrev_i32_e32 v74, 31, v73
+; GFX12-NEXT:    v_lshrrev_b16 v8, 4, s2
+; GFX12-NEXT:    v_lshrrev_b16 v7, 5, s2
+; GFX12-NEXT:    v_lshrrev_b16 v3, 3, s2
+; GFX12-NEXT:    v_lshrrev_b16 v4, 2, s2
+; GFX12-NEXT:    v_lshrrev_b16 v2, 1, s2
+; GFX12-NEXT:    v_bfe_i32 v23, v14, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v21, v20, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v31, v28, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v29, v26, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v55, v46, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v53, v50, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v91, v34, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v89, v33, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v19, v19, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v27, v25, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v25, v22, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v51, v51, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v49, v49, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v87, v66, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v15, v5, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v13, v9, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v47, v38, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v45, v45, 0, 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v12, v[77:80], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:96
+; GFX12-NEXT:    v_bfe_i32 v77, v82, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v75, v86, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v10, v7, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v8, v8, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v43, v41, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v41, v40, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v83, v83, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v81, v81, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v6, v3, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX12-NEXT:    v_bfe_i32 v39, v36, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v37, v37, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v57, 31, v56
-; GFX12-NEXT:    v_ashrrev_i32_e32 v55, 31, v54
-; GFX12-NEXT:    v_ashrrev_i32_e32 v65, 31, v64
-; GFX12-NEXT:    v_ashrrev_i32_e32 v63, 31, v62
-; GFX12-NEXT:    v_ashrrev_i32_e32 v39, 31, v38
-; GFX12-NEXT:    v_ashrrev_i32_e32 v37, 31, v36
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[38:39], 0x10000
-; GFX12-NEXT:    v_ashrrev_i32_e32 v53, 31, v52
-; GFX12-NEXT:    v_ashrrev_i32_e32 v35, 31, v34
-; GFX12-NEXT:    v_ashrrev_i32_e32 v33, 31, v32
-; GFX12-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GFX12-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT:    v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v51, s9
-; GFX12-NEXT:    v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT:    v_ashrrev_i32_e32 v49, 31, v48
-; GFX12-NEXT:    v_ashrrev_i32_e32 v47, 31, v46
-; GFX12-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
+; GFX12-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
+; GFX12-NEXT:    v_bfe_i32 v35, v30, 0, 1
+; GFX12-NEXT:    v_ashrrev_i32_e32 v32, 31, v31
+; GFX12-NEXT:    v_ashrrev_i32_e32 v30, 31, v29
+; GFX12-NEXT:    v_ashrrev_i32_e32 v56, 31, v55
+; GFX12-NEXT:    v_bfe_i32 v59, v54, 0, 1
+; GFX12-NEXT:    v_ashrrev_i32_e32 v54, 31, v53
+; GFX12-NEXT:    v_bfe_i32 v57, v57, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v63, v61, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v61, v60, 0, 1
+; GFX12-NEXT:    v_bfe_i32 v67, v58, 0, 1
+; GFX12-NEXT:    v_ashrrev_i32_e32 v92, 31, v91
+; GFX12-NEXT:    v_ashrrev_i32_e32 v90, 31, v89
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GFX12-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
+; GFX12-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
+; GFX12-NEXT:    v_ashrrev_i32_e32 v52, 31, v51
+; GFX12-NEXT:    v_ashrrev_i32_e32 v50, 31, v49
+; GFX12-NEXT:    v_ashrrev_i32_e32 v88, 31, v87
+; GFX12-NEXT:    v_ashrrev_i32_e32 v86, 31, v85
+; GFX12-NEXT:    v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4
+; GFX12-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX12-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX12-NEXT:    v_ashrrev_i32_e32 v48, 31, v47
+; GFX12-NEXT:    v_ashrrev_i32_e32 v46, 31, v45
+; GFX12-NEXT:    v_ashrrev_i32_e32 v78, 31, v77
+; GFX12-NEXT:    v_ashrrev_i32_e32 v76, 31, v75
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX12-NEXT:    v_ashrrev_i32_e32 v43, 31, v42
-; GFX12-NEXT:    v_ashrrev_i32_e32 v41, 31, v40
+; GFX12-NEXT:    v_ashrrev_i32_e32 v44, 31, v43
+; GFX12-NEXT:    v_ashrrev_i32_e32 v42, 31, v41
+; GFX12-NEXT:    v_ashrrev_i32_e32 v84, 31, v83
+; GFX12-NEXT:    v_ashrrev_i32_e32 v82, 31, v81
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX12-NEXT:    v_ashrrev_i32_e32 v29, 31, v28
-; GFX12-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
+; GFX12-NEXT:    v_ashrrev_i32_e32 v40, 31, v39
+; GFX12-NEXT:    v_ashrrev_i32_e32 v38, 31, v37
+; GFX12-NEXT:    v_ashrrev_i32_e32 v72, 31, v71
+; GFX12-NEXT:    v_ashrrev_i32_e32 v70, 31, v69
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v12, v[36:39], s[0:1] offset:336
-; GFX12-NEXT:    global_store_b128 v12, v[18:21], s[0:1] offset:320
-; GFX12-NEXT:    global_store_b128 v12, v[14:17], s[0:1] offset:304
-; GFX12-NEXT:    global_store_b128 v12, v[40:43], s[0:1] offset:288
-; GFX12-NEXT:    global_store_b128 v12, v[26:29], s[0:1] offset:272
-; GFX12-NEXT:    global_store_b128 v12, v[22:25], s[0:1] offset:256
-; GFX12-NEXT:    global_store_b128 v12, v[62:65], s[0:1] offset:496
-; GFX12-NEXT:    global_store_b128 v12, v[32:35], s[0:1] offset:480
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    v_ashrrev_i32_e32 v36, 31, v35
+; GFX12-NEXT:    v_ashrrev_i32_e32 v60, 31, v59
+; GFX12-NEXT:    v_ashrrev_i32_e32 v58, 31, v57
+; GFX12-NEXT:    v_ashrrev_i32_e32 v64, 31, v63
+; GFX12-NEXT:    v_ashrrev_i32_e32 v62, 31, v61
+; GFX12-NEXT:    v_ashrrev_i32_e32 v68, 31, v67
+; GFX12-NEXT:    v_dual_mov_b32 v66, s41 :: v_dual_mov_b32 v33, s18
+; GFX12-NEXT:    s_clause 0xf
+; GFX12-NEXT:    global_store_b128 v12, v[89:92], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v12, v[85:88], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v12, v[75:78], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v12, v[81:84], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v12, v[65:68], s[0:1]
+; GFX12-NEXT:    global_store_b128 v12, v[61:64], s[0:1] offset:368
+; GFX12-NEXT:    global_store_b128 v12, v[57:60], s[0:1] offset:352
+; GFX12-NEXT:    global_store_b128 v12, v[53:56], s[0:1] offset:336
+; GFX12-NEXT:    global_store_b128 v12, v[49:52], s[0:1] offset:320
+; GFX12-NEXT:    global_store_b128 v12, v[45:48], s[0:1] offset:304
+; GFX12-NEXT:    global_store_b128 v12, v[41:44], s[0:1] offset:288
+; GFX12-NEXT:    global_store_b128 v12, v[37:40], s[0:1] offset:272
+; GFX12-NEXT:    global_store_b128 v12, v[33:36], s[0:1] offset:256
+; GFX12-NEXT:    global_store_b128 v12, v[29:32], s[0:1] offset:496
+; GFX12-NEXT:    global_store_b128 v12, v[25:28], s[0:1] offset:480
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v12, v[54:57], s[0:1] offset:464
-; GFX12-NEXT:    global_store_b128 v12, v[50:53], s[0:1] offset:448
-; GFX12-NEXT:    global_store_b128 v12, v[46:49], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v12, v[21:24], s[0:1] offset:464
+; GFX12-NEXT:    global_store_b128 v12, v[17:20], s[0:1] offset:448
+; GFX12-NEXT:    global_store_b128 v12, v[13:16], s[0:1] offset:240
 ; GFX12-NEXT:    global_store_b128 v12, v[8:11], s[0:1] offset:224
 ; GFX12-NEXT:    global_store_b128 v12, v[4:7], s[0:1] offset:208
 ; GFX12-NEXT:    global_store_b128 v12, v[0:3], s[0:1] offset:192
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a87fa8bf36d9e7..a015a39a7184fc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -8910,62 +8910,62 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x100000
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s14
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s15
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x100000
-; GFX12-NEXT:    v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s66
-; GFX12-NEXT:    v_dual_mov_b32 v4, s67 :: v_dual_mov_b32 v5, s64
+; GFX12-NEXT:    v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s67
+; GFX12-NEXT:    v_dual_mov_b32 v2, s66 :: v_dual_mov_b32 v5, s65
 ; GFX12-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
-; GFX12-NEXT:    v_dual_mov_b32 v6, s65 :: v_dual_mov_b32 v7, s60
-; GFX12-NEXT:    v_dual_mov_b32 v8, s61 :: v_dual_mov_b32 v9, s12
-; GFX12-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s58
-; GFX12-NEXT:    v_dual_mov_b32 v12, s59 :: v_dual_mov_b32 v13, s56
-; GFX12-NEXT:    v_dual_mov_b32 v14, s57 :: v_dual_mov_b32 v15, s54
-; GFX12-NEXT:    v_mov_b32_e32 v16, s55
+; GFX12-NEXT:    v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61
+; GFX12-NEXT:    v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT:    v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59
+; GFX12-NEXT:    v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57
+; GFX12-NEXT:    v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55
+; GFX12-NEXT:    v_mov_b32_e32 v14, s54
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[62:63], 0x100000
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[16:17] offset:240
-; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[16:17] offset:224
-; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[16:17] offset:208
-; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[16:17] offset:192
-; GFX12-NEXT:    v_dual_mov_b32 v1, s52 :: v_dual_mov_b32 v2, s53
-; GFX12-NEXT:    v_dual_mov_b32 v3, s12 :: v_dual_mov_b32 v4, s13
-; GFX12-NEXT:    v_dual_mov_b32 v5, s44 :: v_dual_mov_b32 v6, s45
-; GFX12-NEXT:    v_dual_mov_b32 v7, s50 :: v_dual_mov_b32 v8, s51
-; GFX12-NEXT:    v_dual_mov_b32 v9, s46 :: v_dual_mov_b32 v10, s47
-; GFX12-NEXT:    v_dual_mov_b32 v11, s48 :: v_dual_mov_b32 v12, s49
-; GFX12-NEXT:    v_dual_mov_b32 v13, s34 :: v_dual_mov_b32 v14, s35
-; GFX12-NEXT:    v_dual_mov_b32 v15, s42 :: v_dual_mov_b32 v16, s43
-; GFX12-NEXT:    v_dual_mov_b32 v17, s40 :: v_dual_mov_b32 v18, s41
-; GFX12-NEXT:    v_dual_mov_b32 v19, s38 :: v_dual_mov_b32 v20, s39
-; GFX12-NEXT:    v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21
-; GFX12-NEXT:    v_dual_mov_b32 v23, s36 :: v_dual_mov_b32 v24, s37
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:240
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:224
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:208
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s53 :: v_dual_mov_b32 v0, s52
+; GFX12-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
+; GFX12-NEXT:    v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44
+; GFX12-NEXT:    v_dual_mov_b32 v7, s51 :: v_dual_mov_b32 v6, s50
+; GFX12-NEXT:    v_dual_mov_b32 v9, s47 :: v_dual_mov_b32 v8, s46
+; GFX12-NEXT:    v_dual_mov_b32 v11, s49 :: v_dual_mov_b32 v10, s48
+; GFX12-NEXT:    v_dual_mov_b32 v13, s35 :: v_dual_mov_b32 v12, s34
+; GFX12-NEXT:    v_dual_mov_b32 v15, s43 :: v_dual_mov_b32 v14, s42
+; GFX12-NEXT:    v_dual_mov_b32 v17, s41 :: v_dual_mov_b32 v16, s40
+; GFX12-NEXT:    v_dual_mov_b32 v19, s39 :: v_dual_mov_b32 v18, s38
+; GFX12-NEXT:    v_dual_mov_b32 v21, s21 :: v_dual_mov_b32 v20, s20
+; GFX12-NEXT:    v_dual_mov_b32 v23, s37 :: v_dual_mov_b32 v22, s36
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[16:17] offset:176
-; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[16:17] offset:160
-; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[16:17] offset:144
-; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[16:17] offset:128
-; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[16:17] offset:112
-; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[16:17] offset:96
-; GFX12-NEXT:    v_dual_mov_b32 v1, s30 :: v_dual_mov_b32 v2, s31
-; GFX12-NEXT:    v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v4, s29
-; GFX12-NEXT:    v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3
-; GFX12-NEXT:    v_dual_mov_b32 v7, s26 :: v_dual_mov_b32 v8, s27
-; GFX12-NEXT:    v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
-; GFX12-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX12-NEXT:    v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1
-; GFX12-NEXT:    v_dual_mov_b32 v15, s10 :: v_dual_mov_b32 v16, s11
-; GFX12-NEXT:    v_dual_mov_b32 v17, s8 :: v_dual_mov_b32 v18, s9
-; GFX12-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v20, s7
-; GFX12-NEXT:    v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
-; GFX12-NEXT:    v_dual_mov_b32 v23, s4 :: v_dual_mov_b32 v24, s5
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:176
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:160
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:144
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:128
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s31 :: v_dual_mov_b32 v0, s30
+; GFX12-NEXT:    v_dual_mov_b32 v3, s29 :: v_dual_mov_b32 v2, s28
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v6, s26
+; GFX12-NEXT:    v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v8, s24
+; GFX12-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX12-NEXT:    v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0
+; GFX12-NEXT:    v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v14, s10
+; GFX12-NEXT:    v_dual_mov_b32 v17, s9 :: v_dual_mov_b32 v16, s8
+; GFX12-NEXT:    v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6
+; GFX12-NEXT:    v_dual_mov_b32 v21, s19 :: v_dual_mov_b32 v20, s18
+; GFX12-NEXT:    v_dual_mov_b32 v23, s5 :: v_dual_mov_b32 v22, s4
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[16:17] offset:80
-; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[16:17] offset:64
-; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[16:17] offset:48
-; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[16:17] offset:32
-; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[16:17] offset:16
-; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[16:17]
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 889755c23bbc72..9000cee7ef9df0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -4093,84 +4093,80 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s15
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s1
-; GFX12-NEXT:    s_lshr_b32 s34, s15, 24
-; GFX12-NEXT:    s_and_b32 s50, s15, 0xff
-; GFX12-NEXT:    s_bfe_u32 s15, s15, 0x80010
-; GFX12-NEXT:    s_lshr_b32 s27, s9, 24
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s14
+; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s4
+; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s13
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s11
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s9
 ; GFX12-NEXT:    v_lshrrev_b16 v12, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s4
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s0
-; GFX12-NEXT:    v_dual_mov_b32 v52, 0 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT:    v_dual_mov_b32 v48, s50 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT:    v_mov_b32_e32 v50, s15
-; GFX12-NEXT:    s_and_b32 s44, s9, 0xff
-; GFX12-NEXT:    s_bfe_u32 s9, s9, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v49, 0xffff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v51, s34
+; GFX12-NEXT:    s_lshr_b32 s34, s15, 24
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s12
 ; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
 ; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT:    s_lshr_b32 s31, s13, 24
+; GFX12-NEXT:    s_and_b32 s50, s15, 0xff
+; GFX12-NEXT:    s_bfe_u32 s15, s15, 0x80010
 ; GFX12-NEXT:    s_lshr_b32 s33, s14, 24
-; GFX12-NEXT:    s_and_b32 s43, s8, 0xff
-; GFX12-NEXT:    s_and_b32 s48, s13, 0xff
 ; GFX12-NEXT:    s_and_b32 s49, s14, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s14, s14, 0x80010
+; GFX12-NEXT:    s_lshr_b32 s26, s8, 24
+; GFX12-NEXT:    s_lshr_b32 s31, s13, 24
+; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s11
+; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s10
+; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s9
+; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT:    v_dual_mov_b32 v60, 0 :: v_dual_and_b32 v5, 0xffff, v5
+; GFX12-NEXT:    v_dual_mov_b32 v56, s50 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT:    v_mov_b32_e32 v58, s15
+; GFX12-NEXT:    s_and_b32 s43, s8, 0xff
+; GFX12-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT:    s_and_b32 s48, s13, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v30, s43 :: v_dual_and_b32 v57, 0xffff, v0
+; GFX12-NEXT:    v_dual_mov_b32 v59, s34 :: v_dual_mov_b32 v32, s8
+; GFX12-NEXT:    s_lshr_b32 s27, s9, 24
 ; GFX12-NEXT:    s_lshr_b32 s30, s12, 24
+; GFX12-NEXT:    v_dual_mov_b32 v52, s49 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT:    v_dual_mov_b32 v54, s14 :: v_dual_and_b32 v17, 0xffff, v15
+; GFX12-NEXT:    s_and_b32 s42, s7, 0xff
+; GFX12-NEXT:    s_and_b32 s44, s9, 0xff
+; GFX12-NEXT:    s_bfe_u32 s9, s9, 0x80010
 ; GFX12-NEXT:    s_and_b32 s47, s12, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s12, s12, 0x80010
-; GFX12-NEXT:    s_lshr_b32 s26, s8, 24
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    v_dual_mov_b32 v44, s49 :: v_dual_and_b32 v13, 0xffff, v13
-; GFX12-NEXT:    v_dual_mov_b32 v46, s14 :: v_dual_and_b32 v17, 0xffff, v15
-; GFX12-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX12-NEXT:    v_and_b32_e32 v23, 0xffff, v12
-; GFX12-NEXT:    s_and_b32 s42, s7, 0xff
-; GFX12-NEXT:    s_bfe_u32 s8, s8, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v45, 0xffff, v2
-; GFX12-NEXT:    v_dual_mov_b32 v47, s33 :: v_dual_mov_b32 v30, s43
-; GFX12-NEXT:    global_store_b128 v52, v[48:51], s[16:17] offset:240
-; GFX12-NEXT:    v_dual_mov_b32 v32, s8 :: v_dual_and_b32 v49, 0xffff, v3
-; GFX12-NEXT:    v_mov_b32_e32 v48, s48
-; GFX12-NEXT:    v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s31
-; GFX12-NEXT:    v_mov_b32_e32 v26, s42
+; GFX12-NEXT:    v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v53, 0xffff, v2
+; GFX12-NEXT:    v_dual_mov_b32 v55, s33 :: v_dual_mov_b32 v26, s42
 ; GFX12-NEXT:    s_lshr_b32 s25, s7, 24
-; GFX12-NEXT:    s_lshr_b32 s28, s10, 24
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s10
-; GFX12-NEXT:    v_dual_mov_b32 v40, s47 :: v_dual_and_b32 v27, 0xffff, v11
+; GFX12-NEXT:    v_dual_mov_b32 v48, s48 :: v_dual_and_b32 v21, 0xffff, v14
+; GFX12-NEXT:    v_dual_mov_b32 v50, s13 :: v_dual_and_b32 v23, 0xffff, v12
 ; GFX12-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v42, s12 :: v_dual_and_b32 v31, 0xffff, v10
+; GFX12-NEXT:    v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v49, 0xffff, v3
+; GFX12-NEXT:    v_dual_mov_b32 v51, s31 :: v_dual_mov_b32 v28, s7
+; GFX12-NEXT:    s_lshr_b32 s28, s10, 24
+; GFX12-NEXT:    s_lshr_b32 s29, s11, 24
+; GFX12-NEXT:    s_and_b32 s41, s6, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v44, s47 :: v_dual_and_b32 v27, 0xffff, v11
+; GFX12-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_and_b32 v31, 0xffff, v10
 ; GFX12-NEXT:    s_and_b32 s45, s10, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s10, s10, 0x80010
-; GFX12-NEXT:    v_and_b32_e32 v41, 0xffff, v4
-; GFX12-NEXT:    v_dual_mov_b32 v43, s30 :: v_dual_mov_b32 v28, s7
-; GFX12-NEXT:    s_lshr_b32 s29, s11, 24
 ; GFX12-NEXT:    s_and_b32 s46, s11, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v40, s10 :: v_dual_and_b32 v45, 0xffff, v4
+; GFX12-NEXT:    v_dual_mov_b32 v47, s30 :: v_dual_mov_b32 v22, s41
 ; GFX12-NEXT:    s_bfe_u32 s11, s11, 0x80010
 ; GFX12-NEXT:    s_lshr_b32 s24, s6, 24
-; GFX12-NEXT:    s_and_b32 s41, s6, 0xff
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX12-NEXT:    v_and_b32_e32 v35, 0xffff, v8
-; GFX12-NEXT:    global_store_b128 v52, v[44:47], s[16:17] offset:224
-; GFX12-NEXT:    v_mov_b32_e32 v46, s29
-; GFX12-NEXT:    v_and_b32_e32 v44, 0xffff, v6
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v52, v[48:51], s[16:17] offset:208
-; GFX12-NEXT:    global_store_b128 v52, v[40:43], s[16:17] offset:192
-; GFX12-NEXT:    v_dual_mov_b32 v43, s46 :: v_dual_mov_b32 v22, s41
-; GFX12-NEXT:    v_dual_mov_b32 v45, s11 :: v_dual_mov_b32 v24, s6
+; GFX12-NEXT:    v_dual_mov_b32 v42, s46 :: v_dual_and_b32 v35, 0xffff, v8
+; GFX12-NEXT:    v_and_b32_e32 v39, 0xffff, v7
+; GFX12-NEXT:    v_dual_mov_b32 v38, s45 :: v_dual_and_b32 v43, 0xffff, v6
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    global_store_b128 v60, v[56:59], s[16:17] offset:240
+; GFX12-NEXT:    global_store_b128 v60, v[52:55], s[16:17] offset:224
+; GFX12-NEXT:    global_store_b128 v60, v[48:51], s[16:17] offset:208
+; GFX12-NEXT:    global_store_b128 v60, v[44:47], s[16:17] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v44, s11 :: v_dual_mov_b32 v45, s29
+; GFX12-NEXT:    v_mov_b32_e32 v24, s6
 ; GFX12-NEXT:    s_and_b32 s40, s5, 0xff
-; GFX12-NEXT:    v_dual_mov_b32 v38, s45 :: v_dual_and_b32 v39, 0xffff, v7
-; GFX12-NEXT:    v_dual_mov_b32 v40, s10 :: v_dual_mov_b32 v41, s28
-; GFX12-NEXT:    v_mov_b32_e32 v20, s40
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v41, s28 :: v_dual_mov_b32 v20, s40
 ; GFX12-NEXT:    s_lshr_b32 s23, s5, 24
 ; GFX12-NEXT:    s_bfe_u32 s5, s5, 0x80010
 ; GFX12-NEXT:    v_mov_b32_e32 v37, s27
@@ -4184,12 +4180,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v12, s38
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v52, v[43:46], s[16:17] offset:176
-; GFX12-NEXT:    global_store_b128 v52, v[38:41], s[16:17] offset:160
-; GFX12-NEXT:    global_store_b128 v52, v[34:37], s[16:17] offset:144
-; GFX12-NEXT:    global_store_b128 v52, v[30:33], s[16:17] offset:128
-; GFX12-NEXT:    global_store_b128 v52, v[26:29], s[16:17] offset:112
-; GFX12-NEXT:    global_store_b128 v52, v[22:25], s[16:17] offset:96
+; GFX12-NEXT:    global_store_b128 v60, v[42:45], s[16:17] offset:176
+; GFX12-NEXT:    global_store_b128 v60, v[38:41], s[16:17] offset:160
+; GFX12-NEXT:    global_store_b128 v60, v[34:37], s[16:17] offset:144
+; GFX12-NEXT:    global_store_b128 v60, v[30:33], s[16:17] offset:128
+; GFX12-NEXT:    global_store_b128 v60, v[26:29], s[16:17] offset:112
+; GFX12-NEXT:    global_store_b128 v60, v[22:25], s[16:17] offset:96
 ; GFX12-NEXT:    v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23
 ; GFX12-NEXT:    v_mov_b32_e32 v14, s3
 ; GFX12-NEXT:    s_lshr_b32 s20, s2, 24
@@ -4208,12 +4204,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v52, v[20:23], s[16:17] offset:80
-; GFX12-NEXT:    global_store_b128 v52, v[16:19], s[16:17] offset:64
-; GFX12-NEXT:    global_store_b128 v52, v[12:15], s[16:17] offset:48
-; GFX12-NEXT:    global_store_b128 v52, v[8:11], s[16:17] offset:32
-; GFX12-NEXT:    global_store_b128 v52, v[4:7], s[16:17] offset:16
-; GFX12-NEXT:    global_store_b128 v52, v[0:3], s[16:17]
+; GFX12-NEXT:    global_store_b128 v60, v[20:23], s[16:17] offset:80
+; GFX12-NEXT:    global_store_b128 v60, v[16:19], s[16:17] offset:64
+; GFX12-NEXT:    global_store_b128 v60, v[12:15], s[16:17] offset:48
+; GFX12-NEXT:    global_store_b128 v60, v[8:11], s[16:17] offset:32
+; GFX12-NEXT:    global_store_b128 v60, v[4:7], s[16:17] offset:16
+; GFX12-NEXT:    global_store_b128 v60, v[0:3], s[16:17]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -5056,97 +5052,95 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s15
-; GFX12-NEXT:    s_ashr_i32 s49, s15, 24
-; GFX12-NEXT:    s_bfe_i32 s50, s15, 0x80010
-; GFX12-NEXT:    s_sext_i32_i8 s15, s15
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s14
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s13
-; GFX12-NEXT:    v_dual_mov_b32 v54, 0 :: v_dual_mov_b32 v45, s49
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s12
-; GFX12-NEXT:    s_ashr_i32 s45, s13, 24
+; GFX12-NEXT:    s_ashr_i32 s49, s15, 24
+; GFX12-NEXT:    s_bfe_i32 s50, s15, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s15, s15
 ; GFX12-NEXT:    s_ashr_i32 s47, s14, 24
-; GFX12-NEXT:    v_bfe_i32 v43, v0, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v42, s15 :: v_dual_mov_b32 v49, s47
-; GFX12-NEXT:    v_dual_mov_b32 v44, s50 :: v_dual_mov_b32 v53, s45
-; GFX12-NEXT:    s_bfe_i32 s46, s13, 0x80010
-; GFX12-NEXT:    s_sext_i32_i8 s13, s13
 ; GFX12-NEXT:    s_bfe_i32 s48, s14, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s14, s14
+; GFX12-NEXT:    v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v52, s15
+; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s11
+; GFX12-NEXT:    s_ashr_i32 s45, s13, 24
+; GFX12-NEXT:    s_bfe_i32 s46, s13, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s13, s13
+; GFX12-NEXT:    v_bfe_i32 v53, v0, 0, 8
+; GFX12-NEXT:    v_dual_mov_b32 v54, s50 :: v_dual_mov_b32 v55, s49
+; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s10
 ; GFX12-NEXT:    s_ashr_i32 s43, s12, 24
 ; GFX12-NEXT:    s_bfe_i32 s44, s12, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s12, s12
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s11
-; GFX12-NEXT:    s_ashr_i32 s39, s10, 24
+; GFX12-NEXT:    v_bfe_i32 v49, v2, 0, 8
+; GFX12-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v51, s47
+; GFX12-NEXT:    v_dual_mov_b32 v50, s48 :: v_dual_mov_b32 v47, s45
+; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s9
 ; GFX12-NEXT:    s_ashr_i32 s41, s11, 24
 ; GFX12-NEXT:    s_bfe_i32 s42, s11, 0x80010
-; GFX12-NEXT:    v_bfe_i32 v47, v2, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v46, s14
-; GFX12-NEXT:    v_bfe_i32 v51, v3, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v50, s13
-; GFX12-NEXT:    v_mov_b32_e32 v52, s46
 ; GFX12-NEXT:    s_sext_i32_i8 s11, s11
-; GFX12-NEXT:    v_mov_b32_e32 v48, s48
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s10
-; GFX12-NEXT:    s_ashr_i32 s35, s8, 24
-; GFX12-NEXT:    s_ashr_i32 s37, s9, 24
-; GFX12-NEXT:    v_bfe_i32 v41, v4, 0, 8
-; GFX12-NEXT:    global_store_b128 v54, v[42:45], s[16:17] offset:240
-; GFX12-NEXT:    v_dual_mov_b32 v40, s12 :: v_dual_mov_b32 v37, s37
-; GFX12-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v33, s35
-; GFX12-NEXT:    v_mov_b32_e32 v43, s43
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s9
+; GFX12-NEXT:    v_bfe_i32 v45, v3, 0, 8
+; GFX12-NEXT:    v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s43
+; GFX12-NEXT:    v_mov_b32_e32 v46, s46
 ; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
+; GFX12-NEXT:    s_ashr_i32 s39, s10, 24
+; GFX12-NEXT:    v_bfe_i32 v41, v4, 0, 8
+; GFX12-NEXT:    v_dual_mov_b32 v40, s12 :: v_dual_mov_b32 v57, s42
+; GFX12-NEXT:    v_mov_b32_e32 v42, s44
 ; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v12, 8, s6
 ; GFX12-NEXT:    s_bfe_i32 s40, s10, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s10, s10
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 8, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s4
-; GFX12-NEXT:    s_ashr_i32 s33, s7, 24
+; GFX12-NEXT:    s_ashr_i32 s37, s9, 24
 ; GFX12-NEXT:    s_bfe_i32 s38, s9, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s9, s9
-; GFX12-NEXT:    global_store_b128 v54, v[46:49], s[16:17] offset:224
-; GFX12-NEXT:    v_bfe_i32 v44, v6, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v45, s42
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v54, v[50:53], s[16:17] offset:208
-; GFX12-NEXT:    global_store_b128 v54, v[40:43], s[16:17] offset:192
-; GFX12-NEXT:    v_dual_mov_b32 v43, s11 :: v_dual_mov_b32 v46, s41
-; GFX12-NEXT:    v_mov_b32_e32 v29, s33
-; GFX12-NEXT:    s_ashr_i32 s28, s5, 24
-; GFX12-NEXT:    s_ashr_i32 s30, s6, 24
+; GFX12-NEXT:    v_bfe_i32 v56, v6, 0, 8
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    global_store_b128 v59, v[52:55], s[16:17] offset:240
+; GFX12-NEXT:    global_store_b128 v59, v[48:51], s[16:17] offset:224
+; GFX12-NEXT:    global_store_b128 v59, v[44:47], s[16:17] offset:208
+; GFX12-NEXT:    global_store_b128 v59, v[40:43], s[16:17] offset:192
+; GFX12-NEXT:    v_mov_b32_e32 v41, s39
+; GFX12-NEXT:    v_dual_mov_b32 v55, s11 :: v_dual_mov_b32 v58, s41
+; GFX12-NEXT:    v_mov_b32_e32 v37, s37
+; GFX12-NEXT:    s_ashr_i32 s33, s7, 24
+; GFX12-NEXT:    s_ashr_i32 s35, s8, 24
 ; GFX12-NEXT:    s_bfe_i32 s36, s8, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s8, s8
 ; GFX12-NEXT:    v_bfe_i32 v39, v7, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v25, s30
-; GFX12-NEXT:    v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v41, s39
+; GFX12-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v33, s35
+; GFX12-NEXT:    v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v29, s33
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT:    s_ashr_i32 s24, s3, 24
-; GFX12-NEXT:    s_ashr_i32 s26, s4, 24
+; GFX12-NEXT:    s_ashr_i32 s28, s5, 24
+; GFX12-NEXT:    s_ashr_i32 s30, s6, 24
 ; GFX12-NEXT:    s_bfe_i32 s31, s6, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s6, s6
 ; GFX12-NEXT:    s_bfe_i32 s34, s7, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s7, s7
 ; GFX12-NEXT:    v_bfe_i32 v35, v8, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v19, s26
+; GFX12-NEXT:    v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v25, s30
 ; GFX12-NEXT:    v_mov_b32_e32 v36, s38
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
 ; GFX12-NEXT:    s_ashr_i32 s18, s0, 24
 ; GFX12-NEXT:    s_ashr_i32 s20, s1, 24
 ; GFX12-NEXT:    s_ashr_i32 s22, s2, 24
+; GFX12-NEXT:    s_ashr_i32 s24, s3, 24
+; GFX12-NEXT:    s_ashr_i32 s26, s4, 24
 ; GFX12-NEXT:    s_bfe_i32 s29, s5, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s5, s5
 ; GFX12-NEXT:    v_bfe_i32 v31, v10, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v30, s8
-; GFX12-NEXT:    v_dual_mov_b32 v32, s36 :: v_dual_mov_b32 v7, s20
+; GFX12-NEXT:    v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v19, s26
+; GFX12-NEXT:    v_mov_b32_e32 v32, s36
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s1
 ; GFX12-NEXT:    s_bfe_i32 s27, s4, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX12-NEXT:    v_bfe_i32 v23, v12, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v27, v11, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v3, s18
-; GFX12-NEXT:    v_mov_b32_e32 v28, s34
-; GFX12-NEXT:    v_mov_b32_e32 v22, s6
+; GFX12-NEXT:    v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v11, s22
+; GFX12-NEXT:    v_dual_mov_b32 v28, s34 :: v_dual_mov_b32 v7, s20
+; GFX12-NEXT:    v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v3, s18
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s0
 ; GFX12-NEXT:    s_bfe_i32 s25, s3, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s3, s3
@@ -5154,14 +5148,13 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_bfe_i32 v21, v14, 0, 8
 ; GFX12-NEXT:    v_mov_b32_e32 v24, s31
 ; GFX12-NEXT:    v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v15, s24
-; GFX12-NEXT:    v_mov_b32_e32 v11, s22
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v54, v[43:46], s[16:17] offset:176
-; GFX12-NEXT:    global_store_b128 v54, v[38:41], s[16:17] offset:160
-; GFX12-NEXT:    global_store_b128 v54, v[34:37], s[16:17] offset:144
-; GFX12-NEXT:    global_store_b128 v54, v[30:33], s[16:17] offset:128
-; GFX12-NEXT:    global_store_b128 v54, v[26:29], s[16:17] offset:112
-; GFX12-NEXT:    global_store_b128 v54, v[22:25], s[16:17] offset:96
+; GFX12-NEXT:    global_store_b128 v59, v[55:58], s[16:17] offset:176
+; GFX12-NEXT:    global_store_b128 v59, v[38:41], s[16:17] offset:160
+; GFX12-NEXT:    global_store_b128 v59, v[34:37], s[16:17] offset:144
+; GFX12-NEXT:    global_store_b128 v59, v[30:33], s[16:17] offset:128
+; GFX12-NEXT:    global_store_b128 v59, v[26:29], s[16:17] offset:112
+; GFX12-NEXT:    global_store_b128 v59, v[22:25], s[16:17] offset:96
 ; GFX12-NEXT:    v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28
 ; GFX12-NEXT:    s_bfe_i32 s23, s2, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s2, s2
@@ -5184,12 +5177,12 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v54, v[20:23], s[16:17] offset:80
-; GFX12-NEXT:    global_store_b128 v54, v[16:19], s[16:17] offset:64
-; GFX12-NEXT:    global_store_b128 v54, v[12:15], s[16:17] offset:48
-; GFX12-NEXT:    global_store_b128 v54, v[8:11], s[16:17] offset:32
-; GFX12-NEXT:    global_store_b128 v54, v[4:7], s[16:17] offset:16
-; GFX12-NEXT:    global_store_b128 v54, v[0:3], s[16:17]
+; GFX12-NEXT:    global_store_b128 v59, v[20:23], s[16:17] offset:80
+; GFX12-NEXT:    global_store_b128 v59, v[16:19], s[16:17] offset:64
+; GFX12-NEXT:    global_store_b128 v59, v[12:15], s[16:17] offset:48
+; GFX12-NEXT:    global_store_b128 v59, v[8:11], s[16:17] offset:32
+; GFX12-NEXT:    global_store_b128 v59, v[4:7], s[16:17] offset:16
+; GFX12-NEXT:    global_store_b128 v59, v[0:3], s[16:17]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -8804,114 +8797,113 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s5
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT:    s_lshr_b32 s22, s7, 16
-; GFX12-NEXT:    v_bfe_i32 v31, v1, 0, 8
-; GFX12-NEXT:    s_lshr_b32 s40, s2, 24
+; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s7
+; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s5
+; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s2
+; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s6
+; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s4
+; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s1
+; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s0
+; GFX12-NEXT:    s_lshr_b32 s20, s7, 16
+; GFX12-NEXT:    s_lshr_b32 s24, s6, 24
+; GFX12-NEXT:    s_lshr_b32 s26, s5, 16
+; GFX12-NEXT:    s_lshr_b32 s36, s2, 16
+; GFX12-NEXT:    s_lshr_b32 s38, s2, 24
+; GFX12-NEXT:    v_bfe_i32 v10, v7, 0, 8
+; GFX12-NEXT:    v_bfe_i32 v22, v3, 0, 8
+; GFX12-NEXT:    v_bfe_i32 v30, v0, 0, 8
+; GFX12-NEXT:    s_lshr_b32 s42, s0, 16
 ; GFX12-NEXT:    s_mov_b32 s46, s7
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s4
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s1
-; GFX12-NEXT:    s_lshr_b32 s24, s6, 16
-; GFX12-NEXT:    s_lshr_b32 s42, s1, 16
-; GFX12-NEXT:    s_ashr_i64 s[58:59], s[6:7], 56
-; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
-; GFX12-NEXT:    s_lshr_b32 s26, s6, 24
-; GFX12-NEXT:    s_lshr_b32 s28, s5, 16
-; GFX12-NEXT:    s_lshr_b32 s38, s2, 16
-; GFX12-NEXT:    v_bfe_i32 v11, v8, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v23, v4, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v27, v2, 0, 8
-; GFX12-NEXT:    v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
-; GFX12-NEXT:    v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
-; GFX12-NEXT:    v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
-; GFX12-NEXT:    v_mov_b32_e32 v30, s47
-; GFX12-NEXT:    s_lshr_b32 s44, s0, 24
 ; GFX12-NEXT:    s_mov_b32 s48, s5
 ; GFX12-NEXT:    s_mov_b32 s50, s3
-; GFX12-NEXT:    s_lshr_b32 s30, s4, 16
-; GFX12-NEXT:    s_lshr_b32 s34, s4, 24
+; GFX12-NEXT:    s_lshr_b32 s22, s6, 16
+; GFX12-NEXT:    s_lshr_b32 s28, s4, 16
+; GFX12-NEXT:    s_lshr_b32 s30, s4, 24
+; GFX12-NEXT:    s_lshr_b32 s40, s1, 16
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[6:7], 0x80000
 ; GFX12-NEXT:    s_ashr_i64 s[54:55], s[2:3], 56
 ; GFX12-NEXT:    s_ashr_i64 s[56:57], s[4:5], 56
-; GFX12-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v19, v5, 0, 8
+; GFX12-NEXT:    s_ashr_i64 s[6:7], s[6:7], 56
+; GFX12-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX12-NEXT:    v_bfe_i32 v18, v4, 0, 8
+; GFX12-NEXT:    v_bfe_i32 v26, v1, 0, 8
 ; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT:    s_lshr_b32 s36, s3, 16
+; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX12-NEXT:    s_lshr_b32 s34, s3, 16
+; GFX12-NEXT:    s_lshr_b32 s44, s0, 24
 ; GFX12-NEXT:    s_mov_b32 s52, s1
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[4:5], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[6:7], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[2:3], s[50:51], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[4:5], s[48:49], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[44:45], 0x80000
-; GFX12-NEXT:    s_lshr_b32 s20, s0, 16
+; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v33, s21
 ; GFX12-NEXT:    s_ashr_i64 s[18:19], s[0:1], 56
-; GFX12-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v15, v6, 0, 8
-; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-NEXT:    v_bfe_i32 v14, v5, 0, 8
+; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26
-; GFX12-NEXT:    v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28
-; GFX12-NEXT:    v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56
-; GFX12-NEXT:    v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30
-; GFX12-NEXT:    v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38
-; GFX12-NEXT:    v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40
+; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v35, s7
+; GFX12-NEXT:    v_dual_mov_b32 v34, s6 :: v_dual_mov_b32 v37, s23
+; GFX12-NEXT:    v_dual_mov_b32 v38, s24 :: v_dual_mov_b32 v41, s27
+; GFX12-NEXT:    v_dual_mov_b32 v40, s26 :: v_dual_mov_b32 v43, s57
+; GFX12-NEXT:    v_dual_mov_b32 v42, s56 :: v_dual_mov_b32 v45, s29
+; GFX12-NEXT:    v_dual_mov_b32 v50, s54 :: v_dual_mov_b32 v53, s37
+; GFX12-NEXT:    v_dual_mov_b32 v52, s36 :: v_dual_mov_b32 v55, s39
+; GFX12-NEXT:    v_dual_mov_b32 v54, s38 :: v_dual_mov_b32 v57, s41
 ; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[0:1], s[52:53], 0x80000
-; GFX12-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GFX12-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX12-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
-; GFX12-NEXT:    global_store_b128 v0, v[33:36], s[8:9] offset:240
-; GFX12-NEXT:    v_mov_b32_e32 v33, s42
-; GFX12-NEXT:    global_store_b128 v0, v[29:32], s[8:9] offset:224
-; GFX12-NEXT:    v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
-; GFX12-NEXT:    v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
-; GFX12-NEXT:    v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v17, s14
-; GFX12-NEXT:    v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
-; GFX12-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
-; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34
-; GFX12-NEXT:    v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36
-; GFX12-NEXT:    v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18
-; GFX12-NEXT:    v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
-; GFX12-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GFX12-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GFX12-NEXT:    v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
-; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT:    v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54
-; GFX12-NEXT:    v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX12-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v0, v[37:40], s[8:9] offset:208
-; GFX12-NEXT:    global_store_b128 v0, v[25:28], s[8:9] offset:192
-; GFX12-NEXT:    global_store_b128 v0, v[41:44], s[8:9] offset:176
-; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[8:9] offset:160
-; GFX12-NEXT:    global_store_b128 v0, v[45:48], s[8:9] offset:144
-; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[8:9] offset:128
-; GFX12-NEXT:    global_store_b128 v0, v[49:52], s[8:9] offset:112
-; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[8:9] offset:96
-; GFX12-NEXT:    v_mov_b32_e32 v2, s11
+; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX12-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
+; GFX12-NEXT:    v_ashrrev_i32_e32 v31, 31, v30
+; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v60, s42 :: v_dual_mov_b32 v29, s47
+; GFX12-NEXT:    v_dual_mov_b32 v28, s46 :: v_dual_mov_b32 v63, s45
+; GFX12-NEXT:    v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v21, s5
+; GFX12-NEXT:    v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v17, s15
+; GFX12-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT:    v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v36, s22 :: v_dual_mov_b32 v39, s25
+; GFX12-NEXT:    v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31
+; GFX12-NEXT:    v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35
+; GFX12-NEXT:    v_dual_mov_b32 v56, s40 :: v_dual_mov_b32 v59, s19
+; GFX12-NEXT:    v_dual_mov_b32 v58, s18 :: v_dual_mov_b32 v61, s43
+; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX12-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
+; GFX12-NEXT:    v_dual_mov_b32 v62, s44 :: v_dual_mov_b32 v25, s17
+; GFX12-NEXT:    v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v1, s11
+; GFX12-NEXT:    v_dual_mov_b32 v48, s34 :: v_dual_mov_b32 v51, s55
+; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT:    s_clause 0x9
+; GFX12-NEXT:    global_store_b128 v64, v[32:35], s[8:9] offset:240
+; GFX12-NEXT:    global_store_b128 v64, v[28:31], s[8:9] offset:224
+; GFX12-NEXT:    global_store_b128 v64, v[36:39], s[8:9] offset:208
+; GFX12-NEXT:    global_store_b128 v64, v[24:27], s[8:9] offset:192
+; GFX12-NEXT:    global_store_b128 v64, v[40:43], s[8:9] offset:176
+; GFX12-NEXT:    global_store_b128 v64, v[20:23], s[8:9] offset:160
+; GFX12-NEXT:    global_store_b128 v64, v[44:47], s[8:9] offset:144
+; GFX12-NEXT:    global_store_b128 v64, v[16:19], s[8:9] offset:128
+; GFX12-NEXT:    global_store_b128 v64, v[48:51], s[8:9] offset:112
+; GFX12-NEXT:    global_store_b128 v64, v[12:15], s[8:9] offset:96
+; GFX12-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v0, v[53:56], s[8:9] offset:80
-; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[8:9] offset:64
-; GFX12-NEXT:    global_store_b128 v0, v[33:36], s[8:9] offset:48
-; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[8:9] offset:32
-; GFX12-NEXT:    global_store_b128 v0, v[29:32], s[8:9] offset:16
-; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[8:9]
+; GFX12-NEXT:    global_store_b128 v64, v[52:55], s[8:9] offset:80
+; GFX12-NEXT:    global_store_b128 v64, v[8:11], s[8:9] offset:64
+; GFX12-NEXT:    global_store_b128 v64, v[56:59], s[8:9] offset:48
+; GFX12-NEXT:    global_store_b128 v64, v[4:7], s[8:9] offset:32
+; GFX12-NEXT:    global_store_b128 v64, v[60:63], s[8:9] offset:16
+; GFX12-NEXT:    global_store_b128 v64, v[0:3], s[8:9]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index e0ccda1cbb9edd..e8fe2dee0e747b 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -15,6 +15,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1151 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1152 < %s | FileCheck --check-prefixes=GCN,GFX1030,GFX1030W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1152 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1030,GFX1030W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s
 
 ; GCN-LABEL: {{^}}max_occupancy:
 ; GFX9:       ; Occupancy: 10
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 25e9e09748c814..fbe34a3a3970bd 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -221,15 +221,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    ; def s29
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_writelane_b32 v40, s21, 12
-; GFX906-NEXT:    v_writelane_b32 v40, s22, 13
-; GFX906-NEXT:    v_writelane_b32 v40, s23, 14
-; GFX906-NEXT:    v_writelane_b32 v40, s24, 15
-; GFX906-NEXT:    v_writelane_b32 v40, s25, 16
-; GFX906-NEXT:    v_writelane_b32 v40, s26, 17
-; GFX906-NEXT:    v_writelane_b32 v40, s27, 18
-; GFX906-NEXT:    v_writelane_b32 v40, s28, 19
-; GFX906-NEXT:    v_writelane_b32 v40, s29, 20
+; GFX906-NEXT:    v_writelane_b32 v40, s21, 24
+; GFX906-NEXT:    v_writelane_b32 v40, s22, 25
+; GFX906-NEXT:    v_writelane_b32 v40, s23, 26
+; GFX906-NEXT:    v_writelane_b32 v40, s24, 27
+; GFX906-NEXT:    v_writelane_b32 v40, s25, 28
+; GFX906-NEXT:    v_writelane_b32 v40, s26, 29
+; GFX906-NEXT:    v_writelane_b32 v40, s27, 30
+; GFX906-NEXT:    v_writelane_b32 v40, s28, 31
+; GFX906-NEXT:    v_writelane_b32 v40, s29, 32
 ; GFX906-NEXT:    v_readlane_b32 s4, v40, 10
 ; GFX906-NEXT:    v_readlane_b32 s6, v40, 0
 ; GFX906-NEXT:    v_readlane_b32 s8, v40, 8
@@ -249,39 +249,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s21, v40, 12
+; GFX906-NEXT:    v_readlane_b32 s21, v40, 24
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s21
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s22, v40, 13
+; GFX906-NEXT:    v_readlane_b32 s22, v40, 25
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s22
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s23, v40, 14
+; GFX906-NEXT:    v_readlane_b32 s23, v40, 26
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s23
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s24, v40, 15
+; GFX906-NEXT:    v_readlane_b32 s24, v40, 27
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s24
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s25, v40, 16
+; GFX906-NEXT:    v_readlane_b32 s25, v40, 28
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s25
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s26, v40, 17
+; GFX906-NEXT:    v_readlane_b32 s26, v40, 29
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s26
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s27, v40, 18
+; GFX906-NEXT:    v_readlane_b32 s27, v40, 30
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s27
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s28, v40, 19
+; GFX906-NEXT:    v_readlane_b32 s28, v40, 31
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s28
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s29, v40, 20
+; GFX906-NEXT:    v_readlane_b32 s29, v40, 32
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s29
 ; GFX906-NEXT:    ;;#ASMEND
@@ -602,15 +602,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    ; def s29
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_writelane_b32 v40, s21, 12
-; GFX908-NEXT:    v_writelane_b32 v40, s22, 13
-; GFX908-NEXT:    v_writelane_b32 v40, s23, 14
-; GFX908-NEXT:    v_writelane_b32 v40, s24, 15
-; GFX908-NEXT:    v_writelane_b32 v40, s25, 16
-; GFX908-NEXT:    v_writelane_b32 v40, s26, 17
-; GFX908-NEXT:    v_writelane_b32 v40, s27, 18
-; GFX908-NEXT:    v_writelane_b32 v40, s28, 19
-; GFX908-NEXT:    v_writelane_b32 v40, s29, 20
+; GFX908-NEXT:    v_writelane_b32 v40, s21, 24
+; GFX908-NEXT:    v_writelane_b32 v40, s22, 25
+; GFX908-NEXT:    v_writelane_b32 v40, s23, 26
+; GFX908-NEXT:    v_writelane_b32 v40, s24, 27
+; GFX908-NEXT:    v_writelane_b32 v40, s25, 28
+; GFX908-NEXT:    v_writelane_b32 v40, s26, 29
+; GFX908-NEXT:    v_writelane_b32 v40, s27, 30
+; GFX908-NEXT:    v_writelane_b32 v40, s28, 31
+; GFX908-NEXT:    v_writelane_b32 v40, s29, 32
 ; GFX908-NEXT:    v_readlane_b32 s4, v40, 10
 ; GFX908-NEXT:    v_readlane_b32 s6, v40, 0
 ; GFX908-NEXT:    v_readlane_b32 s8, v40, 8
@@ -630,39 +630,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s21, v40, 12
+; GFX908-NEXT:    v_readlane_b32 s21, v40, 24
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s21
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s22, v40, 13
+; GFX908-NEXT:    v_readlane_b32 s22, v40, 25
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s22
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s23, v40, 14
+; GFX908-NEXT:    v_readlane_b32 s23, v40, 26
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s23
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s24, v40, 15
+; GFX908-NEXT:    v_readlane_b32 s24, v40, 27
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s24
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s25, v40, 16
+; GFX908-NEXT:    v_readlane_b32 s25, v40, 28
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s25
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s26, v40, 17
+; GFX908-NEXT:    v_readlane_b32 s26, v40, 29
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s26
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s27, v40, 18
+; GFX908-NEXT:    v_readlane_b32 s27, v40, 30
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s27
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s28, v40, 19
+; GFX908-NEXT:    v_readlane_b32 s28, v40, 31
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s28
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s29, v40, 20
+; GFX908-NEXT:    v_readlane_b32 s29, v40, 32
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s29
 ; GFX908-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-constant-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-constant-index.ll
index 272a9ebe536c9c..5e16efa89eb1c4 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-constant-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-constant-index.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @non_constant_index(i32 %arg) {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <2 x float> [ [[TMP2:%.*]], [[BB3]] ], [ undef, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <2 x float> [ [[TMP2:%.*]], [[BB3]] ], [ poison, [[BB2:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> [[PROMOTEALLOCA]], float 0.000000e+00, i32 [[ARG]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[ARG]], 1
 ; CHECK-NEXT:    [[TMP2]] = insertelement <2 x float> [[TMP0]], float 0.000000e+00, i32 [[TMP1]]
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index 04a9f3cb2d332e..17a19116735e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -17,12 +17,10 @@
 
 ; DEFAULT: Greedy Register Allocator
 ; DEFAULT-NEXT: Virtual Register Rewriter
-; DEFAULT-NEXT: Stack Slot Coloring
 ; DEFAULT-NEXT: SI lower SGPR spill instructions
 ; DEFAULT-NEXT: Virtual Register Map
 ; DEFAULT-NEXT: Live Register Matrix
 ; DEFAULT-NEXT: SI Pre-allocate WWM Registers
-; DEFAULT-NEXT: Live Stack Slot Analysis
 ; DEFAULT-NEXT: Greedy Register Allocator
 ; DEFAULT-NEXT: SI Lower WWM Copies
 ; DEFAULT-NEXT: GCN NSA Reassign
@@ -52,12 +50,10 @@
 ; BASIC-DEFAULT-NEXT: Live Register Matrix
 ; BASIC-DEFAULT-NEXT: Basic Register Allocator
 ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
-; BASIC-DEFAULT-NEXT: Stack Slot Coloring
 ; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
 ; BASIC-DEFAULT-NEXT: Virtual Register Map
 ; BASIC-DEFAULT-NEXT: Live Register Matrix
 ; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
-; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
 ; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
 ; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
 ; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
@@ -73,12 +69,10 @@
 
 ; DEFAULT-BASIC: Greedy Register Allocator
 ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
-; DEFAULT-BASIC-NEXT: Stack Slot Coloring
 ; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
 ; DEFAULT-BASIC-NEXT: Virtual Register Map
 ; DEFAULT-BASIC-NEXT: Live Register Matrix
 ; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
-; DEFAULT-BASIC-NEXT: Live Stack Slot Analysis
 ; DEFAULT-BASIC-NEXT: Basic Register Allocator
 ; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
 ; DEFAULT-BASIC-NEXT: GCN NSA Reassign
@@ -96,12 +90,10 @@
 ; BASIC-BASIC-NEXT: Live Register Matrix
 ; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: Virtual Register Rewriter
-; BASIC-BASIC-NEXT: Stack Slot Coloring
 ; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
 ; BASIC-BASIC-NEXT: Virtual Register Map
 ; BASIC-BASIC-NEXT: Live Register Matrix
 ; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
-; BASIC-BASIC-NEXT: Live Stack Slot Analysis
 ; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: SI Lower WWM Copies
 ; BASIC-BASIC-NEXT: GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 0527036e674989..887e9c4b5dc5ed 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -2,7 +2,6 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy,0 -stop-after=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILLED %s
 
 # INFO : The test starts from the sgpr-regalloc pipeline.
-# INFO : Now, StackSlotColoring pass comes just after sgpr-regalloc pipeline.
 
 # This file contains 3 test cases to observe the optimal stack slot usage for SGPR spills utilizing Stack Slot Coloring pass.
 # @stack-slot-share-equal-sized-spills : In this, the stack slot indices is shared among the spill stack objects of equal size.
@@ -42,19 +41,20 @@ body:             |
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $vgpr0
     ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, [[DEF]]
     ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr0_sgpr1
     ; SGPR_SPILLED-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, [[DEF]], implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 7, [[DEF]], implicit $sgpr2_sgpr3
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit killed $vgpr0
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6, implicit-def $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_64 = COPY $sgpr0_sgpr1
@@ -107,17 +107,18 @@ body:             |
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $vgpr0
     ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, [[DEF]]
     ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr2_sgpr3
     ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]]
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, [[DEF]]
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit killed $vgpr0
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_64 = COPY $sgpr2_sgpr3
@@ -162,25 +163,26 @@ body:             |
     ; SGPR_SPILLED-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr
     ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) 4, target-flags(amdgpu-rel32-hi) 4, implicit-def dead $scc
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, [[DEF]], implicit killed $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 3, [[DEF]], implicit killed $sgpr4_sgpr5
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $vgpr0
     ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 4, [[DEF]]
     ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, [[DEF]], implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 5, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 6, [[DEF]], implicit $sgpr2_sgpr3
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit killed $vgpr0
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5, implicit-def $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_32 = COPY $sgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 4ce9260b8d53de..52db7fea08e053 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -88,11 +88,13 @@ entry:
   ret void
 }
 
-; GFX9_10 can use a signed immediate byte offset
+; GFX9+ can use a signed immediate byte offset but not without sgpr[offset]
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
-; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, -0x4
+; GFX9_10: s_add_u32 s2, s2, -4
+; GFX9_10: s_addc_u32 s3, s3, -1
+; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
   %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index e1bd1523d78a4b..bea2e6d4b45a3c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10098,7 +10098,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT:    s_mov_b32 s2, 0x86a00
+; GFX6-NEXT:    s_mov_b32 s2, 0x84400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10108,7 +10108,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x86600
+; GFX6-NEXT:    s_mov_b32 s2, 0x84000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10117,7 +10117,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x86200
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10126,7 +10126,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x85e00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10135,7 +10135,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x85a00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10144,7 +10144,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x85600
+; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10153,7 +10153,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x85200
+; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10162,7 +10162,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x84e00
+; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10171,7 +10171,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x84a00
+; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10180,7 +10180,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x84600
+; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10189,7 +10189,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x84200
+; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10198,7 +10198,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
-; GFX6-NEXT:    s_mov_b32 s2, 0x83a00
+; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10208,7 +10208,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
-; GFX6-NEXT:    s_mov_b32 s2, 0x83200
+; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10217,7 +10217,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
-; GFX6-NEXT:    s_mov_b32 s2, 0x83600
+; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10239,7 +10239,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
+; GFX6-NEXT:    s_mov_b32 s0, 0x81800
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10266,7 +10266,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x84800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10307,7 +10307,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x81400
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10316,7 +10316,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x84800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10343,7 +10343,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10352,7 +10352,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x81400
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10379,7 +10379,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x82400
+; GFX6-NEXT:    s_mov_b32 s34, 0x86000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10388,7 +10388,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10411,7 +10411,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s34, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x86800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10423,7 +10423,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
 ; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
+; GFX6-NEXT:    s_mov_b32 s2, 0x86c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10432,7 +10432,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s36, 0x82400
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10450,7 +10450,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s44, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s44, 0x86800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10464,7 +10464,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x20c0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10521,13 +10521,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s0, 0x86a00
+; GFX6-NEXT:    s_mov_b32 s0, 0x84400
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX6-NEXT:    s_mov_b32 s0, 0x86600
+; GFX6-NEXT:    s_mov_b32 s0, 0x84000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10535,7 +10535,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x86200
+; GFX6-NEXT:    s_mov_b32 s0, 0x83c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10543,7 +10543,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85e00
+; GFX6-NEXT:    s_mov_b32 s0, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10551,7 +10551,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85a00
+; GFX6-NEXT:    s_mov_b32 s0, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10559,7 +10559,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85600
+; GFX6-NEXT:    s_mov_b32 s0, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10567,7 +10567,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85200
+; GFX6-NEXT:    s_mov_b32 s0, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10575,7 +10575,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84e00
+; GFX6-NEXT:    s_mov_b32 s0, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10583,7 +10583,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84a00
+; GFX6-NEXT:    s_mov_b32 s0, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10591,7 +10591,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84600
+; GFX6-NEXT:    s_mov_b32 s0, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10599,7 +10599,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84200
+; GFX6-NEXT:    s_mov_b32 s0, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10607,7 +10607,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83a00
+; GFX6-NEXT:    s_mov_b32 s0, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10615,7 +10615,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
+; GFX6-NEXT:    s_mov_b32 s0, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10623,7 +10623,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83600
+; GFX6-NEXT:    s_mov_b32 s0, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10631,7 +10631,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83200
+; GFX6-NEXT:    s_mov_b32 s0, 0x80c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll b/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
index f5aea62d4be79a..f8dba64e7a01a9 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
+++ b/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
@@ -1,10 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 -global-isel=1 --verify-machineinstrs %s -o - | FileCheck %s
 
-declare i32 @llvm.get.fpenv.i32()
-declare void @llvm.set.fpenv.i32(i32)
-declare void @llvm.reset.fpenv()
-
 define i32 @func_get_fpenv() {
 ; CHECK-LABEL: func_get_fpenv:
 ; CHECK:       @ %bb.0: @ %entry
@@ -88,5 +84,86 @@ entry:
   ret void
 }
 
+
+define i32 @get_fpmode_soft() #0 {
+; CHECK-LABEL: get_fpmode_soft:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, sp, #8
+; CHECK-NEXT:    add r4, sp, #4
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    bl fegetmode
+; CHECK-NEXT:    ldr r0, [r4]
+; CHECK-NEXT:    add sp, sp, #8
+; CHECK-NEXT:    pop {r4, lr}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  %fpenv = call i32 @llvm.get.fpmode.i32()
+  ret i32 %fpenv
+}
+
+define i32 @get_fpmode() nounwind {
+; CHECK-LABEL: get_fpmode:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmrs r0, fpscr
+; CHECK-NEXT:    mov pc, lr
+entry:
+  %fpenv = call i32 @llvm.get.fpmode.i32()
+  ret i32 %fpenv
+}
+
+define void @set_fpmode_soft(i32 %fpmode) #0 {
+; CHECK-LABEL: set_fpmode_soft:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, sp, #8
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl fesetmode
+; CHECK-NEXT:    add sp, sp, #8
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  call void @llvm.set.fpmode.i32(i32 %fpmode)
+  ret void
+}
+
+define void @set_fpmode(i32 %fpmode) nounwind {
+; CHECK-LABEL: set_fpmode:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmrs r1, fpscr
+; CHECK-NEXT:    mov r2, #159
+; CHECK-NEXT:    orr r2, r2, #-134217728
+; CHECK-NEXT:    and r1, r1, r2
+; CHECK-NEXT:    mvn r2, #159
+; CHECK-NEXT:    sub r2, r2, #-134217728
+; CHECK-NEXT:    and r0, r0, r2
+; CHECK-NEXT:    orr r0, r1, r0
+; CHECK-NEXT:    vmsr fpscr, r0
+; CHECK-NEXT:    mov pc, lr
+entry:
+  call void @llvm.set.fpmode.i32(i32 %fpmode)
+  ret void
+}
+
+define void @reset_fpmode_soft() #0 {
+; CHECK-LABEL: reset_fpmode_soft:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    mvn r0, #0
+; CHECK-NEXT:    bl fesetmode
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  call void @llvm.reset.fpmode()
+  ret void
+}
+
 attributes #0 = { nounwind "use-soft-float"="true" }
 
diff --git a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
index 27203e274a4aa6..fb9139c0d1285e 100644
--- a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
+++ b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
@@ -186,7 +186,7 @@ exit:
 }
 
 ; Test with a mask that can be encoded with T32 instruction set, but not with A32.
-define i32 @f0(i1 %c0, i32 %v) {
+define i32 @f0(i1 %c0, i32 %v, ptr %p) {
 ; V7M-LABEL: f0:
 ; V7M:       @ %bb.0: @ %E
 ; V7M-NEXT:    lsls r0, r0, #31
@@ -198,7 +198,9 @@ define i32 @f0(i1 %c0, i32 %v) {
 ; V7M-NEXT:    bxeq lr
 ; V7M-NEXT:    b .LBB1_3
 ; V7M-NEXT:  .LBB1_2: @ %B
+; V7M-NEXT:    movs r0, #1
 ; V7M-NEXT:    tst.w r1, #16843009
+; V7M-NEXT:    str r0, [r2]
 ; V7M-NEXT:    itt ne
 ; V7M-NEXT:    movne r0, #0
 ; V7M-NEXT:    bxne lr
@@ -208,10 +210,10 @@ define i32 @f0(i1 %c0, i32 %v) {
 ;
 ; V7A-LABEL: f0:
 ; V7A:       @ %bb.0: @ %E
-; V7A-NEXT:    movw r2, #257
+; V7A-NEXT:    movw r3, #257
 ; V7A-NEXT:    tst r0, #1
-; V7A-NEXT:    movt r2, #257
-; V7A-NEXT:    and r1, r1, r2
+; V7A-NEXT:    movt r3, #257
+; V7A-NEXT:    and r1, r1, r3
 ; V7A-NEXT:    beq .LBB1_3
 ; V7A-NEXT:  @ %bb.1: @ %A
 ; V7A-NEXT:    cmp r1, #0
@@ -221,8 +223,10 @@ define i32 @f0(i1 %c0, i32 %v) {
 ; V7A-NEXT:    mov r0, #1
 ; V7A-NEXT:    bx lr
 ; V7A-NEXT:  .LBB1_3: @ %B
-; V7A-NEXT:    mov r0, #0
+; V7A-NEXT:    mov r0, #1
 ; V7A-NEXT:    cmp r1, #0
+; V7A-NEXT:    str r0, [r2]
+; V7A-NEXT:    mov r0, #0
 ; V7A-NEXT:    moveq r0, #1
 ; V7A-NEXT:    bx lr
 ;
@@ -237,7 +241,9 @@ define i32 @f0(i1 %c0, i32 %v) {
 ; V7A-T-NEXT:    bxeq lr
 ; V7A-T-NEXT:    b .LBB1_3
 ; V7A-T-NEXT:  .LBB1_2: @ %B
+; V7A-T-NEXT:    movs r0, #1
 ; V7A-T-NEXT:    tst.w r1, #16843009
+; V7A-T-NEXT:    str r0, [r2]
 ; V7A-T-NEXT:    itt ne
 ; V7A-T-NEXT:    movne r0, #0
 ; V7A-T-NEXT:    bxne lr
@@ -247,18 +253,20 @@ define i32 @f0(i1 %c0, i32 %v) {
 ;
 ; V6M-LABEL: f0:
 ; V6M:       @ %bb.0: @ %E
-; V6M-NEXT:    ldr r2, .LCPI1_0
-; V6M-NEXT:    ands r2, r1
+; V6M-NEXT:    ldr r3, .LCPI1_0
+; V6M-NEXT:    ands r3, r1
 ; V6M-NEXT:    lsls r0, r0, #31
 ; V6M-NEXT:    beq .LBB1_3
 ; V6M-NEXT:  @ %bb.1: @ %A
-; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    cmp r3, #0
 ; V6M-NEXT:    bne .LBB1_5
 ; V6M-NEXT:  @ %bb.2:
 ; V6M-NEXT:    movs r0, #0
 ; V6M-NEXT:    bx lr
 ; V6M-NEXT:  .LBB1_3: @ %B
-; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    str r0, [r2]
+; V6M-NEXT:    cmp r3, #0
 ; V6M-NEXT:    beq .LBB1_5
 ; V6M-NEXT:  @ %bb.4:
 ; V6M-NEXT:    movs r0, #0
@@ -280,6 +288,7 @@ A:
 
 B:
   %c2 = icmp eq i32 %a, 0
+  store i32 1, ptr %p, align 4
   br i1 %c2, label %D, label %C
 
 C:
@@ -294,7 +303,7 @@ X:
 }
 
 ; Test with a mask that can be encoded both with T32 and A32 instruction sets.
-define i32 @f1(i1 %c0, i32 %v) {
+define i32 @f1(i1 %c0, i32 %v, ptr %p) {
 ; V7M-LABEL: f1:
 ; V7M:       @ %bb.0: @ %E
 ; V7M-NEXT:    lsls r0, r0, #31
@@ -306,7 +315,9 @@ define i32 @f1(i1 %c0, i32 %v) {
 ; V7M-NEXT:    bxeq lr
 ; V7M-NEXT:    b .LBB2_3
 ; V7M-NEXT:  .LBB2_2: @ %B
+; V7M-NEXT:    movs r0, #1
 ; V7M-NEXT:    tst.w r1, #100663296
+; V7M-NEXT:    str r0, [r2]
 ; V7M-NEXT:    itt ne
 ; V7M-NEXT:    movne r0, #0
 ; V7M-NEXT:    bxne lr
@@ -326,8 +337,10 @@ define i32 @f1(i1 %c0, i32 %v) {
 ; V7A-NEXT:    mov r0, #1
 ; V7A-NEXT:    bx lr
 ; V7A-NEXT:  .LBB2_3: @ %B
-; V7A-NEXT:    mov r0, #0
+; V7A-NEXT:    mov r0, #1
 ; V7A-NEXT:    tst r1, #100663296
+; V7A-NEXT:    str r0, [r2]
+; V7A-NEXT:    mov r0, #0
 ; V7A-NEXT:    moveq r0, #1
 ; V7A-NEXT:    bx lr
 ;
@@ -342,7 +355,9 @@ define i32 @f1(i1 %c0, i32 %v) {
 ; V7A-T-NEXT:    bxeq lr
 ; V7A-T-NEXT:    b .LBB2_3
 ; V7A-T-NEXT:  .LBB2_2: @ %B
+; V7A-T-NEXT:    movs r0, #1
 ; V7A-T-NEXT:    tst.w r1, #100663296
+; V7A-T-NEXT:    str r0, [r2]
 ; V7A-T-NEXT:    itt ne
 ; V7A-T-NEXT:    movne r0, #0
 ; V7A-T-NEXT:    bxne lr
@@ -352,19 +367,21 @@ define i32 @f1(i1 %c0, i32 %v) {
 ;
 ; V6M-LABEL: f1:
 ; V6M:       @ %bb.0: @ %E
-; V6M-NEXT:    movs r2, #3
-; V6M-NEXT:    lsls r2, r2, #25
-; V6M-NEXT:    ands r2, r1
+; V6M-NEXT:    movs r3, #3
+; V6M-NEXT:    lsls r3, r3, #25
+; V6M-NEXT:    ands r3, r1
 ; V6M-NEXT:    lsls r0, r0, #31
 ; V6M-NEXT:    beq .LBB2_3
 ; V6M-NEXT:  @ %bb.1: @ %A
-; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    cmp r3, #0
 ; V6M-NEXT:    bne .LBB2_5
 ; V6M-NEXT:  @ %bb.2:
 ; V6M-NEXT:    movs r0, #0
 ; V6M-NEXT:    bx lr
 ; V6M-NEXT:  .LBB2_3: @ %B
-; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    str r0, [r2]
+; V6M-NEXT:    cmp r3, #0
 ; V6M-NEXT:    beq .LBB2_5
 ; V6M-NEXT:  @ %bb.4:
 ; V6M-NEXT:    movs r0, #0
@@ -382,6 +399,7 @@ A:
 
 B:
   %c2 = icmp eq i32 %a, 0
+  store i32 1, ptr %p, align 4
   br i1 %c2, label %D, label %C
 
 C:
diff --git a/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir b/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir
index 85fd2d610b1e64..236cd34f7fba52 100644
--- a/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir
+++ b/llvm/test/CodeGen/ARM/constant-islands-split-IT.mir
@@ -6,7 +6,6 @@
 #
 
 --- |
-  ; ModuleID = '<stdin>'
   source_filename = "<stdin>"
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
   target triple = "thumbv8m.main-arm-none-eabi"
@@ -69,48 +68,67 @@ machineFunctionInfo: {}
 body:             |
   ; CHECK-LABEL: name: h
   ; CHECK: bb.0:
-  ; CHECK:   successors: %bb.4(0x80000000)
-  ; CHECK:   renamable $d0 = VLDRD %const.3, 0, 14 /* CC::al */, $noreg :: (load (s64) from constant-pool)
-  ; CHECK:   dead renamable $r0 = SPACE 40, undef renamable $r0
-  ; CHECK:   tB %bb.4, 14 /* CC::al */, $noreg
-  ; CHECK: bb.1 (align 8):
-  ; CHECK:   successors:
-  ; CHECK:   CONSTPOOL_ENTRY 3, %const.0, 8
-  ; CHECK: bb.2:
-  ; CHECK:   successors:
-  ; CHECK: bb.3 (align 8):
-  ; CHECK:   successors:
-  ; CHECK:   CONSTPOOL_ENTRY 5, %const.2, 8
-  ; CHECK: bb.4 (align 2):
-  ; CHECK:   successors: %bb.5(0x80000000)
-  ; CHECK:   dead renamable $r0 = SPACE 790, undef renamable $r0
-  ; CHECK: bb.5:
-  ; CHECK:   successors: %bb.7(0x80000000)
-  ; CHECK:   renamable $r0 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   t2CMPri $r0, 32, 14 /* CC::al */, $noreg, implicit-def $cpsr
-  ; CHECK:   renamable $r0 = SPACE 200, undef renamable $r0
-  ; CHECK:   t2IT 0, 1, implicit-def $itstate
-  ; CHECK:   renamable $d0 = VLDRD %const.7, 0, 0 /* CC::eq */, $cpsr, implicit $itstate :: (load (s64) from constant-pool)
-  ; CHECK:   renamable $d1 = VLDRD %const.5, 0, 0 /* CC::eq */, $cpsr, implicit $itstate :: (load (s64) from constant-pool)
-  ; CHECK:   renamable $d2 = VLDRD %const.6, 0, 0 /* CC::eq */, $cpsr, implicit $itstate :: (load (s64) from constant-pool)
-  ; CHECK:   $r0 = t2SUBri $r0, 12, 0 /* CC::eq */, $cpsr, $noreg, implicit killed $itstate
-  ; CHECK:   t2B %bb.7, 14 /* CC::al */, $noreg
-  ; CHECK: bb.6 (align 8):
-  ; CHECK:   successors:
-  ; CHECK:   CONSTPOOL_ENTRY 7, %const.1, 8
-  ; CHECK: bb.7 (align 2):
-  ; CHECK:   liveins: $r0, $cpsr, $d0, $s0, $s1, $d1, $s2, $s3, $d2, $s4, $s5
-  ; CHECK:   t2IT 0, 4, implicit-def $itstate
-  ; CHECK:   $sp = tMOVr $r0, 0 /* CC::eq */, $cpsr, implicit $itstate
-  ; CHECK:   $sp = t2LDMIA_RET $sp, 0 /* CC::eq */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc, implicit killed $d0, implicit killed $d1, implicit killed $d2, implicit $sp, implicit killed $itstate
-  ; CHECK: bb.8 (align 8):
-  ; CHECK:   successors:
-  ; CHECK:   CONSTPOOL_ENTRY 6, %const.0, 8
-  ; CHECK: bb.9 (align 2):
-  ; CHECK:   successors: %bb.9(0x80000000)
-  ; CHECK:   dead renamable $r0 = SPACE 4000, undef renamable $r0
-  ; CHECK:   t2B %bb.9, 14 /* CC::al */, $noreg
-  ; CHECK: bb.10:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $d0 = VLDRD %const.3, 0, 14 /* CC::al */, $noreg :: (load (s64) from constant-pool)
+  ; CHECK-NEXT:   dead renamable $r0 = SPACE 40, undef renamable $r0
+  ; CHECK-NEXT:   tB %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (align 8):
+  ; CHECK-NEXT:   successors:
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CONSTPOOL_ENTRY 3, %const.0, 8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors:
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3 (align 8):
+  ; CHECK-NEXT:   successors:
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CONSTPOOL_ENTRY 5, %const.2, 8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4 (align 2):
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r0 = SPACE 790, undef renamable $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r0 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   t2CMPri $r0, 32, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   renamable $r0 = SPACE 200, undef renamable $r0
+  ; CHECK-NEXT:   t2IT 0, 1, implicit-def $itstate
+  ; CHECK-NEXT:   renamable $d0 = VLDRD %const.7, 0, 0 /* CC::eq */, $cpsr, implicit $itstate :: (load (s64) from constant-pool)
+  ; CHECK-NEXT:   renamable $d1 = VLDRD %const.5, 0, 0 /* CC::eq */, $cpsr, implicit $itstate :: (load (s64) from constant-pool)
+  ; CHECK-NEXT:   renamable $d2 = VLDRD %const.6, 0, 0 /* CC::eq */, $cpsr, implicit $itstate :: (load (s64) from constant-pool)
+  ; CHECK-NEXT:   $r0 = t2SUBri $r0, 12, 0 /* CC::eq */, $cpsr, $noreg, implicit killed $itstate
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6 (align 8):
+  ; CHECK-NEXT:   successors:
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CONSTPOOL_ENTRY 7, %const.1, 8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7 (align 2):
+  ; CHECK-NEXT:   liveins: $r0, $cpsr, $d0, $s0, $s1, $d1, $s2, $s3, $d2, $s4, $s5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   t2IT 0, 4, implicit-def $itstate
+  ; CHECK-NEXT:   $sp = tMOVr $r0, 0 /* CC::eq */, $cpsr, implicit $itstate
+  ; CHECK-NEXT:   $sp = t2LDMIA_RET $sp, 0 /* CC::eq */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc, implicit killed $d0, implicit killed $d1, implicit killed $d2, implicit $sp, implicit killed $itstate
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8 (align 8):
+  ; CHECK-NEXT:   successors:
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CONSTPOOL_ENTRY 6, %const.0, 8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9 (align 2):
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $r0 = SPACE 4000, undef renamable $r0
+  ; CHECK-NEXT:   t2B %bb.9, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
   bb.0:
     successors: %bb.1(0x80000000)
 
diff --git a/llvm/test/CodeGen/ARM/ldexp.ll b/llvm/test/CodeGen/ARM/ldexp.ll
new file mode 100644
index 00000000000000..941390ee94c1eb
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ldexp.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=armv7-linux < %s -o - | FileCheck -check-prefix=LINUX %s
+; RUN: llc -mtriple=thumbv7-windows-msvc -mattr=+thumb-mode < %s -o - | FileCheck -check-prefix=WINDOWS %s
+
+define double @testExp(double %val, i32 %a) {
+; LINUX:    b ldexp{{$}}
+; WINDOWS:  b.w ldexp{{$}}
+entry:
+  %call = tail call fast double @ldexp(double %val, i32 %a)
+  ret double %call
+}
+
+declare double @ldexp(double, i32) memory(none)
+
+define double @testExpIntrinsic(double %val, i32 %a) {
+; LINUX:    b ldexp{{$}}
+; WINDOWS:  b.w ldexp{{$}}
+entry:
+  %call = tail call fast double @llvm.ldexp.f64(double %val, i32 %a)
+  ret double %call
+}
+
+define float @testExpf(float %val, i32 %a) {
+; LINUX:    b ldexpf
+; WINDOWS:  b.w ldexpf
+entry:
+  %call = tail call fast float @ldexpf(float %val, i32 %a)
+  ret float %call
+}
+
+define float @testExpfIntrinsic(float %val, i32 %a) {
+; LINUX:    b ldexpf
+; WINDOWS:  bl ldexp{{$}}
+entry:
+  %call = tail call fast float @llvm.ldexp.f32(float %val, i32 %a)
+  ret float %call
+}
+
+declare float @ldexpf(float, i32) memory(none)
+
+define fp128 @testExpl(fp128 %val, i32 %a) {
+; LINUX:    bl ldexpl
+; WINDOWS:    b.w ldexpl
+entry:
+  %call = tail call fast fp128 @ldexpl(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+declare fp128 @ldexpl(fp128, i32) memory(none)
+
+define half @testExpf16(half %val, i32 %a) {
+; LINUX: bl ldexpf
+; WINDOWS: bl ldexp{{$}}
+entry:
+  %0 = tail call fast half @llvm.ldexp.f16.i32(half %val, i32 %a)
+  ret half %0
+}
+
+declare half @llvm.ldexp.f16.i32(half, i32) memory(none)
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PipelineStateValidation.ll b/llvm/test/CodeGen/DirectX/ContainerData/PipelineStateValidation.ll
new file mode 100644
index 00000000000000..9e1720b0b1cac6
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PipelineStateValidation.ll
@@ -0,0 +1,41 @@
+; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: @dx.psv0 = private constant  [76 x i8] c"{{.*}}", section "PSV0", align 4
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+!dx.valver = !{!0}
+
+!0 = !{i32 1, i32 7}
+
+; DXC: - Name:            PSV0
+; DXC:     Size:            76
+; DXC:     PSVInfo:
+; DXC:       Version:         3
+; DXC:       ShaderStage:     5
+; DXC:       MinimumWaveLaneCount: 0
+; DXC:       MaximumWaveLaneCount: 4294967295
+; DXC:       UsesViewID:      0
+; DXC:       SigInputVectors: 0
+; DXC:       SigOutputVectors: [ 0, 0, 0, 0 ]
+; DXC:       NumThreadsX:     1
+; DXC:       NumThreadsY:     1
+; DXC:       NumThreadsZ:     1
+; DXC:       EntryName:       main
+; DXC:       ResourceStride:  24
+; DXC:       Resources:       []
+; DXC:       SigInputElements: []
+; DXC:       SigOutputElements: []
+; DXC:       SigPatchOrPrimElements: []
+; DXC:       InputOutputMap:
+; DXC:         - [  ]
+; DXC:         - [  ]
+; DXC:         - [  ]
+; DXC:         - [  ]
\ No newline at end of file
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir b/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir
index b1549e39c910ff..9645ced56452e1 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
 
 # CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
 # CHECK-LABEL: name: exp_approx_top_six
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir b/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir
index b62cbbbb8af4e8..a6285445d85dbc 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir
@@ -1,11 +1,12 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
-#
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+
 # The Window Scheduling algorithm will discard the debug IR, just like the SMS
 # algorithm does. Additionally, the MMO information in the IR is also preserved
 # to ensure that no barrier dependencies are generated within the loop body.
-#
+
 # CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
 # CHECK-LABEL: name: exp_approx
 # CHECK: bb.5.for.body:
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir b/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir
index 534c25591f5bf3..278663543ed4f1 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
 
 # CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir
index 1721419cea9db8..447918dec825d2 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir
@@ -1,13 +1,14 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s \
 # RUN: --check-prefix=CHECK-INITIALIZE
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -window-region-limit=1 -window-ii-limit=1 -o - \
-# RUN: 2>&1 | FileCheck %s --check-prefix=CHECK-ANALYSE-II
+# RUN: -window-sched=force -window-region-limit=1 -window-ii-limit=1 \
+# RUN: -filetype=null 2>&1 | FileCheck %s --check-prefix=CHECK-ANALYSE-II
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
 # RUN: -window-sched=force -window-region-limit=1 -window-search-ratio=80 \
-# RUN: -o - 2>&1 | FileCheck %s --check-prefix=CHECK-SCHED-NOT-NEEDED
+# RUN: -filetype=null 2>&1 | FileCheck %s --check-prefix=CHECK-SCHED-NOT-NEEDED
 
 # CHECK-INITIALIZE: There are too few MIs in the window region!
 # CHECK-INITIALIZE: The WindowScheduler failed to initialize!
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir
index d0521a92585b03..2b9e790776c2b0 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir
@@ -1,10 +1,14 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s \
-# RUN: --check-prefix=CHECK-SUCCESSIVE-PHI
-
-# CHECK-SUCCESSIVE-PHI: Consecutive phis are not allowed in window scheduling!
-# CHECK-SUCCESSIVE-PHI: The WindowScheduler failed to initialize!
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+
+# CHECK: Loop carried phis are not supported yet!
+# CHECK: The WindowScheduler failed to initialize!
+# CHECK-LABEL: body:             |
+# CHECK: bb.3 (machine-block-address-taken):
+# CHECK: [[REG:%[0-9]+]]:intregs = PHI {{%[0-9]+}}, %bb.1, {{%[0-9]+}}, %bb.3
+# CHECK: {{%[0-9]+}}:intregs = PHI {{%[0-9]+}}, %bb.1, [[REG]], %bb.3
 
 ---
 name:            relu
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
index 64229fd8d75cf6..601b98dca8e20b 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
 
 # CHECK: The WindowScheduler failed to initialize!
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-3.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-3.mir
new file mode 100644
index 00000000000000..b56c69d4e352fb
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-3.mir
@@ -0,0 +1,52 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+
+# CHECK: Loop carried phis are not supported yet!
+# CHECK: The WindowScheduler failed to initialize!
+# CHECK-LABEL: body:             |
+# CHECK: bb.3 (machine-block-address-taken):
+# CHECK: {{%[0-9]+}}:intregs = PHI {{%[0-9]+}}, %bb.1, [[REG:%[0-9]+]], %bb.3
+# CHECK: [[REG]]:intregs = PHI {{%[0-9]+}}, %bb.1, {{%[0-9]+}}, %bb.3
+
+---
+name:            relu
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+  
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1:
+    successors: %bb.3(0x80000000)
+  
+    %4:hvxvr = V6_vd0
+    %5:intregs = A2_addi %2, 31
+    %6:intregs = S2_lsr_i_r %5, 5
+    %7:intregs = COPY %6
+    J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3 (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+  
+    %8:intregs = PHI %0, %bb.1, %9, %bb.3
+    %9:intregs = PHI %1, %bb.1, %10, %bb.3
+    %11:hvxvr, %10:intregs = V6_vL32b_pi %9, 128
+    %12:hvxvr = V6_vmaxw killed %11, %4
+    %13:intregs = V6_vS32b_pi %8, 128, killed %12
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir b/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir
index f12a72790741c1..ca6a87119b1d69 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
 
 # CHECK-NOT: PSEUDO_PROBE
 # CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-phi.mir b/llvm/test/CodeGen/Hexagon/swp-ws-phi.mir
index 7897a02b43f381..a0d3e95190f062 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-phi.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-phi.mir
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -filetype=null 2>&1 | FileCheck %s
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
 
 # CHECK: Window scheduling is not needed!
 # CHECK-LABEL: body:             |
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir b/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir
index 1e764d5fa48b42..555608831c4d12 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
 
 # CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-weak-dep.mir b/llvm/test/CodeGen/Hexagon/swp-ws-weak-dep.mir
index 1eb0afbcf8c215..b1720be689c096 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-weak-dep.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-weak-dep.mir
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
-# RUN: -window-sched=force -filetype=null 2>&1 | FileCheck %s
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
 
 # CHECK: SU(3): Ord  Latency=0 Weak
 # CHECK: SU(1): Ord  Latency=0 Weak
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index cd308a1495914a..ed1a24e82b4e46 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -74,46 +74,46 @@ ret:
 define void @test_la_got(i32 signext %n) {
 ; LA32-LABEL: test_la_got:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    move $a1, $zero
+; LA32-NEXT:    pcalau12i $a1, %got_pc_hi20(g)
+; LA32-NEXT:    ld.w $a1, $a1, %got_pc_lo12(g)
+; LA32-NEXT:    move $a2, $zero
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB1_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA32-NEXT:    pcalau12i $a2, %got_pc_hi20(g)
-; LA32-NEXT:    ld.w $a2, $a2, %got_pc_lo12(g)
-; LA32-NEXT:    ld.w $zero, $a2, 0
-; LA32-NEXT:    addi.w $a1, $a1, 1
-; LA32-NEXT:    blt $a1, $a0, .LBB1_1
+; LA32-NEXT:    ld.w $zero, $a1, 0
+; LA32-NEXT:    addi.w $a2, $a2, 1
+; LA32-NEXT:    blt $a2, $a0, .LBB1_1
 ; LA32-NEXT:  # %bb.2: # %ret
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_la_got:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    move $a1, $zero
+; LA64-NEXT:    pcalau12i $a1, %got_pc_hi20(g)
+; LA64-NEXT:    ld.d $a1, $a1, %got_pc_lo12(g)
+; LA64-NEXT:    move $a2, $zero
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB1_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    pcalau12i $a2, %got_pc_hi20(g)
-; LA64-NEXT:    ld.d $a2, $a2, %got_pc_lo12(g)
-; LA64-NEXT:    ld.w $zero, $a2, 0
-; LA64-NEXT:    addi.w $a1, $a1, 1
-; LA64-NEXT:    blt $a1, $a0, .LBB1_1
+; LA64-NEXT:    ld.w $zero, $a1, 0
+; LA64-NEXT:    addi.w $a2, $a2, 1
+; LA64-NEXT:    blt $a2, $a0, .LBB1_1
 ; LA64-NEXT:  # %bb.2: # %ret
 ; LA64-NEXT:    ret
 ;
 ; LA64LARGE-LABEL: test_la_got:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    move $a1, $zero
-; LA64LARGE-NEXT:    .p2align 4, , 16
-; LA64LARGE-NEXT:  .LBB1_1: # %loop
-; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    pcalau12i $a2, %got_pc_hi20(g)
+; LA64LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(g)
 ; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(g)
 ; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(g)
 ; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(g)
-; LA64LARGE-NEXT:    ldx.d $a2, $t8, $a2
-; LA64LARGE-NEXT:    ld.w $zero, $a2, 0
-; LA64LARGE-NEXT:    addi.w $a1, $a1, 1
-; LA64LARGE-NEXT:    blt $a1, $a0, .LBB1_1
+; LA64LARGE-NEXT:    ldx.d $a1, $t8, $a1
+; LA64LARGE-NEXT:    move $a2, $zero
+; LA64LARGE-NEXT:    .p2align 4, , 16
+; LA64LARGE-NEXT:  .LBB1_1: # %loop
+; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64LARGE-NEXT:    ld.w $zero, $a1, 0
+; LA64LARGE-NEXT:    addi.w $a2, $a2, 1
+; LA64LARGE-NEXT:    blt $a2, $a0, .LBB1_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
 ; LA64LARGE-NEXT:    ret
 entry:
@@ -135,13 +135,13 @@ ret:
 define void @test_la_tls_ie(i32 signext %n) {
 ; LA32-LABEL: test_la_tls_ie:
 ; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
+; LA32-NEXT:    ld.w $a2, $a1, %ie_pc_lo12(ie)
 ; LA32-NEXT:    move $a1, $zero
+; LA32-NEXT:    add.w $a2, $a2, $tp
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB2_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA32-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
-; LA32-NEXT:    ld.w $a2, $a2, %ie_pc_lo12(ie)
-; LA32-NEXT:    add.w $a2, $a2, $tp
 ; LA32-NEXT:    ld.w $zero, $a2, 0
 ; LA32-NEXT:    addi.w $a1, $a1, 1
 ; LA32-NEXT:    blt $a1, $a0, .LBB2_1
@@ -150,32 +150,32 @@ define void @test_la_tls_ie(i32 signext %n) {
 ;
 ; LA64-LABEL: test_la_tls_ie:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    move $a1, $zero
+; LA64-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
+; LA64-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ie)
+; LA64-NEXT:    move $a2, $zero
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB2_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
-; LA64-NEXT:    ld.d $a2, $a2, %ie_pc_lo12(ie)
-; LA64-NEXT:    ldx.w $zero, $a2, $tp
-; LA64-NEXT:    addi.w $a1, $a1, 1
-; LA64-NEXT:    blt $a1, $a0, .LBB2_1
+; LA64-NEXT:    ldx.w $zero, $a1, $tp
+; LA64-NEXT:    addi.w $a2, $a2, 1
+; LA64-NEXT:    blt $a2, $a0, .LBB2_1
 ; LA64-NEXT:  # %bb.2: # %ret
 ; LA64-NEXT:    ret
 ;
 ; LA64LARGE-LABEL: test_la_tls_ie:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    move $a1, $zero
-; LA64LARGE-NEXT:    .p2align 4, , 16
-; LA64LARGE-NEXT:  .LBB2_1: # %loop
-; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
+; LA64LARGE-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
 ; LA64LARGE-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
 ; LA64LARGE-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
 ; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LA64LARGE-NEXT:    ldx.d $a2, $t8, $a2
-; LA64LARGE-NEXT:    ldx.w $zero, $a2, $tp
-; LA64LARGE-NEXT:    addi.w $a1, $a1, 1
-; LA64LARGE-NEXT:    blt $a1, $a0, .LBB2_1
+; LA64LARGE-NEXT:    ldx.d $a1, $t8, $a1
+; LA64LARGE-NEXT:    move $a2, $zero
+; LA64LARGE-NEXT:    .p2align 4, , 16
+; LA64LARGE-NEXT:  .LBB2_1: # %loop
+; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64LARGE-NEXT:    ldx.w $zero, $a1, $tp
+; LA64LARGE-NEXT:    addi.w $a2, $a2, 1
+; LA64LARGE-NEXT:    blt $a2, $a0, .LBB2_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
 ; LA64LARGE-NEXT:    ret
 entry:
@@ -202,21 +202,25 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
 ; LA32-NEXT:    st.w $s0, $sp, 4 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 0 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    .cfi_offset 22, -8
 ; LA32-NEXT:    .cfi_offset 23, -12
+; LA32-NEXT:    .cfi_offset 24, -16
 ; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    move $s0, $zero
+; LA32-NEXT:    move $s1, $zero
+; LA32-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; LA32-NEXT:    addi.w $s0, $a0, %got_pc_lo12(ld)
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB3_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA32-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LA32-NEXT:    addi.w $a0, $a0, %got_pc_lo12(ld)
+; LA32-NEXT:    move $a0, $s0
 ; LA32-NEXT:    bl %plt(__tls_get_addr)
 ; LA32-NEXT:    ld.w $zero, $a0, 0
-; LA32-NEXT:    addi.w $s0, $s0, 1
-; LA32-NEXT:    blt $s0, $fp, .LBB3_1
+; LA32-NEXT:    addi.w $s1, $s1, 1
+; LA32-NEXT:    blt $s1, $fp, .LBB3_1
 ; LA32-NEXT:  # %bb.2: # %ret
+; LA32-NEXT:    ld.w $s1, $sp, 0 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s0, $sp, 4 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -230,21 +234,25 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 0 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
 ; LA64-NEXT:    .cfi_offset 23, -24
+; LA64-NEXT:    .cfi_offset 24, -32
 ; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    move $s0, $zero
+; LA64-NEXT:    move $s1, $zero
+; LA64-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; LA64-NEXT:    addi.d $s0, $a0, %got_pc_lo12(ld)
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB3_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LA64-NEXT:    addi.d $a0, $a0, %got_pc_lo12(ld)
+; LA64-NEXT:    move $a0, $s0
 ; LA64-NEXT:    bl %plt(__tls_get_addr)
 ; LA64-NEXT:    ld.w $zero, $a0, 0
-; LA64-NEXT:    addi.w $s0, $s0, 1
-; LA64-NEXT:    blt $s0, $fp, .LBB3_1
+; LA64-NEXT:    addi.w $s1, $s1, 1
+; LA64-NEXT:    blt $s1, $fp, .LBB3_1
 ; LA64-NEXT:  # %bb.2: # %ret
+; LA64-NEXT:    ld.d $s1, $sp, 0 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
@@ -258,19 +266,22 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA64LARGE-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; LA64LARGE-NEXT:    st.d $s1, $sp, 0 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    .cfi_offset 1, -8
 ; LA64LARGE-NEXT:    .cfi_offset 22, -16
 ; LA64LARGE-NEXT:    .cfi_offset 23, -24
+; LA64LARGE-NEXT:    .cfi_offset 24, -32
 ; LA64LARGE-NEXT:    move $fp, $a0
-; LA64LARGE-NEXT:    move $s0, $zero
-; LA64LARGE-NEXT:    .p2align 4, , 16
-; LA64LARGE-NEXT:  .LBB3_1: # %loop
-; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; LA64LARGE-NEXT:    move $s1, $zero
+; LA64LARGE-NEXT:    pcalau12i $s0, %ld_pc_hi20(ld)
 ; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
 ; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
 ; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LA64LARGE-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGE-NEXT:    add.d $s0, $t8, $s0
+; LA64LARGE-NEXT:    .p2align 4, , 16
+; LA64LARGE-NEXT:  .LBB3_1: # %loop
+; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64LARGE-NEXT:    move $a0, $s0
 ; LA64LARGE-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
 ; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
 ; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
@@ -278,9 +289,10 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA64LARGE-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGE-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGE-NEXT:    ld.w $zero, $a0, 0
-; LA64LARGE-NEXT:    addi.w $s0, $s0, 1
-; LA64LARGE-NEXT:    blt $s0, $fp, .LBB3_1
+; LA64LARGE-NEXT:    addi.w $s1, $s1, 1
+; LA64LARGE-NEXT:    blt $s1, $fp, .LBB3_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
+; LA64LARGE-NEXT:    ld.d $s1, $sp, 0 # 8-byte Folded Reload
 ; LA64LARGE-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGE-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64LARGE-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
@@ -370,18 +382,21 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
 ; LA32-NEXT:    st.w $s0, $sp, 4 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 0 # 4-byte Folded Spill
 ; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    move $s0, $zero
+; LA32-NEXT:    move $s1, $zero
+; LA32-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; LA32-NEXT:    addi.w $s0, $a0, %got_pc_lo12(gd)
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB5_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA32-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; LA32-NEXT:    addi.w $a0, $a0, %got_pc_lo12(gd)
+; LA32-NEXT:    move $a0, $s0
 ; LA32-NEXT:    bl %plt(__tls_get_addr)
 ; LA32-NEXT:    ld.w $zero, $a0, 0
-; LA32-NEXT:    addi.w $s0, $s0, 1
-; LA32-NEXT:    blt $s0, $fp, .LBB5_1
+; LA32-NEXT:    addi.w $s1, $s1, 1
+; LA32-NEXT:    blt $s1, $fp, .LBB5_1
 ; LA32-NEXT:  # %bb.2: # %ret
+; LA32-NEXT:    ld.w $s1, $sp, 0 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s0, $sp, 4 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -394,18 +409,21 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 0 # 8-byte Folded Spill
 ; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    move $s0, $zero
+; LA64-NEXT:    move $s1, $zero
+; LA64-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; LA64-NEXT:    addi.d $s0, $a0, %got_pc_lo12(gd)
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB5_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; LA64-NEXT:    addi.d $a0, $a0, %got_pc_lo12(gd)
+; LA64-NEXT:    move $a0, $s0
 ; LA64-NEXT:    bl %plt(__tls_get_addr)
 ; LA64-NEXT:    ld.w $zero, $a0, 0
-; LA64-NEXT:    addi.w $s0, $s0, 1
-; LA64-NEXT:    blt $s0, $fp, .LBB5_1
+; LA64-NEXT:    addi.w $s1, $s1, 1
+; LA64-NEXT:    blt $s1, $fp, .LBB5_1
 ; LA64-NEXT:  # %bb.2: # %ret
+; LA64-NEXT:    ld.d $s1, $sp, 0 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
@@ -418,16 +436,18 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA64LARGE-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; LA64LARGE-NEXT:    st.d $s1, $sp, 0 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    move $fp, $a0
-; LA64LARGE-NEXT:    move $s0, $zero
-; LA64LARGE-NEXT:    .p2align 4, , 16
-; LA64LARGE-NEXT:  .LBB5_1: # %loop
-; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; LA64LARGE-NEXT:    move $s1, $zero
+; LA64LARGE-NEXT:    pcalau12i $s0, %gd_pc_hi20(gd)
 ; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(gd)
 ; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(gd)
 ; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(gd)
-; LA64LARGE-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGE-NEXT:    add.d $s0, $t8, $s0
+; LA64LARGE-NEXT:    .p2align 4, , 16
+; LA64LARGE-NEXT:  .LBB5_1: # %loop
+; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64LARGE-NEXT:    move $a0, $s0
 ; LA64LARGE-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
 ; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
 ; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
@@ -435,9 +455,10 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA64LARGE-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGE-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGE-NEXT:    ld.w $zero, $a0, 0
-; LA64LARGE-NEXT:    addi.w $s0, $s0, 1
-; LA64LARGE-NEXT:    blt $s0, $fp, .LBB5_1
+; LA64LARGE-NEXT:    addi.w $s1, $s1, 1
+; LA64LARGE-NEXT:    blt $s1, $fp, .LBB5_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
+; LA64LARGE-NEXT:    ld.d $s1, $sp, 0 # 8-byte Folded Reload
 ; LA64LARGE-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGE-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64LARGE-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index 678546f1ef535a..3e8ef59a756332 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -8,10 +8,6 @@
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
 
-;; FIXME: According to the description of the psABI v2.30, the code sequences
-;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
-;; be adjacent.
-
 @g = dso_local global i64 zeroinitializer, align 4
 @G = global i64 zeroinitializer, align 4
 @gd = external thread_local global i64
@@ -36,13 +32,13 @@ define void @foo() nounwind {
 ; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ld)
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a2, $a2, %ie_pc_lo12(ie)
 ; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
-; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
-; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM_NO_SCH-NEXT:    ret
@@ -61,14 +57,14 @@ define void @foo() nounwind {
 ; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
+; MEDIUM_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ld)
+; MEDIUM_SCH-NEXT:    ld.d $a2, $a2, %ie_pc_lo12(ie)
 ; MEDIUM_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
-; MEDIUM_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
-; MEDIUM_SCH-NEXT:    ldx.d $zero, $a0, $tp
+; MEDIUM_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; MEDIUM_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM_SCH-NEXT:    ret
@@ -101,19 +97,19 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
 ; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
-; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
 ; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
 ; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
 ; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
-; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a1, $t8, $a1
+; LARGE_NO_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
 ; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
 ; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
 ; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a2, $t8, $a2
 ; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
+; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_NO_SCH-NEXT:    ret
@@ -146,19 +142,19 @@ define void @foo() nounwind {
 ; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
 ; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
 ; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
-; LARGE_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
 ; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
 ; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
 ; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
-; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
-; LARGE_SCH-NEXT:    ldx.d $zero, $a0, $tp
-; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_SCH-NEXT:    ldx.d $a1, $t8, $a1
+; LARGE_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
 ; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
 ; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
 ; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    ldx.d $a2, $t8, $a2
 ; LARGE_SCH-NEXT:    ldx.d $zero, $a0, $tp
+; LARGE_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; LARGE_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_SCH-NEXT:    ret
diff --git a/llvm/test/CodeGen/MIR/Generic/bfloat-immediates.mir b/llvm/test/CodeGen/MIR/Generic/bfloat-immediates.mir
new file mode 100644
index 00000000000000..c4e035a4f095cd
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/bfloat-immediates.mir
@@ -0,0 +1,12 @@
+# RUN: llc -run-pass none -o - %s | FileCheck %s
+# This test ensures that the MIR parser parses bfloat immediates correctly.
+
+---
+name: bfloat_immediates
+body: |
+  bb.0:
+    ; CHECK: %0:_(s16) = G_FCONSTANT bfloat 0xR3E80
+    ; CHECK: %1:_(s16) = G_FCONSTANT bfloat 0xR3E80
+    %0:_(s16) = G_FCONSTANT bfloat 0xR3E80
+    %1:_(s16) = G_FCONSTANT bfloat 0.25
+...
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
index 0637bc916ea491..56bfaa14c5877c 100644
--- a/llvm/test/CodeGen/NVPTX/combine-mad.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -134,3 +134,52 @@ define i32 @test3(i32 %n, i32 %m, i32 %s) {
   %mul = mul i32 %sel, %m
   ret i32 %mul
 }
+
+;; (add (select 0, (mul a, b)), c) -> (select (mad a, b, c), c)
+define i32 @test4(i32 %a, i32 %b, i32 %c, i1 %p) {
+; CHECK-LABEL: test4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [test4_param_3];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u32 %r1, [test4_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test4_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [test4_param_2];
+; CHECK-NEXT:    mad.lo.s32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    selp.b32 %r5, %r4, %r3, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT:    ret;
+  %mul = mul i32 %a, %b
+  %sel = select i1 %p, i32 %mul, i32 0
+  %add = add i32 %c, %sel
+  ret i32 %add
+}
+
+define i32 @test4_rev(i32 %a, i32 %b, i32 %c, i1 %p) {
+; CHECK-LABEL: test4_rev(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [test4_rev_param_3];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u32 %r1, [test4_rev_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test4_rev_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [test4_rev_param_2];
+; CHECK-NEXT:    mad.lo.s32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    selp.b32 %r5, %r3, %r4, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT:    ret;
+  %mul = mul i32 %a, %b
+  %sel = select i1 %p, i32 0, i32 %mul
+  %add = add i32 %c, %sel
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 234da13f7e3a5e..14e02a49f6e5e4 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -265,7 +265,10 @@ declare half @test_callee(half %a, half %b) #0
 ; CHECK-DAG:  .param .align 2 .b8 retval0[2];
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
-; CHECK:      );
+; CHECK-NEXT: (
+; CHECK-NEXT:        param0,
+; CHECK-NEXT:        param1
+; CHECK-NEXT: );
 ; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
@@ -286,7 +289,10 @@ define half @test_call(half %a, half %b) #0 {
 ; CHECK-DAG:  .param .align 2 .b8 retval0[2];
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
-; CHECK:      );
+; CHECK-NEXT: (
+; CHECK-NEXT:        param0,
+; CHECK-NEXT:        param1
+; CHECK-NEXT: );
 ; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
@@ -307,7 +313,10 @@ define half @test_call_flipped(half %a, half %b) #0 {
 ; CHECK-DAG:  .param .align 2 .b8 retval0[2];
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
-; CHECK:      );
+; CHECK-NEXT: (
+; CHECK-NEXT:        param0,
+; CHECK-NEXT:        param1
+; CHECK-NEXT: );
 ; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
new file mode 100644
index 00000000000000..f6db9c429dba57
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -0,0 +1,365 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+
+define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
+; PTX-LABEL: grid_const_int(
+; PTX-NOT:     ld.u32
+; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
+; 
+; OPT-LABEL: define void @grid_const_int(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
+; OPT-NOT:     alloca
+; OPT:         [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT:         [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
+;
+  %tmp = load i32, ptr %input1, align 4
+  %add = add i32 %tmp, %input2
+  store i32 %add, ptr %out
+  ret void
+}
+
+%struct.s = type { i32, i32 }
+
+define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
+; PTX-LABEL: grid_const_struct(
+; PTX:       {
+; PTX-NOT:     ld.u32
+; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0];
+; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4];
+;
+; OPT-LABEL: define void @grid_const_struct(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) {
+; OPT-NOT:     alloca
+; OPT:         [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
+; OPT:         [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
+; OPT:         [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
+; OPT:         [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
+;
+  %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
+  %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
+  %int1 = load i32, ptr %gep1
+  %int2 = load i32, ptr %gep2
+  %add = add i32 %int1, %int2
+  store i32 %add, ptr %out
+  ret void
+}
+
+define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
+; PTX-LABEL: grid_const_escape(
+; PTX:       {
+; PTX-NOT:     .local
+; PTX:         cvta.param.{{.*}}
+; OPT-LABEL: define void @grid_const_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) {
+; OPT-NOT:     alloca [[STRUCT_S]]
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT:         [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
+;
+  %call = call i32 @escape(ptr %input)
+  ret void
+}
+
+define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
+; PTX-LABEL: multiple_grid_const_escape(
+; PTX:         mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
+; PTX:         mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
+; PTX:         mov.{{.*}} [[RD3:%.*]], [[RD2]];
+; PTX:         mov.{{.*}} [[RD4:%.*]], [[RD1]];
+; PTX:         cvta.param.{{.*}} [[RD5:%.*]], [[RD4]];
+; PTX:         cvta.param.{{.*}} [[RD6:%.*]], [[RD3]];
+; PTX:         {
+; PTX:         st.param.b64 [param0+0], [[RD5]];
+; PTX:         st.param.b64 [param2+0], [[RD6]];
+;
+; OPT-LABEL: define void @multiple_grid_const_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
+; OPT:         [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NOT:     alloca %struct.s
+; OPT:         [[A_ADDR:%.*]] = alloca i32, align 4
+; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT:         [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
+;
+  %a.addr = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4
+  %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
+  ret void
+}
+
+define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
+; PTX-LABEL: grid_const_memory_escape(
+; PTX-NOT:     .local
+; PTX:         mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0;
+; PTX:         cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]];
+; PTX:         st.global.u64 [[[RD4:%.*]]], [[RD3]];
+;
+; OPT-LABEL: define void @grid_const_memory_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) {
+; OPT-NOT:     alloca [[STRUCT_S]]
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT:         store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8
+;
+  store ptr %input, ptr %addr, align 8
+  ret void
+}
+
+define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
+; PTX-LABEL: grid_const_inlineasm_escape(
+; PTX-NOT      .local 
+; PTX:         add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4;
+; PTX:         cvta.param.u64 [[RD4:%.*]], [[RD2]]
+; PTX:         cvta.param.u64 [[RD3:%.*]], [[RD1]]
+; PTX:         add.s64 [[RD5:%.*]], [[RD3]], [[RD4]];
+;
+; OPT-LABEL: define void @grid_const_inlineasm_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) {
+; OPT-NOT:     alloca [[STRUCT_S]]
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0
+; OPT:         [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1
+; OPT:         [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]])
+; OPT:         [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]])
+; OPT:         [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
+;
+  %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
+  %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
+  %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
+  store i64 %1, ptr %result, align 8
+  ret void
+}
+
+define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
+; PTX-LABEL: grid_const_partial_escape(
+; PTX-NOT:     .local
+; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0];
+; PTX:         add.{{.*}}
+; PTX:         cvta.param.u64 [[RD3:%.*]], {{%.*}}
+; PTX:         st.param.{{.*}} [param0+0], [[RD3]]
+; PTX:         call
+;
+; OPT-LABEL: define void @grid_const_partial_escape(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
+; OPT-NOT:     alloca
+; OPT:         [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4
+; OPT:         [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]]
+; OPT:         store i32 [[TWICE]]
+; OPT:         [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT:         [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
+; OPT:         ret void
+;
+  %val = load i32, ptr %input
+  %twice = add i32 %val, %val
+  store i32 %twice, ptr %output
+  %call = call i32 @escape(ptr %input)
+  ret void
+}
+
+define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
+; PTX-LABEL: grid_const_partial_escapemem(
+; PTX:       {
+; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0];
+; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4];
+; PTX:         cvta.param.{{.*}} [[RD5:%.*]], {{%.*}};
+; PTX:         st.global.{{.*}} [{{.*}}], [[RD5]];
+; PTX:         add.s32 [[R3:%.*]], [[R1]], [[R2]] 
+; PTX:         st.param.{{.*}} [param0+0], [[RD5]]
+; PTX:         escape
+; OPT-LABEL: define i32 @grid_const_partial_escapemem(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
+; OPT-NOT:     alloca
+; OPT:         [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0
+; OPT:         [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4
+; OPT:         [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1
+; OPT:         [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4
+; OPT:         [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT:         store ptr [[INPUT1]]
+; OPT:         [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; OPT:         [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]])
+; OPT:         [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
+;
+  %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
+  %val1 = load i32, ptr %ptr1
+  %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
+  %val2 = load i32, ptr %ptr2
+  store ptr %input, ptr %output
+  %add = add i32 %val1, %val2
+  %call2 = call i32 @escape(ptr %ptr1)
+  ret i32 %add
+}
+
+define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+; PTX-LABEL: grid_const_phi_escape(
+; PTX:         cvta.param.{{.*}} [[RD1:%.*]], {{.*}}
+; PTX:         @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]];
+; PTX:        $L__BB[[TARGET_LABEL]]:
+; PTX:         ld.{{.*}} [[R1:%.*]], [[[RD1]]];
+;
+; OPT-LABEL: define void @grid_const_phi_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) {
+; OPT:         [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT:         [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT:         br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]]
+; OPT:       [[FIRST]]:
+; OPT:         [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT:         br label %[[MERGE:.*]]
+; OPT:       [[SECOND]]:
+; OPT:         [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
+; OPT:         br label %[[MERGE]]
+; OPT:       [[MERGE]]:
+; OPT:         [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NOT:     load i32, ptr addrspace(101)
+; OPT:         [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+;
+
+  %val = load i32, ptr %inout
+  %less = icmp slt i32 %val, 0
+  br i1 %less, label %first, label %second
+first:
+  %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
+  br label %merge
+second:
+  %ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1
+  br label %merge
+merge:
+  %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
+  %valloaded = load i32, ptr %ptrnew
+  store i32 %valloaded, ptr %inout
+  ret void
+}
+
+; NOTE: %input2 is *not* grid_constant
+define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_phi_escape2(
+; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4];
+; PTX:         @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]];
+; PTX:         cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]];
+; PTX:         ld.u32 [[R1]], [[[RD1]]];
+; PTX:       $L__BB[[LABEL]]:
+; PTX:         st.global.u32 [[[RD3:%.*]]], [[R1]]
+; OPT-LABEL: define void @grid_const_phi_escape2(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) {
+; OPT:         [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
+; OPT:         [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT:         [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
+; OPT:         store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; OPT:         [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT:         [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
+; OPT:         br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; OPT:       [[FIRST]]:
+; OPT:         [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
+; OPT:         br label %[[MERGE:.*]]
+; OPT:       [[SECOND]]:
+; OPT:         [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
+; OPT:         br label %[[MERGE]]
+; OPT:       [[MERGE]]:
+; OPT:         [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+;
+  %val = load i32, ptr %inout
+  %less = icmp slt i32 %val, 0
+  br i1 %less, label %first, label %second
+first:
+  %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
+  br label %merge
+second:
+  %ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1
+  br label %merge
+merge:
+  %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
+  %valloaded = load i32, ptr %ptrnew
+  store i32 %valloaded, ptr %inout
+  ret void
+}
+
+; NOTE: %input2 is *not* grid_constant
+define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_select_escape(
+; PTX:         cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]]
+; PTX:         setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0
+; PTX:         add.{{.*}} [[RD3:%.*]], %SP, 0;
+; PTX:         selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]];
+; PTX:         ld.u32 {{%.*}}, [[[RD4]]];
+; OPT-LABEL: define void @grid_const_select_escape(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) {
+; OPT:         [[INPUT24:%.*]] = alloca i32, align 4
+; OPT:         [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT:         [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
+; OPT:         load i32, ptr [[INOUT]]
+; OPT:         [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]]
+; OPT:         [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+;
+  %val = load i32, ptr %inout
+  %less = icmp slt i32 %val, 0
+  %ptrnew = select i1 %less, ptr %input1, ptr %input2
+  %valloaded = load i32, ptr %ptrnew
+  store i32 %valloaded, ptr %inout
+  ret void
+}
+
+define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
+; PTX-LABEL: grid_const_ptrtoint(
+; PTX-NOT:     .local
+; PTX:         ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0];
+; PTX:         cvta.param.u64 [[RD1:%.*]], {{%.*}}
+; PTX:         cvt.u32.u64 {{%.*}}, [[RD1]]
+; OPT-LABEL: define i32 @grid_const_ptrtoint(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) {
+; OPT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:    [[VAL:%.*]]   = load i32, ptr addrspace(101) [[INPUT2]]
+; OPT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+  %val = load i32, ptr %input
+  %ptrval = ptrtoint ptr %input to i32
+  %keepalive = add i32 %val, %ptrval
+  ret i32 %keepalive
+}
+
+
+
+declare dso_local void @dummy() local_unnamed_addr
+declare dso_local ptr @escape(ptr) local_unnamed_addr
+declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
+
+!0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1}
+!1 = !{i32 1}
+
+!2 = !{ptr @grid_const_struct, !"kernel", i32 1, !"grid_constant", !3}
+!3 = !{i32 1}
+
+!4 = !{ptr @grid_const_escape, !"kernel", i32 1, !"grid_constant", !5}
+!5 = !{i32 1}
+
+!6 = !{ptr @multiple_grid_const_escape, !"kernel", i32 1, !"grid_constant", !7}
+!7 = !{i32 1, i32 3}
+
+!8 = !{ptr @grid_const_memory_escape, !"kernel", i32 1, !"grid_constant", !9}
+!9 = !{i32 1}
+
+!10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11}
+!11 = !{i32 1}
+
+!12 = !{ptr @grid_const_partial_escape, !"kernel", i32 1, !"grid_constant", !13}
+!13 = !{i32 1}
+
+!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
+!15 = !{i32 1}
+
+!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17}
+!17 = !{i32 1}
+
+!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19}
+!19 = !{i32 1}
+
+!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21}
+!21 = !{i32 1}
+
+!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
+!23 = !{i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/max-align.ll b/llvm/test/CodeGen/NVPTX/max-align.ll
new file mode 100644
index 00000000000000..c8b1cb12dee5f4
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/max-align.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx64 -O0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -O0 | %ptxas-verify %}
+
+
+; CHECK: .visible .func  (.param .align 128 .b8 func_retval0[256]) repro()
+define <64 x i32> @repro() {
+
+  ; CHECK: .param .align 128 .b8 retval0[256];
+  %1 = tail call <64 x i32> @test(i32 0)
+  ret <64 x i32> %1
+}
+
+; Function Attrs: nounwind
+declare <64 x i32> @test(i32)
diff --git a/llvm/test/CodeGen/NVPTX/mma-no-sink-after-laneid-check.ll b/llvm/test/CodeGen/NVPTX/mma-no-sink-after-laneid-check.ll
new file mode 100644
index 00000000000000..a88bc4d43dbefd
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/mma-no-sink-after-laneid-check.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | FileCheck %s
+
+declare { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32(i32, i32, i32, float, float, float, float) #1
+
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.laneid() #0
+
+; COM: llvm.nvvm.mma should not sink to the next block and gets reordered to be after laneid check.
+; CHECK-LABEL: no_reorder_mma_and_laneid_check
+define dso_local void @no_reorder_mma_and_laneid_check(ptr %arg, ptr %arg1) {
+bb:
+  ; CHECK: mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32
+  ; CHECK: laneid
+  %i = tail call { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32(i32 10, i32 10, i32 8, float 0.0, float 0.0, float 0.0, float 0.0)
+  %i3 = tail call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+  %i4 = icmp eq i32 %i3, 0
+  br i1 %i4, label %bb5, label %bb8
+
+bb5:                                              ; preds = %bb
+  %i6 = extractvalue { float, float, float, float } %i, 0
+  %i7 = getelementptr float, ptr %arg, i64 0
+  store float %i6, ptr %i7, align 4
+  br label %bb8
+
+bb8:                                              ; preds = %bb5, %bb
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/param-align.ll b/llvm/test/CodeGen/NVPTX/param-align.ll
index 5435ee238c88df..5be1a59ede9f2b 100644
--- a/llvm/test/CodeGen/NVPTX/param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/param-align.ll
@@ -69,3 +69,56 @@ define ptx_device void @t6() {
   call void %fp(ptr byval(i8) null);
   ret void
 }
+
+; CHECK-LABEL: .func check_ptr_align1(
+; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align1_param_0];
+; CHECK-NOT: 	ld.param.u8
+; CHECK: 	mov.b32 	%r1, 0;
+; CHECK: 	st.u8 	[%rd1+3], %r1;
+; CHECK: 	st.u8 	[%rd1+2], %r1;
+; CHECK: 	st.u8 	[%rd1+1], %r1;
+; CHECK: 	mov.b32 	%r2, 1;
+; CHECK: 	st.u8 	[%rd1], %r2;
+; CHECK: 	ret;
+define void @check_ptr_align1(ptr align 1 %_arg_ptr) {
+entry:
+  store i32 1, ptr %_arg_ptr, align 1
+  ret void
+}
+
+; CHECK-LABEL: .func check_ptr_align2(
+; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align2_param_0];
+; CHECK-NOT: 	ld.param.u16
+; CHECK: 	mov.b32 	%r1, 0;
+; CHECK: 	st.u16 	[%rd1+2], %r1;
+; CHECK: 	mov.b32 	%r2, 2;
+; CHECK: 	st.u16 	[%rd1], %r2;
+; CHECK: 	ret;
+define void @check_ptr_align2(ptr align 2 %_arg_ptr) {
+entry:
+  store i32 2, ptr %_arg_ptr, align 2
+  ret void
+}
+
+; CHECK-LABEL: .func check_ptr_align4(
+; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align4_param_0];
+; CHECK-NOT: 	ld.param.u32
+; CHECK: 	mov.b32 	%r1, 4;
+; CHECK: 	st.u32 	[%rd1], %r1;
+; CHECK: 	ret;
+define void @check_ptr_align4(ptr align 4 %_arg_ptr) {
+entry:
+  store i32 4, ptr %_arg_ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: .func check_ptr_align8(
+; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align8_param_0];
+; CHECK: 	mov.b32 	%r1, 8;
+; CHECK: 	st.u32 	[%rd1], %r1;
+; CHECK: 	ret;
+define void @check_ptr_align8(ptr align 8 %_arg_ptr) {
+entry:
+  store i32 8, ptr %_arg_ptr, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
new file mode 100644
index 00000000000000..4941760a78c794
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.T = type { i64, <2 x i32>, <4 x i32> }
+
+declare void @test_call(%struct.T)
+
+define void @test_store_param_undef() {
+; CHECK-LABEL: test_store_param_undef(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[32];
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    test_call,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    ret;
+  call void @test_call(%struct.T undef)
+  ret void
+}
+
+define void @test_store_param_def(i64 %param0, i32 %param1) {
+; CHECK-LABEL: test_store_param_def(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_param_def_param_0];
+; CHECK-NEXT:    ld.param.u32 %r1, [test_store_param_def_param_1];
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[32];
+; CHECK-NEXT:    st.param.b64 [param0+0], %rd1;
+; CHECK-NEXT:    st.param.v2.b32 [param0+8], {%r2, %r1};
+; CHECK-NEXT:    st.param.v4.b32 [param0+16], {%r3, %r1, %r4, %r5};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    test_call,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    ret;
+  %V2 = insertelement <2 x i32> undef, i32 %param1, i32 1
+  %V4 = insertelement <4 x i32> undef, i32 %param1, i32 1
+  %S0 = insertvalue %struct.T undef, i64 %param0, 0
+  %S1 = insertvalue %struct.T %S0, <2 x i32> %V2, 1
+  %S2 = insertvalue %struct.T %S1, <4 x i32> %V4, 2
+  call void @test_call(%struct.T %S2)
+  ret void
+}
+
+define void @test_store_undef(ptr %out) {
+; CHECK-LABEL: test_store_undef(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  store %struct.T undef, ptr %out
+  ret void
+}
+
+define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
+; CHECK-LABEL: test_store_def(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_def_param_0];
+; CHECK-NEXT:    ld.param.u32 %r1, [test_store_def_param_1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [test_store_def_param_2];
+; CHECK-NEXT:    st.v4.u32 [%rd2+16], {%r2, %r1, %r3, %r4};
+; CHECK-NEXT:    st.v2.u32 [%rd2+8], {%r5, %r1};
+; CHECK-NEXT:    st.u64 [%rd2], %rd1;
+; CHECK-NEXT:    ret;
+  %V2 = insertelement <2 x i32> undef, i32 %param1, i32 1
+  %V4 = insertelement <4 x i32> undef, i32 %param1, i32 1
+  %S0 = insertvalue %struct.T undef, i64 %param0, 0
+  %S1 = insertvalue %struct.T %S0, <2 x i32> %V2, 1
+  %S2 = insertvalue %struct.T %S1, <4 x i32> %V4, 2
+  store %struct.T %S2, ptr %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fcmp-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fcmp-f16-rv32.mir
new file mode 100644
index 00000000000000..6fdfd65a15f005
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fcmp-f16-rv32.mir
@@ -0,0 +1,355 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            fcmp_oeq_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oeq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = nofpexcept FEQ_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FEQ_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(oeq), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ogt_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ogt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: $x10 = COPY [[FLT_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(ogt), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_oge_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = FLE_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: $x10 = COPY [[FLE_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(oge), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_olt_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_olt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = nofpexcept FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FLT_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(olt), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ole_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ole_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = nofpexcept FLE_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FLE_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(ole), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_one_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_one_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[FLT_H1:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:gpr = OR [[FLT_H]], [[FLT_H1]]
+    ; CHECK-NEXT: $x10 = COPY [[OR]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(one), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ord_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ord_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = FEQ_H [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[FEQ_H1:%[0-9]+]]:gpr = FEQ_H [[COPY1]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr = AND [[FEQ_H]], [[FEQ_H1]]
+    ; CHECK-NEXT: $x10 = COPY [[AND]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(ord), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ueq_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ueq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[FLT_H1:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:gpr = OR [[FLT_H]], [[FLT_H1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[OR]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(ueq), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ugt_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ugt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = FLE_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLE_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(ugt), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uge_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLT_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(uge), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ult_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ult_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = FLE_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLE_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(ult), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ule_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ule_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLT_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(ule), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_une_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_une_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = FEQ_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FEQ_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(une), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uno_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uno_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = FEQ_H [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[FEQ_H1:%[0-9]+]]:gpr = FEQ_H [[COPY1]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr = AND [[FEQ_H]], [[FEQ_H1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[AND]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s32) = G_FCMP floatpred(uno), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fcmp-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fcmp-f16-rv64.mir
new file mode 100644
index 00000000000000..827d3e567e6f19
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fcmp-f16-rv64.mir
@@ -0,0 +1,355 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            fcmp_oeq_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oeq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = nofpexcept FEQ_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FEQ_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(oeq), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ogt_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ogt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: $x10 = COPY [[FLT_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(ogt), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_oge_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = FLE_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: $x10 = COPY [[FLE_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(oge), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_olt_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_olt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = nofpexcept FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FLT_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(olt), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ole_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ole_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = nofpexcept FLE_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FLE_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(ole), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_one_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_one_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[FLT_H1:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:gpr = OR [[FLT_H]], [[FLT_H1]]
+    ; CHECK-NEXT: $x10 = COPY [[OR]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(one), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ord_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ord_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = FEQ_H [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[FEQ_H1:%[0-9]+]]:gpr = FEQ_H [[COPY1]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr = AND [[FEQ_H]], [[FEQ_H1]]
+    ; CHECK-NEXT: $x10 = COPY [[AND]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(ord), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ueq_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ueq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[FLT_H1:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:gpr = OR [[FLT_H]], [[FLT_H1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[OR]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(ueq), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ugt_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ugt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = FLE_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLE_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(ugt), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uge_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLT_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(uge), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ult_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ult_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLE_H:%[0-9]+]]:gpr = FLE_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLE_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(ult), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ule_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ule_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FLT_H:%[0-9]+]]:gpr = FLT_H [[COPY1]], [[COPY]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FLT_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(ule), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_une_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_une_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = FEQ_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[FEQ_H]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(une), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uno_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uno_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FEQ_H:%[0-9]+]]:gpr = FEQ_H [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[FEQ_H1:%[0-9]+]]:gpr = FEQ_H [[COPY1]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr = AND [[FEQ_H]], [[FEQ_H1]]
+    ; CHECK-NEXT: [[XORI:%[0-9]+]]:gpr = XORI [[AND]], 1
+    ; CHECK-NEXT: $x10 = COPY [[XORI]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %4:gprb(s64) = G_FCMP floatpred(uno), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith-f16.mir
new file mode 100644
index 00000000000000..96f019948537fd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith-f16.mir
@@ -0,0 +1,266 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            fadd_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fadd_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FADD_H:%[0-9]+]]:fpr16 = nofpexcept FADD_H [[COPY]], [[COPY1]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FADD_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = G_FADD %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fsub_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fsub_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FSUB_H:%[0-9]+]]:fpr16 = nofpexcept FSUB_H [[COPY]], [[COPY1]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FSUB_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = G_FSUB %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fmul_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fmul_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FMUL_H:%[0-9]+]]:fpr16 = nofpexcept FMUL_H [[COPY]], [[COPY1]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FMUL_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = G_FMUL %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fdiv_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fdiv_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FDIV_H:%[0-9]+]]:fpr16 = nofpexcept FDIV_H [[COPY]], [[COPY1]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FDIV_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = G_FDIV %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fma_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h, $f12_h
+
+    ; CHECK-LABEL: name: fma_f16
+    ; CHECK: liveins: $f10_h, $f11_h, $f12_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $f12_h
+    ; CHECK-NEXT: [[FMADD_H:%[0-9]+]]:fpr16 = nofpexcept FMADD_H [[COPY]], [[COPY1]], [[COPY2]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FMADD_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = COPY $f12_h
+    %3:fprb(s16) = G_FMA %0, %1, %2
+    $f10_h = COPY %3(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fneg_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h, $f12_h
+
+    ; CHECK-LABEL: name: fneg_f16
+    ; CHECK: liveins: $f10_h, $f11_h, $f12_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FSGNJN_H:%[0-9]+]]:fpr16 = FSGNJN_H [[COPY]], [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FSGNJN_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = G_FNEG %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fabs_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h, $f12_h
+
+    ; CHECK-LABEL: name: fabs_f16
+    ; CHECK: liveins: $f10_h, $f11_h, $f12_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FSGNJX_H:%[0-9]+]]:fpr16 = FSGNJX_H [[COPY]], [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FSGNJX_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = G_FABS %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fsqrt_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h, $f12_h
+
+    ; CHECK-LABEL: name: fsqrt_f16
+    ; CHECK: liveins: $f10_h, $f11_h, $f12_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FSQRT_H:%[0-9]+]]:fpr16 = nofpexcept FSQRT_H [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FSQRT_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = G_FSQRT %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fmaxnum_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fmaxnum_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FMAX_H:%[0-9]+]]:fpr16 = nofpexcept FMAX_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMAX_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = G_FMAXNUM %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fminnum_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fminnum_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FMIN_H:%[0-9]+]]:fpr16 = nofpexcept FMIN_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMIN_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = G_FMINNUM %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fcopysign_f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcopysign_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $f11_h
+    ; CHECK-NEXT: [[FSGNJ_H:%[0-9]+]]:fpr16 = FSGNJ_H [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FSGNJ_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = COPY $f11_h
+    %2:fprb(s16) = G_FCOPYSIGN %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir
index 78bb91d2eb275e..037647ae688725 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir
@@ -191,30 +191,6 @@ body:             |
     $f10_f = COPY %1(s32)
     PseudoRET implicit $f10_f
 
-...
----
-name:            fadd_f64
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $f10_d, $f11_d
-
-    ; CHECK-LABEL: name: fadd_f64
-    ; CHECK: liveins: $f10_d, $f11_d
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $f11_d
-    ; CHECK-NEXT: [[FADD_D:%[0-9]+]]:fpr64 = nofpexcept FADD_D [[COPY]], [[COPY1]], 7
-    ; CHECK-NEXT: $f10_d = COPY [[FADD_D]]
-    ; CHECK-NEXT: PseudoRET implicit $f10_d
-    %0:fprb(s64) = COPY $f10_d
-    %1:fprb(s64) = COPY $f11_d
-    %2:fprb(s64) = G_FADD %0, %1
-    $f10_d = COPY %2(s64)
-    PseudoRET implicit $f10_d
-
 ...
 ---
 name:            fmaxnum_f32
@@ -287,6 +263,30 @@ body:             |
     $f10_f = COPY %2(s32)
     PseudoRET implicit $f10_f
 
+...
+---
+name:            fadd_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d
+
+    ; CHECK-LABEL: name: fadd_f64
+    ; CHECK: liveins: $f10_d, $f11_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $f11_d
+    ; CHECK-NEXT: [[FADD_D:%[0-9]+]]:fpr64 = nofpexcept FADD_D [[COPY]], [[COPY1]], 7
+    ; CHECK-NEXT: $f10_d = COPY [[FADD_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s64) = COPY $f11_d
+    %2:fprb(s64) = G_FADD %0, %1
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
 ...
 ---
 name:            fsub_f64
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant-f16.mir
new file mode 100644
index 00000000000000..8951e373ba7a96
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant-f16.mir
@@ -0,0 +1,95 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,RV32
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,RV64
+
+---
+name:            half_imm
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1:
+    ; RV32-LABEL: name: half_imm
+    ; RV32: [[LUI:%[0-9]+]]:gpr = LUI 4
+    ; RV32-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[LUI]], 584
+    ; RV32-NEXT: [[FMV_H_X:%[0-9]+]]:fpr16 = FMV_H_X [[ADDI]]
+    ; RV32-NEXT: $f10_h = COPY [[FMV_H_X]]
+    ; RV32-NEXT: PseudoRET implicit $f10_h
+    ;
+    ; RV64-LABEL: name: half_imm
+    ; RV64: [[LUI:%[0-9]+]]:gpr = LUI 4
+    ; RV64-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LUI]], 584
+    ; RV64-NEXT: [[FMV_H_X:%[0-9]+]]:fpr16 = FMV_H_X [[ADDIW]]
+    ; RV64-NEXT: $f10_h = COPY [[FMV_H_X]]
+    ; RV64-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = G_FCONSTANT half 0xH4248
+    $f10_h = COPY %0(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            half_imm_op
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: half_imm_op
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 15
+    ; CHECK-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[ADDI]], 10
+    ; CHECK-NEXT: [[FMV_H_X:%[0-9]+]]:fpr16 = FMV_H_X [[SLLI]]
+    ; CHECK-NEXT: [[FADD_H:%[0-9]+]]:fpr16 = nofpexcept FADD_H [[COPY]], [[FMV_H_X]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FADD_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s16) = G_FCONSTANT half 1.000000e+00
+    %2:fprb(s16) = G_FADD %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            half_positive_zero
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: half_positive_zero
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x0
+    ; CHECK-NEXT: [[FMV_H_X:%[0-9]+]]:fpr16 = FMV_H_X [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMV_H_X]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:fprb(s16) = G_FCONSTANT half 0.000000e+00
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            half_negative_zero
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: half_negative_zero
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[LUI:%[0-9]+]]:gpr = LUI 1048568
+    ; CHECK-NEXT: [[FMV_H_X:%[0-9]+]]:fpr16 = FMV_H_X [[LUI]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMV_H_X]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:fprb(s16) = G_FCONSTANT half -0.000000e+00
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-ext-trunc-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-ext-trunc-f16.mir
new file mode 100644
index 00000000000000..966604c4838e58
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-ext-trunc-f16.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+d,+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+d,+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            fpext_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fpext_f32
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_S_H:%[0-9]+]]:fpr32 = nofpexcept FCVT_S_H [[COPY]], 0
+    ; CHECK-NEXT: $f10_f = COPY [[FCVT_S_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s32) = G_FPEXT %0(s16)
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fptrunc_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fptrunc_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FCVT_H_S:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_S [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s16) = G_FPTRUNC %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fpext_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fpext_f64
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_D_H:%[0-9]+]]:fpr64 = nofpexcept FCVT_D_H [[COPY]], 0
+    ; CHECK-NEXT: $f10_d = COPY [[FCVT_D_H]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s16) = COPY $f10_h
+    %1:fprb(s64) = G_FPEXT %0(s16)
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fptrunc_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fptrunc_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FCVT_H_D:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_D [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s16) = G_FPTRUNC %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fptoi-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fptoi-f16-rv32.mir
new file mode 100644
index 00000000000000..13dac86ac1df68
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fptoi-f16-rv32.mir
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            fptosi_s32_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_W_H:%[0-9]+]]:gpr = nofpexcept FCVT_W_H [[COPY]], 1
+    ; CHECK-NEXT: $x10 = COPY [[FCVT_W_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:gprb(s32) = G_FPTOSI %0(s16)
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s32_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_WU_H:%[0-9]+]]:gpr = nofpexcept FCVT_WU_H [[COPY]], 1
+    ; CHECK-NEXT: $x10 = COPY [[FCVT_WU_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:gprb(s32) = G_FPTOUI %0(s16)
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fptoi-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fptoi-f16-rv64.mir
new file mode 100644
index 00000000000000..11a3f47ea7f14e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fptoi-f16-rv64.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            fptosi_s32_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_W_H:%[0-9]+]]:gpr = nofpexcept FCVT_W_H [[COPY]], 1
+    ; CHECK-NEXT: $x10 = COPY [[FCVT_W_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:gprb(s32) = G_FPTOSI %0(s16)
+    %2:gprb(s64) = G_ANYEXT %1(s32)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s32_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_WU_H:%[0-9]+]]:gpr = nofpexcept FCVT_WU_H [[COPY]], 1
+    ; CHECK-NEXT: $x10 = COPY [[FCVT_WU_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:gprb(s32) = G_FPTOUI %0(s16)
+    %2:gprb(s64) = G_ANYEXT %1(s32)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s64_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s64_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_L_H:%[0-9]+]]:gpr = nofpexcept FCVT_L_H [[COPY]], 1
+    ; CHECK-NEXT: $x10 = COPY [[FCVT_L_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:gprb(s64) = G_FPTOSI %0(s16)
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s64_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s64_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCVT_LU_H:%[0-9]+]]:gpr = nofpexcept FCVT_LU_H [[COPY]], 1
+    ; CHECK-NEXT: $x10 = COPY [[FCVT_LU_H]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %1:gprb(s64) = G_FPTOUI %0(s16)
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-f16-rv32.mir
new file mode 100644
index 00000000000000..b8ff04a7e6bec5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-f16-rv32.mir
@@ -0,0 +1,82 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=instruction-select -verify-machineinstrs %s -o - | \
+# RUN: FileCheck %s
+
+---
+name:            is_fpclass_f16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCLASS_H:%[0-9]+]]:gpr = FCLASS_H [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_H]], 152
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %3:gprb(s32) = G_CONSTANT i32 152
+    %4:gprb(s32) = G_CONSTANT i32 0
+    %5:gprb(s32) = G_FCLASS %0(s16)
+    %6:gprb(s32) = G_AND %5, %3
+    %7:gprb(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f16_onehot
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16_onehot
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCLASS_H:%[0-9]+]]:gpr = FCLASS_H [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_H]], 256
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %3:gprb(s32) = G_CONSTANT i32 256
+    %4:gprb(s32) = G_CONSTANT i32 0
+    %5:gprb(s32) = G_FCLASS %0(s16)
+    %6:gprb(s32) = G_AND %5, %3
+    %7:gprb(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f16_one
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16_one
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCLASS_H:%[0-9]+]]:gpr = FCLASS_H [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_H]], 1
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %3:gprb(s32) = G_CONSTANT i32 1
+    %4:gprb(s32) = G_CONSTANT i32 0
+    %5:gprb(s32) = G_FCLASS %0(s16)
+    %6:gprb(s32) = G_AND %5, %3
+    %7:gprb(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-f16-rv64.mir
new file mode 100644
index 00000000000000..3860657ea8f03b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-f16-rv64.mir
@@ -0,0 +1,82 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=instruction-select -verify-machineinstrs %s -o - | \
+# RUN: FileCheck %s
+
+---
+name:            is_fpclass_f16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCLASS_H:%[0-9]+]]:gpr = FCLASS_H [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_H]], 152
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %3:gprb(s64) = G_CONSTANT i64 152
+    %4:gprb(s64) = G_CONSTANT i64 0
+    %5:gprb(s64) = G_FCLASS %0(s16)
+    %6:gprb(s64) = G_AND %5, %3
+    %7:gprb(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f16_onehot
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16_onehot
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCLASS_H:%[0-9]+]]:gpr = FCLASS_H [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_H]], 256
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %3:gprb(s64) = G_CONSTANT i64 256
+    %4:gprb(s64) = G_CONSTANT i64 0
+    %5:gprb(s64) = G_FCLASS %0(s16)
+    %6:gprb(s64) = G_AND %5, %3
+    %7:gprb(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f16_one
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16_one
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $f10_h
+    ; CHECK-NEXT: [[FCLASS_H:%[0-9]+]]:gpr = FCLASS_H [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_H]], 1
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s16) = COPY $f10_h
+    %3:gprb(s64) = G_CONSTANT i64 1
+    %4:gprb(s64) = G_CONSTANT i64 0
+    %5:gprb(s64) = G_FCLASS %0(s16)
+    %6:gprb(s64) = G_AND %5, %3
+    %7:gprb(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv32.mir
index 705c70757e80b6..8c8c780dbd0bf0 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv32.mir
@@ -80,3 +80,81 @@ body:             |
     $x10 = COPY %7(s32)
     PseudoRET implicit $x10
 ...
+---
+name:            is_fpclass_f64
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: is_fpclass_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FCLASS_D:%[0-9]+]]:gpr = FCLASS_D [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_D]], 152
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s64) = COPY $f10_d
+    %3:gprb(s32) = G_CONSTANT i32 152
+    %4:gprb(s32) = G_CONSTANT i32 0
+    %5:gprb(s32) = G_FCLASS %0(s64)
+    %6:gprb(s32) = G_AND %5, %3
+    %7:gprb(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f64_onehot
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: is_fpclass_f64_onehot
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FCLASS_D:%[0-9]+]]:gpr = FCLASS_D [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_D]], 256
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s64) = COPY $f10_d
+    %3:gprb(s32) = G_CONSTANT i32 256
+    %4:gprb(s32) = G_CONSTANT i32 0
+    %5:gprb(s32) = G_FCLASS %0(s64)
+    %6:gprb(s32) = G_AND %5, %3
+    %7:gprb(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f64_one
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: is_fpclass_f64_one
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FCLASS_D:%[0-9]+]]:gpr = FCLASS_D [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_D]], 1
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s64) = COPY $f10_d
+    %3:gprb(s32) = G_CONSTANT i32 1
+    %4:gprb(s32) = G_CONSTANT i32 0
+    %5:gprb(s32) = G_FCLASS %0(s64)
+    %6:gprb(s32) = G_AND %5, %3
+    %7:gprb(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv64.mir
index e970bc23a8c1a2..3e5a8a84841791 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/is-fpclass-rv64.mir
@@ -2,6 +2,84 @@
 # RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=instruction-select -verify-machineinstrs %s -o - | \
 # RUN: FileCheck %s
 
+---
+name:            is_fpclass_f32
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: is_fpclass_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FCLASS_S:%[0-9]+]]:gpr = FCLASS_S [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_S]], 152
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s32) = COPY $f10_f
+    %3:gprb(s64) = G_CONSTANT i64 152
+    %4:gprb(s64) = G_CONSTANT i64 0
+    %5:gprb(s64) = G_FCLASS %0(s32)
+    %6:gprb(s64) = G_AND %5, %3
+    %7:gprb(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f32_onehot
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: is_fpclass_f32_onehot
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FCLASS_S:%[0-9]+]]:gpr = FCLASS_S [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_S]], 256
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s32) = COPY $f10_f
+    %3:gprb(s64) = G_CONSTANT i64 256
+    %4:gprb(s64) = G_CONSTANT i64 0
+    %5:gprb(s64) = G_FCLASS %0(s32)
+    %6:gprb(s64) = G_AND %5, %3
+    %7:gprb(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
+---
+name:            is_fpclass_f32_one
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: is_fpclass_f32_one
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FCLASS_S:%[0-9]+]]:gpr = FCLASS_S [[COPY]]
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[FCLASS_S]], 1
+    ; CHECK-NEXT: [[SLTU:%[0-9]+]]:gpr = SLTU $x0, [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SLTU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:fprb(s32) = COPY $f10_f
+    %3:gprb(s64) = G_CONSTANT i64 1
+    %4:gprb(s64) = G_CONSTANT i64 0
+    %5:gprb(s64) = G_FCLASS %0(s32)
+    %6:gprb(s64) = G_AND %5, %3
+    %7:gprb(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
 ---
 name:            is_fpclass_f64
 legalized:       true
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/itofp-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/itofp-f16-rv32.mir
new file mode 100644
index 00000000000000..fbcf124f9f4681
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/itofp-f16-rv32.mir
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            sitofp_s16_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[FCVT_H_W:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_W [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_W]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:gprb(s32) = COPY $x10
+    %1:fprb(s16) = G_SITOFP %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[FCVT_H_WU:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_WU [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_WU]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:gprb(s32) = COPY $x10
+    %1:fprb(s16) = G_UITOFP %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/itofp-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/itofp-f16-rv64.mir
new file mode 100644
index 00000000000000..1afb1d9be6a099
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/itofp-f16-rv64.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            sitofp_s64_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s64_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[FCVT_H_W:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_W [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_W]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:gprb(s64) = COPY $x10
+    %1:gprb(s32) = G_TRUNC %0(s64)
+    %2:fprb(s16) = G_SITOFP %1(s32)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s64_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s64_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[FCVT_H_WU:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_WU [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_WU]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:gprb(s64) = COPY $x10
+    %1:gprb(s32) = G_TRUNC %0(s64)
+    %2:fprb(s16) = G_UITOFP %1(s32)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s64_s64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s64_s64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[FCVT_H_L:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_L [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_L]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:gprb(s64) = COPY $x10
+    %1:fprb(s16) = G_SITOFP %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s64_s64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s64_s64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[FCVT_H_LU:%[0-9]+]]:fpr16 = nofpexcept FCVT_H_LU [[COPY]], 7
+    ; CHECK-NEXT: $f10_h = COPY [[FCVT_H_LU]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:gprb(s64) = COPY $x10
+    %1:fprb(s16) = G_UITOFP %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-store.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-store.ll
new file mode 100644
index 00000000000000..5a9cfd4d55cf55
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-store.ll
@@ -0,0 +1,950 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -global-isel -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck  -check-prefixes=RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -global-isel -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck  -check-prefixes=RV64 %s
+
+define void @vstore_nx1i8(ptr %pa, <vscale x 1 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx1i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx1i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 1 x i8> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx2i8(ptr %pa, <vscale x 2 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx2i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i8> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx4i8(ptr %pa, <vscale x 4 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx4i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i8> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx8i8(ptr %pa, <vscale x 8 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx8i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx8i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 8 x i8> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx16i8(ptr %pa, <vscale x 16 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx16i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx16i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 16 x i8> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx32i8(ptr %pa, <vscale x 32 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx32i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m4
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx32i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m4
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 32 x i8> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx64i8(ptr %pa, <vscale x 64 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx64i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m8
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v8m8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx64i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m8
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v8m8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 64 x i8> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx1i16(ptr %pa, <vscale x 1 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx1i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx1i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 1 x i16> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx2i16(ptr %pa, <vscale x 2 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx2i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i16> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx4i16(ptr %pa, <vscale x 4 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx4i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i16> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx8i16(ptr %pa, <vscale x 8 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx8i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx8i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 8 x i16> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx16i16(ptr %pa, <vscale x 16 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx16i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m4
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx16i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m4
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 16 x i16> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx32i16(ptr %pa, <vscale x 32 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx32i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m8
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v8m8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx32i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m8
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v8m8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 32 x i16> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx1i32(ptr %pa, <vscale x 1 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx1i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx1i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 1 x i32> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx2i32(ptr %pa, <vscale x 2 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx2i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i32> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx4i32(ptr %pa, <vscale x 4 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx4i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i32> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx8i32(ptr %pa, <vscale x 8 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx8i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m4
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx8i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m4
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 8 x i32> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx16i32(ptr %pa, <vscale x 16 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx16i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m8
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v8m8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx16i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m8
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v8m8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 16 x i32> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx1i64(ptr %pa, <vscale x 1 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx1i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx1i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 1 x i64> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx2i64(ptr %pa, <vscale x 2 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx2i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i64> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx4i64(ptr %pa, <vscale x 4 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx4i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m4
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m4
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i64> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx8i64(ptr %pa, <vscale x 8 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx8i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m8
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v8m8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx8i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m8
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v8m8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 8 x i64> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx16i8_align1(ptr %pa, <vscale x 16 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx16i8_align1
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx16i8_align1
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 16 x i8> %b, ptr %pa, align 1
+  ret void
+}
+
+define void @vstore_nx16i8_align2(ptr %pa, <vscale x 16 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx16i8_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx16i8_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 16 x i8> %b, ptr %pa, align 2
+  ret void
+}
+
+define void @vstore_nx16i8_align16(ptr %pa, <vscale x 16 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx16i8_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx16i8_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 16 x i8> %b, ptr %pa, align 16
+  ret void
+}
+
+define void @vstore_nx16i8_align64(ptr %pa, <vscale x 16 x i8> %b) {
+  ; RV32-LABEL: name: vstore_nx16i8_align64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx16i8_align64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 16 x i8> %b, ptr %pa, align 64
+  ret void
+}
+
+define void @vstore_nx4i16_align1(ptr %pa, <vscale x 4 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx4i16_align1
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 1)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i16_align1
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 1)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i16> %b, ptr %pa, align 1
+  ret void
+}
+
+define void @vstore_nx4i16_align2(ptr %pa, <vscale x 4 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx4i16_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i16_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i16> %b, ptr %pa, align 2
+  ret void
+}
+
+define void @vstore_nx4i16_align4(ptr %pa, <vscale x 4 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx4i16_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i16_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i16> %b, ptr %pa, align 4
+  ret void
+}
+
+define void @vstore_nx4i16_align8(ptr %pa, <vscale x 4 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx4i16_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i16_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i16> %b, ptr %pa, align 8
+  ret void
+}
+
+define void @vstore_nx4i16_align16(ptr %pa, <vscale x 4 x i16> %b) {
+  ; RV32-LABEL: name: vstore_nx4i16_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx4i16_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 4 x i16> %b, ptr %pa, align 16
+  ret void
+}
+
+define void @vstore_nx2i32_align2(ptr %pa, <vscale x 2 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx2i32_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 2)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i32_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 2)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i32> %b, ptr %pa, align 2
+  ret void
+}
+
+define void @vstore_nx2i32_align4(ptr %pa, <vscale x 2 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx2i32_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i32_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i32> %b, ptr %pa, align 4
+  ret void
+}
+
+define void @vstore_nx2i32_align8(ptr %pa, <vscale x 2 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx2i32_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i32_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i32> %b, ptr %pa, align 8
+  ret void
+}
+
+define void @vstore_nx2i32_align16(ptr %pa, <vscale x 2 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx2i32_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i32_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i32> %b, ptr %pa, align 16
+  ret void
+}
+
+define void @vstore_nx2i32_align256(ptr %pa, <vscale x 2 x i32> %b) {
+  ; RV32-LABEL: name: vstore_nx2i32_align256
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i32_align256
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i32> %b, ptr %pa, align 256
+  ret void
+}
+
+define void @vstore_nx2i64_align4(ptr %pa, <vscale x 2 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx2i64_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 4)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i64_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 4)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i64> %b, ptr %pa, align 4
+  ret void
+}
+
+define void @vstore_nx2i64_align8(ptr %pa, <vscale x 2 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx2i64_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i64_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i64> %b, ptr %pa, align 8
+  ret void
+}
+
+define void @vstore_nx2i64_align16(ptr %pa, <vscale x 2 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx2i64_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i64_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i64> %b, ptr %pa, align 16
+  ret void
+}
+
+define void @vstore_nx2i64_align32(ptr %pa, <vscale x 2 x i64> %b) {
+  ; RV32-LABEL: name: vstore_nx2i64_align32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m2
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2i64_align32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x i64> %b, ptr %pa, align 32
+  ret void
+}
+
+define void @vstore_nx1ptr(ptr %pa, <vscale x 1 x ptr> %b) {
+  ; RV32-LABEL: name: vstore_nx1ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x p0>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx1ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $v8, $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 1 x p0>) = COPY $v8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 1 x ptr> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx2ptr(ptr %pa, <vscale x 2 x ptr> %b) {
+  ; RV32-LABEL: name: vstore_nx2ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $v8, $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $v8
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx2ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m2
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $v8m2
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 2 x ptr> %b, ptr %pa
+  ret void
+}
+
+define void @vstore_nx8ptr(ptr %pa, <vscale x 8 x ptr> %b) {
+  ; RV32-LABEL: name: vstore_nx8ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10, $v8m4
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x p0>) = COPY $v8m4
+  ; RV32-NEXT:   G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+  ; RV32-NEXT:   PseudoRET
+  ;
+  ; RV64-LABEL: name: vstore_nx8ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10, $v8m8
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x p0>) = COPY $v8m8
+  ; RV64-NEXT:   G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+  ; RV64-NEXT:   PseudoRET
+  store <vscale x 8 x ptr> %b, ptr %pa
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fcmp-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fcmp-f16-rv32.mir
new file mode 100644
index 00000000000000..6aaaf50d386ba4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fcmp-f16-rv32.mir
@@ -0,0 +1,312 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fcmp_oeq_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oeq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(oeq), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ogt_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ogt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ogt), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ogt), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_oge_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oge), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(oge), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_olt_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_olt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(olt), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(olt), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ole_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ole_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ole), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ole), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_one_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_one_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(one), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(one), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ord_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ord_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ord), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ord), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ueq_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ueq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ueq), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ueq), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ugt_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ugt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ugt), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ugt), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uge_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(uge), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(uge), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ult_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ult_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ult), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ule_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ule_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ule), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ule), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_une_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_une_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(une), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(une), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uno_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uno_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(uno), %0(s16), %1
+    %3:_(s32) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fcmp-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fcmp-f16-rv64.mir
new file mode 100644
index 00000000000000..0f604cad455122
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fcmp-f16-rv64.mir
@@ -0,0 +1,312 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fcmp_oeq_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oeq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(oeq), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(oeq), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ogt_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ogt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(ogt), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ogt), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_oge_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_oge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(oge), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(oge), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_olt_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_olt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(olt), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(olt), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ole_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ole_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(ole), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ole), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_one_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_one_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(one), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(one), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ord_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ord_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(ord), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ord), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ueq_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ueq_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(ueq), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ueq), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ugt_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ugt_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(ugt), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ugt), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uge_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uge_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(uge), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(uge), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ult_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ult_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(ult), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ult), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_ule_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_ule_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(ule), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(ule), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_une_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_une_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(une), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(une), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fcmp_uno_f16
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_uno_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s64) = G_FCMP floatpred(uno), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s1) = G_FCMP floatpred(uno), %0(s16), %1
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-arith-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-arith-f16.mir
new file mode 100644
index 00000000000000..f1777e945b12db
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-arith-f16.mir
@@ -0,0 +1,233 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fadd_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fadd_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FADD]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FADD %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fsub_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fsub_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FSUB]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FSUB %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fmul_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fmul_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMUL]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FMUL %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fdiv_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fdiv_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s16) = G_FDIV [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FDIV]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FDIV %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fma_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h, $f12_h
+
+    ; CHECK-LABEL: name: fma_f16
+    ; CHECK: liveins: $f10_h, $f11_h, $f12_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY $f12_h
+    ; CHECK-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMA]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = COPY $f12_h
+    %3:_(s16) = G_FMA %0, %1, %2
+    $f10_h = COPY %3(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fneg_f16
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fneg_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FNEG]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FNEG %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fabs_f16
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fabs_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FABS]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FABS %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fsqrt_f16
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fsqrt_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:_(s16) = G_FSQRT [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FSQRT]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FSQRT %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fmaxnum_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fmaxnum_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:_(s16) = G_FMAXNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMAXNUM]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FMAXNUM %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fminnum_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fminnum_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s16) = G_FMINNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMINNUM]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FMINNUM %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fcopysign_f16
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcopysign_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCOPYSIGN:%[0-9]+]]:_(s16) = G_FCOPYSIGN [[COPY]], [[COPY1]](s16)
+    ; CHECK-NEXT: $f10_h = COPY [[FCOPYSIGN]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FCOPYSIGN %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ext-trunc-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ext-trunc-f16.mir
new file mode 100644
index 00000000000000..4b091540c5b975
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ext-trunc-f16.mir
@@ -0,0 +1,82 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -mattr=+d,+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+d,+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fpext_f32
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fpext_f32
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[COPY]](s16)
+    ; CHECK-NEXT: $f10_f = COPY [[FPEXT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPEXT %0(s16)
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fptrunc_f32
+body:             |
+  bb.1:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fptrunc_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[FPTRUNC]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s32) = COPY $f10_f
+    %1:_(s16) = G_FPTRUNC %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fpext_f64
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fpext_f64
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[COPY]](s16)
+    ; CHECK-NEXT: $f10_d = COPY [[FPEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s16) = COPY $f10_h
+    %1:_(s64) = G_FPEXT %0(s16)
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fptrunc_f64
+body:             |
+  bb.1:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fptrunc_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY]](s64)
+    ; CHECK-NEXT: $f10_h = COPY [[FPTRUNC]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $f10_d
+    %1:_(s16) = G_FPTRUNC %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fptoi-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fptoi-f16-rv32.mir
new file mode 100644
index 00000000000000..27e52feb1d9a31
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fptoi-f16-rv32.mir
@@ -0,0 +1,142 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fptoui_s1_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s1_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOUI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s1) = G_FPTOUI %0(s16)
+    %2:_(s32) = G_ANYEXT %1(s1)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s8_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s8_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOSI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s8) = G_FPTOSI %0(s16)
+    %2:_(s32) = G_ANYEXT %1(s8)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s8_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s8_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOUI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s8) = G_FPTOUI %0(s16)
+    %2:_(s32) = G_ANYEXT %1(s8)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s16_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s16_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOSI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FPTOSI %0(s16)
+    %2:_(s32) = G_ANYEXT %1(s16)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s16_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s16_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOUI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FPTOUI %0(s16)
+    %2:_(s32) = G_ANYEXT %1(s16)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s32_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOSI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOSI %0(s16)
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s32_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOUI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOUI %0(s16)
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fptoi-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fptoi-f16-rv64.mir
new file mode 100644
index 00000000000000..12a0d0e4c763c2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fptoi-f16-rv64.mir
@@ -0,0 +1,210 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fptosi_s1_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s1_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOSI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s1) = G_FPTOSI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s1)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s1_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s1_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOUI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s1) = G_FPTOUI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s1)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s8_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s8_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOSI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s8) = G_FPTOSI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s8)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s8_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s8_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOUI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s8) = G_FPTOUI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s8)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s16_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s16_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOSI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FPTOSI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s16)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s16_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s16_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOUI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FPTOUI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s16)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s32_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOSI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOSI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s32)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s32_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTOUI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOUI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s32)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s64_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s64_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s64) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOSI]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s64) = G_FPTOSI %0(s16)
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s64_s16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s64_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s64) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOUI]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s64) = G_FPTOUI %0(s16)
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-f16-rv32.mir
new file mode 100644
index 00000000000000..1cbbaf09bd1ced
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-f16-rv32.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            is_fpclass_f16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:_(s32) = G_FCLASS [[COPY]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s1) = G_IS_FPCLASS %0(s16), 608
+    %2:_(s32) = G_ANYEXT %1(s1)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-f16-rv64.mir
new file mode 100644
index 00000000000000..ad02f5bacb820e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-f16-rv64.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            is_fpclass_f16
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:_(s64) = G_FCLASS [[COPY]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ne), [[AND]](s64), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s1) = G_IS_FPCLASS %0(s16), 608
+    %2:_(s64) = G_ANYEXT %1(s1)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv32.mir
index b35a68b20ad151..fb8c5c39962a8f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv32.mir
@@ -26,3 +26,27 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+---
+name:            is_fpclass_f64
+body:             |
+  bb.1:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: is_fpclass_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:_(s32) = G_FCLASS [[COPY]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $f10_d
+    %1:_(s1) = G_IS_FPCLASS %0(s64), 608
+    %2:_(s32) = G_ANYEXT %1(s1)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv64.mir
index 8b27e2875c537a..735398b873c6cd 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-is-fpclass-rv64.mir
@@ -2,6 +2,30 @@
 # RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=legalizer %s -o - \
 # RUN: | FileCheck %s
 
+---
+name:            is_fpclass_f32
+body:             |
+  bb.1:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: is_fpclass_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:_(s64) = G_FCLASS [[COPY]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ne), [[AND]](s64), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $f10_f
+    %1:_(s1) = G_IS_FPCLASS %0(s32), 608
+    %2:_(s64) = G_ANYEXT %1(s1)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
 ---
 name:            is_fpclass_f64
 body:             |
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-itofp-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-itofp-f16-rv32.mir
new file mode 100644
index 00000000000000..f68902c715affa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-itofp-f16-rv32.mir
@@ -0,0 +1,185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            sitofp_s16_s1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[ASHR]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s32)
+    %2:_(s16) = G_SITOFP %0(s1)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[AND]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s32)
+    %2:_(s16) = G_UITOFP %0(s1)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s8
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[ASHR]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %2:_(s16) = G_SITOFP %0(s8)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s8
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[AND]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %2:_(s16) = G_UITOFP %0(s8)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s16
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[ASHR]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %2:_(s16) = G_SITOFP %0(s16)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s16
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[AND]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %2:_(s16) = G_UITOFP %0(s16)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[COPY]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s32) = COPY $x10
+    %1:_(s16) = G_SITOFP %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[COPY]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s32) = COPY $x10
+    %1:_(s16) = G_UITOFP %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-itofp-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-itofp-f16-rv64.mir
new file mode 100644
index 00000000000000..797e79b69da5fd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-itofp-f16-rv64.mir
@@ -0,0 +1,238 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            sitofp_s16_s1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s64)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[ASHR]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s64)
+    %2:_(s16) = G_SITOFP %0(s1)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[AND]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s64)
+    %2:_(s16) = G_UITOFP %0(s1)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s8
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s64)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[ASHR]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %2:_(s16) = G_SITOFP %0(s8)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s8
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[AND]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %2:_(s16) = G_UITOFP %0(s8)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s16
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s64)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[ASHR]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %2:_(s16) = G_SITOFP %0(s16)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s16
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[AND]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %2:_(s16) = G_UITOFP %0(s16)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[TRUNC]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %2:_(s16) = G_SITOFP %0(s32)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[TRUNC]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %2:_(s16) = G_UITOFP %0(s32)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s64
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[COPY]](s64)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $x10
+    %1:_(s16) = G_SITOFP %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s64
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(s16) = G_UITOFP [[COPY]](s64)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $x10
+    %1:_(s16) = G_UITOFP %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fcmp-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fcmp-f16-rv32.mir
new file mode 100644
index 00000000000000..2a3f8bdc4d3e02
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fcmp-f16-rv32.mir
@@ -0,0 +1,27 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            fcmp_f16
+legalized:       true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:gprb(s32) = G_FCMP floatpred(oeq), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %4:_(s32) = G_FCMP floatpred(oeq), %0(s16), %1
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fcmp-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fcmp-f16-rv64.mir
new file mode 100644
index 00000000000000..518ef4488aa441
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fcmp-f16-rv64.mir
@@ -0,0 +1,27 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            fcmp_f16
+legalized:       true
+body:             |
+  bb.1:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcmp_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:gprb(s64) = G_FCMP floatpred(oeq), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: $x10 = COPY [[FCMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %4:_(s64) = G_FCMP floatpred(oeq), %0(s16), %1
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-arith-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-arith-f16.mir
new file mode 100644
index 00000000000000..ace4aa542377b3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-arith-f16.mir
@@ -0,0 +1,257 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            fadd_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fadd_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FADD:%[0-9]+]]:fprb(s16) = G_FADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FADD]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FADD %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fsub_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fsub_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FSUB:%[0-9]+]]:fprb(s16) = G_FSUB [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FSUB]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FSUB %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fmul_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fmul_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:fprb(s16) = G_FMUL [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMUL]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FMUL %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fdiv_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fdiv_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FDIV:%[0-9]+]]:fprb(s16) = G_FDIV [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FDIV]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FDIV %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fma_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h, $f12_h
+
+    ; CHECK-LABEL: name: fma_f16
+    ; CHECK: liveins: $f10_h, $f11_h, $f12_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fprb(s16) = COPY $f12_h
+    ; CHECK-NEXT: [[FMA:%[0-9]+]]:fprb(s16) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMA]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = COPY $f12_h
+    %3:_(s16) = G_FMA %0, %1, %2
+    $f10_h = COPY %3(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fneg_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fneg_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FNEG:%[0-9]+]]:fprb(s16) = G_FNEG [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FNEG]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FNEG %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fabs_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fabs_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FABS:%[0-9]+]]:fprb(s16) = G_FABS [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FABS]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FABS %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fsqrt_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fsqrt_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:fprb(s16) = G_FSQRT [[COPY]]
+    ; CHECK-NEXT: $f10_h = COPY [[FSQRT]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = G_FSQRT %0
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fmaxnum_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fmaxnum_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:fprb(s16) = G_FMAXNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMAXNUM]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FMAXNUM %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fminnum_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fminnum_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:fprb(s16) = G_FMINNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_h = COPY [[FMINNUM]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FMINNUM %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fcopysign_f16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_h, $f11_h
+
+    ; CHECK-LABEL: name: fcopysign_f16
+    ; CHECK: liveins: $f10_h, $f11_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s16) = COPY $f11_h
+    ; CHECK-NEXT: [[FCOPYSIGN:%[0-9]+]]:fprb(s16) = G_FCOPYSIGN [[COPY]], [[COPY1]](s16)
+    ; CHECK-NEXT: $f10_h = COPY [[FCOPYSIGN]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s16) = COPY $f10_h
+    %1:_(s16) = COPY $f11_h
+    %2:_(s16) = G_FCOPYSIGN %0, %1
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-ext-trunc-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-ext-trunc-f16.mir
new file mode 100644
index 00000000000000..7758a640ac8600
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-ext-trunc-f16.mir
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+d,zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+d,zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            fpext_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fpext_f32
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:fprb(s32) = G_FPEXT [[COPY]](s16)
+    ; CHECK-NEXT: $f10_f = COPY [[FPEXT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPEXT %0(s16)
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fptrunc_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fptrunc_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:fprb(s16) = G_FPTRUNC [[COPY]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[FPTRUNC]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s32) = COPY $f10_f
+    %1:_(s16) = G_FPTRUNC %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            fpext_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fpext_f64
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:fprb(s64) = G_FPEXT [[COPY]](s16)
+    ; CHECK-NEXT: $f10_d = COPY [[FPEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s16) = COPY $f10_h
+    %1:_(s64) = G_FPEXT %0(s16)
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fptrunc_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fptrunc_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:fprb(s16) = G_FPTRUNC [[COPY]](s64)
+    ; CHECK-NEXT: $f10_h = COPY [[FPTRUNC]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $f10_d
+    %1:_(s16) = G_FPTRUNC %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fptoi-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fptoi-f16-rv32.mir
new file mode 100644
index 00000000000000..346dd28eb37781
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fptoi-f16-rv32.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            fptosi_s32_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:gprb(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOSI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOSI %0(s16)
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s32_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:gprb(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOUI]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOUI %0(s16)
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fptoi-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fptoi-f16-rv64.mir
new file mode 100644
index 00000000000000..37aefa3ecbf0c0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fptoi-f16-rv64.mir
@@ -0,0 +1,93 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            fptosi_s32_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:gprb(s32) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:gprb(s64) = G_ANYEXT [[FPTOSI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOSI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s32)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s32_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s32_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:gprb(s32) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:gprb(s64) = G_ANYEXT [[FPTOUI]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s32) = G_FPTOUI %0(s16)
+    %2:_(s64) = G_ANYEXT %1(s32)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptosi_s64_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptosi_s64_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:gprb(s64) = G_FPTOSI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOSI]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s64) = G_FPTOSI %0(s16)
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            fptoui_s64_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: fptoui_s64_s16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:gprb(s64) = G_FPTOUI [[COPY]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[FPTOUI]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %1:_(s64) = G_FPTOUI %0(s16)
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-f16-rv32.mir
new file mode 100644
index 00000000000000..9224579d6bd783
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-f16-rv32.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=regbankselect -verify-machineinstrs %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            is_fpclass_f16
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[C:%[0-9]+]]:gprb(s32) = G_CONSTANT i32 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:gprb(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:gprb(s32) = G_FCLASS [[COPY]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gprb(s32) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gprb(s32) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %3:_(s32) = G_CONSTANT i32 152
+    %4:_(s32) = G_CONSTANT i32 0
+    %5:_(s32) = G_FCLASS %0(s16)
+    %6:_(s32) = G_AND %5, %3
+    %7:_(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-f16-rv64.mir
new file mode 100644
index 00000000000000..9bf796d1f55e96
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-f16-rv64.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=regbankselect -verify-machineinstrs %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            is_fpclass_f16
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $f10_h
+
+    ; CHECK-LABEL: name: is_fpclass_f16
+    ; CHECK: liveins: $f10_h
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s16) = COPY $f10_h
+    ; CHECK-NEXT: [[C:%[0-9]+]]:gprb(s64) = G_CONSTANT i64 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:gprb(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:gprb(s64) = G_FCLASS [[COPY]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gprb(s64) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gprb(s64) = G_ICMP intpred(ne), [[AND]](s64), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s16) = COPY $f10_h
+    %3:_(s64) = G_CONSTANT i64 152
+    %4:_(s64) = G_CONSTANT i64 0
+    %5:_(s64) = G_FCLASS %0(s16)
+    %6:_(s64) = G_AND %5, %3
+    %7:_(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv32.mir
index e48c1cbaba303f..ad0b73e6bef923 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv32.mir
@@ -29,3 +29,30 @@ body:             |
     $x10 = COPY %7(s32)
     PseudoRET implicit $x10
 ...
+---
+name:            is_fpclass_f64
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: is_fpclass_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[C:%[0-9]+]]:gprb(s32) = G_CONSTANT i32 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:gprb(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:gprb(s32) = G_FCLASS [[COPY]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gprb(s32) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gprb(s32) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $f10_d
+    %3:_(s32) = G_CONSTANT i32 152
+    %4:_(s32) = G_CONSTANT i32 0
+    %5:_(s32) = G_FCLASS %0(s64)
+    %6:_(s32) = G_AND %5, %3
+    %7:_(s32) = G_ICMP intpred(ne), %6(s32), %4
+    $x10 = COPY %7(s32)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv64.mir
index afe999ea42be43..9e602b3ed94c3d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/is-fpclass-rv64.mir
@@ -2,6 +2,33 @@
 # RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=regbankselect -verify-machineinstrs %s -o - \
 # RUN: | FileCheck %s
 
+---
+name:            is_fpclass_f32
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: is_fpclass_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[C:%[0-9]+]]:gprb(s64) = G_CONSTANT i64 152
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:gprb(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[FCLASS:%[0-9]+]]:gprb(s64) = G_FCLASS [[COPY]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:gprb(s64) = G_AND [[FCLASS]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gprb(s64) = G_ICMP intpred(ne), [[AND]](s64), [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[ICMP]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $f10_f
+    %3:_(s64) = G_CONSTANT i64 152
+    %4:_(s64) = G_CONSTANT i64 0
+    %5:_(s64) = G_FCLASS %0(s32)
+    %6:_(s64) = G_AND %5, %3
+    %7:_(s64) = G_ICMP intpred(ne), %6(s64), %4
+    $x10 = COPY %7(s64)
+    PseudoRET implicit $x10
+...
 ---
 name:            is_fpclass_f64
 legalized:       true
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/itofp-f16-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/itofp-f16-rv32.mir
new file mode 100644
index 00000000000000..02e5a9a982ed00
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/itofp-f16-rv32.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            sitofp_s16_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprb(s32) = COPY $x10
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:fprb(s16) = G_SITOFP [[COPY]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s32) = COPY $x10
+    %1:_(s16) = G_SITOFP %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprb(s32) = COPY $x10
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:fprb(s16) = G_UITOFP [[COPY]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s32) = COPY $x10
+    %1:_(s16) = G_UITOFP %0(s32)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/itofp-f16-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/itofp-f16-rv64.mir
new file mode 100644
index 00000000000000..10da1f8fc9fb30
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/itofp-f16-rv64.mir
@@ -0,0 +1,93 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+zfh -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            sitofp_s16_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprb(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:gprb(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:fprb(s16) = G_SITOFP [[TRUNC]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $x10
+    %1:_(s32) = G_TRUNC %0(s64)
+    %2:_(s16) = G_SITOFP %1(s32)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprb(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:gprb(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:fprb(s16) = G_UITOFP [[TRUNC]](s32)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $x10
+    %1:_(s32) = G_TRUNC %0(s64)
+    %2:_(s16) = G_UITOFP %1(s32)
+    $f10_h = COPY %2(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            sitofp_s16_s64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: sitofp_s16_s64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprb(s64) = COPY $x10
+    ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:fprb(s16) = G_SITOFP [[COPY]](s64)
+    ; CHECK-NEXT: $f10_h = COPY [[SITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $x10
+    %1:_(s16) = G_SITOFP %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
+---
+name:            uitofp_s16_s64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: uitofp_s16_s64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprb(s64) = COPY $x10
+    ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:fprb(s16) = G_UITOFP [[COPY]](s64)
+    ; CHECK-NEXT: $f10_h = COPY [[UITOFP]](s16)
+    ; CHECK-NEXT: PseudoRET implicit $f10_h
+    %0:_(s64) = COPY $x10
+    %1:_(s16) = G_UITOFP %0(s64)
+    $f10_h = COPY %1(s16)
+    PseudoRET implicit $f10_h
+
+...
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f20f6f7c6f94e9..135baad04bbbdf 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -406,11 +406,11 @@
 ; RV32ZAMA16B: .attribute 5, "rv32i2p1_zama16b1p0"
 ; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4_zicsr2p0"
 ; RV32ZABHA: .attribute 5, "rv32i2p1_a2p1_zabha1p0"
-; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm0p8"
-; RV32SMNPM: .attribute 5, "rv32i2p1_smnpm0p8"
-; RV32SMMPM: .attribute 5, "rv32i2p1_smmpm0p8"
-; RV32SSPM: .attribute 5, "rv32i2p1_sspm0p8"
-; RV32SUPM: .attribute 5, "rv32i2p1_supm0p8"
+; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm1p0"
+; RV32SMNPM: .attribute 5, "rv32i2p1_smnpm1p0"
+; RV32SMMPM: .attribute 5, "rv32i2p1_smmpm1p0"
+; RV32SSPM: .attribute 5, "rv32i2p1_sspm1p0"
+; RV32SUPM: .attribute 5, "rv32i2p1_supm1p0"
 ; RV32SSQOSID: .attribute 5, "rv32i2p1_ssqosid1p0"
 
 ; RV64M: .attribute 5, "rv64i2p1_m2p0_zmmul1p0"
@@ -542,11 +542,11 @@
 ; RV64ZALASR: .attribute 5, "rv64i2p1_zalasr0p1"
 ; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp0p4_zicsr2p0"
 ; RV64ZABHA: .attribute 5, "rv64i2p1_a2p1_zabha1p0"
-; RV64SSNPM: .attribute 5, "rv64i2p1_ssnpm0p8"
-; RV64SMNPM: .attribute 5, "rv64i2p1_smnpm0p8"
-; RV64SMMPM: .attribute 5, "rv64i2p1_smmpm0p8"
-; RV64SSPM: .attribute 5, "rv64i2p1_sspm0p8"
-; RV64SUPM: .attribute 5, "rv64i2p1_supm0p8"
+; RV64SSNPM: .attribute 5, "rv64i2p1_ssnpm1p0"
+; RV64SMNPM: .attribute 5, "rv64i2p1_smnpm1p0"
+; RV64SMMPM: .attribute 5, "rv64i2p1_smmpm1p0"
+; RV64SSPM: .attribute 5, "rv64i2p1_sspm1p0"
+; RV64SUPM: .attribute 5, "rv64i2p1_supm1p0"
 ; RV64SSQOSID: .attribute 5, "rv64i2p1_ssqosid1p0"
 
 ; RVI20U32: .attribute 5, "rv32i2p1"
@@ -556,7 +556,7 @@
 ; RVA22U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
 ; RVA22S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscounterenw1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0_svinval1p0_svpbmt1p0"
 ; RVA23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0"
-; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm0p8_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
+; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm1p0_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
 ; RVB23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
 ; RVB23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
 ; RVM23U32: .attribute 5, "rv32i2p1_m2p0_zicbop1p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zimop1p0_zmmul1p0_zca1p0_zcb1p0_zce1p0_zcmop1p0_zcmp1p0_zcmt1p0_zba1p0_zbb1p0_zbs1p0"
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index f25aa0de89da88..cb9fc6c16333e0 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1356,3 +1356,64 @@ define i64 @bswap_i64(i64 %a) {
   %1 = call i64 @llvm.bswap.i64(i64 %a)
   ret i64 %1
 }
+
+define i16 @orc_b_i16(i16 %a) {
+; RV32I-LABEL: orc_b_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 257
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i16:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    andi a0, a0, 257
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+  %1 = and i16 %a, 257
+  %2 = mul nuw i16 %1, 255
+  ret i16 %2
+}
+
+define i32 @orc_b_i32(i32 %a) {
+; RV32I-LABEL: orc_b_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lui a1, 4112
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+  %1 = and i32 %a, 16843009
+  %2 = mul nuw i32 %1, 255
+  ret i32 %2
+}
+
+define i64 @orc_b_i64(i64 %a) {
+; CHECK-LABEL: orc_b_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 4112
+; CHECK-NEXT:    addi a2, a2, 257
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 8
+; CHECK-NEXT:    sltu a3, a2, a0
+; CHECK-NEXT:    srli a4, a0, 24
+; CHECK-NEXT:    slli a5, a1, 8
+; CHECK-NEXT:    or a4, a5, a4
+; CHECK-NEXT:    sub a1, a4, a1
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a0, a2, a0
+; CHECK-NEXT:    ret
+  %1 = and i64 %a, 72340172838076673
+  %2 = mul nuw i64 %1, 255
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbkb.ll
index 33116d16595a44..b194c0ed52dfb5 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbkb.ll
@@ -255,3 +255,67 @@ define void @packh_i16_3(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2, ptr %p) {
   store i16 %8, ptr %p
   ret void
 }
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; RV32I-LABEL: zexth_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBKB-LABEL: zexth_i32:
+; RV32ZBKB:       # %bb.0:
+; RV32ZBKB-NEXT:    zext.h a0, a0
+; RV32ZBKB-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; RV32I-LABEL: zexth_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBKB-LABEL: zexth_i64:
+; RV32ZBKB:       # %bb.0:
+; RV32ZBKB-NEXT:    zext.h a0, a0
+; RV32ZBKB-NEXT:    li a1, 0
+; RV32ZBKB-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+define i32 @zext_i16_to_i32(i16 %a) nounwind {
+; RV32I-LABEL: zext_i16_to_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBKB-LABEL: zext_i16_to_i32:
+; RV32ZBKB:       # %bb.0:
+; RV32ZBKB-NEXT:    zext.h a0, a0
+; RV32ZBKB-NEXT:    ret
+  %1 = zext i16 %a to i32
+  ret i32 %1
+}
+
+define i64 @zext_i16_to_i64(i16 %a) nounwind {
+; RV32I-LABEL: zext_i16_to_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBKB-LABEL: zext_i16_to_i64:
+; RV32ZBKB:       # %bb.0:
+; RV32ZBKB-NEXT:    zext.h a0, a0
+; RV32ZBKB-NEXT:    li a1, 0
+; RV32ZBKB-NEXT:    ret
+  %1 = zext i16 %a to i64
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb.ll
index 806f2e0838e294..8e26598380d4ae 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb.ll
@@ -308,3 +308,63 @@ define i64 @pack_i64_imm() {
 ; RV64ZBKB-NEXT:    ret
   ret i64 1157442765409226768 ; 0x0101010101010101
 }
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; RV64I-LABEL: zexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zexth_i32:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: zexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zexth_i64:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+define i32 @zext_i16_to_i32(i16 %a) nounwind {
+; RV64I-LABEL: zext_i16_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zext_i16_to_i32:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %1 = zext i16 %a to i32
+  ret i32 %1
+}
+
+define i64 @zext_i16_to_i64(i16 %a) nounwind {
+; RV64I-LABEL: zext_i16_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zext_i16_to_i64:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %1 = zext i16 %a to i64
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 4d5ef5db86057b..6c354cc1b446b2 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1494,3 +1494,69 @@ define i64 @bswap_i64(i64 %a) {
   %1 = call i64 @llvm.bswap.i64(i64 %a)
   ret i64 %1
 }
+
+define i16 @orc_b_i16(i16 %a) {
+; RV64I-LABEL: orc_b_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 257
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: orc_b_i16:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    andi a0, a0, 257
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = and i16 %a, 257
+  %2 = mul nuw i16 %1, 255
+  ret i16 %2
+}
+
+define i32 @orc_b_i32(i32 %a) {
+; RV64I-LABEL: orc_b_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: orc_b_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lui a1, 4112
+; RV64ZBB-NEXT:    addiw a1, a1, 257
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = and i32 %a, 16843009
+  %2 = mul nuw i32 %1, 255
+  ret i32 %2
+}
+
+define i64 @orc_b_i64(i64 %a) {
+; RV64I-LABEL: orc_b_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: orc_b_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lui a1, 4112
+; RV64ZBB-NEXT:    addiw a1, a1, 257
+; RV64ZBB-NEXT:    slli a2, a1, 32
+; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = and i64 %a, 72340172838076673
+  %2 = mul nuw i64 %1, 255
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index db1330b0a63bd3..4aa6cd42ab0994 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -332,3 +332,63 @@ define i64 @pack_i64_imm() {
 ; RV64ZBKB-NEXT:    ret
   ret i64 1157442765409226768 ; 0x0101010101010101
 }
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; RV64I-LABEL: zexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zexth_i32:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: zexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zexth_i64:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+define i32 @zext_i16_to_i32(i16 %a) nounwind {
+; RV64I-LABEL: zext_i16_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zext_i16_to_i32:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %1 = zext i16 %a to i32
+  ret i32 %1
+}
+
+define i64 @zext_i16_to_i64(i16 %a) nounwind {
+; RV64I-LABEL: zext_i16_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: zext_i16_to_i64:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    zext.h a0, a0
+; RV64ZBKB-NEXT:    ret
+  %1 = zext i16 %a to i64
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll
new file mode 100644
index 00000000000000..4aa60897f5064f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @vls3i8(ptr align 8 %array) {
+; CHECK-LABEL: vls3i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <3 x i8>, ptr %array, i64 0
+  %1 = load <3 x i8>, ptr %array, align 1
+  %2 = add<3 x i8> %1, %1
+  store <3 x i8> %2, ptr %array, align 1
+  ret void
+}
+
+define void @vls3(ptr align 8 %array) {
+; CHECK-LABEL: vls3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <3 x i32>, ptr %array, i64 0
+  %1 = load <3 x i32>, ptr %array, align 4
+  %2 = add<3 x i32> %1, %1
+  store <3 x i32> %2, ptr %array, align 4
+  ret void
+}
+
+define void @vls5(ptr align 8 %array) {
+; CHECK-LABEL: vls5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <5 x i32>, ptr %array, i64 0
+  %1 = load <5 x i32>, ptr %array, align 4
+  %2 = add<5 x i32> %1, %1
+  store <5 x i32> %2, ptr %array, align 4
+  ret void
+}
+
+define void @vls6(ptr align 8 %array) {
+; CHECK-LABEL: vls6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <6 x i32>, ptr %array, i64 0
+  %1 = load <6 x i32>, ptr %array, align 4
+  %2 = add<6 x i32> %1, %1
+  store <6 x i32> %2, ptr %array, align 4
+  ret void
+}
+
+define void @vls7(ptr align 8 %array) {
+; CHECK-LABEL: vls7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <7 x i32>, ptr %array, i64 0
+  %1 = load <7 x i32>, ptr %array, align 4
+  %2 = add<7 x i32> %1, %1
+  store <7 x i32> %2, ptr %array, align 4
+  ret void
+}
+
+
+define void @vls9(ptr align 8 %array) {
+; CHECK-LABEL: vls9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <9 x i32>, ptr %array, i64 0
+  %1 = load <9 x i32>, ptr %array, align 4
+  %2 = add<9 x i32> %1, %1
+  store <9 x i32> %2, ptr %array, align 4
+  ret void
+}
+
+
+define void @vls10(ptr align 8 %array) {
+; CHECK-LABEL: vls10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 10, e32, m4, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 10, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <10 x i32>, ptr %array, i64 0
+  %1 = load <10 x i32>, ptr %array, align 4
+  %2 = add<10 x i32> %1, %1
+  store <10 x i32> %2, ptr %array, align 4
+  ret void
+}
+
+define void @vls11(ptr align 8 %array) {
+; CHECK-LABEL: vls11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 11, e32, m4, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsetivli zero, 11, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %arr = getelementptr inbounds <11 x i32>, ptr %array, i64 0
+  %1 = load <11 x i32>, ptr %array, align 4
+  %2 = add<11 x i32> %1, %1
+  store <11 x i32> %2, ptr %array, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll
index e75591c7d0aaf1..d945cf56169812 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll
@@ -119,3 +119,227 @@ define <16 x half> @selectcc_v16f16(half %a, half %b, <16 x half> %c, <16 x half
   %v = select i1 %cmp, <16 x half> %c, <16 x half> %d
   ret <16 x half> %v
 }
+
+define <2 x float> @select_v2f32(i1 zeroext %c, <2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: select_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %v
+}
+
+define <2 x float> @selectcc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: selectcc_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.s a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq float %a, %b
+  %v = select i1 %cmp, <2 x float> %c, <2 x float> %d
+  ret <2 x float> %v
+}
+
+define <4 x float> @select_v4f32(i1 zeroext %c, <4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: select_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %v
+}
+
+define <4 x float> @selectcc_v4f32(float %a, float %b, <4 x float> %c, <4 x float> %d) {
+; CHECK-LABEL: selectcc_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.s a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq float %a, %b
+  %v = select i1 %cmp, <4 x float> %c, <4 x float> %d
+  ret <4 x float> %v
+}
+
+define <8 x float> @select_v8f32(i1 zeroext %c, <8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: select_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <8 x float> %a, <8 x float> %b
+  ret <8 x float> %v
+}
+
+define <8 x float> @selectcc_v8f32(float %a, float %b, <8 x float> %c, <8 x float> %d) {
+; CHECK-LABEL: selectcc_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.s a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq float %a, %b
+  %v = select i1 %cmp, <8 x float> %c, <8 x float> %d
+  ret <8 x float> %v
+}
+
+define <16 x float> @select_v16f32(i1 zeroext %c, <16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: select_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmsne.vi v0, v16, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <16 x float> %a, <16 x float> %b
+  ret <16 x float> %v
+}
+
+define <16 x float> @selectcc_v16f32(float %a, float %b, <16 x float> %c, <16 x float> %d) {
+; CHECK-LABEL: selectcc_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.s a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmsne.vi v0, v16, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq float %a, %b
+  %v = select i1 %cmp, <16 x float> %c, <16 x float> %d
+  ret <16 x float> %v
+}
+
+define <2 x double> @select_v2f64(i1 zeroext %c, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: select_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %v
+}
+
+define <2 x double> @selectcc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d) {
+; CHECK-LABEL: selectcc_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.d a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq double %a, %b
+  %v = select i1 %cmp, <2 x double> %c, <2 x double> %d
+  ret <2 x double> %v
+}
+
+define <4 x double> @select_v4f64(i1 zeroext %c, <4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: select_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <4 x double> %a, <4 x double> %b
+  ret <4 x double> %v
+}
+
+define <4 x double> @selectcc_v4f64(double %a, double %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: selectcc_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.d a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq double %a, %b
+  %v = select i1 %cmp, <4 x double> %c, <4 x double> %d
+  ret <4 x double> %v
+}
+
+define <8 x double> @select_v8f64(i1 zeroext %c, <8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: select_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmsne.vi v0, v16, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <8 x double> %a, <8 x double> %b
+  ret <8 x double> %v
+}
+
+define <8 x double> @selectcc_v8f64(double %a, double %b, <8 x double> %c, <8 x double> %d) {
+; CHECK-LABEL: selectcc_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.d a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmsne.vi v0, v16, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq double %a, %b
+  %v = select i1 %cmp, <8 x double> %c, <8 x double> %d
+  ret <8 x double> %v
+}
+
+define <16 x double> @select_v16f64(i1 zeroext %c, <16 x double> %a, <16 x double> %b) {
+; CHECK-LABEL: select_v16f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v24, a0
+; CHECK-NEXT:    vmsne.vi v0, v24, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    ret
+  %v = select i1 %c, <16 x double> %a, <16 x double> %b
+  ret <16 x double> %v
+}
+
+define <16 x double> @selectcc_v16f64(double %a, double %b, <16 x double> %c, <16 x double> %d) {
+; CHECK-LABEL: selectcc_v16f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    feq.d a0, fa0, fa1
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v24, a0
+; CHECK-NEXT:    vmsne.vi v0, v24, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq double %a, %b
+  %v = select i1 %cmp, <16 x double> %c, <16 x double> %d
+  ret <16 x double> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 7eb6cacf1ca43f..d984e266d76580 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -234,24 +234,24 @@ if.end6:                                          ; preds = %if.else5, %if.then4
 define <vscale x 1 x double> @test6(i64 %avl, i8 zeroext %cond, <vscale x 1 x double> %a, <vscale x 1 x double> %b) nounwind {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andi a3, a1, 1
-; CHECK-NEXT:    vsetvli a2, a0, e64, m1, ta, ma
-; CHECK-NEXT:    bnez a3, .LBB5_3
+; CHECK-NEXT:    andi a2, a1, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    bnez a2, .LBB5_3
 ; CHECK-NEXT:  # %bb.1: # %if.else
 ; CHECK-NEXT:    vfsub.vv v8, v8, v9
 ; CHECK-NEXT:    andi a1, a1, 2
 ; CHECK-NEXT:    beqz a1, .LBB5_4
 ; CHECK-NEXT:  .LBB5_2: # %if.then4
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI5_0)
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_1)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI5_1)
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI5_0)
+; CHECK-NEXT:    vlse64.v v9, (a1), zero
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_1)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI5_1)
+; CHECK-NEXT:    vlse64.v v10, (a1), zero
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    lui a0, %hi(scratch)
-; CHECK-NEXT:    addi a0, a0, %lo(scratch)
-; CHECK-NEXT:    vse64.v v9, (a0)
+; CHECK-NEXT:    lui a1, %hi(scratch)
+; CHECK-NEXT:    addi a1, a1, %lo(scratch)
+; CHECK-NEXT:    vse64.v v9, (a1)
 ; CHECK-NEXT:    j .LBB5_5
 ; CHECK-NEXT:  .LBB5_3: # %if.then
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
@@ -259,16 +259,16 @@ define <vscale x 1 x double> @test6(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK-NEXT:    bnez a1, .LBB5_2
 ; CHECK-NEXT:  .LBB5_4: # %if.else5
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    lui a0, 260096
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    lui a0, 262144
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    lui a1, 260096
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    lui a1, 262144
+; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    lui a0, %hi(scratch)
-; CHECK-NEXT:    addi a0, a0, %lo(scratch)
-; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    lui a1, %hi(scratch)
+; CHECK-NEXT:    addi a1, a1, %lo(scratch)
+; CHECK-NEXT:    vse32.v v9, (a1)
 ; CHECK-NEXT:  .LBB5_5: # %if.end10
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v8, v8
 ; CHECK-NEXT:    ret
 entry:
@@ -328,7 +328,8 @@ define <vscale x 1 x double> @test8(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    vsetvli s0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    mv s0, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    beqz a1, .LBB6_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
@@ -387,7 +388,8 @@ define <vscale x 1 x double> @test9(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    vsetvli s0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    mv s0, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    beqz a1, .LBB7_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    vfadd.vv v9, v8, v9
@@ -722,7 +724,7 @@ define void @vector_init_vsetvli_N(i64 %N, ptr %c) {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB14_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli zero, a3, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a1, a1, a4
@@ -1034,3 +1036,22 @@ declare <vscale x 4 x i32> @llvm.riscv.vadd.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i1>,
   i64,
   i64);
+
+; Normally a pseudo's AVL is already live in its block, so it will already be
+; live where we're inserting the vsetvli, before the pseudo.  In some cases the
+; AVL can be from a predecessor block, so make sure we extend its live range
+; across blocks.
+define <vscale x 2 x i32> @cross_block_avl_extend(i64 %avl, <vscale x 2 x i32> %a, <vscale x 2 x i32> %b) {
+entry:
+  ; Get the output vl from a vsetvli
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %avl, i64 2, i64 0)
+  ; Force a vsetvli toggle so we need to insert a new vsetvli in exit
+  %d = call <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i64 1)
+  br label %exit
+exit:
+  ; The use of the vl from the vsetvli will be replaced with its %avl because
+  ; VLMAX is the same. So %avl, which was previously only live in %entry, will
+  ; need to be extended down toe %exit.
+  %c = call <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> %a, <vscale x 2 x i32> %d, i64 %vl)
+  ret <vscale x 2 x i32> %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index da0c1cfb50097c..7f01fd4d945c6a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -258,7 +258,6 @@ entry:
 define <vscale x 1 x double> @test14(i64 %avl, <vscale x 1 x double> %a, <vscale x 1 x double> %b) nounwind {
 ; CHECK-LABEL: test14:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
diff --git a/llvm/test/CodeGen/SPIRV/const-array-in-struct.ll b/llvm/test/CodeGen/SPIRV/const-array-in-struct.ll
new file mode 100644
index 00000000000000..ac39f3d28d073b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/const-array-in-struct.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: %[[#IntTy:]] = OpTypeInt 32 0
+; CHECK-SPIRV: %[[#Const16:]] = OpConstant %[[#IntTy]] 16
+; CHECK-SPIRV: %[[#ArrayTy:]] = OpTypeArray %[[#IntTy]] %[[#Const16]]
+; CHECK-SPIRV: %[[#StructTy:]] = OpTypeStruct %[[#ArrayTy]]
+; CHECK-SPIRV-COUNT-16: %[[#]] = OpConstant %[[#IntTy]] {{[0-9]+}}
+; CHECK-SPIRV: %[[#ConstArray:]] = OpConstantComposite %[[#ArrayTy]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]]
+; CHECK-SPIRV: %[[#]] = OpConstantComposite %[[#StructTy]] %[[#ConstArray]]
+
+%struct_array_16i32 = type { [16 x i32] }
+
+@G = private unnamed_addr addrspace(1) constant %struct_array_16i32 { [16 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15] }, align 4
+
+define spir_kernel void @test() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
new file mode 100644
index 00000000000000..9234106e5fcd18
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
@@ -0,0 +1,128 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV64
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV32
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: OpName %[[#Var:]] "var"
+; CHECK-SPIRV: OpName %[[#GVar:]] "g_var"
+; CHECK-SPIRV: OpName %[[#AVar:]] "a_var"
+; CHECK-SPIRV: OpName %[[#PVar:]] "p_var"
+
+; CHECK-SPIRV-DAG: %[[#CharTy:]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[#IntTy:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#V2CharTy:]] = OpTypeVector %[[#CharTy]] 2
+; CHECK-SPIRV-DAG: %[[#V2ConstNull:]] = OpConstantNull %[[#V2CharTy]]
+; CHECK-SPIRV-DAG: %[[#Const1:]] = OpConstant %[[#CharTy]] 1
+; CHECK-SPIRV-DAG: %[[#Const2:]] = OpConstant %[[#IntTy]] 2
+; CHECK-SPIRV-DAG: %[[#Arr2V2CharTy:]] = OpTypeArray %[[#V2CharTy]] %[[#Const2]]
+; CHECK-SPIRV-DAG: %[[#LongTy:]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[#PtrV2CharTy:]] = OpTypePointer CrossWorkgroup %[[#V2CharTy]]
+; CHECK-SPIRV-DAG: %[[#V2Char1:]] = OpConstantComposite %[[#V2CharTy]] %[[#Const1]] %[[#Const1]]
+; CHECK-SPIRV-DAG: %[[#Arr2V2Char:]] = OpConstantComposite %[[#Arr2V2CharTy]] %[[#V2Char1]] %[[#V2Char1]]
+; CHECK-SPIRV-DAG: %[[#PtrCharTy:]] = OpTypePointer CrossWorkgroup %[[#CharTy]]
+; CHECK-SPIRV-DAG: %[[#PtrArr2V2CharTy:]] = OpTypePointer CrossWorkgroup %[[#Arr2V2CharTy]]
+; CHECK-SPIRV-DAG: %[[#IntZero:]] = OpConstantNull %[[#IntTy]]
+; CHECK-SPIRV-DAG: %[[#LongZero:]] = OpConstantNull %[[#LongTy]]
+; CHECK-SPIRV64-DAG: %[[#ConstLong2:]] = OpConstant %[[#LongTy]] 2
+; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]]
+; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]]
+; CHECK-SPIRV-DAG: %[[#PtrPtrCharTy:]] = OpTypePointer CrossWorkgroup %[[#PtrCharTy]]
+; CHECK-SPIRV-DAG: %[[#AVar]] = OpVariable %[[#PtrArr2V2CharTy]] CrossWorkgroup %[[#Arr2V2Char]]
+; CHECK-SPIRV-DAG: %[[#PVar]] = OpVariable %[[#PtrPtrCharTy]] CrossWorkgroup %[[#PvarInit]]
+; CHECK-SPIRV-DAG: %[[#GVar]] = OpVariable %[[#PtrV2CharTy]] CrossWorkgroup %[[#V2Char1]]
+; CHECK-SPIRV-DAG: %[[#]] = OpVariable %[[#PtrV2CharTy]] CrossWorkgroup %[[#V2ConstNull]]
+
+; As an option: %[[#Const0:]] = OpConstant %[[#CharTy]] 0
+;               %[[#V2CharZero:]] = OpConstantComposite %[[#V2CharTy]] %[[#Const0]] %[[#Const0]]
+;               %[[#]] = OpVariable %[[#PtrV2CharTy]] CrossWorkgroup %[[#V2CharZero]]
+
+@var = addrspace(1) global <2 x i8> zeroinitializer, align 2
+@g_var = addrspace(1) global <2 x i8> <i8 1, i8 1>, align 2
+@a_var = addrspace(1) global [2 x <2 x i8>] [<2 x i8> <i8 1, i8 1>, <2 x i8> <i8 1, i8 1>], align 2
+@p_var = addrspace(1) global ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @a_var, i64 2), align 8
+
+define spir_func <2 x i8> @from_buf(<2 x i8> %a) #0 {
+entry:
+  ret <2 x i8> %a
+}
+
+define spir_func <2 x i8> @to_buf(<2 x i8> %a) #0 {
+entry:
+  ret <2 x i8> %a
+}
+
+define spir_kernel void @writer(ptr addrspace(1) %src, i32 %idx) #0 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_type_qual !8 !kernel_arg_base_type !7 !spirv.ParameterDecorations !9 {
+entry:
+  %arrayidx = getelementptr inbounds <2 x i8>, ptr addrspace(1) %src, i64 0
+  %0 = load <2 x i8>, ptr addrspace(1) %arrayidx, align 2
+  %call = call spir_func <2 x i8> @from_buf(<2 x i8> %0) #0
+  store <2 x i8> %call, ptr addrspace(1) @var, align 2
+  %arrayidx1 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %src, i64 1
+  %1 = load <2 x i8>, ptr addrspace(1) %arrayidx1, align 2
+  %call2 = call spir_func <2 x i8> @from_buf(<2 x i8> %1) #0
+  store <2 x i8> %call2, ptr addrspace(1) @g_var, align 2
+  %arrayidx3 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %src, i64 2
+  %2 = load <2 x i8>, ptr addrspace(1) %arrayidx3, align 2
+  %call4 = call spir_func <2 x i8> @from_buf(<2 x i8> %2) #0
+  %3 = getelementptr inbounds [2 x <2 x i8>], ptr addrspace(1) @a_var, i64 0, i64 0
+  store <2 x i8> %call4, ptr addrspace(1) %3, align 2
+  %arrayidx5 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %src, i64 3
+  %4 = load <2 x i8>, ptr addrspace(1) %arrayidx5, align 2
+  %call6 = call spir_func <2 x i8> @from_buf(<2 x i8> %4) #0
+  %5 = getelementptr inbounds [2 x <2 x i8>], ptr addrspace(1) @a_var, i64 0, i64 1
+  store <2 x i8> %call6, ptr addrspace(1) %5, align 2
+  %idx.ext = zext i32 %idx to i64
+  %add.ptr = getelementptr inbounds <2 x i8>, ptr addrspace(1) %3, i64 %idx.ext
+  store ptr addrspace(1) %add.ptr, ptr addrspace(1) @p_var, align 8
+  ret void
+}
+
+define spir_kernel void @reader(ptr addrspace(1) %dest, <2 x i8> %ptr_write_val) #0 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !10 !kernel_arg_type_qual !8 !kernel_arg_base_type !10 !spirv.ParameterDecorations !9 {
+entry:
+  %call = call spir_func <2 x i8> @from_buf(<2 x i8> %ptr_write_val) #0
+  %0 = load ptr addrspace(1), ptr addrspace(1) @p_var, align 8
+  store volatile <2 x i8> %call, ptr addrspace(1) %0, align 2
+  %1 = load <2 x i8>, ptr addrspace(1) @var, align 2
+  %call1 = call spir_func <2 x i8> @to_buf(<2 x i8> %1) #0
+  %arrayidx = getelementptr inbounds <2 x i8>, ptr addrspace(1) %dest, i64 0
+  store <2 x i8> %call1, ptr addrspace(1) %arrayidx, align 2
+  %2 = load <2 x i8>, ptr addrspace(1) @g_var, align 2
+  %call2 = call spir_func <2 x i8> @to_buf(<2 x i8> %2) #0
+  %arrayidx3 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %dest, i64 1
+  store <2 x i8> %call2, ptr addrspace(1) %arrayidx3, align 2
+  %3 = getelementptr inbounds [2 x <2 x i8>], ptr addrspace(1) @a_var, i64 0, i64 0
+  %4 = load <2 x i8>, ptr addrspace(1) %3, align 2
+  %call4 = call spir_func <2 x i8> @to_buf(<2 x i8> %4) #0
+  %arrayidx5 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %dest, i64 2
+  store <2 x i8> %call4, ptr addrspace(1) %arrayidx5, align 2
+  %5 = getelementptr inbounds [2 x <2 x i8>], ptr addrspace(1) @a_var, i64 0, i64 1
+  %6 = load <2 x i8>, ptr addrspace(1) %5, align 2
+  %call6 = call spir_func <2 x i8> @to_buf(<2 x i8> %6) #0
+  %arrayidx7 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %dest, i64 3
+  store <2 x i8> %call6, ptr addrspace(1) %arrayidx7, align 2
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!spirv.MemoryModel = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!2}
+!opencl.used.extensions = !{!3}
+!opencl.used.optional.core.features = !{!3}
+!spirv.Generator = !{!4}
+
+!0 = !{i32 2, i32 2}
+!1 = !{i32 3, i32 200000}
+!2 = !{i32 2, i32 0}
+!3 = !{}
+!4 = !{i16 6, i16 14}
+!5 = !{i32 1, i32 0}
+!6 = !{!"none", !"none"}
+!7 = !{!"char2*", !"int"}
+!8 = !{!"", !""}
+!9 = !{!3, !3}
+!10 = !{!"char2*", !"char2"}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll
index 4fb99d9dfc76f8..3fd5bd65853f7d 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll
@@ -11,13 +11,18 @@
 ; CHECK-DAG: %[[Const0:[0-9]+]] = OpConstant %[[TyFP32]] 0
 ; CHECK-DAG: %[[Const42:[0-9]+]] = OpConstant %[[TyFP32]] 42
 ; CHECK-DAG: %[[ScopeDevice:[0-9]+]] = OpConstant %[[TyInt32]] 1
+; CHECK-DAG: %[[ScopeWorkgroup:[0-9]+]] = OpConstant %[[TyInt32]] 2
 ; CHECK-DAG: %[[MemSeqCst:[0-9]+]] = OpConstant %[[TyInt32]] 16
+; CHECK-DAG: %[[WorkgroupMemory:[0-9]+]] = OpConstant %[[TyInt32]] 512
 ; CHECK-DAG: %[[TyFP32Ptr:[0-9]+]] = OpTypePointer {{[a-zA-Z]+}} %[[TyFP32]]
 ; CHECK-DAG: %[[DblPtr:[0-9]+]] = OpVariable %[[TyFP32Ptr]] {{[a-zA-Z]+}} %[[Const0]]
 ; CHECK: OpAtomicFAddEXT %[[TyFP32]] %[[DblPtr]] %[[ScopeDevice]] %[[MemSeqCst]] %[[Const42]]
 ; CHECK: %[[Const42Neg:[0-9]+]] = OpFNegate %[[TyFP32]] %[[Const42]]
 ; CHECK: OpAtomicFAddEXT %[[TyFP32]] %[[DblPtr]] %[[ScopeDevice]] %[[MemSeqCst]] %[[Const42Neg]]
 ; CHECK: OpAtomicFAddEXT %[[TyFP32]] %[[DblPtr]] %[[ScopeDevice]] %[[MemSeqCst]] %[[Const42]]
+; CHECK: OpAtomicFAddEXT %[[TyFP32]] %[[DblPtr]] %[[ScopeWorkgroup]] %[[WorkgroupMemory]] %[[Const42]]
+; CHECK: %[[Neg42:[0-9]+]] = OpFNegate %[[TyFP32]] %[[Const42]]
+; CHECK: OpAtomicFAddEXT %[[TyFP32]] %[[DblPtr]] %[[ScopeWorkgroup]] %[[WorkgroupMemory]] %[[Neg42]]
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64"
@@ -39,5 +44,15 @@ entry:
 
 declare dso_local spir_func float @_Z21__spirv_AtomicFAddEXT(ptr addrspace(1), i32, i32, float)
 
+define dso_local spir_func void @test3() local_unnamed_addr {
+entry:
+  %r1 = tail call spir_func float @_Z25atomic_fetch_add_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1) @f, float 42.000000e+00, i32 0)
+  %r2 = tail call spir_func float @_Z25atomic_fetch_sub_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1) @f, float 42.000000e+00, i32 0)
+  ret void
+}
+
+declare spir_func float @_Z25atomic_fetch_add_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1), float, i32)
+declare spir_func float @_Z25atomic_fetch_sub_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1), float, i32)
+
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_cooperative_matrix/cooperative_matrix.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_cooperative_matrix/cooperative_matrix.ll
new file mode 100644
index 00000000000000..1c41c7331cda8a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_cooperative_matrix/cooperative_matrix.ll
@@ -0,0 +1,54 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_cooperative_matrix %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_cooperative_matrix %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeCooperativeMatrixKHR type requires the following SPIR-V extension: SPV_KHR_cooperative_matrix
+
+; CHECK: OpCapability CooperativeMatrixKHR
+; CHECK: OpExtension "SPV_KHR_cooperative_matrix"
+
+; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#Const12:]] = OpConstant %[[#Int32Ty]] 12
+; CHECK-DAG: %[[#Const48:]] = OpConstant %[[#Int32Ty]] 48
+; CHECK-DAG: %[[#Const3:]] = OpConstant %[[#Int32Ty]] 3
+; CHECK-DAG: %[[#Const2:]] = OpConstant %[[#Int32Ty]] 2
+; CHECK-DAG: %[[#Const1:]] = OpConstant %[[#Int32Ty]] 1
+; CHECK-DAG: %[[#Const0:]] = OpConstant %[[#Int32Ty]] 0
+; CHECK-DAG: %[[#MatTy1:]] = OpTypeCooperativeMatrixKHR %[[#Int32Ty]] %[[#Const3]] %[[#Const12]] %[[#Const12]] %[[#Const2]]
+; CHECK-DAG: %[[#MatTy2:]] = OpTypeCooperativeMatrixKHR %[[#Int32Ty]] %[[#Const3]] %[[#Const12]] %[[#Const48]] %[[#Const0]]
+; CHECK-DAG: %[[#MatTy3:]] = OpTypeCooperativeMatrixKHR %[[#Int32Ty]] %[[#Const3]] %[[#Const48]] %[[#Const12]] %[[#Const1]]
+; CHECK: OpCompositeConstruct %[[#MatTy1]]
+; CHECK: %[[#Load1:]] = OpCooperativeMatrixLoadKHR %[[#MatTy2]]
+; CHECK: OpCooperativeMatrixLengthKHR %[[#Int32Ty]] %[[#MatTy2:]]
+; CHECK: OpCooperativeMatrixLoadKHR %[[#MatTy3]]
+; CHECK: OpCooperativeMatrixMulAddKHR %[[#MatTy1]]
+; CHECK: OpCooperativeMatrixStoreKHR
+
+define spir_kernel void @matr_mult(ptr addrspace(1) align 1 %_arg_accA, ptr addrspace(1) align 1 %_arg_accB, ptr addrspace(1) align 4 %_arg_accC, i64 %_arg_N, i64 %_arg_K) {
+entry:
+  %addr1 = alloca target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), align 8
+  %res = alloca target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), align 8
+  %m1 = tail call spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z26__spirv_CompositeConstruct(i32 0)
+  store target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) %m1, ptr %addr1, align 8
+  %accA3 = addrspacecast ptr addrspace(1) %_arg_accA to ptr addrspace(3)
+  %m2 = tail call spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 48, 0) @_Z32__spirv_CooperativeMatrixLoadKHR_1(ptr addrspace(3) %accA3, i32 0, i64 %_arg_K, i32 1)
+  %len = tail call spir_func i32 @_Z34__spirv_CooperativeMatrixLengthKHR(target("spirv.CooperativeMatrixKHR", i32, 3, 12, 48, 0) %m2)
+  %accB3 = addrspacecast ptr addrspace(1) %_arg_accB to ptr addrspace(3)
+  %m3 = tail call spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 48, 12, 1) @_Z32__spirv_CooperativeMatrixLoadKHR_2(ptr addrspace(3) %accB3, i32 0, i64 0)
+  %m4 = load target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), ptr %addr1, align 8
+  %m5 = tail call spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z34__spirv_CooperativeMatrixMulAddKHR(target("spirv.CooperativeMatrixKHR", i32, 3, 12, 48, 0) %m2, target("spirv.CooperativeMatrixKHR", i32, 3, 48, 12, 1) %m3, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) %m4, i32 12)
+  store target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) %m5, ptr %res, align 8
+  %r = load i64, ptr %res, align 8
+  store i64 %r, ptr %addr1, align 8
+  %accC3 = addrspacecast ptr addrspace(1) %_arg_accC to ptr addrspace(3)
+  %m6 = load target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), ptr %addr1, align 8
+  tail call spir_func void @_Z33__spirv_CooperativeMatrixStoreKHR(ptr addrspace(3) %accC3, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) %m6, i32 0, i64 %_arg_N, i32 1)
+  ret void
+}
+
+declare dso_local spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z26__spirv_CompositeConstruct(i32)
+declare dso_local spir_func i32 @_Z34__spirv_CooperativeMatrixLengthKHR(target("spirv.CooperativeMatrixKHR", i32, 3, 12, 48, 0))
+declare dso_local spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 48, 0) @_Z32__spirv_CooperativeMatrixLoadKHR_1(ptr addrspace(3), i32, i64, i32)
+declare dso_local spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 48, 12, 1) @_Z32__spirv_CooperativeMatrixLoadKHR_2(ptr addrspace(3), i32, i64)
+declare dso_local spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z34__spirv_CooperativeMatrixMulAddKHR(target("spirv.CooperativeMatrixKHR", i32, 3, 12, 48, 0), target("spirv.CooperativeMatrixKHR", i32, 3, 48, 12, 1), target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), i32)
+declare dso_local spir_func void @_Z33__spirv_CooperativeMatrixStoreKHR(ptr addrspace(3), target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), i32, i64, i32)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/image.ll b/llvm/test/CodeGen/SPIRV/opencl/image.ll
index ea8208c221bbd6..d87bb89fd2a6c2 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/image.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/image.ll
@@ -1,4 +1,8 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; FIXME: Write tests to ensure invalid usage of image are rejected, such as:
 ;;  - invalid AS (only global is allowed);
diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-zeroinitializer.ll b/llvm/test/CodeGen/SPIRV/pointers/global-zeroinitializer.ll
new file mode 100644
index 00000000000000..679b0e436afc59
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/global-zeroinitializer.ll
@@ -0,0 +1,24 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[#Var:]] "var"
+; CHECK-DAG: %[[#Char:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#Vec2Char:]] = OpTypeVector %[[#Char]] 2
+; CHECK-DAG: %[[#PtrVec2Char:]] = OpTypePointer CrossWorkgroup %[[#Vec2Char]]
+; CHECK-DAG: %[[#ConstNull:]] = OpConstantNull %[[#Vec2Char]]
+; CHECK: %[[#]] = OpVariable %[[#PtrVec2Char]] CrossWorkgroup %[[#ConstNull]]
+; As an option: %[[#C0:]] = OpConstant %[[#Char]] 0
+;               %[[#VecZero:]] = OpConstantComposite %[[#Vec2Char]] %[[#C0]] %[[#C0]]
+;               %[[#]] = OpVariable %[[#PtrVec2Char]] CrossWorkgroup %[[#VecZero]]
+; CHECK: OpFunction
+
+@var = addrspace(1) global <2 x i8> zeroinitializer
+
+define spir_kernel void @foo() {
+entry:
+  %addr = load <2 x i8>, ptr addrspace(1) @var, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
new file mode 100644
index 00000000000000..ebc3fbfb10d17d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
@@ -0,0 +1,24 @@
+; The goal of the test is to check that collisions between explicit integer constants and
+; integer constants added automatically by IR Translator are resolved correctly both for
+; 32- and 64-bits platforms. The test is successful if it doesn't crash and generate valid
+; SPIR-V code for both 32 and 64 bits targets.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV64
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV32
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV64-DAG: %[[#IntTy:]] = OpTypeInt 64 0
+; CHECK-SPIRV32-DAG: %[[#IntTy:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#Const2:]] = OpConstant %[[#IntTy]] 2
+; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] 70 %[[#]] %[[#]] %[[#Const2]]
+; CHECK-SPIRV: OpFunction
+
+@a_var = addrspace(1) global [2 x i8] [i8 1, i8 1]
+@p_var = addrspace(1) global ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @a_var, i64 2)
+
+define spir_func void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpGenericCastToPtr.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpGenericCastToPtr.ll
index e3a82b3577701b..8f3f71c5337b43 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpGenericCastToPtr.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpGenericCastToPtr.ll
@@ -26,6 +26,9 @@
 ; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalCharPtr]]
 ; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
 ; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateCharPtr]]
 ; CHECK-SPIRV: OpFunctionEnd
 
 ; CHECK-SPIRV: OpFunction
@@ -50,6 +53,9 @@ entry:
   %G = call spir_func ptr addrspace(1) @_Z33__spirv_GenericCastToPtr_ToGlobalPvi(ptr addrspace(4) %var1, i32 5)
   %L = call spir_func ptr addrspace(3) @_Z32__spirv_GenericCastToPtr_ToLocalPvi(ptr addrspace(4) %var2, i32 4)
   %P = call spir_func ptr @_Z34__spirv_GenericCastToPtr_ToPrivatePvi(ptr addrspace(4) %var3, i32 7)
+  %GE = call spir_func ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) %var1, i32 5)
+  %LE = call spir_func ptr addrspace(3) @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPvi(ptr addrspace(4) %var2, i32 4)
+  %PE = call spir_func ptr @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePvi(ptr addrspace(4) %var3, i32 7)
   ret void
 }
 
@@ -72,6 +78,9 @@ entry:
 declare spir_func ptr addrspace(1) @_Z33__spirv_GenericCastToPtr_ToGlobalPvi(ptr addrspace(4), i32)
 declare spir_func ptr addrspace(3) @_Z32__spirv_GenericCastToPtr_ToLocalPvi(ptr addrspace(4), i32)
 declare spir_func ptr @_Z34__spirv_GenericCastToPtr_ToPrivatePvi(ptr addrspace(4), i32)
+declare spir_func ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4), i32)
+declare spir_func ptr addrspace(3) @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPvi(ptr addrspace(4), i32)
+declare spir_func ptr @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePvi(ptr addrspace(4), i32)
 
 declare spir_func ptr addrspace(1) @_Z9to_globalPv(ptr addrspace(4))
 declare spir_func ptr addrspace(3) @_Z8to_localPv(ptr addrspace(4))
@@ -86,6 +95,9 @@ declare spir_func ptr @_Z10to_privatePv(ptr addrspace(4))
 ; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalIntPtr]]
 ; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
 ; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateIntPtr]]
 ; CHECK-SPIRV: OpFunctionEnd
 
 ; CHECK-SPIRV: OpFunction
@@ -110,6 +122,9 @@ entry:
   %G = call spir_func ptr addrspace(1) @__spirv_GenericCastToPtr_ToGlobal(ptr addrspace(4) %var1, i32 5)
   %L = call spir_func ptr addrspace(3) @__spirv_GenericCastToPtr_ToLocal(ptr addrspace(4) %var2, i32 4)
   %P = call spir_func ptr @__spirv_GenericCastToPtr_ToPrivate(ptr addrspace(4) %var3, i32 7)
+  %GE = call spir_func ptr addrspace(1) @__spirv_GenericCastToPtrExplicit_ToGlobal(ptr addrspace(4) %var1, i32 5)
+  %LE = call spir_func ptr addrspace(3) @__spirv_GenericCastToPtrExplicit_ToLocal(ptr addrspace(4) %var2, i32 4)
+  %PE = call spir_func ptr @__spirv_GenericCastToPtrExplicit_ToPrivate(ptr addrspace(4) %var3, i32 7)
   ret void
 }
 
@@ -132,6 +147,9 @@ entry:
 declare spir_func ptr addrspace(1) @__spirv_GenericCastToPtr_ToGlobal(ptr addrspace(4), i32)
 declare spir_func ptr addrspace(3) @__spirv_GenericCastToPtr_ToLocal(ptr addrspace(4), i32)
 declare spir_func ptr @__spirv_GenericCastToPtr_ToPrivate(ptr addrspace(4), i32)
+declare spir_func ptr addrspace(1) @__spirv_GenericCastToPtrExplicit_ToGlobal(ptr addrspace(4), i32)
+declare spir_func ptr addrspace(3) @__spirv_GenericCastToPtrExplicit_ToLocal(ptr addrspace(4), i32)
+declare spir_func ptr @__spirv_GenericCastToPtrExplicit_ToPrivate(ptr addrspace(4), i32)
 
 declare spir_func ptr addrspace(1) @to_global(ptr addrspace(4))
 declare spir_func ptr addrspace(3) @to_local(ptr addrspace(4))
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpPtrCastToGeneric.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpPtrCastToGeneric.ll
new file mode 100644
index 00000000000000..818243ab19e412
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpPtrCastToGeneric.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: %[[#Char:]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[#GlobalCharPtr:]] = OpTypePointer CrossWorkgroup %[[#Char]]
+; CHECK-SPIRV-DAG: %[[#LocalCharPtr:]] = OpTypePointer Workgroup %[[#Char]]
+; CHECK-SPIRV-DAG: %[[#GenericCharPtr:]] = OpTypePointer Generic %[[#Char]]
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: %[[#Arg1:]] = OpFunctionParameter %[[#GlobalCharPtr]]
+; CHECK-SPIRV: %[[#Ptr1:]] = OpPtrCastToGeneric %[[#GenericCharPtr]] %[[#Arg1]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]] %[[#Ptr1]]
+; CHECK-SPIRV: OpFunctionEnd
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: %[[#Arg2:]] = OpFunctionParameter %[[#GlobalCharPtr]]
+; CHECK-SPIRV: %[[#Ptr2:]] = OpPtrCastToGeneric %[[#GenericCharPtr]] %[[#Arg2]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]] %[[#Ptr2]]
+; CHECK-SPIRV: OpFunctionEnd
+
+define spir_kernel void @foo(ptr addrspace(1) %arg) {
+entry:
+  %p = addrspacecast ptr addrspace(1) %arg to ptr addrspace(3)
+  ret void
+}
+
+define spir_kernel void @bar(ptr addrspace(1) %arg) {
+entry:
+  %p1 = addrspacecast ptr addrspace(1) %arg to ptr addrspace(4)
+  %p2 = addrspacecast ptr addrspace(4) %p1 to ptr addrspace(3)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/var-uniform-const.ll b/llvm/test/CodeGen/SPIRV/var-uniform-const.ll
new file mode 100644
index 00000000000000..6f7c91eb09e90b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/var-uniform-const.ll
@@ -0,0 +1,87 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: %[[#Char:]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[#Long:]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#Size3:]] = OpConstant %[[#Int]] 3
+; CHECK-SPIRV-DAG: %[[#Arr3:]] = OpTypeArray %[[#Char]] %[[#Size3]]
+; CHECK-SPIRV-DAG: %[[#Size16:]] = OpConstant %[[#Int]] 16
+; CHECK-SPIRV-DAG: %[[#Arr16:]] = OpTypeArray %[[#Char]] %[[#Size16]]
+; CHECK-SPIRV-DAG: %[[#Const3:]] = OpConstant %[[#Long]] 3
+; CHECK-SPIRV-DAG: %[[#One:]] = OpConstant %[[#Char]] 1
+; CHECK-SPIRV-DAG: %[[#One3:]] = OpConstantComposite %[[#Arr3]] %[[#One]] %[[#One]] %[[#One]]
+; CHECK-SPIRV-DAG: %[[#Zero3:]] = OpConstantNull %[[#Arr3]]
+; CHECK-SPIRV-DAG: %[[#Const16:]] = OpConstant %[[#Long]] 16
+; CHECK-SPIRV-DAG: %[[#One16:]] = OpConstantComposite %[[#Arr16]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]] %[[#One]]
+; CHECK-SPIRV-DAG: %[[#Zero16:]] = OpConstantNull %[[#Arr16]]
+
+; The first set of functions.
+; CHECK-SPIRV-DAG: %[[#PtrArr3:]] = OpTypePointer UniformConstant %[[#Arr3]]
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr3]] UniformConstant %[[#One3]]
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr3]] UniformConstant %[[#Zero3]]
+; CHECK-SPIRV-DAG: %[[#PtrArr16:]] = OpTypePointer UniformConstant %[[#Arr16]]
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr16]] UniformConstant %[[#One16]]
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr16]] UniformConstant %[[#Zero16]]
+
+; The second set of functions.
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr3]] UniformConstant %[[#One3]]
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr3]] UniformConstant %[[#Zero3]]
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr16]] UniformConstant %[[#One16]]
+; CHECK-SPIRV-DAG: OpVariable %[[#PtrArr16]] UniformConstant %[[#Zero16]]
+
+%Vec3 = type { <3 x i8> }
+%Vec16 = type { <16 x i8> }
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const3]] Aligned 4
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const3]] Aligned 4
+; CHECK-SPIRV: OpFunctionEnd
+define spir_kernel void @foo(ptr addrspace(1) noundef align 16 %arg) {
+  %a1 = getelementptr inbounds %Vec3, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a1, i8 0, i64 3, i1 false)
+  %a2 = getelementptr inbounds %Vec3, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a2, i8 1, i64 3, i1 false)
+  ret void
+}
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const16]] Aligned 4
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const16]] Aligned 4
+; CHECK-SPIRV: OpFunctionEnd
+define spir_kernel void @bar(ptr addrspace(1) noundef align 16 %arg) {
+  %a1 = getelementptr inbounds %Vec16, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a1, i8 0, i64 16, i1 false)
+  %a2 = getelementptr inbounds %Vec16, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a2, i8 1, i64 16, i1 false)
+  ret void
+}
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const3]] Aligned 4
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const3]] Aligned 4
+; CHECK-SPIRV: OpFunctionEnd
+define spir_kernel void @foo_2(ptr addrspace(1) noundef align 16 %arg) {
+  %a1 = getelementptr inbounds %Vec3, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a1, i8 0, i64 3, i1 false)
+  %a2 = getelementptr inbounds %Vec3, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a2, i8 1, i64 3, i1 false)
+  ret void
+}
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const16]] Aligned 4
+; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#]] %[[#Const16]] Aligned 4
+; CHECK-SPIRV: OpFunctionEnd
+define spir_kernel void @bar_2(ptr addrspace(1) noundef align 16 %arg) {
+  %a1 = getelementptr inbounds %Vec16, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a1, i8 0, i64 16, i1 false)
+  %a2 = getelementptr inbounds %Vec16, ptr addrspace(1) %arg, i64 1
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 %a2, i8 1, i64 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/CodeGen/SystemZ/systemz-large-stack-frames.ll b/llvm/test/CodeGen/SystemZ/systemz-large-stack-frames.ll
new file mode 100644
index 00000000000000..51997a6c17c658
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/systemz-large-stack-frames.ll
@@ -0,0 +1,42 @@
+; REQUIRES: asserts
+; REQUIRES: systemz-registered-target
+; Used to fail with: LLVM ERROR: Error while trying to spill R5D from class ADDR64Bit: Cannot scavenge register without an emergency spill slot!
+
+
+; RUN: llc %s --mtriple s390x-ibm-zos -filetype obj -o %t
+
+; ModuleID = 'main.cpp'
+source_filename = "main.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; ModuleID = 'large-stack-frames.cpp'
+source_filename = "large-stack-frames.cpp"
+target datalayout = "E-m:l-p1:32:32-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+target triple = "s390x-ibm-zos"
+%struct.slice.108 = type { ptr, ptr, [8 x i64], [8 x i64], [8 x i64] }
+declare void @dealloc(ptr) local_unnamed_addr #0
+define internal void @foo([26 x i64] %co1, [26 x i64] %co2, [26 x i64] %co3, [26 x i64] %co4, [26 x i64] %co5, [26 x i64] %co6, [26 x i64] %co7, [26 x i64] %co8, i32 %skip_dispatch, ptr %0, i1 %1) #0 {
+entry:
+  %ref.tmp = alloca %struct.slice.108, align 8
+  br i1 %1, label %error, label %if.end95
+if.end95:
+  br i1 %1, label %if.else.i1546, label %object.exit1547
+if.else.i1546:
+  tail call void @dealloc(ptr noundef nonnull %0)
+  br label %object.exit1547
+object.exit1547:
+  %call96 = tail call fastcc noundef ptr @slice([26 x i64] inreg %co7, i32 noundef signext 1, ptr noundef nonnull @get_float, ptr noundef nonnull @object, i32 noundef signext 0)
+  ret void
+error:
+  ret void
+}
+declare dso_local fastcc ptr @slice([26 x i64], i32, ptr, ptr, i32) unnamed_addr #0
+define internal ptr @get_float(ptr %itemp, ptr %2) #0 {
+entry:
+  ret ptr %2
+}
+define internal i32 @object(ptr %itemp, ptr %obj) #0 {
+entry:
+  ret i32 1
+}
diff --git a/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll b/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll
index d3e5823fcb1fe7..3a8e3a35e2ef7a 100644
--- a/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll
+++ b/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll
@@ -303,7 +303,7 @@ define i64 @func4(i64 %n) {
 ; Require saving of r4 and in addition, a displacement large enough
 ; to force use of agfi before stmg.
 ; CHECK64: lgr	0, 4
-; CHECK64: agfi	4, -1040192
+; CHECK64: agfi	4, -1040224
 ; CHECK64: stmg  4, 10, 2048(4)
 ; CHECK64: lgr     8, 4
 ; CHECK64: basr   7, 6
@@ -317,7 +317,7 @@ define i64 @func5(i64 %n) {
 }
 
 ; CHECK-LABEL: large_stack
-; CHECK64: agfi  4, -1048768
+; CHECK64: agfi  4, -1048800
 ; CHECK64-NEXT: llgt  3, 1208
 ; CHECK64-NEXT: cg  4, 64(3)
 ; CHECK64-NEXT: jhe
@@ -332,7 +332,7 @@ define void @large_stack0() {
 }
 
 ; CHECK-LABEL: large_stack1
-; CHECK64: agfi  4, -1048768
+; CHECK64: agfi  4, -1048800
 ; CHECK64: lgr 0, 3
 ; CHECK64: llgt  3, 1208
 ; CHECK64: cg  4, 64(3)
@@ -358,7 +358,7 @@ define void @large_stack1(i64 %n1, i64 %n2, i64 %n3) {
 ; CHECK-LABEL: large_stack2
 ; CHECK64: lgr 0, 4
 ; CHECK64: stg 3, 2192(4)
-; CHECK64: agfi  4, -1048768
+; CHECK64: agfi  4, -1048800
 ; CHECK64: llgt  3, 1208
 ; CHECK64: cg  4, 64(3)
 ; CHECK64: jhe L#BB8_2
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index 0f0a1590915143..fa78f5f9591d63 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -246,3 +246,39 @@ define <8 x half> @nearest_v8f16_via_roundeven(<8 x half> %a) {
   %v = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %a)
   ret <8 x half> %v
 }
+
+define <8 x half> @convert_s_v8f16(<8 x i16> %x) {
+; CHECK-LABEL: convert_s_v8f16:
+; CHECK:         .functype convert_s_v8f16 (v128) -> (v128)
+; CHECK-NEXT:    f16x8.convert_i16x8_s $push0=, $0
+; CHECK-NEXT:    return $pop[[R]]{{$}}
+  %a = sitofp <8 x i16> %x to <8 x half>
+  ret <8 x half> %a
+}
+
+define <8 x half> @convert_u_v8f16(<8 x i16> %x) {
+; CHECK-LABEL: convert_u_v8f16:
+; CHECK:         .functype convert_u_v8f16 (v128) -> (v128)
+; CHECK-NEXT:    f16x8.convert_i16x8_u $push0=, $0
+; CHECK-NEXT:    return $pop[[R]]{{$}}
+  %a = uitofp <8 x i16> %x to <8 x half>
+  ret <8 x half> %a
+}
+
+define <8 x i16> @trunc_sat_s_v8i16(<8 x half> %x) {
+; CHECK-LABEL: trunc_sat_s_v8i16:
+; CHECK:         .functype trunc_sat_s_v8i16 (v128) -> (v128)
+; CHECK-NEXT:    i16x8.trunc_sat_f16x8_s $push0=, $0
+; CHECK-NEXT:    return $pop[[R]]{{$}}
+  %a = fptosi <8 x half> %x to <8 x i16>
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @trunc_sat_u_v8i16(<8 x half> %x) {
+; CHECK-LABEL: trunc_sat_u_v8i16:
+; CHECK:         .functype trunc_sat_u_v8i16 (v128) -> (v128)
+; CHECK-NEXT:    i16x8.trunc_sat_f16x8_u $push0=, $0
+; CHECK-NEXT:    return $pop[[R]]{{$}}
+  %a = fptoui <8 x half> %x to <8 x i16>
+  ret <8 x i16> %a
+}
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-attrs.ll b/llvm/test/CodeGen/WebAssembly/target-features-attrs.ll
new file mode 100644
index 00000000000000..25dee51ac8c389
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/target-features-attrs.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mcpu=mvp < %s | FileCheck %s
+; RUN: llc -mcpu=mvp -mattr=+simd128 < %s | FileCheck %s --check-prefixes SIMD128
+
+; Test that codegen emits target features from the command line or
+; function attributes correctly and that features are enabled for the
+; entire module if they are enabled for any function in the module.
+
+target triple = "wasm32-unknown-unknown"
+
+define void @fn_atomics(ptr %p1, float %f2) #0 {
+  %a = atomicrmw min ptr undef, i32 42 seq_cst
+  %v = fptoui float %f2 to i32
+  store i32 %v, ptr %p1
+  ret void
+}
+
+define void @fn_nontrapping_fptoint(ptr %p1, float %f2) #1 {
+  %a = atomicrmw min ptr undef, i32 42 seq_cst
+  %v = fptoui float %f2 to i32
+  store i32 %v, ptr %p1
+  ret void
+}
+
+define void @fn_reference_types() #2 {
+  ret void
+}
+
+attributes #0 = { "target-features"="+atomics" }
+attributes #1 = { "target-features"="+nontrapping-fptoint" }
+attributes #2 = { "target-features"="+reference-types" }
+
+; CHECK-LABEL: fn_atomics:
+
+; Expanded atomicrmw min
+; CHECK:       loop
+; CHECK:       i32.atomic.rmw.cmpxchg
+; CHECK:       end_loop
+
+; nontrapping fptoint
+; CHECK:       i32.trunc_sat_f32_u
+; CHECK:       i32.store
+
+; `fn_nontrapping_fptoint` should be the same as `fn_atomics`
+; CHECK-LABEL: fn_nontrapping_fptoint:
+
+; Expanded atomicrmw min
+; CHECK:       loop
+; CHECK:       i32.atomic.rmw.cmpxchg
+; CHECK:       end_loop
+
+; nontrapping fptoint
+; CHECK:       i32.trunc_sat_f32_u
+; CHECK:       i32.store
+
+; Features in function attributes:
+; +atomics, +nontrapping-fptoint, +reference-types
+; CHECK-LABEL: .custom_section.target_features,"",@
+; CHECK-NEXT: .int8  3
+; CHECK-NEXT: .int8  43
+; CHECK-NEXT: .int8  7
+; CHECK-NEXT: .ascii  "atomics"
+; CHECK-NEXT: .int8  43
+; CHECK-NEXT: .int8  19
+; CHECK-NEXT: .ascii  "nontrapping-fptoint"
+; CHECK-NEXT: .int8  43
+; CHECK-NEXT: .int8  15
+; CHECK-NEXT: .ascii  "reference-types"
+
+; Features in function attributes + features specified by -mattr= option:
+; +atomics, +nontrapping-fptoint, +reference-types, +simd128
+; SIMD128-LABEL: .custom_section.target_features,"",@
+; SIMD128-NEXT: .int8  4
+; SIMD128-NEXT: .int8  43
+; SIMD128-NEXT: .int8  7
+; SIMD128-NEXT: .ascii  "atomics"
+; SIMD128-NEXT: .int8  43
+; SIMD128-NEXT: .int8  19
+; SIMD128-NEXT: .ascii  "nontrapping-fptoint"
+; SIMD128-NEXT: .int8  43
+; SIMD128-NEXT: .int8  15
+; SIMD128-NEXT: .ascii  "reference-types"
+; SIMD128-NEXT: .int8  43
+; SIMD128-NEXT: .int8  7
+; SIMD128-NEXT: .ascii  "simd128"
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
new file mode 100644
index 00000000000000..d368271682a96e
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mcpu=mvp | FileCheck %s --check-prefixes MVP
+; RUN: llc < %s -mcpu=generic | FileCheck %s --check-prefixes GENERIC
+; RUN: llc < %s | FileCheck %s --check-prefixes GENERIC
+; RUN: llc < %s -mcpu=bleeding-edge | FileCheck %s --check-prefixes BLEEDING-EDGE
+
+; Test that the target features section contains the correct set of features
+; depending on -mcpu= options.
+
+target triple = "wasm32-unknown-unknown"
+
+; mvp: should not contain the target features section
+; MVP-NOT: .custom_section.target_features,"",@
+
+; generic: +mutable-globals, +sign-ext
+; GENERIC-LABEL: .custom_section.target_features,"",@
+; GENERIC-NEXT: .int8  2
+; GENERIC-NEXT: .int8  43
+; GENERIC-NEXT: .int8  15
+; GENERIC-NEXT: .ascii  "mutable-globals"
+; GENERIC-NEXT: .int8  43
+; GENERIC-NEXT: .int8  8
+; GENERIC-NEXT: .ascii  "sign-ext"
+
+; bleeding-edge: +atomics, +bulk-memory, +mutable-globals, +nontrapping-fptoint,
+;                +sign-ext, +simd128, +tail-call
+; BLEEDING-EDGE-LABEL: .section  .custom_section.target_features,"",@
+; BLEEDING-EDGE-NEXT: .int8  7
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  7
+; BLEEDING-EDGE-NEXT: .ascii  "atomics"
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  11
+; BLEEDING-EDGE-NEXT: .ascii  "bulk-memory"
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  15
+; BLEEDING-EDGE-NEXT: .ascii  "mutable-globals"
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  19
+; BLEEDING-EDGE-NEXT: .ascii  "nontrapping-fptoint"
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  8
+; BLEEDING-EDGE-NEXT: .ascii  "sign-ext"
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  7
+; BLEEDING-EDGE-NEXT: .ascii  "simd128"
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  9
+; BLEEDING-EDGE-NEXT: .ascii  "tail-call"
diff --git a/llvm/test/CodeGen/WebAssembly/target-features.ll b/llvm/test/CodeGen/WebAssembly/target-features.ll
deleted file mode 100644
index 953a4ac9b41de5..00000000000000
--- a/llvm/test/CodeGen/WebAssembly/target-features.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: llc < %s | FileCheck %s --check-prefixes CHECK,ATTRS
-; RUN: llc < %s -mcpu=mvp -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -mcpu=bleeding-edge | FileCheck %s --check-prefixes CHECK,BLEEDING-EDGE
-
-; Test that codegen emits target features from the command line or
-; function attributes correctly and that features are enabled for the
-; entire module if they are enabled for any function in the module.
-
-target triple = "wasm32-unknown-unknown"
-
-define void @fn_atomics(ptr %p1, float %f2) #0 {
-  %a = atomicrmw min ptr undef, i32 42 seq_cst
-  %v = fptoui float %f2 to i32
-  store i32 %v, ptr %p1
-  ret void
-}
-
-define void @fn_nontrapping_fptoint(ptr %p1, float %f2) #1 {
-  %a = atomicrmw min ptr undef, i32 42 seq_cst
-  %v = fptoui float %f2 to i32
-  store i32 %v, ptr %p1
-  ret void
-}
-
-define void @fn_reference_types() #2 {
-  ret void
-}
-
-attributes #0 = { "target-features"="+atomics" }
-attributes #1 = { "target-features"="+nontrapping-fptoint" }
-attributes #2 = { "target-features"="+reference-types" }
-
-; CHECK-LABEL: fn_atomics:
-
-; Expanded atomicrmw min
-; ATTRS:       loop
-; CHECK:       i32.atomic.rmw.cmpxchg
-; ATTRS:       end_loop
-
-; nontrapping fptoint
-; CHECK:       i32.trunc_sat_f32_u
-; ATTRS:       i32.store
-
-; `fn_nontrapping_fptoint` should be the same as `fn_atomics`
-; CHECK-LABEL: fn_nontrapping_fptoint:
-
-; Expanded atomicrmw min
-; ATTRS:       loop
-; CHECK:       i32.atomic.rmw.cmpxchg
-; ATTRS:       end_loop
-
-; nontrapping fptoint
-; CHECK:       i32.trunc_sat_f32_u
-; ATTRS:       i32.store
-
-; CHECK-LABEL: .custom_section.target_features,"",@
-
-; +atomics, +reference-types, +mutable-globals
-; ATTRS-NEXT: .int8	5
-; ATTRS-NEXT: .int8	43
-; ATTRS-NEXT: .int8	7
-; ATTRS-NEXT: .ascii	"atomics"
-; ATTRS-NEXT: .int8	43
-; ATTRS-NEXT: .int8	15
-; ATTRS-NEXT: .ascii	"mutable-globals"
-; ATTRS-NEXT: .int8	43
-; ATTRS-NEXT: .int8	19
-; ATTRS-NEXT: .ascii	"nontrapping-fptoint"
-; ATTRS-NEXT: .int8	43
-; ATTRS-NEXT: .int8	15
-; ATTRS-NEXT: .ascii	"reference-types"
-; ATTRS-NEXT: .int8	43
-; ATTRS-NEXT: .int8	8
-
-; +atomics, +nontrapping-fptoint, +reference-types, +simd128
-; SIMD128-NEXT: .int8 4
-; SIMD128-NEXT: .int8 43
-; SIMD128-NEXT: .int8 7
-; SIMD128-NEXT: .ascii "atomics"
-; SIMD128-NEXT: .int8 43
-; SIMD128-NEXT: .int8 19
-; SIMD128-NEXT: .ascii "nontrapping-fptoint"
-; SIMD128-NEXT: .int8 43
-; SIMD128-NEXT: .int8 15
-; SIMD128-NEXT: .ascii "reference-types"
-; SIMD128-NEXT: .int8 43
-; SIMD128-NEXT: .int8 7
-; SIMD128-NEXT: .ascii "simd128"
-
-; +atomics, +bulk-memory, +mutable-globals, +nontrapping-fptoint,
-; +reference-types, +sign-ext, +simd128, +tail-call
-; BLEEDING-EDGE-NEXT: .int8   8
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   7
-; BLEEDING-EDGE-NEXT: .ascii  "atomics"
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   11
-; BLEEDING-EDGE-NEXT: .ascii  "bulk-memory"
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   15
-; BLEEDING-EDGE-NEXT: .ascii  "mutable-globals"
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   19
-; BLEEDING-EDGE-NEXT: .ascii  "nontrapping-fptoint"
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   15
-; BLEEDING-EDGE-NEXT: .ascii  "reference-types"
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   8
-; BLEEDING-EDGE-NEXT: .ascii  "sign-ext"
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   7
-; BLEEDING-EDGE-NEXT: .ascii  "simd128"
-; BLEEDING-EDGE-NEXT: .int8   43
-; BLEEDING-EDGE-NEXT: .int8   9
-; BLEEDING-EDGE-NEXT: .ascii  "tail-call"
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 39c7ce1413d1b3..c67049d5006ca3 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1638,10 +1638,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
-; AVX512BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -3539,11 +3538,10 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,25,0,27,0,29,0,31]
-; AVX512BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -3672,11 +3670,10 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,25,26,0,28,29,0,31]
-; AVX512BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/apx/adc.ll b/llvm/test/CodeGen/X86/apx/adc.ll
index f6792ad8d227b8..ec9800ddc69aec 100644
--- a/llvm/test/CodeGen/X86/apx/adc.ll
+++ b/llvm/test/CodeGen/X86/apx/adc.ll
@@ -113,9 +113,7 @@ define i16 @adc16ri8(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16ri8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    adcw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xd7,0x00]
-; CHECK-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    adcw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xd7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i16 %a, 123
   %k = icmp ugt i16 %x, %y
@@ -167,10 +165,8 @@ define i16 @adc16ri(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16ri:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    adcw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xd7,0x00]
-; CHECK-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    adcw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xd7,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i16 %a, 1234
   %k = icmp ugt i16 %x, %y
@@ -267,9 +263,7 @@ define i16 @adc16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mi8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    adcw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x17,0x00]
-; CHECK-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    adcw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x17,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = add i16 %a, 123
@@ -325,10 +319,8 @@ define i16 @adc16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    adcw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x17,0x00]
-; CHECK-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    adcw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x17,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = add i16 %a, 1234
@@ -447,10 +439,8 @@ define void @adc16mi_legacy(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mi_legacy:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    adcw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x17,0x00]
-; CHECK-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    adcw $1234, (%rdi) # encoding: [0x66,0x81,0x17,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    movw %ax, (%rdi) # encoding: [0x66,0x89,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = add i16 %a, 1234
diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll
index 334ceb2e6d76ff..2641fd7c141414 100644
--- a/llvm/test/CodeGen/X86/apx/add.ll
+++ b/llvm/test/CodeGen/X86/apx/add.ll
@@ -20,14 +20,12 @@ entry:
 define i16 @add16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: add16rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    addw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} addw %si, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x01,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, %b
@@ -131,14 +129,12 @@ entry:
 define i16 @add16ri8(i16 noundef %a) {
 ; CHECK-LABEL: add16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    addw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} addw $123, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xc7,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, 123
@@ -193,16 +189,14 @@ entry:
 define i16 @add16ri(i16 noundef %a) {
 ; CHECK-LABEL: add16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $1234, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    addw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc7,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} addw $1234, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xc7,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, 1234
@@ -310,16 +304,12 @@ entry:
 define i16 @add16mi8(ptr %a) {
 ; CHECK-LABEL: add16mi8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    addw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} addw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x07,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -378,18 +368,14 @@ entry:
 define i16 @add16mi(ptr %a) {
 ; CHECK-LABEL: add16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    addw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x07,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} addw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x07,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll
index d902f41284e33a..51858ad5916054 100644
--- a/llvm/test/CodeGen/X86/apx/and.ll
+++ b/llvm/test/CodeGen/X86/apx/and.ll
@@ -133,14 +133,12 @@ entry:
 define i16 @and16ri8(i16 noundef %a) {
 ; CHECK-LABEL: and16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    andw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xe7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} andw $123, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xe7,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, 123
@@ -195,16 +193,14 @@ entry:
 define i16 @and16ri(i16 noundef %a) {
 ; CHECK-LABEL: and16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl $1234, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xe7,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    andw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xe7,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} andw $1234, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xe7,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, 1234
@@ -312,16 +308,12 @@ entry:
 define i16 @and16mi8(ptr %a) {
 ; CHECK-LABEL: and16mi8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    andw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x27,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} andw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x27,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -382,18 +374,14 @@ entry:
 define i16 @and16mi(ptr %a) {
 ; CHECK-LABEL: and16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    andl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x25,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    andw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x27,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    andl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x25,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} andw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x27,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -469,7 +457,7 @@ define i1 @andflag8rr(i8 %a, i8 %b) {
 define i1 @andflag16rr(i16 %a, i16 %b) {
 ; CHECK-LABEL: andflag16rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; CHECK-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; CHECK-NEXT:    andw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x21,0xc7]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
@@ -478,7 +466,7 @@ define i1 @andflag16rr(i16 %a, i16 %b) {
 ;
 ; NF-LABEL: andflag16rr:
 ; NF:       # %bb.0:
-; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; NF-NEXT:    andw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x21,0xc7]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
@@ -564,7 +552,7 @@ define i1 @andflag8rm(ptr %ptr, i8 %b) {
 define i1 @andflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-LABEL: andflag16rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; CHECK-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; CHECK-NEXT:    andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
@@ -573,7 +561,7 @@ define i1 @andflag16rm(ptr %ptr, i16 %b) {
 ;
 ; NF-LABEL: andflag16rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; NF-NEXT:    andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
new file mode 100644
index 00000000000000..7db1d524ebdeb9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mattr=+cf -verify-machineinstrs | FileCheck %s
+
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: basic:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cfcmovel (%rsi), %eax
+; CHECK-NEXT:    cfcmovel %eax, (%rdx)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    cfcmovneq %rax, (%rdx)
+; CHECK-NEXT:    movw $2, %ax
+; CHECK-NEXT:    cfcmovnew %ax, (%rcx)
+; CHECK-NEXT:    retq
+entry:
+  %cond = icmp eq i32 %a, 0
+  %0 = bitcast i1 %cond to <1 x i1>
+  %1 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i32> poison)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> %1, ptr %p, i32 4, <1 x i1> %0)
+  %2 = xor i1 %cond, true
+  %3 = bitcast i1 %2 to <1 x i1>
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr %p, i32 8, <1 x i1> %3)
+  call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr %q, i32 8, <1 x i1> %3)
+  ret void
+}
+
+define i16 @cload_passthru_zero(i16 %a, ptr %b) {
+; CHECK-LABEL: cload_passthru_zero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testw %di, %di
+; CHECK-NEXT:    cfcmovew (%rsi), %ax
+; CHECK-NEXT:    retq
+entry:
+  %cond = icmp eq i16 %a, 0
+  %0 = bitcast i1 %cond to <1 x i1>
+  %1 = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i16> <i16 0>)
+  %2 = bitcast <1 x i16> %1 to i16
+  ret i16 %2
+}
+
+define i64 @cload_passthru_not_zero(i64 %a, ptr %b) {
+; CHECK-LABEL: cload_passthru_not_zero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    cfcmoveq (%rsi), %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %cond = icmp eq i64 %a, 0
+  %0 = bitcast i1 %cond to <1 x i1>
+  %va = bitcast i64 %a to <1 x i64>
+  %1 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i64> %va)
+  %2 = bitcast <1 x i64> %1 to i64
+  ret i64 %2
+}
+
+;; CFCMOV can use the flags produced by SUB directly.
+define i64 @reduced_data_dependency(i64 %a, i64 %b, ptr %c) {
+; CHECK-LABEL: reduced_data_dependency:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    subq %rsi, %rcx
+; CHECK-NEXT:    cfcmovnsq (%rdx), %rdi, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+entry:
+  %sub = sub i64 %a, %b
+  %cond = icmp sge i64 %sub, 0
+  %0 = bitcast i1 %cond to <1 x i1>
+  %va = bitcast i64 %a to <1 x i64>
+  %1 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %c, i32 4, <1 x i1> %0, <1 x i64> %va)
+  %2 = bitcast <1 x i64> %1 to i64
+  %3 = add i64 %2, %sub
+  ret i64 %3
+}
+
+;; No need to optimize the generated assembly for cond_false/cond_true b/c it
+;; should never be emitted by middle end. Add IR here just to check it's
+;; legal to feed constant mask to backend.
+define i16 @cond_false(ptr %b) {
+; CHECK-LABEL: cond_false:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    negb %al
+; CHECK-NEXT:    cfcmovnew (%rdi), %ax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast i1 false to <1 x i1>
+  %1 = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i16> <i16 0>)
+  %2 = bitcast <1 x i16> %1 to i16
+  ret i16 %2
+}
+
+define i64 @cond_true(ptr %b) {
+; CHECK-LABEL: cond_true:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    negb %al
+; CHECK-NEXT:    cfcmovneq (%rdi), %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast i1 true to <1 x i1>
+  %1 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i64> <i64 0>)
+  %2 = bitcast <1 x i64> %1 to i64
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir
index 5a59ab0f8a9d0b..7723cac8eb3269 100644
--- a/llvm/test/CodeGen/X86/apx/compress-evex.mir
+++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir
@@ -127,3 +127,14 @@ body:             |
     $rbx = CFCMOV64rm_ND $rbx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags
     RET64 $rax
 ...
+---
+name:            no_convert_8bit_16bit
+body:             |
+  bb.0.entry:
+    liveins: $di, $si, $ax
+    ; CHECK: addb    %sil, %dil                      # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf7]
+    ; CHECK: xorw    %di, %ax                        # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf8]
+    $dil = ADD8rr_ND $dil, killed $sil, implicit-def dead $eflags
+    $ax = XOR16rr_ND $ax, killed $di, implicit-def dead $eflags
+    RET64 $rax
+...
diff --git a/llvm/test/CodeGen/X86/apx/ctest.ll b/llvm/test/CodeGen/X86/apx/ctest.ll
index 22afc39fd40c9f..cef835ac918a39 100644
--- a/llvm/test/CodeGen/X86/apx/ctest.ll
+++ b/llvm/test/CodeGen/X86/apx/ctest.ll
@@ -396,9 +396,8 @@ define void @ctest16ri_zf(i16 noundef %a, i16 noundef %b) {
 ;
 ; NDD-LABEL: ctest16ri_zf:
 ; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    andl $1234, %esi, %eax # imm = 0x4D2
 ; NDD-NEXT:    testw %di, %di
-; NDD-NEXT:    ctestnew {dfv=zf} %ax, %ax
+; NDD-NEXT:    ctestnew {dfv=zf} $1234, %si # imm = 0x4D2
 ; NDD-NEXT:    jne .LBB10_1
 ; NDD-NEXT:  # %bb.2: # %if.then
 ; NDD-NEXT:    xorl %eax, %eax
@@ -704,10 +703,8 @@ define void @ctest16mi_zf(i16 noundef %a, ptr %ptr) {
 ;
 ; NDD-LABEL: ctest16mi_zf:
 ; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    movzwl (%rsi), %eax
-; NDD-NEXT:    andl $1234, %eax # imm = 0x4D2
 ; NDD-NEXT:    testw %di, %di
-; NDD-NEXT:    ctestnew {dfv=zf} %ax, %ax
+; NDD-NEXT:    ctestnew {dfv=zf} $1234, (%rsi) # imm = 0x4D2
 ; NDD-NEXT:    jne .LBB18_1
 ; NDD-NEXT:  # %bb.2: # %if.then
 ; NDD-NEXT:    xorl %eax, %eax
@@ -746,10 +743,8 @@ define void @ctest32mi_zf(i32 noundef %a, ptr %ptr) {
 ;
 ; NDD-LABEL: ctest32mi_zf:
 ; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    movzwl (%rsi), %eax
-; NDD-NEXT:    andl $12345, %eax # imm = 0x3039
 ; NDD-NEXT:    testl %edi, %edi
-; NDD-NEXT:    ctestnew {dfv=zf} %ax, %ax
+; NDD-NEXT:    ctestnew {dfv=zf} $12345, (%rsi) # imm = 0x3039
 ; NDD-NEXT:    jne .LBB19_1
 ; NDD-NEXT:  # %bb.2: # %if.then
 ; NDD-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/apx/dec.ll b/llvm/test/CodeGen/X86/apx/dec.ll
index a18ed2ace603ab..56c45336291869 100644
--- a/llvm/test/CodeGen/X86/apx/dec.ll
+++ b/llvm/test/CodeGen/X86/apx/dec.ll
@@ -20,14 +20,12 @@ entry:
 define i16 @dec16r(i16 noundef %a) {
 ; CHECK-LABEL: dec16r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    decl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    decw %di, %ax
 ; CHECK-NEXT:    retq
 ;
 ; NF-LABEL: dec16r:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} decl %edi, %eax
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} decw %di, %ax
 ; NF-NEXT:    retq
 entry:
   %dec = sub i16 %a, 1
@@ -83,16 +81,12 @@ entry:
 define i16 @dec16m(ptr %ptr) {
 ; CHECK-LABEL: dec16m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    decl %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    decw (%rdi), %ax
 ; CHECK-NEXT:    retq
 ;
 ; NF-LABEL: dec16m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax
-; NF-NEXT:    decl %eax
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} decw (%rdi), %ax
 ; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/inc.ll b/llvm/test/CodeGen/X86/apx/inc.ll
index 8d31badb997797..cecb65261bf81c 100644
--- a/llvm/test/CodeGen/X86/apx/inc.ll
+++ b/llvm/test/CodeGen/X86/apx/inc.ll
@@ -20,14 +20,12 @@ entry:
 define i16 @inc16r(i16 noundef %a) {
 ; CHECK-LABEL: inc16r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    incw %di, %ax
 ; CHECK-NEXT:    retq
 ;
 ; NF-LABEL: inc16r:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} incl %edi, %eax
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} incw %di, %ax
 ; NF-NEXT:    retq
 entry:
   %inc = add i16 %a, 1
@@ -83,16 +81,12 @@ entry:
 define i16 @inc16m(ptr %ptr) {
 ; CHECK-LABEL: inc16m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    incl %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    incw (%rdi), %ax
 ; CHECK-NEXT:    retq
 ;
 ; NF-LABEL: inc16m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax
-; NF-NEXT:    incl %eax
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} incw (%rdi), %ax
 ; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/neg.ll b/llvm/test/CodeGen/X86/apx/neg.ll
index 5e033e33cb8b2a..cf86dc609bc33c 100644
--- a/llvm/test/CodeGen/X86/apx/neg.ll
+++ b/llvm/test/CodeGen/X86/apx/neg.ll
@@ -20,14 +20,12 @@ entry:
 define i16 @neg16r(i16 noundef %a) {
 ; CHECK-LABEL: neg16r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    negw %di, %ax
 ; CHECK-NEXT:    retq
 ;
 ; NF-LABEL: neg16r:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negl %edi, %eax
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} negw %di, %ax
 ; NF-NEXT:    retq
 entry:
   %neg = sub i16 0, %a
@@ -147,14 +145,12 @@ entry:
 define i16 @uneg16r(i16 noundef %a) {
 ; CHECK-LABEL: uneg16r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    negw %di, %ax
 ; CHECK-NEXT:    retq
 ;
 ; NF-LABEL: uneg16r:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negl %edi, %eax
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} negw %di, %ax
 ; NF-NEXT:    retq
 entry:
   %t = call {i16, i1} @llvm.usub.with.overflow.i16(i16 0, i16 %a)
diff --git a/llvm/test/CodeGen/X86/apx/not.ll b/llvm/test/CodeGen/X86/apx/not.ll
index 5369819d512920..f13131ece5a56e 100644
--- a/llvm/test/CodeGen/X86/apx/not.ll
+++ b/llvm/test/CodeGen/X86/apx/not.ll
@@ -14,8 +14,7 @@ entry:
 define i16 @not16r(i16 noundef %a) {
 ; CHECK-LABEL: not16r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    notl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    notw %di, %ax
 ; CHECK-NEXT:    retq
 entry:
   %not = xor i16 %a, -1
@@ -56,9 +55,7 @@ entry:
 define i16 @not16m(ptr %ptr) {
 ; CHECK-LABEL: not16m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    notw (%rdi), %ax
 ; CHECK-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index f7aa400714dc3d..6a3db295c8c150 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -133,14 +133,12 @@ entry:
 define i16 @or16ri8(i16 noundef %a) {
 ; CHECK-LABEL: or16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xcf,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    orw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xcf,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xcf,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} orw $123, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xcf,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, 123
@@ -195,16 +193,14 @@ entry:
 define i16 @or16ri(i16 noundef %a) {
 ; CHECK-LABEL: or16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl $1234, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xcf,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    orw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xcf,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xcf,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} orw $1234, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xcf,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, 1234
@@ -312,16 +308,12 @@ entry:
 define i16 @or16mi8(ptr %a) {
 ; CHECK-LABEL: or16mi8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    orw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} orw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x0f,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -380,18 +372,14 @@ entry:
 define i16 @or16mi(ptr %a) {
 ; CHECK-LABEL: or16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    orl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    orw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x0f,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    orl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} orw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x0f,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -465,7 +453,7 @@ define i1 @orflag8rr(i8 %a, i8 %b) {
 define i1 @orflag16rr(i16 %a, i16 %b) {
 ; CHECK-LABEL: orflag16rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; CHECK-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; CHECK-NEXT:    orw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x09,0xc7]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
@@ -474,7 +462,7 @@ define i1 @orflag16rr(i16 %a, i16 %b) {
 ;
 ; NF-LABEL: orflag16rr:
 ; NF:       # %bb.0:
-; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; NF-NEXT:    orw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x09,0xc7]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
@@ -560,7 +548,7 @@ define i1 @orflag8rm(ptr %ptr, i8 %b) {
 define i1 @orflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-LABEL: orflag16rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; CHECK-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; CHECK-NEXT:    orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
@@ -569,7 +557,7 @@ define i1 @orflag16rm(ptr %ptr, i16 %b) {
 ;
 ; NF-LABEL: orflag16rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
 ; NF-NEXT:    orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
diff --git a/llvm/test/CodeGen/X86/apx/sar.ll b/llvm/test/CodeGen/X86/apx/sar.ll
index 6a14aa6a505638..25c067b46b9ce9 100644
--- a/llvm/test/CodeGen/X86/apx/sar.ll
+++ b/llvm/test/CodeGen/X86/apx/sar.ll
@@ -15,9 +15,7 @@ entry:
 define i16 @sar16m1(ptr %ptr) {
 ; CHECK-LABEL: sar16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
-; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x3f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -78,10 +76,8 @@ define i16 @sar16mcl(ptr %ptr, i16 %cl) {
 ; CHECK-LABEL: sar16mcl:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x3f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -93,10 +89,8 @@ define i16 @sar16mcl_mask(ptr %ptr, i16 %cl) {
 ; CHECK-LABEL: sar16mcl_mask:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x3f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -173,9 +167,7 @@ entry:
 define i16 @sar16mi(ptr %ptr) {
 ; CHECK-LABEL: sar16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
-; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x3f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -218,9 +210,7 @@ entry:
 define i16 @sar16r1(i16 noundef %a) {
 ; CHECK-LABEL: sar16r1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
-; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %sar = ashr i16 %a, 1
@@ -276,10 +266,8 @@ define i16 @sar16rcl(i16 noundef %a, i16 %cl) {
 ; CHECK-LABEL: sar16rcl:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %sar = ashr i16 %a, %cl
@@ -290,10 +278,8 @@ define i16 @sar16rcl_mask(i16 noundef %a, i16 %cl) {
 ; CHECK-LABEL: sar16rcl_mask:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
@@ -364,9 +350,7 @@ entry:
 define i16 @sar16ri(i16 noundef %a) {
 ; CHECK-LABEL: sar16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
-; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    sarw $4, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0xff,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %sar = ashr i16 %a, 4
diff --git a/llvm/test/CodeGen/X86/apx/sbb.ll b/llvm/test/CodeGen/X86/apx/sbb.ll
index b9032f92f8b20e..a67419bbd5db7a 100644
--- a/llvm/test/CodeGen/X86/apx/sbb.ll
+++ b/llvm/test/CodeGen/X86/apx/sbb.ll
@@ -114,8 +114,7 @@ define i16 @sbb16ri8(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xdf,0x00]
-; CHECK-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    addw $-123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = sub i16 %a, 123
   %k = icmp ugt i16 %x, %y
@@ -171,9 +170,8 @@ define i16 @sbb16ri(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xdf,0x00]
-; CHECK-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
+; CHECK-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
 ; CHECK-NEXT:    # imm = 0xFB2E
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = sub i16 %a, 1234
   %k = icmp ugt i16 %x, %y
@@ -273,8 +271,7 @@ define i16 @sbb16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    addw $-123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = sub i16 %a, 123
@@ -334,9 +331,8 @@ define i16 @sbb16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
+; CHECK-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
 ; CHECK-NEXT:    # imm = 0xFB2E
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = sub i16 %a, 1234
diff --git a/llvm/test/CodeGen/X86/apx/setzucc.ll b/llvm/test/CodeGen/X86/apx/setzucc.ll
new file mode 100644
index 00000000000000..1436d396bd0771
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/setzucc.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mattr=+zu | FileCheck %s
+
+define i16 @i8(i8 %x) nounwind {
+; CHECK-LABEL: i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpb $3, %dil
+; CHECK-NEXT:    setzuae %al
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %t0 = icmp ugt i8 %x, 2
+  %zext = zext i1 %t0 to i16
+  ret i16 %zext
+}
+
+define i16 @i16(i16 %x) nounwind {
+; CHECK-LABEL: i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw $2, %di
+; CHECK-NEXT:    setzub %al
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %t0 = icmp ult i16 %x, 2
+  %if = select i1 %t0, i16 1, i16 0
+  ret i16 %if
+}
+
+define i32 @i32(i32 %x) nounwind {
+; CHECK-LABEL: i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    setzue %al
+; CHECK-NEXT:    retq
+  %t0 = icmp eq i32 %x, 1
+  %if = select i1 %t0, i32 1, i32 0
+  ret i32 %if
+}
+
+define i64 @i64(i64 %x) nounwind {
+; CHECK-LABEL: i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpq $1, %rdi
+; CHECK-NEXT:    setzune %al
+; CHECK-NEXT:    retq
+  %t0 = icmp ne i64 %x, 1
+  %if = select i1 %t0, i64 1, i64 0
+  ret i64 %if
+}
diff --git a/llvm/test/CodeGen/X86/apx/shl.ll b/llvm/test/CodeGen/X86/apx/shl.ll
index 35b6cb27254b2e..1db01f21989297 100644
--- a/llvm/test/CodeGen/X86/apx/shl.ll
+++ b/llvm/test/CodeGen/X86/apx/shl.ll
@@ -20,14 +20,12 @@ entry:
 define i16 @shl16ri(i16 noundef %a) {
 ; CHECK-LABEL: shl16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shlw $4, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0xe7,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shll $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xe7,0x04]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shlw $4, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xc1,0xe7,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, 4
@@ -86,15 +84,13 @@ define i16 @shl16m1(ptr %ptr) {
 ; CHECK-LABEL: shl16m1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    addw %ax, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16m1:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    addw %ax, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -183,19 +179,15 @@ define i16 @shl16mcl(ptr %ptr, i16 %cl) {
 ; CHECK-LABEL: shl16mcl:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shlw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shlw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x27]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -207,19 +199,15 @@ define i16 @shl16mcl_mask(ptr %ptr, i16 %cl) {
 ; CHECK-LABEL: shl16mcl_mask:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shlw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shlw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x27]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -329,16 +317,12 @@ entry:
 define i16 @shl16mi(ptr %ptr) {
 ; CHECK-LABEL: shl16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shlw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shlw $4, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xc1,0x27,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -396,14 +380,12 @@ entry:
 define i16 @shl16r1(i16 noundef %a) {
 ; CHECK-LABEL: shl16r1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    addw %di, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16r1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} addw %di, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x01,0xff]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, 1
@@ -484,16 +466,14 @@ define i16 @shl16rcl(i16 noundef %a, i16 %cl) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shlw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16rcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shlw %cl, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0xe7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, %cl
@@ -505,16 +485,14 @@ define i16 @shl16rcl_mask(i16 noundef %a, i16 %cl) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shlw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16rcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shlw %cl, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0xe7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
diff --git a/llvm/test/CodeGen/X86/apx/shr.ll b/llvm/test/CodeGen/X86/apx/shr.ll
index b5b91b02fedffb..a63d22b30f83dc 100644
--- a/llvm/test/CodeGen/X86/apx/shr.ll
+++ b/llvm/test/CodeGen/X86/apx/shr.ll
@@ -21,16 +21,12 @@ entry:
 define i16 @shr16m1(ptr %ptr) {
 ; CHECK-LABEL: shr16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16m1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd1,0x2f]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -115,19 +111,15 @@ define i16 @shr16mcl(ptr %ptr, i16 %cl) {
 ; CHECK-LABEL: shr16mcl:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x2f]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -139,19 +131,15 @@ define i16 @shr16mcl_mask(ptr %ptr, i16 %cl) {
 ; CHECK-LABEL: shr16mcl_mask:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x2f]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -261,16 +249,12 @@ entry:
 define i16 @shr16mi(ptr %ptr) {
 ; CHECK-LABEL: shr16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw $4, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xc1,0x2f,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -328,16 +312,12 @@ entry:
 define i16 @shr16r1(i16 noundef %a) {
 ; CHECK-LABEL: shr16r1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
-; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16r1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
-; NF-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd1,0xef]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, 1
@@ -417,19 +397,15 @@ define i16 @shr16rcl(i16 noundef %a, i16 %cl) {
 ; CHECK-LABEL: shr16rcl:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16rcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw %cl, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0xef]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, %cl
@@ -440,19 +416,15 @@ define i16 @shr16rcl_mask(i16 noundef %a, i16 %cl) {
 ; CHECK-LABEL: shr16rcl_mask:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16rcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw %cl, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0xef]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
@@ -556,16 +528,12 @@ entry:
 define i16 @shr16ri(i16 noundef %a) {
 ; CHECK-LABEL: shr16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
-; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    shrw $4, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0xef,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
-; NF-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} shrw $4, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xc1,0xef,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, 4
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index a38d09587ba919..75d705557cdf21 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -20,14 +20,12 @@ entry:
 define i16 @sub16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: sub16rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    subw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xf7]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} subw %si, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x29,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, %b
@@ -131,14 +129,12 @@ entry:
 define i16 @sub16ri8(i16 noundef %a) {
 ; CHECK-LABEL: sub16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subl $-128, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x80]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    subw $-128, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xef,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subl $-128, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xef,0x80]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} subw $-128, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xef,0x80]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, -128
@@ -193,16 +189,14 @@ entry:
 define i16 @sub16ri(i16 noundef %a) {
 ; CHECK-LABEL: sub16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $-1234, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x2e,0xfb,0xff,0xff]
+; CHECK-NEXT:    addw $-1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc7,0x2e,0xfb]
 ; CHECK-NEXT:    # imm = 0xFB2E
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $-1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x2e,0xfb,0xff,0xff]
+; NF-NEXT:    {nf} addw $-1234, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xc7,0x2e,0xfb]
 ; NF-NEXT:    # imm = 0xFB2E
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, 1234
@@ -262,16 +256,12 @@ entry:
 define i16 @sub16mr(ptr %a, i16 noundef %b) {
 ; CHECK-LABEL: sub16mr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    subw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} subw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x29,0x37]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -314,16 +304,12 @@ entry:
 define i16 @sub16mi8(ptr %a) {
 ; CHECK-LABEL: sub16mi8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    subw $-128, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x2f,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} subw $-128, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x2f,0x80]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -382,18 +368,14 @@ entry:
 define i16 @sub16mi(ptr %a) {
 ; CHECK-LABEL: sub16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
+; CHECK-NEXT:    addw $-1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x07,0x2e,0xfb]
 ; CHECK-NEXT:    # imm = 0xFB2E
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
+; NF-NEXT:    {nf} addw $-1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x07,0x2e,0xfb]
 ; NF-NEXT:    # imm = 0xFB2E
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll
index 6a6b699a922db8..3426f9cc92ce78 100644
--- a/llvm/test/CodeGen/X86/apx/xor.ll
+++ b/llvm/test/CodeGen/X86/apx/xor.ll
@@ -133,14 +133,12 @@ entry:
 define i16 @xor16ri8(i16 noundef %a) {
 ; CHECK-LABEL: xor16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xf7,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    xorw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xf7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xf7,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} xorw $123, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xf7,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, 123
@@ -195,16 +193,14 @@ entry:
 define i16 @xor16ri(i16 noundef %a) {
 ; CHECK-LABEL: xor16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl $1234, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xf7,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    xorw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xf7,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xf7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} xorw $1234, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xf7,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, 1234
@@ -312,16 +308,12 @@ entry:
 define i16 @xor16mi8(ptr %a) {
 ; CHECK-LABEL: xor16mi8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    xorw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x37,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    {nf} xorw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x37,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -380,18 +372,14 @@ entry:
 define i16 @xor16mi(ptr %a) {
 ; CHECK-LABEL: xor16mi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; CHECK-NEXT:    xorl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x35,0xd2,0x04,0x00,0x00]
+; CHECK-NEXT:    xorw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x37,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
-; NF-NEXT:    xorl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x35,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    {nf} xorw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x37,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
-; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index b0fd4c848c5f8c..0de308a9e0738b 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1973,11 +1973,11 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) {
 ; CHECK-LABEL: avg_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orb %sil, %al
-; CHECK-NEXT:    xorb %sil, %dil
-; CHECK-NEXT:    shrb %dil
-; CHECK-NEXT:    subb %dil, %al
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    movzbl %dil, %ecx
+; CHECK-NEXT:    leal 1(%rcx,%rax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = zext <1 x i8> %x to <1 x i16>
   %b = zext <1 x i8> %y to <1 x i16>
diff --git a/llvm/test/CodeGen/X86/avgceils-scalar.ll b/llvm/test/CodeGen/X86/avgceils-scalar.ll
index 86de35d36f0769..91121bd4ad9357 100644
--- a/llvm/test/CodeGen/X86/avgceils-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgceils-scalar.ll
@@ -11,22 +11,20 @@
 define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: test_fixed_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    xorb %cl, %dl
-; X86-NEXT:    sarb %dl
-; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    sarb %dil
-; X64-NEXT:    subb %dil, %al
+; X64-NEXT:    movsbl %sil, %eax
+; X64-NEXT:    movsbl %dil, %ecx
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %or = or i8 %a0, %a1
   %xor = xor i8 %a0, %a1
@@ -38,22 +36,20 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: test_ext_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    xorb %cl, %dl
-; X86-NEXT:    sarb %dl
-; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    sarb %dil
-; X64-NEXT:    subb %dil, %al
+; X64-NEXT:    movsbl %sil, %eax
+; X64-NEXT:    movsbl %dil, %ecx
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %x0 = sext i8 %a0 to i16
   %x1 = sext i8 %a1 to i16
@@ -67,25 +63,19 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_fixed_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movswl %dx, %ecx
-; X86-NEXT:    sarl %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movswl %si, %eax
 ; X64-NEXT:    movswl %di, %ecx
-; X64-NEXT:    sarl %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %or = or i16 %a0, %a1
@@ -98,25 +88,19 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_ext_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movswl %dx, %ecx
-; X86-NEXT:    sarl %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movswl %si, %eax
 ; X64-NEXT:    movswl %di, %ecx
-; X64-NEXT:    sarl %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %x0 = sext i16 %a0 to i32
@@ -142,11 +126,11 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_fixed_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    sarl %edi
-; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    movslq %esi, %rax
+; X64-NEXT:    movslq %edi, %rcx
+; X64-NEXT:    leaq 1(%rcx,%rax), %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %or = or i32 %a0, %a1
   %xor = xor i32 %a1, %a0
@@ -169,11 +153,11 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_ext_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    sarl %edi
-; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    movslq %esi, %rax
+; X64-NEXT:    movslq %edi, %rcx
+; X64-NEXT:    leaq 1(%rcx,%rax), %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x0 = sext i32 %a0 to i64
   %x1 = sext i32 %a1 to i64
diff --git a/llvm/test/CodeGen/X86/avgceilu-scalar.ll b/llvm/test/CodeGen/X86/avgceilu-scalar.ll
index 014c984528141b..4ab4851eccd2c9 100644
--- a/llvm/test/CodeGen/X86/avgceilu-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgceilu-scalar.ll
@@ -11,22 +11,20 @@
 define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: test_fixed_i8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    xorb %cl, %dl
-; X86-NEXT:    shrb %dl
-; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    subb %dil, %al
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %or = or i8 %a0, %a1
   %xor = xor i8 %a0, %a1
@@ -38,22 +36,20 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: test_ext_i8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    xorb %cl, %dl
-; X86-NEXT:    shrb %dl
-; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    subb %dil, %al
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %x0 = zext i8 %a0 to i16
   %x1 = zext i8 %a1 to i16
@@ -67,25 +63,19 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_fixed_i16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    shrl %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    movzwl %di, %ecx
-; X64-NEXT:    shrl %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %or = or i16 %a0, %a1
@@ -98,25 +88,19 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_ext_i16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    shrl %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    leal 1(%ecx,%eax), %eax
+; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    movzwl %di, %ecx
-; X64-NEXT:    shrl %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    leal 1(%rcx,%rax), %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %x0 = zext i16 %a0 to i32
@@ -142,11 +126,11 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_fixed_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    shrl %edi
-; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    leaq 1(%rcx,%rax), %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %or = or i32 %a0, %a1
   %xor = xor i32 %a1, %a0
@@ -169,11 +153,11 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_ext_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    shrl %edi
-; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    leaq 1(%rcx,%rax), %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x0 = zext i32 %a0 to i64
   %x1 = zext i32 %a1 to i64
diff --git a/llvm/test/CodeGen/X86/avgfloors-scalar.ll b/llvm/test/CodeGen/X86/avgfloors-scalar.ll
index 4c591d40790401..deb79d54be7c48 100644
--- a/llvm/test/CodeGen/X86/avgfloors-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgfloors-scalar.ll
@@ -11,22 +11,20 @@
 define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: test_fixed_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb %cl, %dl
-; X86-NEXT:    xorb %cl, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    addb %dl, %al
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    sarb %dil
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %and = and i8 %a0, %a1
   %xor = xor i8 %a0, %a1
@@ -38,22 +36,20 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: test_ext_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb %cl, %dl
-; X86-NEXT:    xorb %cl, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    addb %dl, %al
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    sarb %dil
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %x0 = sext i8 %a0 to i16
   %x1 = sext i8 %a1 to i16
@@ -66,25 +62,19 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_fixed_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movswl %cx, %eax
-; X86-NEXT:    sarl %eax
-; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andl %esi, %ecx
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movswl %si, %ecx
 ; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    sarl %eax
 ; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %and = and i16 %a0, %a1
@@ -97,25 +87,19 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_ext_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movswl %cx, %eax
-; X86-NEXT:    sarl %eax
-; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andl %esi, %ecx
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movswl %si, %ecx
 ; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    sarl %eax
 ; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %x0 = sext i16 %a0 to i32
@@ -140,11 +124,11 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_fixed_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    sarl %edi
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    movslq %esi, %rcx
+; X64-NEXT:    movslq %edi, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %and = and i32 %a0, %a1
   %xor = xor i32 %a1, %a0
@@ -167,11 +151,11 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_ext_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    sarl %edi
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    movslq %esi, %rcx
+; X64-NEXT:    movslq %edi, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x0 = sext i32 %a0 to i64
   %x1 = sext i32 %a1 to i64
diff --git a/llvm/test/CodeGen/X86/avgflooru-scalar.ll b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
index 592e5e15b936ad..d21c9d65ea9c8a 100644
--- a/llvm/test/CodeGen/X86/avgflooru-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
@@ -13,20 +13,18 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb %cl, %dl
-; X86-NEXT:    xorb %cl, %al
-; X86-NEXT:    shrb %al
-; X86-NEXT:    addb %dl, %al
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %and = and i8 %a0, %a1
   %xor = xor i8 %a0, %a1
@@ -40,20 +38,18 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb %cl, %dl
-; X86-NEXT:    xorb %cl, %al
-; X86-NEXT:    shrb %al
-; X86-NEXT:    addb %dl, %al
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb %sil, %al
-; X64-NEXT:    xorb %sil, %dil
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %x0 = zext i8 %a0 to i16
   %x1 = zext i8 %a1 to i16
@@ -66,25 +62,19 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_fixed_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    shrl %eax
-; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andl %esi, %ecx
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movzwl %si, %ecx
 ; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    shrl %eax
 ; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %and = and i16 %a0, %a1
@@ -97,25 +87,19 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: test_ext_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    shrl %eax
-; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andl %esi, %ecx
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movzwl %si, %ecx
 ; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    shrl %eax
 ; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %x0 = zext i16 %a0 to i32
@@ -140,11 +124,11 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_fixed_i32:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    shrl %edi
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %and = and i32 %a0, %a1
   %xor = xor i32 %a1, %a0
@@ -167,11 +151,11 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ;
 ; X64-LABEL: test_ext_i32:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    xorl %esi, %edi
-; X64-NEXT:    shrl %edi
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x0 = zext i32 %a0 to i64
   %x1 = zext i32 %a1 to i64
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index d8452d105ae558..90733dfb8465ef 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -121,16 +121,14 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
 define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
 ; CHECK-LABEL: mul_v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpand %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; CHECK-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; CHECK-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpsllw $8, %ymm0, %ymm0
+; CHECK-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %x = mul <32 x i8> %i, %j
   ret <32 x i8> %x
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 17ac3e2e6364d2..73edceb3c3ede3 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -393,9 +393,8 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <1
 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
@@ -404,11 +403,10 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -419,10 +417,9 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
@@ -477,8 +474,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; CHECK-NEXT:    vpermt2w %ymm4, %ymm3, %ymm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
@@ -493,9 +489,8 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -508,8 +503,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; CHECK-NEXT:    vpermt2w %ymm4, %ymm3, %ymm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
@@ -524,9 +518,8 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -539,8 +532,7 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -551,8 +543,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; CHECK-NEXT:    vpermt2w %ymm4, %ymm3, %ymm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
@@ -567,9 +558,8 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 6255621d870e12..eef2b3db5d694e 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -256,12 +256,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ext_i8_8i32:
@@ -487,18 +484,12 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm2
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ext_i16_16i32:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index d2794df731b65d..5c810797bd2b75 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -320,12 +320,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -613,20 +610,14 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm2
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/cmpf-avx.ll b/llvm/test/CodeGen/X86/cmpf-avx.ll
index 39dce7b9895092..e58295fff98558 100644
--- a/llvm/test/CodeGen/X86/cmpf-avx.ll
+++ b/llvm/test/CodeGen/X86/cmpf-avx.ll
@@ -2,25 +2,20 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64
 
+; PR82242
 define <8 x i32> @cmp_eq_bitcast(<8 x i32> %x) {
 ; X86-LABEL: cmp_eq_bitcast:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
-; X86-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; X86-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmp_eq_bitcast:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
-; X64-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; X64-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    retq
   %and = and <8 x i32> %x, <i32 7, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %cmp = icmp eq <8 x i32> %and, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll
index a375ac1db9a1b9..565d9ef6eb3e6b 100644
--- a/llvm/test/CodeGen/X86/combine-pmadd.ll
+++ b/llvm/test/CodeGen/X86/combine-pmadd.ll
@@ -88,14 +88,37 @@ define <4 x i32> @combine_pmaddwd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
   ret <4 x i32> %4
 }
 
-define i32 @combine_pmaddwd_constant() {
-; CHECK-LABEL: combine_pmaddwd_constant:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $-155, %eax
-; CHECK-NEXT:    retq
+; [2]: (-5*13)+(6*-15) = -155 = 4294967141
+define <4 x i32> @combine_pmaddwd_constant() {
+; SSE-LABEL: combine_pmaddwd_constant:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [19,17,4294967141,271]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pmaddwd_constant:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [19,17,4294967141,271]
+; AVX-NEXT:    retq
   %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8>, <8 x i16> <i16 -5, i16 7, i16 -9, i16 -11, i16 13, i16 -15, i16 17, i16 -19>)
-  %2 = extractelement <4 x i32> %1, i32 2 ; (-5*13)+(6*-15) = -155
-  ret i32 %2
+  ret <4 x i32> %1
+}
+
+; ensure we don't assume pmaddwd performs add nsw
+; [0]: (-32768*-32768)+(-32768*-32768) = 0x80000000 = 2147483648
+define <4 x i32> @combine_pmaddwd_constant_nsw() {
+; SSE-LABEL: combine_pmaddwd_constant_nsw:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pmaddwd_constant_nsw:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> undef, i16 32768, i32 0
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %2, <8 x i16> %2)
+  ret <4 x i32> %3
 }
 
 define <8 x i16> @combine_pmaddubsw_zero(<16 x i8> %a0, <16 x i8> %a1) {
@@ -180,25 +203,44 @@ define <8 x i16> @combine_pmaddubsw_demandedelts(<16 x i8> %a0, <16 x i8> %a1) {
   ret <8 x i16> %4
 }
 
+; [3]: ((uint16_t)-6*7)+(7*-8) = (250*7)+(7*-8) = 1694
 define i32 @combine_pmaddubsw_constant() {
 ; CHECK-LABEL: combine_pmaddubsw_constant:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl $1694, %eax # imm = 0x69E
 ; CHECK-NEXT:    retq
   %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
-  %2 = extractelement <8 x i16> %1, i32 3 ; ((uint16_t)-6*7)+(7*-8) = (250*7)+(7*-8) = 1694
+  %2 = extractelement <8 x i16> %1, i32 3
   %3 = sext i16 %2 to i32
   ret i32 %3
 }
 
+; [0]: add_sat_i16(((uint16_t)-1*-128),((uint16_t)-1*-128)_ = add_sat_i16(255*-128),(255*-128)) = sat_i16(-65280) = -32768
 define i32 @combine_pmaddubsw_constant_sat() {
 ; CHECK-LABEL: combine_pmaddubsw_constant_sat:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl $-32768, %eax # imm = 0x8000
 ; CHECK-NEXT:    retq
   %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
-  %2 = extractelement <8 x i16> %1, i32 0 ; add_sat_i16(((uint16_t)-1*-128),((uint16_t)-1*-128)_ = add_sat_i16(255*-128),(255*-128)) = sat_i16(-65280) = -32768
+  %2 = extractelement <8 x i16> %1, i32 0
   %3 = sext i16 %2 to i32
   ret i32 %3
 }
 
+; Constant folding PMADDWD was causing an infinite loop in the PCMPGT commuting between 2 constant values.
+define i1 @pmaddwd_pcmpgt_infinite_loop() {
+; CHECK-LABEL: pmaddwd_pcmpgt_infinite_loop:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    retq
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>)
+  %2 = icmp eq <4 x i32> %1, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %3 = select <4 x i1> %2, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> zeroinitializer
+  %4 = add <4 x i32> %3, <i32 -8, i32 -9, i32 -10, i32 -11>
+  %.not = trunc <4 x i32> %3 to <4 x i1>
+  %5 = icmp sgt <4 x i32> %4, <i32 2147483640, i32 2147483639, i32 2147483638, i32 2147483637>
+  %6 = select <4 x i1> %.not, <4 x i1> %5, <4 x i1> zeroinitializer
+  %7 = bitcast <4 x i1> %6 to i4
+  %8 = icmp eq i4 %7, 0
+  ret i1 %8
+}
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
new file mode 100644
index 00000000000000..d735ca5d446cd2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -0,0 +1,425 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86,X86-NOCMOV
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov | FileCheck %s --check-prefixes=X86,X86-CMOV
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=X86-CLZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=X64-CLZ
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt,+fast-lzcnt | FileCheck %s --check-prefix=X86-CLZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt,+fast-lzcnt | FileCheck %s --check-prefix=X64-CLZ
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; X86-LABEL: ctlo_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorb $-1, %al
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movb $8, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: ctlo_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    xorb $-1, %dil
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    bsrl %eax, %eax
+; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    movb $8, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i8:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notb %al
+; X86-CLZ-NEXT:    movzbl %al, %eax
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    addl $-24, %eax
+; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i8:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notb %dil
+; X64-CLZ-NEXT:    movzbl %dil, %eax
+; X64-CLZ-NEXT:    lzcntl %eax, %eax
+; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i8 %x, -1
+  %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+  ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; X86-LABEL: ctlo_i8_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notb %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: ctlo_i8_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    notb %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    bsrl %eax, %eax
+; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i8_undef:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notb %al
+; X86-CLZ-NEXT:    movzbl %al, %eax
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    addl $-24, %eax
+; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i8_undef:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notb %dil
+; X64-CLZ-NEXT:    movzbl %dil, %eax
+; X64-CLZ-NEXT:    lzcntl %eax, %eax
+; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i8 %x, -1
+  %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+  ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; X86-LABEL: ctlo_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorw $-1, %ax
+; X86-NEXT:    je .LBB2_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    bsrw %ax, %ax
+; X86-NEXT:    xorl $15, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    movw $16, %ax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: ctlo_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    xorw $-1, %di
+; X64-NEXT:    je .LBB2_1
+; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    xorl $15, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB2_1:
+; X64-NEXT:    movw $16, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i16:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    lzcntw %ax, %ax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i16:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntw %di, %ax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i16 %x, -1
+  %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+  ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; X86-LABEL: ctlo_i16_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bsrw %ax, %ax
+; X86-NEXT:    xorl $15, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: ctlo_i16_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    notl %edi
+; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    xorl $15, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i16_undef:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    lzcntw %ax, %ax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i16_undef:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntw %di, %ax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i16 %x, -1
+  %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+  ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; X86-LABEL: ctlo_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl $-1, %eax
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: ctlo_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl $-1, %edi
+; X64-NEXT:    je .LBB4_1
+; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    xorl $31, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1:
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i32:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i32:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntl %edi, %eax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+  ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; X86-LABEL: ctlo_i32_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: ctlo_i32_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    notl %edi
+; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    xorl $31, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i32_undef:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i32_undef:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntl %edi, %eax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+  ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; X86-NOCMOV-LABEL: ctlo_i64:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOCMOV-NEXT:    notl %ecx
+; X86-NOCMOV-NEXT:    notl %eax
+; X86-NOCMOV-NEXT:    bsrl %eax, %edx
+; X86-NOCMOV-NEXT:    movl $63, %eax
+; X86-NOCMOV-NEXT:    je .LBB6_2
+; X86-NOCMOV-NEXT:  # %bb.1:
+; X86-NOCMOV-NEXT:    movl %edx, %eax
+; X86-NOCMOV-NEXT:  .LBB6_2:
+; X86-NOCMOV-NEXT:    testl %ecx, %ecx
+; X86-NOCMOV-NEXT:    jne .LBB6_3
+; X86-NOCMOV-NEXT:  # %bb.4:
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    addl $32, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB6_3:
+; X86-NOCMOV-NEXT:    bsrl %ecx, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlo_i64:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    notl %ecx
+; X86-CMOV-NEXT:    notl %eax
+; X86-CMOV-NEXT:    bsrl %eax, %eax
+; X86-CMOV-NEXT:    movl $63, %edx
+; X86-CMOV-NEXT:    cmovnel %eax, %edx
+; X86-CMOV-NEXT:    xorl $31, %edx
+; X86-CMOV-NEXT:    addl $32, %edx
+; X86-CMOV-NEXT:    bsrl %ecx, %eax
+; X86-CMOV-NEXT:    xorl $31, %eax
+; X86-CMOV-NEXT:    testl %ecx, %ecx
+; X86-CMOV-NEXT:    cmovel %edx, %eax
+; X86-CMOV-NEXT:    xorl %edx, %edx
+; X86-CMOV-NEXT:    retl
+;
+; X64-LABEL: ctlo_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    xorq $-1, %rdi
+; X64-NEXT:    je .LBB6_1
+; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    xorq $63, %rax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_1:
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i64:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    testl %eax, %eax
+; X86-CLZ-NEXT:    jne .LBB6_1
+; X86-CLZ-NEXT:  # %bb.2:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    addl $32, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+; X86-CLZ-NEXT:  .LBB6_1:
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i64:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notq %rdi
+; X64-CLZ-NEXT:    lzcntq %rdi, %rax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i64 %x, -1
+  %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+  ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; X86-NOCMOV-LABEL: ctlo_i64_undef:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    notl %eax
+; X86-NOCMOV-NEXT:    testl %eax, %eax
+; X86-NOCMOV-NEXT:    jne .LBB7_1
+; X86-NOCMOV-NEXT:  # %bb.2:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    notl %eax
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    addl $32, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB7_1:
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlo_i64_undef:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    notl %eax
+; X86-CMOV-NEXT:    notl %ecx
+; X86-CMOV-NEXT:    bsrl %ecx, %edx
+; X86-CMOV-NEXT:    xorl $31, %edx
+; X86-CMOV-NEXT:    bsrl %eax, %eax
+; X86-CMOV-NEXT:    xorl $31, %eax
+; X86-CMOV-NEXT:    addl $32, %eax
+; X86-CMOV-NEXT:    testl %ecx, %ecx
+; X86-CMOV-NEXT:    cmovnel %edx, %eax
+; X86-CMOV-NEXT:    xorl %edx, %edx
+; X86-CMOV-NEXT:    retl
+;
+; X64-LABEL: ctlo_i64_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    notq %rdi
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    xorq $63, %rax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: ctlo_i64_undef:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    testl %eax, %eax
+; X86-CLZ-NEXT:    jne .LBB7_1
+; X86-CLZ-NEXT:  # %bb.2:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    addl $32, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+; X86-CLZ-NEXT:  .LBB7_1:
+; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: ctlo_i64_undef:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    notq %rdi
+; X64-CLZ-NEXT:    lzcntq %rdi, %rax
+; X64-CLZ-NEXT:    retq
+  %tmp1 = xor i64 %x, -1
+  %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+  ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/ctlz.ll
similarity index 62%
rename from llvm/test/CodeGen/X86/clz.ll
rename to llvm/test/CodeGen/X86/ctlz.ll
index 92cbc165902473..bd3d36903ee94a 100644
--- a/llvm/test/CodeGen/X86/clz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -7,205 +7,11 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt,+fast-lzcnt | FileCheck %s --check-prefix=X64-FASTLZCNT
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt,+fast-lzcnt | FileCheck %s --check-prefix=X86-FASTLZCNT
 
-declare i8 @llvm.cttz.i8(i8, i1)
-declare i16 @llvm.cttz.i16(i16, i1)
-declare i32 @llvm.cttz.i32(i32, i1)
-declare i64 @llvm.cttz.i64(i64, i1)
-
 declare i8 @llvm.ctlz.i8(i8, i1)
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i32 @llvm.ctlz.i32(i32, i1)
 declare i64 @llvm.ctlz.i64(i64, i1)
 
-define i8 @cttz_i8(i8 %x)  {
-; X86-LABEL: cttz_i8:
-; X86:       # %bb.0:
-; X86-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i8:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i8:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i8:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i8:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
-  ret i8 %tmp
-}
-
-define i16 @cttz_i16(i16 %x)  {
-; X86-LABEL: cttz_i16:
-; X86:       # %bb.0:
-; X86-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i16:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i16:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i16:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i16:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true )
-  ret i16 %tmp
-}
-
-define i32 @cttz_i32(i32 %x)  {
-; X86-LABEL: cttz_i32:
-; X86:       # %bb.0:
-; X86-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i32:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i32:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i32:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i32:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true )
-  ret i32 %tmp
-}
-
-define i64 @cttz_i64(i64 %x)  {
-; X86-NOCMOV-LABEL: cttz_i64:
-; X86-NOCMOV:       # %bb.0:
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOCMOV-NEXT:    testl %eax, %eax
-; X86-NOCMOV-NEXT:    jne .LBB3_1
-; X86-NOCMOV-NEXT:  # %bb.2:
-; X86-NOCMOV-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
-; X86-NOCMOV-NEXT:    addl $32, %eax
-; X86-NOCMOV-NEXT:    xorl %edx, %edx
-; X86-NOCMOV-NEXT:    retl
-; X86-NOCMOV-NEXT:  .LBB3_1:
-; X86-NOCMOV-NEXT:    rep bsfl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl %edx, %edx
-; X86-NOCMOV-NEXT:    retl
-;
-; X86-CMOV-LABEL: cttz_i64:
-; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    rep bsfl %ecx, %edx
-; X86-CMOV-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    addl $32, %eax
-; X86-CMOV-NEXT:    testl %ecx, %ecx
-; X86-CMOV-NEXT:    cmovnel %edx, %eax
-; X86-CMOV-NEXT:    xorl %edx, %edx
-; X86-CMOV-NEXT:    retl
-;
-; X64-LABEL: cttz_i64:
-; X64:       # %bb.0:
-; X64-NEXT:    rep bsfq %rdi, %rax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i64:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    testl %eax, %eax
-; X86-CLZ-NEXT:    jne .LBB3_1
-; X86-CLZ-NEXT:  # %bb.2:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    addl $32, %eax
-; X86-CLZ-NEXT:    xorl %edx, %edx
-; X86-CLZ-NEXT:    retl
-; X86-CLZ-NEXT:  .LBB3_1:
-; X86-CLZ-NEXT:    tzcntl %eax, %eax
-; X86-CLZ-NEXT:    xorl %edx, %edx
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i64:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntq %rdi, %rax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i64:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntq %rdi, %rax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i64:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    testl %eax, %eax
-; X86-FASTLZCNT-NEXT:    jne .LBB3_1
-; X86-FASTLZCNT-NEXT:  # %bb.2:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    addl $32, %eax
-; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
-; X86-FASTLZCNT-NEXT:    retl
-; X86-FASTLZCNT-NEXT:  .LBB3_1:
-; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true )
-  ret i64 %tmp
-}
-
 define i8 @ctlz_i8(i8 %x) {
 ; X86-LABEL: ctlz_i8:
 ; X86:       # %bb.0:
@@ -337,14 +143,14 @@ define i64 @ctlz_i64(i64 %x) {
 ; X86-NOCMOV:       # %bb.0:
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    testl %eax, %eax
-; X86-NOCMOV-NEXT:    jne .LBB7_1
+; X86-NOCMOV-NEXT:    jne .LBB3_1
 ; X86-NOCMOV-NEXT:  # %bb.2:
 ; X86-NOCMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    addl $32, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
 ; X86-NOCMOV-NEXT:    retl
-; X86-NOCMOV-NEXT:  .LBB7_1:
+; X86-NOCMOV-NEXT:  .LBB3_1:
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
@@ -373,13 +179,13 @@ define i64 @ctlz_i64(i64 %x) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    testl %eax, %eax
-; X86-CLZ-NEXT:    jne .LBB7_1
+; X86-CLZ-NEXT:    jne .LBB3_1
 ; X86-CLZ-NEXT:  # %bb.2:
 ; X86-CLZ-NEXT:    lzcntl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    addl $32, %eax
 ; X86-CLZ-NEXT:    xorl %edx, %edx
 ; X86-CLZ-NEXT:    retl
-; X86-CLZ-NEXT:  .LBB7_1:
+; X86-CLZ-NEXT:  .LBB3_1:
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
 ; X86-CLZ-NEXT:    xorl %edx, %edx
 ; X86-CLZ-NEXT:    retl
@@ -398,13 +204,13 @@ define i64 @ctlz_i64(i64 %x) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    testl %eax, %eax
-; X86-FASTLZCNT-NEXT:    jne .LBB7_1
+; X86-FASTLZCNT-NEXT:    jne .LBB3_1
 ; X86-FASTLZCNT-NEXT:  # %bb.2:
 ; X86-FASTLZCNT-NEXT:    lzcntl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    addl $32, %eax
 ; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
 ; X86-FASTLZCNT-NEXT:    retl
-; X86-FASTLZCNT-NEXT:  .LBB7_1:
+; X86-FASTLZCNT-NEXT:  .LBB3_1:
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
 ; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
 ; X86-FASTLZCNT-NEXT:    retl
@@ -418,14 +224,14 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB8_1
+; X86-NEXT:    je .LBB4_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    bsrl %eax, %eax
 ; X86-NEXT:    xorl $7, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB8_1:
+; X86-NEXT:  .LBB4_1:
 ; X86-NEXT:    movb $8, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
@@ -433,14 +239,14 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X64-LABEL: ctlz_i8_zero_test:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testb %dil, %dil
-; X64-NEXT:    je .LBB8_1
+; X64-NEXT:    je .LBB4_1
 ; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
 ; X64-NEXT:    xorl $7, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB8_1:
+; X64-NEXT:  .LBB4_1:
 ; X64-NEXT:    movb $8, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -486,13 +292,13 @@ define i16 @ctlz_i16_zero_test(i16 %n) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    je .LBB9_1
+; X86-NEXT:    je .LBB5_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    bsrw %ax, %ax
 ; X86-NEXT:    xorl $15, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_1:
+; X86-NEXT:  .LBB5_1:
 ; X86-NEXT:    movw $16, %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
@@ -500,13 +306,13 @@ define i16 @ctlz_i16_zero_test(i16 %n) {
 ; X64-LABEL: ctlz_i16_zero_test:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testw %di, %di
-; X64-NEXT:    je .LBB9_1
+; X64-NEXT:    je .LBB5_1
 ; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    bsrw %di, %ax
 ; X64-NEXT:    xorl $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_1:
+; X64-NEXT:  .LBB5_1:
 ; X64-NEXT:    movw $16, %ax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
@@ -540,24 +346,24 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB10_1
+; X86-NEXT:    je .LBB6_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    bsrl %eax, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB10_1:
+; X86-NEXT:  .LBB6_1:
 ; X86-NEXT:    movl $32, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_i32_zero_test:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB10_1
+; X64-NEXT:    je .LBB6_1
 ; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    bsrl %edi, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB10_1:
+; X64-NEXT:  .LBB6_1:
 ; X64-NEXT:    movl $32, %eax
 ; X64-NEXT:    retq
 ;
@@ -591,18 +397,18 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOCMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %edx
 ; X86-NOCMOV-NEXT:    movl $63, %eax
-; X86-NOCMOV-NEXT:    je .LBB11_2
+; X86-NOCMOV-NEXT:    je .LBB7_2
 ; X86-NOCMOV-NEXT:  # %bb.1:
 ; X86-NOCMOV-NEXT:    movl %edx, %eax
-; X86-NOCMOV-NEXT:  .LBB11_2:
+; X86-NOCMOV-NEXT:  .LBB7_2:
 ; X86-NOCMOV-NEXT:    testl %ecx, %ecx
-; X86-NOCMOV-NEXT:    jne .LBB11_3
+; X86-NOCMOV-NEXT:    jne .LBB7_3
 ; X86-NOCMOV-NEXT:  # %bb.4:
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    addl $32, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
 ; X86-NOCMOV-NEXT:    retl
-; X86-NOCMOV-NEXT:  .LBB11_3:
+; X86-NOCMOV-NEXT:  .LBB7_3:
 ; X86-NOCMOV-NEXT:    bsrl %ecx, %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
@@ -626,12 +432,12 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
 ; X64-LABEL: ctlz_i64_zero_test:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB11_1
+; X64-NEXT:    je .LBB7_1
 ; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    bsrq %rdi, %rax
 ; X64-NEXT:    xorq $63, %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB11_1:
+; X64-NEXT:  .LBB7_1:
 ; X64-NEXT:    movl $64, %eax
 ; X64-NEXT:    retq
 ;
@@ -639,13 +445,13 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    testl %eax, %eax
-; X86-CLZ-NEXT:    jne .LBB11_1
+; X86-CLZ-NEXT:    jne .LBB7_1
 ; X86-CLZ-NEXT:  # %bb.2:
 ; X86-CLZ-NEXT:    lzcntl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    addl $32, %eax
 ; X86-CLZ-NEXT:    xorl %edx, %edx
 ; X86-CLZ-NEXT:    retl
-; X86-CLZ-NEXT:  .LBB11_1:
+; X86-CLZ-NEXT:  .LBB7_1:
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
 ; X86-CLZ-NEXT:    xorl %edx, %edx
 ; X86-CLZ-NEXT:    retl
@@ -664,13 +470,13 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    testl %eax, %eax
-; X86-FASTLZCNT-NEXT:    jne .LBB11_1
+; X86-FASTLZCNT-NEXT:    jne .LBB7_1
 ; X86-FASTLZCNT-NEXT:  # %bb.2:
 ; X86-FASTLZCNT-NEXT:    lzcntl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    addl $32, %eax
 ; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
 ; X86-FASTLZCNT-NEXT:    retl
-; X86-FASTLZCNT-NEXT:  .LBB11_1:
+; X86-FASTLZCNT-NEXT:  .LBB7_1:
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
 ; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
 ; X86-FASTLZCNT-NEXT:    retl
@@ -678,245 +484,6 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
   ret i64 %tmp1
 }
 
-; Promote i8 cttz to i32 and mask bit8 to prevent (slow) zero-src bsf case.
-define i8 @cttz_i8_zero_test(i8 %n) {
-; X86-LABEL: cttz_i8_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movl $256, %eax # imm = 0x100
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i8_zero_test:
-; X64:       # %bb.0:
-; X64-NEXT:    orl $256, %edi # imm = 0x100
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i8_zero_test:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movl $256, %eax # imm = 0x100
-; X86-CLZ-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    tzcntl %eax, %eax
-; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i8_zero_test:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    orl $256, %edi # imm = 0x100
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i8_zero_test:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    orl $256, %edi # imm = 0x100
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i8_zero_test:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    movl $256, %eax # imm = 0x100
-; X86-FASTLZCNT-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)
-  ret i8 %tmp1
-}
-
-; Promote i16 cttz to i32 and mask bit16 to prevent (slow) zero-src bsf case.
-define i16 @cttz_i16_zero_test(i16 %n) {
-; X86-LABEL: cttz_i16_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movl $65536, %eax # imm = 0x10000
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i16_zero_test:
-; X64:       # %bb.0:
-; X64-NEXT:    orl $65536, %edi # imm = 0x10000
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i16_zero_test:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movl $65536, %eax # imm = 0x10000
-; X86-CLZ-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    tzcntl %eax, %eax
-; X86-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i16_zero_test:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    orl $65536, %edi # imm = 0x10000
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i16_zero_test:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    orl $65536, %edi # imm = 0x10000
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i16_zero_test:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    movl $65536, %eax # imm = 0x10000
-; X86-FASTLZCNT-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp1 = call i16 @llvm.cttz.i16(i16 %n, i1 false)
-  ret i16 %tmp1
-}
-
-; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
-define i32 @cttz_i32_zero_test(i32 %n) {
-; X86-LABEL: cttz_i32_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB14_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB14_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i32_zero_test:
-; X64:       # %bb.0:
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB14_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB14_1:
-; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i32_zero_test:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i32_zero_test:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i32_zero_test:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i32_zero_test:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp1 = call i32 @llvm.cttz.i32(i32 %n, i1 false)
-  ret i32 %tmp1
-}
-
-; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
-define i64 @cttz_i64_zero_test(i64 %n) {
-; X86-NOCMOV-LABEL: cttz_i64_zero_test:
-; X86-NOCMOV:       # %bb.0:
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOCMOV-NOT:     rep
-; X86-NOCMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %edx
-; X86-NOCMOV-NEXT:    movl $32, %eax
-; X86-NOCMOV-NEXT:    je .LBB15_2
-; X86-NOCMOV-NEXT:  # %bb.1:
-; X86-NOCMOV-NEXT:    movl %edx, %eax
-; X86-NOCMOV-NEXT:  .LBB15_2:
-; X86-NOCMOV-NEXT:    testl %ecx, %ecx
-; X86-NOCMOV-NEXT:    jne .LBB15_3
-; X86-NOCMOV-NEXT:  # %bb.4:
-; X86-NOCMOV-NEXT:    addl $32, %eax
-; X86-NOCMOV-NEXT:    xorl %edx, %edx
-; X86-NOCMOV-NEXT:    retl
-; X86-NOCMOV-NEXT:  .LBB15_3:
-; X86-NOCMOV-NEXT:    rep bsfl %ecx, %eax
-; X86-NOCMOV-NEXT:    xorl %edx, %edx
-; X86-NOCMOV-NEXT:    retl
-;
-; X86-CMOV-LABEL: cttz_i64_zero_test:
-; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NOT:     rep
-; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $32, %edx
-; X86-CMOV-NEXT:    cmovnel %ecx, %edx
-; X86-CMOV-NEXT:    addl $32, %edx
-; X86-CMOV-NOT:     rep
-; X86-CMOV-NEXT:    bsfl %eax, %eax
-; X86-CMOV-NEXT:    cmovel %edx, %eax
-; X86-CMOV-NEXT:    xorl %edx, %edx
-; X86-CMOV-NEXT:    retl
-;
-; X64-LABEL: cttz_i64_zero_test:
-; X64:       # %bb.0:
-; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB15_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfq %rdi, %rax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB15_1:
-; X64-NEXT:    movl $64, %eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i64_zero_test:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    testl %eax, %eax
-; X86-CLZ-NEXT:    jne .LBB15_1
-; X86-CLZ-NEXT:  # %bb.2:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    addl $32, %eax
-; X86-CLZ-NEXT:    xorl %edx, %edx
-; X86-CLZ-NEXT:    retl
-; X86-CLZ-NEXT:  .LBB15_1:
-; X86-CLZ-NEXT:    tzcntl %eax, %eax
-; X86-CLZ-NEXT:    xorl %edx, %edx
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i64_zero_test:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntq %rdi, %rax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i64_zero_test:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntq %rdi, %rax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i64_zero_test:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    testl %eax, %eax
-; X86-FASTLZCNT-NEXT:    jne .LBB15_1
-; X86-FASTLZCNT-NEXT:  # %bb.2:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    addl $32, %eax
-; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
-; X86-FASTLZCNT-NEXT:    retl
-; X86-FASTLZCNT-NEXT:  .LBB15_1:
-; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp1 = call i64 @llvm.cttz.i64(i64 %n, i1 false)
-  ret i64 %tmp1
-}
-
 ; Don't generate the cmovne when the source is known non-zero (and bsr would
 ; not set ZF).
 ; rdar://9490949
@@ -1019,13 +586,13 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB18_1
+; X86-NEXT:    je .LBB10_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    bsrl %eax, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_1:
+; X86-NEXT:  .LBB10_1:
 ; X86-NEXT:    movl $32, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    retl
@@ -1033,13 +600,13 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
 ; X64-LABEL: ctlz_bsr_zero_test:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB18_1
+; X64-NEXT:    je .LBB10_1
 ; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    bsrl %edi, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_1:
+; X64-NEXT:  .LBB10_1:
 ; X64-NEXT:    movl $32, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    retq
@@ -1072,63 +639,6 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
   ret i32 %bsr
 }
 
-define i8 @cttz_i8_knownbits(i8 %x)  {
-; X86-LABEL: cttz_i8_knownbits:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orb $2, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i8_knownbits:
-; X64:       # %bb.0:
-; X64-NEXT:    orb $2, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i8_knownbits:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    orb $2, %al
-; X86-CLZ-NEXT:    movzbl %al, %eax
-; X86-CLZ-NEXT:    tzcntl %eax, %eax
-; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i8_knownbits:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    orb $2, %dil
-; X64-CLZ-NEXT:    movzbl %dil, %eax
-; X64-CLZ-NEXT:    tzcntl %eax, %eax
-; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i8_knownbits:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    orb $2, %dil
-; X64-FASTLZCNT-NEXT:    movzbl %dil, %eax
-; X64-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i8_knownbits:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    orb $2, %al
-; X86-FASTLZCNT-NEXT:    movzbl %al, %eax
-; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
-; X86-FASTLZCNT-NEXT:    retl
-  %x2 = or i8 %x, 2
-  %tmp = call i8 @llvm.cttz.i8(i8 %x2, i1 true )
-  %tmp2 = and i8 %tmp, 1
-  ret i8 %tmp2
-}
-
 define i8 @ctlz_i8_knownbits(i8 %x)  {
 ; X86-LABEL: ctlz_i8_knownbits:
 ; X86:       # %bb.0:
@@ -1200,7 +710,7 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-NOCMOV:       # %bb.0:
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    testl %eax, %eax
-; X86-NOCMOV-NEXT:    jne .LBB21_1
+; X86-NOCMOV-NEXT:    jne .LBB12_1
 ; X86-NOCMOV-NEXT:  # %bb.2:
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    orl $1, %eax
@@ -1209,7 +719,7 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-NOCMOV-NEXT:    orl $32, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
 ; X86-NOCMOV-NEXT:    retl
-; X86-NOCMOV-NEXT:  .LBB21_1:
+; X86-NOCMOV-NEXT:  .LBB12_1:
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
@@ -1241,7 +751,7 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    testl %eax, %eax
-; X86-CLZ-NEXT:    jne .LBB21_1
+; X86-CLZ-NEXT:    jne .LBB12_1
 ; X86-CLZ-NEXT:  # %bb.2:
 ; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    orl $1, %eax
@@ -1249,7 +759,7 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-CLZ-NEXT:    orl $32, %eax
 ; X86-CLZ-NEXT:    xorl %edx, %edx
 ; X86-CLZ-NEXT:    retl
-; X86-CLZ-NEXT:  .LBB21_1:
+; X86-CLZ-NEXT:  .LBB12_1:
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
 ; X86-CLZ-NEXT:    xorl %edx, %edx
 ; X86-CLZ-NEXT:    retl
@@ -1270,7 +780,7 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    testl %eax, %eax
-; X86-FASTLZCNT-NEXT:    jne .LBB21_1
+; X86-FASTLZCNT-NEXT:    jne .LBB12_1
 ; X86-FASTLZCNT-NEXT:  # %bb.2:
 ; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    orl $1, %eax
@@ -1278,7 +788,7 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-FASTLZCNT-NEXT:    orl $32, %eax
 ; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
 ; X86-FASTLZCNT-NEXT:    retl
-; X86-FASTLZCNT-NEXT:  .LBB21_1:
+; X86-FASTLZCNT-NEXT:  .LBB12_1:
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
 ; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
 ; X86-FASTLZCNT-NEXT:    retl
@@ -1287,98 +797,6 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
   ret i64 %tmp1
 }
 
-; Make sure we can detect that the input is non-zero and avoid cmov after BSF
-; This is relevant for 32-bit mode without tzcnt
-define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
-; X86-NOCMOV-LABEL: cttz_i64_zero_test_knownneverzero:
-; X86-NOCMOV:       # %bb.0:
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOCMOV-NEXT:    testl %eax, %eax
-; X86-NOCMOV-NEXT:    jne .LBB22_1
-; X86-NOCMOV-NEXT:  # %bb.2:
-; X86-NOCMOV-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
-; X86-NOCMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NOCMOV-NEXT:    rep bsfl %eax, %eax
-; X86-NOCMOV-NEXT:    orl $32, %eax
-; X86-NOCMOV-NEXT:    xorl %edx, %edx
-; X86-NOCMOV-NEXT:    retl
-; X86-NOCMOV-NEXT:  .LBB22_1:
-; X86-NOCMOV-NEXT:    rep bsfl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl %edx, %edx
-; X86-NOCMOV-NEXT:    retl
-;
-; X86-CMOV-LABEL: cttz_i64_zero_test_knownneverzero:
-; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
-; X86-CMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    rep bsfl %ecx, %edx
-; X86-CMOV-NEXT:    rep bsfl %eax, %eax
-; X86-CMOV-NEXT:    orl $32, %eax
-; X86-CMOV-NEXT:    testl %ecx, %ecx
-; X86-CMOV-NEXT:    cmovnel %edx, %eax
-; X86-CMOV-NEXT:    xorl %edx, %edx
-; X86-CMOV-NEXT:    retl
-;
-; X64-LABEL: cttz_i64_zero_test_knownneverzero:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT:    orq %rdi, %rax
-; X64-NEXT:    rep bsfq %rax, %rax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i64_zero_test_knownneverzero:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    testl %eax, %eax
-; X86-CLZ-NEXT:    jne .LBB22_1
-; X86-CLZ-NEXT:  # %bb.2:
-; X86-CLZ-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
-; X86-CLZ-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    tzcntl %eax, %eax
-; X86-CLZ-NEXT:    orl $32, %eax
-; X86-CLZ-NEXT:    xorl %edx, %edx
-; X86-CLZ-NEXT:    retl
-; X86-CLZ-NEXT:  .LBB22_1:
-; X86-CLZ-NEXT:    tzcntl %eax, %eax
-; X86-CLZ-NEXT:    xorl %edx, %edx
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i64_zero_test_knownneverzero:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-CLZ-NEXT:    orq %rdi, %rax
-; X64-CLZ-NEXT:    tzcntq %rax, %rax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i64_zero_test_knownneverzero:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-FASTLZCNT-NEXT:    orq %rdi, %rax
-; X64-FASTLZCNT-NEXT:    tzcntq %rax, %rax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i64_zero_test_knownneverzero:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    testl %eax, %eax
-; X86-FASTLZCNT-NEXT:    jne .LBB22_1
-; X86-FASTLZCNT-NEXT:  # %bb.2:
-; X86-FASTLZCNT-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
-; X86-FASTLZCNT-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    orl $32, %eax
-; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
-; X86-FASTLZCNT-NEXT:    retl
-; X86-FASTLZCNT-NEXT:  .LBB22_1:
-; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
-; X86-FASTLZCNT-NEXT:    retl
-  %o = or i64 %n, -9223372036854775808 ; 0x8000000000000000
-  %tmp1 = call i64 @llvm.cttz.i64(i64 %o, i1 false)
-  ret i64 %tmp1
-}
-
 ; Ensure we fold away the XOR(TRUNC(XOR(BSR(X),31)),31).
 define i8 @PR47603_trunc(i32 %0) {
 ; X86-LABEL: PR47603_trunc:
@@ -1478,78 +896,6 @@ define i32 @PR47603_zext(i32 %a0, ptr %a1) {
   ret i32 %sext
 }
 
-define i32 @cttz_i32_osize(i32 %x) optsize {
-; X86-LABEL: cttz_i32_osize:
-; X86:       # %bb.0:
-; X86-NOT:     rep
-; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i32_osize:
-; X64:       # %bb.0:
-; X64-NOT:     rep
-; X64-NEXT:    bsfl %edi, %eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i32_osize:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i32_osize:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i32_osize:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i32_osize:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true)
-  ret i32 %tmp
-}
-
-define i32 @cttz_i32_msize(i32 %x) minsize {
-; X86-LABEL: cttz_i32_msize:
-; X86:       # %bb.0:
-; X86-NOT:     rep
-; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: cttz_i32_msize:
-; X64:       # %bb.0:
-; X64-NOT:     rep
-; X64-NEXT:    bsfl %edi, %eax
-; X64-NEXT:    retq
-;
-; X86-CLZ-LABEL: cttz_i32_msize:
-; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    retl
-;
-; X64-CLZ-LABEL: cttz_i32_msize:
-; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    tzcntl %edi, %eax
-; X64-CLZ-NEXT:    retq
-;
-; X64-FASTLZCNT-LABEL: cttz_i32_msize:
-; X64-FASTLZCNT:       # %bb.0:
-; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
-; X64-FASTLZCNT-NEXT:    retq
-;
-; X86-FASTLZCNT-LABEL: cttz_i32_msize:
-; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    retl
-  %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true)
-  ret i32 %tmp
-}
-
 define i8 @ctlz_xor7_i8_true(i8 %x) {
 ; X86-LABEL: ctlz_xor7_i8_true:
 ; X86:       # %bb.0:
@@ -1606,7 +952,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB28_1
+; X86-NEXT:    je .LBB16_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    bsrl %eax, %eax
@@ -1614,7 +960,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB28_1:
+; X86-NEXT:  .LBB16_1:
 ; X86-NEXT:    movb $8, %al
 ; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -1623,7 +969,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X64-LABEL: ctlz_xor7_i8_false:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testb %dil, %dil
-; X64-NEXT:    je .LBB28_1
+; X64-NEXT:    je .LBB16_1
 ; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
@@ -1631,7 +977,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB28_1:
+; X64-NEXT:  .LBB16_1:
 ; X64-NEXT:    movb $8, %al
 ; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -1721,13 +1067,13 @@ define i32 @ctlz_xor31_i32_false(i32 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB30_1
+; X86-NEXT:    je .LBB18_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    bsrl %eax, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB30_1:
+; X86-NEXT:  .LBB18_1:
 ; X86-NEXT:    movl $32, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    retl
@@ -1735,13 +1081,13 @@ define i32 @ctlz_xor31_i32_false(i32 %x) {
 ; X64-LABEL: ctlz_xor31_i32_false:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB30_1
+; X64-NEXT:    je .LBB18_1
 ; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    bsrl %edi, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB30_1:
+; X64-NEXT:  .LBB18_1:
 ; X64-NEXT:    movl $32, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    retq
@@ -1779,16 +1125,16 @@ define i64 @ctlz_xor63_i64_true(i64 %x) {
 ; X86-NOCMOV:       # %bb.0:
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    testl %eax, %eax
-; X86-NOCMOV-NEXT:    jne .LBB31_1
+; X86-NOCMOV-NEXT:    jne .LBB19_1
 ; X86-NOCMOV-NEXT:  # %bb.2:
 ; X86-NOCMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    addl $32, %eax
-; X86-NOCMOV-NEXT:    jmp .LBB31_3
-; X86-NOCMOV-NEXT:  .LBB31_1:
+; X86-NOCMOV-NEXT:    jmp .LBB19_3
+; X86-NOCMOV-NEXT:  .LBB19_1:
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
-; X86-NOCMOV-NEXT:  .LBB31_3:
+; X86-NOCMOV-NEXT:  .LBB19_3:
 ; X86-NOCMOV-NEXT:    xorl $63, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
 ; X86-NOCMOV-NEXT:    retl
@@ -1816,14 +1162,14 @@ define i64 @ctlz_xor63_i64_true(i64 %x) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    testl %eax, %eax
-; X86-CLZ-NEXT:    jne .LBB31_1
+; X86-CLZ-NEXT:    jne .LBB19_1
 ; X86-CLZ-NEXT:  # %bb.2:
 ; X86-CLZ-NEXT:    lzcntl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    addl $32, %eax
-; X86-CLZ-NEXT:    jmp .LBB31_3
-; X86-CLZ-NEXT:  .LBB31_1:
+; X86-CLZ-NEXT:    jmp .LBB19_3
+; X86-CLZ-NEXT:  .LBB19_1:
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:  .LBB31_3:
+; X86-CLZ-NEXT:  .LBB19_3:
 ; X86-CLZ-NEXT:    xorl $63, %eax
 ; X86-CLZ-NEXT:    xorl %edx, %edx
 ; X86-CLZ-NEXT:    retl
@@ -1843,14 +1189,14 @@ define i64 @ctlz_xor63_i64_true(i64 %x) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    testl %eax, %eax
-; X86-FASTLZCNT-NEXT:    jne .LBB31_1
+; X86-FASTLZCNT-NEXT:    jne .LBB19_1
 ; X86-FASTLZCNT-NEXT:  # %bb.2:
 ; X86-FASTLZCNT-NEXT:    lzcntl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    addl $32, %eax
-; X86-FASTLZCNT-NEXT:    jmp .LBB31_3
-; X86-FASTLZCNT-NEXT:  .LBB31_1:
+; X86-FASTLZCNT-NEXT:    jmp .LBB19_3
+; X86-FASTLZCNT-NEXT:  .LBB19_1:
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:  .LBB31_3:
+; X86-FASTLZCNT-NEXT:  .LBB19_3:
 ; X86-FASTLZCNT-NEXT:    xorl $63, %eax
 ; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
 ; X86-FASTLZCNT-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
new file mode 100644
index 00000000000000..6eb748a1afbab5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -0,0 +1,662 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86,X86-NOCMOV
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov | FileCheck %s --check-prefixes=X86,X86-CMOV
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=X86-CLZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=X64-CLZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt,+fast-lzcnt | FileCheck %s --check-prefix=X64-FASTLZCNT
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt,+fast-lzcnt | FileCheck %s --check-prefix=X86-FASTLZCNT
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i8 @cttz_i8(i8 %x)  {
+; X86-LABEL: cttz_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i8:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i8:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i8:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i8:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
+  ret i8 %tmp
+}
+
+define i16 @cttz_i16(i16 %x)  {
+; X86-LABEL: cttz_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i16:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i16:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i16:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i16:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true )
+  ret i16 %tmp
+}
+
+define i32 @cttz_i32(i32 %x)  {
+; X86-LABEL: cttz_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i32:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i32:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i32:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i32:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true )
+  ret i32 %tmp
+}
+
+define i64 @cttz_i64(i64 %x)  {
+; X86-NOCMOV-LABEL: cttz_i64:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testl %eax, %eax
+; X86-NOCMOV-NEXT:    jne .LBB3_1
+; X86-NOCMOV-NEXT:  # %bb.2:
+; X86-NOCMOV-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    addl $32, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB3_1:
+; X86-NOCMOV-NEXT:    rep bsfl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: cttz_i64:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    rep bsfl %ecx, %edx
+; X86-CMOV-NEXT:    rep bsfl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    addl $32, %eax
+; X86-CMOV-NEXT:    testl %ecx, %ecx
+; X86-CMOV-NEXT:    cmovnel %edx, %eax
+; X86-CMOV-NEXT:    xorl %edx, %edx
+; X86-CMOV-NEXT:    retl
+;
+; X64-LABEL: cttz_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i64:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    testl %eax, %eax
+; X86-CLZ-NEXT:    jne .LBB3_1
+; X86-CLZ-NEXT:  # %bb.2:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    addl $32, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+; X86-CLZ-NEXT:  .LBB3_1:
+; X86-CLZ-NEXT:    tzcntl %eax, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i64:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntq %rdi, %rax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i64:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntq %rdi, %rax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i64:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    testl %eax, %eax
+; X86-FASTLZCNT-NEXT:    jne .LBB3_1
+; X86-FASTLZCNT-NEXT:  # %bb.2:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    addl $32, %eax
+; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
+; X86-FASTLZCNT-NEXT:    retl
+; X86-FASTLZCNT-NEXT:  .LBB3_1:
+; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true )
+  ret i64 %tmp
+}
+
+; Promote i8 cttz to i32 and mask bit8 to prevent (slow) zero-src bsf case.
+define i8 @cttz_i8_zero_test(i8 %n) {
+; X86-LABEL: cttz_i8_zero_test:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i8_zero_test:
+; X64:       # %bb.0:
+; X64-NEXT:    orl $256, %edi # imm = 0x100
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i8_zero_test:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl $256, %eax # imm = 0x100
+; X86-CLZ-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    tzcntl %eax, %eax
+; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i8_zero_test:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    orl $256, %edi # imm = 0x100
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i8_zero_test:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    orl $256, %edi # imm = 0x100
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i8_zero_test:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    movl $256, %eax # imm = 0x100
+; X86-FASTLZCNT-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)
+  ret i8 %tmp1
+}
+
+; Promote i16 cttz to i32 and mask bit16 to prevent (slow) zero-src bsf case.
+define i16 @cttz_i16_zero_test(i16 %n) {
+; X86-LABEL: cttz_i16_zero_test:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $65536, %eax # imm = 0x10000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i16_zero_test:
+; X64:       # %bb.0:
+; X64-NEXT:    orl $65536, %edi # imm = 0x10000
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i16_zero_test:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl $65536, %eax # imm = 0x10000
+; X86-CLZ-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    tzcntl %eax, %eax
+; X86-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i16_zero_test:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    orl $65536, %edi # imm = 0x10000
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i16_zero_test:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    orl $65536, %edi # imm = 0x10000
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i16_zero_test:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    movl $65536, %eax # imm = 0x10000
+; X86-FASTLZCNT-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X86-FASTLZCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp1 = call i16 @llvm.cttz.i16(i16 %n, i1 false)
+  ret i16 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i32 @cttz_i32_zero_test(i32 %n) {
+; X86-LABEL: cttz_i32_zero_test:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB6_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB6_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i32_zero_test:
+; X64:       # %bb.0:
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    je .LBB6_1
+; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_1:
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i32_zero_test:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i32_zero_test:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i32_zero_test:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i32_zero_test:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp1 = call i32 @llvm.cttz.i32(i32 %n, i1 false)
+  ret i32 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i64 @cttz_i64_zero_test(i64 %n) {
+; X86-NOCMOV-LABEL: cttz_i64_zero_test:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOCMOV-NOT:     rep
+; X86-NOCMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %edx
+; X86-NOCMOV-NEXT:    movl $32, %eax
+; X86-NOCMOV-NEXT:    je .LBB7_2
+; X86-NOCMOV-NEXT:  # %bb.1:
+; X86-NOCMOV-NEXT:    movl %edx, %eax
+; X86-NOCMOV-NEXT:  .LBB7_2:
+; X86-NOCMOV-NEXT:    testl %ecx, %ecx
+; X86-NOCMOV-NEXT:    jne .LBB7_3
+; X86-NOCMOV-NEXT:  # %bb.4:
+; X86-NOCMOV-NEXT:    addl $32, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB7_3:
+; X86-NOCMOV-NEXT:    rep bsfl %ecx, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: cttz_i64_zero_test:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NOT:     rep
+; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    movl $32, %edx
+; X86-CMOV-NEXT:    cmovnel %ecx, %edx
+; X86-CMOV-NEXT:    addl $32, %edx
+; X86-CMOV-NOT:     rep
+; X86-CMOV-NEXT:    bsfl %eax, %eax
+; X86-CMOV-NEXT:    cmovel %edx, %eax
+; X86-CMOV-NEXT:    xorl %edx, %edx
+; X86-CMOV-NEXT:    retl
+;
+; X64-LABEL: cttz_i64_zero_test:
+; X64:       # %bb.0:
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    je .LBB7_1
+; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    rep bsfq %rdi, %rax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB7_1:
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i64_zero_test:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    testl %eax, %eax
+; X86-CLZ-NEXT:    jne .LBB7_1
+; X86-CLZ-NEXT:  # %bb.2:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    addl $32, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+; X86-CLZ-NEXT:  .LBB7_1:
+; X86-CLZ-NEXT:    tzcntl %eax, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i64_zero_test:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntq %rdi, %rax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i64_zero_test:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntq %rdi, %rax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i64_zero_test:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    testl %eax, %eax
+; X86-FASTLZCNT-NEXT:    jne .LBB7_1
+; X86-FASTLZCNT-NEXT:  # %bb.2:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    addl $32, %eax
+; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
+; X86-FASTLZCNT-NEXT:    retl
+; X86-FASTLZCNT-NEXT:  .LBB7_1:
+; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp1 = call i64 @llvm.cttz.i64(i64 %n, i1 false)
+  ret i64 %tmp1
+}
+
+define i8 @cttz_i8_knownbits(i8 %x)  {
+; X86-LABEL: cttz_i8_knownbits:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orb $2, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i8_knownbits:
+; X64:       # %bb.0:
+; X64-NEXT:    orb $2, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i8_knownbits:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    orb $2, %al
+; X86-CLZ-NEXT:    movzbl %al, %eax
+; X86-CLZ-NEXT:    tzcntl %eax, %eax
+; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i8_knownbits:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    orb $2, %dil
+; X64-CLZ-NEXT:    movzbl %dil, %eax
+; X64-CLZ-NEXT:    tzcntl %eax, %eax
+; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i8_knownbits:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    orb $2, %dil
+; X64-FASTLZCNT-NEXT:    movzbl %dil, %eax
+; X64-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i8_knownbits:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    orb $2, %al
+; X86-FASTLZCNT-NEXT:    movzbl %al, %eax
+; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
+; X86-FASTLZCNT-NEXT:    retl
+  %x2 = or i8 %x, 2
+  %tmp = call i8 @llvm.cttz.i8(i8 %x2, i1 true )
+  %tmp2 = and i8 %tmp, 1
+  ret i8 %tmp2
+}
+
+; Make sure we can detect that the input is non-zero and avoid cmov after BSF
+; This is relevant for 32-bit mode without tzcnt
+define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
+; X86-NOCMOV-LABEL: cttz_i64_zero_test_knownneverzero:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testl %eax, %eax
+; X86-NOCMOV-NEXT:    jne .LBB9_1
+; X86-NOCMOV-NEXT:  # %bb.2:
+; X86-NOCMOV-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NOCMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    rep bsfl %eax, %eax
+; X86-NOCMOV-NEXT:    orl $32, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB9_1:
+; X86-NOCMOV-NEXT:    rep bsfl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl %edx, %edx
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: cttz_i64_zero_test_knownneverzero:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-CMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    rep bsfl %ecx, %edx
+; X86-CMOV-NEXT:    rep bsfl %eax, %eax
+; X86-CMOV-NEXT:    orl $32, %eax
+; X86-CMOV-NEXT:    testl %ecx, %ecx
+; X86-CMOV-NEXT:    cmovnel %edx, %eax
+; X86-CMOV-NEXT:    xorl %edx, %edx
+; X86-CMOV-NEXT:    retl
+;
+; X64-LABEL: cttz_i64_zero_test_knownneverzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i64_zero_test_knownneverzero:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    testl %eax, %eax
+; X86-CLZ-NEXT:    jne .LBB9_1
+; X86-CLZ-NEXT:  # %bb.2:
+; X86-CLZ-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-CLZ-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    tzcntl %eax, %eax
+; X86-CLZ-NEXT:    orl $32, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+; X86-CLZ-NEXT:  .LBB9_1:
+; X86-CLZ-NEXT:    tzcntl %eax, %eax
+; X86-CLZ-NEXT:    xorl %edx, %edx
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i64_zero_test_knownneverzero:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-CLZ-NEXT:    orq %rdi, %rax
+; X64-CLZ-NEXT:    tzcntq %rax, %rax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i64_zero_test_knownneverzero:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-FASTLZCNT-NEXT:    orq %rdi, %rax
+; X64-FASTLZCNT-NEXT:    tzcntq %rax, %rax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i64_zero_test_knownneverzero:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    testl %eax, %eax
+; X86-FASTLZCNT-NEXT:    jne .LBB9_1
+; X86-FASTLZCNT-NEXT:  # %bb.2:
+; X86-FASTLZCNT-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-FASTLZCNT-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X86-FASTLZCNT-NEXT:    orl $32, %eax
+; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
+; X86-FASTLZCNT-NEXT:    retl
+; X86-FASTLZCNT-NEXT:  .LBB9_1:
+; X86-FASTLZCNT-NEXT:    tzcntl %eax, %eax
+; X86-FASTLZCNT-NEXT:    xorl %edx, %edx
+; X86-FASTLZCNT-NEXT:    retl
+  %o = or i64 %n, -9223372036854775808 ; 0x8000000000000000
+  %tmp1 = call i64 @llvm.cttz.i64(i64 %o, i1 false)
+  ret i64 %tmp1
+}
+
+define i32 @cttz_i32_osize(i32 %x) optsize {
+; X86-LABEL: cttz_i32_osize:
+; X86:       # %bb.0:
+; X86-NOT:     rep
+; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i32_osize:
+; X64:       # %bb.0:
+; X64-NOT:     rep
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i32_osize:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i32_osize:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i32_osize:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i32_osize:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true)
+  ret i32 %tmp
+}
+
+define i32 @cttz_i32_msize(i32 %x) minsize {
+; X86-LABEL: cttz_i32_msize:
+; X86:       # %bb.0:
+; X86-NOT:     rep
+; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cttz_i32_msize:
+; X64:       # %bb.0:
+; X64-NOT:     rep
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-CLZ-LABEL: cttz_i32_msize:
+; X86-CLZ:       # %bb.0:
+; X86-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    retl
+;
+; X64-CLZ-LABEL: cttz_i32_msize:
+; X64-CLZ:       # %bb.0:
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
+; X64-CLZ-NEXT:    retq
+;
+; X64-FASTLZCNT-LABEL: cttz_i32_msize:
+; X64-FASTLZCNT:       # %bb.0:
+; X64-FASTLZCNT-NEXT:    tzcntl %edi, %eax
+; X64-FASTLZCNT-NEXT:    retq
+;
+; X86-FASTLZCNT-LABEL: cttz_i32_msize:
+; X86-FASTLZCNT:       # %bb.0:
+; X86-FASTLZCNT-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    retl
+  %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true)
+  ret i32 %tmp
+}
diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll
index 2be5dec156690e..0aa376195627d4 100644
--- a/llvm/test/CodeGen/X86/ldexp.ll
+++ b/llvm/test/CodeGen/X86/ldexp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=x86_64-unknown-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=X64 %s
+; RUN: llc -mtriple=x86_64-pc-win32 -verify-machineinstrs < %s | FileCheck -check-prefixes=WIN64 %s
 ; RUN: llc -mtriple=i386-pc-win32 -verify-machineinstrs < %s | FileCheck -check-prefix=WIN32 %s
 
 define float @ldexp_f32(i8 zeroext %x) {
@@ -8,75 +9,30 @@ define float @ldexp_f32(i8 zeroext %x) {
 ; X64-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    jmp ldexpf@PLT # TAILCALL
 ;
+; WIN64-LABEL: ldexp_f32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    .seh_stackalloc 40
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    movzbl %cl, %edx
+; WIN64-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm0
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+; WIN64-NEXT:    .seh_endproc
+;
 ; WIN32-LABEL: ldexp_f32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    cmpl $381, %ecx # imm = 0x17D
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    jl LBB0_2
-; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl $381, %eax # imm = 0x17D
-; WIN32-NEXT:  LBB0_2:
-; WIN32-NEXT:    addl $-254, %eax
-; WIN32-NEXT:    leal -127(%ecx), %edx
-; WIN32-NEXT:    cmpl $255, %ecx
-; WIN32-NEXT:    jae LBB0_4
-; WIN32-NEXT:  # %bb.3:
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:  LBB0_4:
-; WIN32-NEXT:    flds __real@7f800000
-; WIN32-NEXT:    flds __real@7f000000
-; WIN32-NEXT:    jae LBB0_6
-; WIN32-NEXT:  # %bb.5:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB0_6:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    cmpl $-329, %ecx # imm = 0xFEB7
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    jge LBB0_8
-; WIN32-NEXT:  # %bb.7:
-; WIN32-NEXT:    movl $-330, %edx # imm = 0xFEB6
-; WIN32-NEXT:  LBB0_8:
-; WIN32-NEXT:    cmpl $-228, %ecx
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    flds __real@0c800000
-; WIN32-NEXT:    jb LBB0_9
-; WIN32-NEXT:  # %bb.10:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    leal 102(%ecx), %edx
-; WIN32-NEXT:    cmpl $-126, %ecx
-; WIN32-NEXT:    jge LBB0_12
-; WIN32-NEXT:    jmp LBB0_13
-; WIN32-NEXT:  LBB0_9:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $204, %edx
-; WIN32-NEXT:    cmpl $-126, %ecx
-; WIN32-NEXT:    jl LBB0_13
-; WIN32-NEXT:  LBB0_12:
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:  LBB0_13:
+; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fld1
-; WIN32-NEXT:    jl LBB0_15
-; WIN32-NEXT:  # %bb.14:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB0_15:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    cmpl $127, %ecx
-; WIN32-NEXT:    jg LBB0_17
-; WIN32-NEXT:  # %bb.16:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB0_17:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    shll $23, %eax
-; WIN32-NEXT:    addl $1065353216, %eax # imm = 0x3F800000
-; WIN32-NEXT:    movl %eax, (%esp)
-; WIN32-NEXT:    fmuls (%esp)
-; WIN32-NEXT:    popl %eax
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    retl
   %zext = zext i8 %x to i32
   %ldexp = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 %zext)
@@ -89,6 +45,12 @@ define double @ldexp_f64(i8 zeroext %x) {
 ; X64-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-NEXT:    jmp ldexp@PLT # TAILCALL
 ;
+; WIN64-LABEL: ldexp_f64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movzbl %cl, %edx
+; WIN64-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; WIN64-NEXT:    jmp ldexp # TAILCALL
+;
 ; WIN32-LABEL: ldexp_f64:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    subl $12, %esp
@@ -127,152 +89,80 @@ define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) {
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
+; WIN64-LABEL: ldexp_v2f32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    .seh_pushreg %rsi
+; WIN64-NEXT:    subq $80, %rsp
+; WIN64-NEXT:    .seh_stackalloc 80
+; WIN64-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm8, 64
+; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm7, 48
+; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm6, 32
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    movq %rdx, %rsi
+; WIN64-NEXT:    movaps (%rcx), %xmm7
+; WIN64-NEXT:    movl 12(%rdx), %edx
+; WIN64-NEXT:    movaps %xmm7, %xmm0
+; WIN64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3]
+; WIN64-NEXT:    cvtss2sd %xmm0, %xmm0
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    xorps %xmm6, %xmm6
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm6
+; WIN64-NEXT:    movl 8(%rsi), %edx
+; WIN64-NEXT:    movaps %xmm7, %xmm0
+; WIN64-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
+; WIN64-NEXT:    cvtss2sd %xmm0, %xmm0
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    xorps %xmm8, %xmm8
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm8
+; WIN64-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; WIN64-NEXT:    movl (%rsi), %edx
+; WIN64-NEXT:    movl 4(%rsi), %esi
+; WIN64-NEXT:    xorps %xmm0, %xmm0
+; WIN64-NEXT:    cvtss2sd %xmm7, %xmm0
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    xorps %xmm6, %xmm6
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm6
+; WIN64-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1,1,1]
+; WIN64-NEXT:    xorps %xmm0, %xmm0
+; WIN64-NEXT:    cvtss2sd %xmm7, %xmm0
+; WIN64-NEXT:    movl %esi, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm0
+; WIN64-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; WIN64-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
+; WIN64-NEXT:    movaps %xmm6, %xmm0
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; WIN64-NEXT:    addq $80, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+; WIN64-NEXT:    .seh_endproc
+;
 ; WIN32-LABEL: ldexp_v2f32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    subl $20, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    cmpl $-329, %eax # imm = 0xFEB7
-; WIN32-NEXT:    movl %eax, %edx
-; WIN32-NEXT:    jge LBB2_2
-; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl $-330, %edx # imm = 0xFEB6
-; WIN32-NEXT:  LBB2_2:
-; WIN32-NEXT:    addl $204, %edx
-; WIN32-NEXT:    leal 102(%eax), %ecx
-; WIN32-NEXT:    cmpl $-228, %eax
-; WIN32-NEXT:    jb LBB2_4
-; WIN32-NEXT:  # %bb.3:
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:  LBB2_4:
-; WIN32-NEXT:    flds __real@0c800000
-; WIN32-NEXT:    fld %st(1)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    jb LBB2_6
-; WIN32-NEXT:  # %bb.5:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB2_6:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    cmpl $-126, %eax
-; WIN32-NEXT:    jl LBB2_8
-; WIN32-NEXT:  # %bb.7:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fld %st(1)
-; WIN32-NEXT:    movl %eax, %edx
-; WIN32-NEXT:  LBB2_8:
-; WIN32-NEXT:    cmpl $381, %eax # imm = 0x17D
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    jl LBB2_10
-; WIN32-NEXT:  # %bb.9:
-; WIN32-NEXT:    movl $381, %esi # imm = 0x17D
-; WIN32-NEXT:  LBB2_10:
-; WIN32-NEXT:    flds __real@7f000000
-; WIN32-NEXT:    fmul %st, %st(3)
-; WIN32-NEXT:    fld %st(3)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    leal -127(%eax), %ecx
-; WIN32-NEXT:    cmpl $255, %eax
-; WIN32-NEXT:    jae LBB2_11
-; WIN32-NEXT:  # %bb.12:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    jmp LBB2_13
-; WIN32-NEXT:  LBB2_11:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    addl $-254, %esi
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:  LBB2_13:
-; WIN32-NEXT:    cmpl $127, %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    jg LBB2_15
-; WIN32-NEXT:  # %bb.14:
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:  LBB2_15:
-; WIN32-NEXT:    cmpl $381, %esi # imm = 0x17D
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    jl LBB2_17
-; WIN32-NEXT:  # %bb.16:
-; WIN32-NEXT:    movl $381, %edx # imm = 0x17D
-; WIN32-NEXT:  LBB2_17:
-; WIN32-NEXT:    addl $-254, %edx
-; WIN32-NEXT:    leal -127(%esi), %edi
-; WIN32-NEXT:    cmpl $255, %esi
-; WIN32-NEXT:    jae LBB2_19
-; WIN32-NEXT:  # %bb.18:
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:  LBB2_19:
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    fmul %st, %st(2)
-; WIN32-NEXT:    jae LBB2_21
-; WIN32-NEXT:  # %bb.20:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB2_21:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    cmpl $-329, %esi # imm = 0xFEB7
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    jge LBB2_23
-; WIN32-NEXT:  # %bb.22:
-; WIN32-NEXT:    movl $-330, %edi # imm = 0xFEB6
-; WIN32-NEXT:  LBB2_23:
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(4), %st
-; WIN32-NEXT:    fmul %st, %st(4)
-; WIN32-NEXT:    cmpl $-228, %esi
-; WIN32-NEXT:    jb LBB2_24
-; WIN32-NEXT:  # %bb.25:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    leal 102(%esi), %edi
-; WIN32-NEXT:    cmpl $-126, %esi
-; WIN32-NEXT:    jge LBB2_27
-; WIN32-NEXT:    jmp LBB2_28
-; WIN32-NEXT:  LBB2_24:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $204, %edi
-; WIN32-NEXT:    cmpl $-126, %esi
-; WIN32-NEXT:    jl LBB2_28
-; WIN32-NEXT:  LBB2_27:
-; WIN32-NEXT:    fstp %st(3)
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB2_28:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    cmpl $127, %esi
-; WIN32-NEXT:    jg LBB2_30
-; WIN32-NEXT:  # %bb.29:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(2)
-; WIN32-NEXT:  LBB2_30:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    cmpl $127, %eax
-; WIN32-NEXT:    jg LBB2_32
-; WIN32-NEXT:  # %bb.31:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB2_32:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    shll $23, %ecx
-; WIN32-NEXT:    addl $1065353216, %ecx # imm = 0x3F800000
-; WIN32-NEXT:    movl %ecx, (%esp)
-; WIN32-NEXT:    shll $23, %edx
-; WIN32-NEXT:    addl $1065353216, %edx # imm = 0x3F800000
-; WIN32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fmuls (%esp)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fmuls {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    retl
   %1 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %val, <2 x i32> %exp)
   ret <2 x float> %1
@@ -319,335 +209,106 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) {
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
+; WIN64-LABEL: ldexp_v4f32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    .seh_pushreg %rsi
+; WIN64-NEXT:    subq $80, %rsp
+; WIN64-NEXT:    .seh_stackalloc 80
+; WIN64-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm8, 64
+; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm7, 48
+; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm6, 32
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    movq %rdx, %rsi
+; WIN64-NEXT:    movaps (%rcx), %xmm7
+; WIN64-NEXT:    movl 12(%rdx), %edx
+; WIN64-NEXT:    movaps %xmm7, %xmm0
+; WIN64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3]
+; WIN64-NEXT:    cvtss2sd %xmm0, %xmm0
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    xorps %xmm6, %xmm6
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm6
+; WIN64-NEXT:    movl 8(%rsi), %edx
+; WIN64-NEXT:    movaps %xmm7, %xmm0
+; WIN64-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
+; WIN64-NEXT:    cvtss2sd %xmm0, %xmm0
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    xorps %xmm8, %xmm8
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm8
+; WIN64-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; WIN64-NEXT:    movl (%rsi), %edx
+; WIN64-NEXT:    movl 4(%rsi), %esi
+; WIN64-NEXT:    xorps %xmm0, %xmm0
+; WIN64-NEXT:    cvtss2sd %xmm7, %xmm0
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    xorps %xmm6, %xmm6
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm6
+; WIN64-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1,1,1]
+; WIN64-NEXT:    xorps %xmm0, %xmm0
+; WIN64-NEXT:    cvtss2sd %xmm7, %xmm0
+; WIN64-NEXT:    movl %esi, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    cvtsd2ss %xmm0, %xmm0
+; WIN64-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; WIN64-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
+; WIN64-NEXT:    movaps %xmm6, %xmm0
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; WIN64-NEXT:    addq $80, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+; WIN64-NEXT:    .seh_endproc
+;
 ; WIN32-LABEL: ldexp_v4f32:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $32, %esp
+; WIN32-NEXT:    subl $44, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; WIN32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds __real@7f000000
-; WIN32-NEXT:    fld %st(1)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    cmpl $255, %ecx
-; WIN32-NEXT:    jae LBB3_2
-; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB3_2:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    cmpl $-329, %ecx # imm = 0xFEB7
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    jge LBB3_4
-; WIN32-NEXT:  # %bb.3:
-; WIN32-NEXT:    movl $-330, %esi # imm = 0xFEB6
-; WIN32-NEXT:  LBB3_4:
-; WIN32-NEXT:    addl $204, %esi
-; WIN32-NEXT:    leal 102(%ecx), %eax
-; WIN32-NEXT:    cmpl $-228, %ecx
-; WIN32-NEXT:    jb LBB3_6
-; WIN32-NEXT:  # %bb.5:
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:  LBB3_6:
-; WIN32-NEXT:    flds __real@0c800000
-; WIN32-NEXT:    fld %st(3)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    jb LBB3_8
-; WIN32-NEXT:  # %bb.7:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB3_8:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    cmpl $-126, %ecx
-; WIN32-NEXT:    jl LBB3_10
-; WIN32-NEXT:  # %bb.9:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(4)
-; WIN32-NEXT:  LBB3_10:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    subl $127, %edx
-; WIN32-NEXT:    jg LBB3_12
-; WIN32-NEXT:  # %bb.11:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB3_12:
-; WIN32-NEXT:    fstp %st(3)
-; WIN32-NEXT:    fld %st(3)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(3), %st
-; WIN32-NEXT:    cmpl $255, %edi
-; WIN32-NEXT:    jae LBB3_14
-; WIN32-NEXT:  # %bb.13:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB3_14:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; WIN32-NEXT:    cmpl $-329, %edi # imm = 0xFEB7
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    jge LBB3_16
-; WIN32-NEXT:  # %bb.15:
-; WIN32-NEXT:    movl $-330, %eax # imm = 0xFEB6
-; WIN32-NEXT:  LBB3_16:
-; WIN32-NEXT:    fld %st(3)
-; WIN32-NEXT:    fmul %st(3), %st
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(4), %st
-; WIN32-NEXT:    cmpl $-228, %edi
-; WIN32-NEXT:    jb LBB3_17
-; WIN32-NEXT:  # %bb.18:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    leal 102(%edi), %eax
-; WIN32-NEXT:    cmpl $-126, %edi
-; WIN32-NEXT:    jge LBB3_20
-; WIN32-NEXT:    jmp LBB3_21
-; WIN32-NEXT:  LBB3_17:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    addl $204, %eax
-; WIN32-NEXT:    cmpl $-126, %edi
-; WIN32-NEXT:    jl LBB3_21
-; WIN32-NEXT:  LBB3_20:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(4)
-; WIN32-NEXT:  LBB3_21:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %ebx
-; WIN32-NEXT:    subl $127, %ebx
-; WIN32-NEXT:    jg LBB3_23
-; WIN32-NEXT:  # %bb.22:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:  LBB3_23:
-; WIN32-NEXT:    fstp %st(3)
-; WIN32-NEXT:    cmpl $381, %edi # imm = 0x17D
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    jge LBB3_24
-; WIN32-NEXT:  # %bb.25:
-; WIN32-NEXT:    cmpl $255, %edi
-; WIN32-NEXT:    jae LBB3_26
-; WIN32-NEXT:  LBB3_27:
-; WIN32-NEXT:    cmpl $-126, %ecx
-; WIN32-NEXT:    jl LBB3_29
-; WIN32-NEXT:  LBB3_28:
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:  LBB3_29:
-; WIN32-NEXT:    cmpl $381, %ecx # imm = 0x17D
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    jl LBB3_31
-; WIN32-NEXT:  # %bb.30:
-; WIN32-NEXT:    movl $381, %eax # imm = 0x17D
-; WIN32-NEXT:  LBB3_31:
-; WIN32-NEXT:    cmpl $255, %ecx
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; WIN32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    jb LBB3_33
-; WIN32-NEXT:  # %bb.32:
-; WIN32-NEXT:    addl $-254, %eax
-; WIN32-NEXT:    movl %eax, %edx
-; WIN32-NEXT:  LBB3_33:
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; WIN32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    cmpl $381, %ebp # imm = 0x17D
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    jl LBB3_35
-; WIN32-NEXT:  # %bb.34:
-; WIN32-NEXT:    movl $381, %eax # imm = 0x17D
-; WIN32-NEXT:  LBB3_35:
-; WIN32-NEXT:    fld %st(2)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    leal -127(%ebp), %edi
-; WIN32-NEXT:    cmpl $255, %ebp
-; WIN32-NEXT:    jae LBB3_36
-; WIN32-NEXT:  # %bb.37:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    jmp LBB3_38
-; WIN32-NEXT:  LBB3_24:
-; WIN32-NEXT:    movl $381, %eax # imm = 0x17D
-; WIN32-NEXT:    cmpl $255, %edi
-; WIN32-NEXT:    jb LBB3_27
-; WIN32-NEXT:  LBB3_26:
-; WIN32-NEXT:    addl $-254, %eax
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    cmpl $-126, %ecx
-; WIN32-NEXT:    jge LBB3_28
-; WIN32-NEXT:    jmp LBB3_29
-; WIN32-NEXT:  LBB3_36:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    addl $-254, %eax
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:  LBB3_38:
-; WIN32-NEXT:    cmpl $-329, %ebp # imm = 0xFEB7
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    jge LBB3_40
-; WIN32-NEXT:  # %bb.39:
-; WIN32-NEXT:    movl $-330, %ecx # imm = 0xFEB6
-; WIN32-NEXT:  LBB3_40:
-; WIN32-NEXT:    addl $204, %ecx
-; WIN32-NEXT:    leal 102(%ebp), %eax
-; WIN32-NEXT:    cmpl $-228, %ebp
-; WIN32-NEXT:    jb LBB3_42
-; WIN32-NEXT:  # %bb.41:
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:  LBB3_42:
-; WIN32-NEXT:    fld %st(3)
-; WIN32-NEXT:    fmul %st(3), %st
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(4), %st
-; WIN32-NEXT:    jb LBB3_44
-; WIN32-NEXT:  # %bb.43:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB3_44:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    cmpl $-126, %ebp
-; WIN32-NEXT:    jl LBB3_46
-; WIN32-NEXT:  # %bb.45:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(4)
-; WIN32-NEXT:  LBB3_46:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    cmpl $127, %ebp
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    jg LBB3_48
-; WIN32-NEXT:  # %bb.47:
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:  LBB3_48:
-; WIN32-NEXT:    cmpl $381, %esi # imm = 0x17D
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:    jl LBB3_50
-; WIN32-NEXT:  # %bb.49:
-; WIN32-NEXT:    movl $381, %ecx # imm = 0x17D
-; WIN32-NEXT:  LBB3_50:
-; WIN32-NEXT:    addl $-254, %ecx
-; WIN32-NEXT:    leal -127(%esi), %eax
-; WIN32-NEXT:    cmpl $255, %esi
-; WIN32-NEXT:    jae LBB3_52
-; WIN32-NEXT:  # %bb.51:
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:  LBB3_52:
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(3), %st
-; WIN32-NEXT:    fmul %st, %st(3)
-; WIN32-NEXT:    jae LBB3_54
-; WIN32-NEXT:  # %bb.53:
-; WIN32-NEXT:    fstp %st(3)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB3_54:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    cmpl $-329, %esi # imm = 0xFEB7
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    jge LBB3_56
-; WIN32-NEXT:  # %bb.55:
-; WIN32-NEXT:    movl $-330, %eax # imm = 0xFEB6
-; WIN32-NEXT:  LBB3_56:
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(4), %st
-; WIN32-NEXT:    fmul %st, %st(4)
-; WIN32-NEXT:    cmpl $-228, %esi
-; WIN32-NEXT:    jb LBB3_57
-; WIN32-NEXT:  # %bb.58:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    leal 102(%esi), %eax
-; WIN32-NEXT:    cmpl $-126, %esi
-; WIN32-NEXT:    jge LBB3_60
-; WIN32-NEXT:    jmp LBB3_61
-; WIN32-NEXT:  LBB3_57:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $204, %eax
-; WIN32-NEXT:    cmpl $-126, %esi
-; WIN32-NEXT:    jl LBB3_61
-; WIN32-NEXT:  LBB3_60:
-; WIN32-NEXT:    fstp %st(3)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 12(%esi)
+; WIN32-NEXT:    fstps 8(%esi)
+; WIN32-NEXT:    fstps 4(%esi)
+; WIN32-NEXT:    fstps (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB3_61:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    cmpl $127, %esi
-; WIN32-NEXT:    jg LBB3_63
-; WIN32-NEXT:  # %bb.62:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(2)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB3_63:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    cmpl $127, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jg LBB3_65
-; WIN32-NEXT:  # %bb.64:
-; WIN32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; WIN32-NEXT:  LBB3_65:
-; WIN32-NEXT:    cmpl $127, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    jg LBB3_67
-; WIN32-NEXT:  # %bb.66:
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; WIN32-NEXT:  LBB3_67:
-; WIN32-NEXT:    cmpl $127, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    jg LBB3_69
-; WIN32-NEXT:  # %bb.68:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:    fxch %st(2)
-; WIN32-NEXT:  LBB3_69:
-; WIN32-NEXT:    fstp %st(3)
-; WIN32-NEXT:    shll $23, %edi
-; WIN32-NEXT:    addl $1065353216, %edi # imm = 0x3F800000
-; WIN32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    shll $23, %ecx
-; WIN32-NEXT:    addl $1065353216, %ecx # imm = 0x3F800000
-; WIN32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    shll $23, %ebx
-; WIN32-NEXT:    addl $1065353216, %ebx # imm = 0x3F800000
-; WIN32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    shll $23, %edx
-; WIN32-NEXT:    addl $1065353216, %edx # imm = 0x3F800000
-; WIN32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fmuls {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fmuls {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    fmuls {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:    fmuls {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps 12(%eax)
-; WIN32-NEXT:    fxch %st(2)
-; WIN32-NEXT:    fstps 8(%eax)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fstps 4(%eax)
-; WIN32-NEXT:    fstps (%eax)
-; WIN32-NEXT:    addl $32, %esp
+; WIN32-NEXT:    addl $44, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -680,6 +341,37 @@ define <2 x double> @ldexp_v2f64(<2 x double> %val, <2 x i32> %exp) {
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
+; WIN64-LABEL: ldexp_v2f64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    .seh_pushreg %rsi
+; WIN64-NEXT:    subq $64, %rsp
+; WIN64-NEXT:    .seh_stackalloc 64
+; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm7, 48
+; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm6, 32
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    movaps (%rcx), %xmm6
+; WIN64-NEXT:    movl (%rdx), %eax
+; WIN64-NEXT:    movl 4(%rdx), %esi
+; WIN64-NEXT:    movaps %xmm6, %xmm0
+; WIN64-NEXT:    movl %eax, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    movaps %xmm0, %xmm7
+; WIN64-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
+; WIN64-NEXT:    movaps %xmm6, %xmm0
+; WIN64-NEXT:    movl %esi, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
+; WIN64-NEXT:    movaps %xmm7, %xmm0
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; WIN64-NEXT:    addq $64, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+; WIN64-NEXT:    .seh_endproc
+;
 ; WIN32-LABEL: ldexp_v2f64:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %esi
@@ -756,6 +448,58 @@ define <4 x double> @ldexp_v4f64(<4 x double> %val, <4 x i32> %exp) {
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
+; WIN64-LABEL: ldexp_v4f64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    .seh_pushreg %rsi
+; WIN64-NEXT:    pushq %rdi
+; WIN64-NEXT:    .seh_pushreg %rdi
+; WIN64-NEXT:    pushq %rbx
+; WIN64-NEXT:    .seh_pushreg %rbx
+; WIN64-NEXT:    subq $80, %rsp
+; WIN64-NEXT:    .seh_stackalloc 80
+; WIN64-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm8, 64
+; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm7, 48
+; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm %xmm6, 32
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    movl 12(%r8), %esi
+; WIN64-NEXT:    movl 8(%r8), %edi
+; WIN64-NEXT:    movaps (%rdx), %xmm6
+; WIN64-NEXT:    movaps (%rcx), %xmm7
+; WIN64-NEXT:    movl (%r8), %edx
+; WIN64-NEXT:    movl 4(%r8), %ebx
+; WIN64-NEXT:    movaps %xmm7, %xmm0
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    movaps %xmm0, %xmm8
+; WIN64-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
+; WIN64-NEXT:    movaps %xmm7, %xmm0
+; WIN64-NEXT:    movl %ebx, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
+; WIN64-NEXT:    movaps %xmm6, %xmm0
+; WIN64-NEXT:    movl %edi, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    movaps %xmm0, %xmm7
+; WIN64-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
+; WIN64-NEXT:    movaps %xmm6, %xmm0
+; WIN64-NEXT:    movl %esi, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
+; WIN64-NEXT:    movaps %xmm8, %xmm0
+; WIN64-NEXT:    movaps %xmm7, %xmm1
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; WIN64-NEXT:    addq $80, %rsp
+; WIN64-NEXT:    popq %rbx
+; WIN64-NEXT:    popq %rdi
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+; WIN64-NEXT:    .seh_endproc
+;
 ; WIN32-LABEL: ldexp_v4f64:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
@@ -826,86 +570,42 @@ define half @ldexp_f16(half %arg0, i32 %arg1) {
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
+; WIN64-LABEL: ldexp_f16:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    .seh_pushreg %rsi
+; WIN64-NEXT:    subq $32, %rsp
+; WIN64-NEXT:    .seh_stackalloc 32
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    movl %edx, %esi
+; WIN64-NEXT:    callq __extendhfsf2
+; WIN64-NEXT:    cvtss2sd %xmm0, %xmm0
+; WIN64-NEXT:    movl %esi, %edx
+; WIN64-NEXT:    callq ldexp
+; WIN64-NEXT:    callq __truncdfhf2
+; WIN64-NEXT:    nop
+; WIN64-NEXT:    addq $32, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+; WIN64-NEXT:    .seh_endproc
+;
 ; WIN32-LABEL: ldexp_f16:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, (%esp)
-; WIN32-NEXT:    cmpl $381, %edi # imm = 0x17D
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    jl LBB6_2
-; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl $381, %esi # imm = 0x17D
-; WIN32-NEXT:  LBB6_2:
-; WIN32-NEXT:    addl $-254, %esi
 ; WIN32-NEXT:    calll ___gnu_h2f_ieee
-; WIN32-NEXT:    leal -127(%edi), %eax
-; WIN32-NEXT:    cmpl $255, %edi
-; WIN32-NEXT:    jae LBB6_4
-; WIN32-NEXT:  # %bb.3:
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:  LBB6_4:
-; WIN32-NEXT:    flds __real@7f000000
-; WIN32-NEXT:    fld %st(1)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fmul %st, %st(1)
-; WIN32-NEXT:    jae LBB6_6
-; WIN32-NEXT:  # %bb.5:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB6_6:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    cmpl $-329, %edi # imm = 0xFEB7
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    jge LBB6_8
-; WIN32-NEXT:  # %bb.7:
-; WIN32-NEXT:    movl $-330, %eax # imm = 0xFEB6
-; WIN32-NEXT:  LBB6_8:
-; WIN32-NEXT:    flds __real@0c800000
-; WIN32-NEXT:    fld %st(2)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fmul %st, %st(1)
-; WIN32-NEXT:    cmpl $-228, %edi
-; WIN32-NEXT:    jb LBB6_9
-; WIN32-NEXT:  # %bb.10:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    leal 102(%edi), %eax
-; WIN32-NEXT:    cmpl $-126, %edi
-; WIN32-NEXT:    jge LBB6_12
-; WIN32-NEXT:    jmp LBB6_13
-; WIN32-NEXT:  LBB6_9:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $204, %eax
-; WIN32-NEXT:    cmpl $-126, %edi
-; WIN32-NEXT:    jl LBB6_13
-; WIN32-NEXT:  LBB6_12:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(2)
-; WIN32-NEXT:  LBB6_13:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    cmpl $127, %edi
-; WIN32-NEXT:    jg LBB6_15
-; WIN32-NEXT:  # %bb.14:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:  LBB6_15:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    shll $23, %esi
-; WIN32-NEXT:    addl $1065353216, %esi # imm = 0x3F800000
 ; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fmuls {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstps (%esp)
 ; WIN32-NEXT:    calll ___gnu_f2h_ieee
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    retl
   %ldexp = call half @llvm.ldexp.f16.i32(half %arg0, i32 %arg1)
   ret half %ldexp
diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll
index d6038cebd45822..cd560ad627de4c 100644
--- a/llvm/test/CodeGen/X86/llvm.frexp.ll
+++ b/llvm/test/CodeGen/X86/llvm.frexp.ll
@@ -42,55 +42,21 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; WIN32-LABEL: test_frexp_f16_i32:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    subl $20, %esp
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, (%esp)
 ; WIN32-NEXT:    calll ___gnu_h2f_ieee
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmuls __real@4c000000
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jb LBB0_1
-; WIN32-NEXT:  # %bb.2:
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    jmp LBB0_3
-; WIN32-NEXT:  LBB0_1:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    andl $2139095040, %eax # imm = 0x7F800000
-; WIN32-NEXT:  LBB0_3:
-; WIN32-NEXT:    shrl $23, %eax
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jae LBB0_5
-; WIN32-NEXT:  # %bb.4:
-; WIN32-NEXT:    addl $-25, %eax
-; WIN32-NEXT:  LBB0_5:
-; WIN32-NEXT:    andl $-2139095041, %edx # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %edx # imm = 0x3F000000
-; WIN32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl $-126, %eax
-; WIN32-NEXT:    addl $-2139095040, %ecx # imm = 0x80800000
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    xorl %esi, %esi
-; WIN32-NEXT:    cmpl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB0_7
-; WIN32-NEXT:  # %bb.6:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB0_7:
-; WIN32-NEXT:    fstp %st(0)
 ; WIN32-NEXT:    fstps (%esp)
-; WIN32-NEXT:    jbe LBB0_9
-; WIN32-NEXT:  # %bb.8:
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:  LBB0_9:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    calll ___gnu_f2h_ieee
 ; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    addl $16, %esp
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
@@ -126,37 +92,19 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
 ;
 ; WIN32-LABEL: test_frexp_f16_i32_only_use_fract:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    subl $20, %esp
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, (%esp)
 ; WIN32-NEXT:    calll ___gnu_h2f_ieee
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmuls __real@4c000000
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %eax # imm = 0x800000
-; WIN32-NEXT:    jae LBB1_2
-; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:  LBB1_2:
-; WIN32-NEXT:    andl $-2139095041, %ecx # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %ecx # imm = 0x3F000000
-; WIN32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl $-2139095040, %eax # imm = 0x80800000
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    cmpl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB1_4
-; WIN32-NEXT:  # %bb.3:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB1_4:
-; WIN32-NEXT:    fstp %st(0)
 ; WIN32-NEXT:    fstps (%esp)
 ; WIN32-NEXT:    calll ___gnu_f2h_ieee
-; WIN32-NEXT:    addl $16, %esp
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    retl
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 0
@@ -195,41 +143,17 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ;
 ; WIN32-LABEL: test_frexp_f16_i32_only_use_exp:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    subl $8, %esp
+; WIN32-NEXT:    subl $16, %esp
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl %eax, (%esp)
 ; WIN32-NEXT:    calll ___gnu_h2f_ieee
-; WIN32-NEXT:    addl $4, %esp
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmuls __real@4c000000
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps (%esp)
-; WIN32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    andl (%esp), %ecx
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jb LBB2_1
-; WIN32-NEXT:  # %bb.2:
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    jmp LBB2_3
-; WIN32-NEXT:  LBB2_1:
-; WIN32-NEXT:    movl $2139095040, %edx # imm = 0x7F800000
-; WIN32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:  LBB2_3:
-; WIN32-NEXT:    shrl $23, %edx
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jae LBB2_5
-; WIN32-NEXT:  # %bb.4:
-; WIN32-NEXT:    addl $-25, %edx
-; WIN32-NEXT:  LBB2_5:
-; WIN32-NEXT:    addl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    cmpl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB2_7
-; WIN32-NEXT:  # %bb.6:
-; WIN32-NEXT:    addl $-126, %edx
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:  LBB2_7:
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstp %st(0)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    retl
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 1
@@ -268,50 +192,16 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
 ;
 ; WIN32-LABEL: test_frexp_f32_i32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    subl $20, %esp
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmuls __real@4c000000
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fsts (%esp)
-; WIN32-NEXT:    movl (%esp), %eax
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jb LBB3_1
-; WIN32-NEXT:  # %bb.2:
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    jmp LBB3_3
-; WIN32-NEXT:  LBB3_1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, %edx
-; WIN32-NEXT:    andl $2139095040, %edx # imm = 0x7F800000
-; WIN32-NEXT:  LBB3_3:
-; WIN32-NEXT:    shrl $23, %edx
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jae LBB3_5
-; WIN32-NEXT:  # %bb.4:
-; WIN32-NEXT:    addl $-25, %edx
-; WIN32-NEXT:  LBB3_5:
-; WIN32-NEXT:    andl $-2139095041, %eax # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %eax # imm = 0x3F000000
-; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl $-126, %edx
-; WIN32-NEXT:    addl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    cmpl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB3_7
-; WIN32-NEXT:  # %bb.6:
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:  LBB3_7:
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jbe LBB3_9
-; WIN32-NEXT:  # %bb.8:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB3_9:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    retl
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   ret { float, i32 } %result
@@ -330,33 +220,15 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) {
 ;
 ; WIN32-LABEL: test_frexp_f32_i32_only_use_fract:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    subl $20, %esp
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fsts (%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmuls __real@4c000000
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl (%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %eax # imm = 0x800000
-; WIN32-NEXT:    jae LBB4_2
-; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:  LBB4_2:
-; WIN32-NEXT:    andl $-2139095041, %ecx # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %ecx # imm = 0x3F000000
-; WIN32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl $-2139095040, %eax # imm = 0x80800000
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    cmpl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB4_4
-; WIN32-NEXT:  # %bb.3:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB4_4:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    retl
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   %result.0 = extractvalue { float, i32 } %result, 0
@@ -377,38 +249,15 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
 ;
 ; WIN32-LABEL: test_frexp_f32_i32_only_use_exp:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    subl $8, %esp
+; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmuls __real@4c000000
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps (%esp)
-; WIN32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    andl (%esp), %ecx
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jb LBB5_1
-; WIN32-NEXT:  # %bb.2:
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    jmp LBB5_3
-; WIN32-NEXT:  LBB5_1:
-; WIN32-NEXT:    movl $2139095040, %edx # imm = 0x7F800000
-; WIN32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:  LBB5_3:
-; WIN32-NEXT:    shrl $23, %edx
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jae LBB5_5
-; WIN32-NEXT:  # %bb.4:
-; WIN32-NEXT:    addl $-25, %edx
-; WIN32-NEXT:  LBB5_5:
-; WIN32-NEXT:    addl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    cmpl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB5_7
-; WIN32-NEXT:  # %bb.6:
-; WIN32-NEXT:    addl $-126, %edx
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:  LBB5_7:
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstp %st(0)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    retl
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   %result.0 = extractvalue { float, i32 } %result, 1
@@ -476,206 +325,58 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
 ;
 ; WIN32-LABEL: test_frexp_v4f32_v4i32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %ebp
-; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $68, %esp
+; WIN32-NEXT:    subl $60, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds __real@4c000000
-; WIN32-NEXT:    fld %st(1)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %eax # imm = 0x800000
-; WIN32-NEXT:    jb LBB6_1
-; WIN32-NEXT:  # %bb.2:
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    jmp LBB6_3
-; WIN32-NEXT:  LBB6_1:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    andl $2139095040, %ecx # imm = 0x7F800000
-; WIN32-NEXT:  LBB6_3:
-; WIN32-NEXT:    shrl $23, %ecx
-; WIN32-NEXT:    cmpl $8388608, %eax # imm = 0x800000
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(3), %st
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jae LBB6_5
-; WIN32-NEXT:  # %bb.4:
-; WIN32-NEXT:    addl $-25, %ecx
-; WIN32-NEXT:  LBB6_5:
-; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    andl $2147483647, %edx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %edx # imm = 0x800000
-; WIN32-NEXT:    jb LBB6_6
-; WIN32-NEXT:  # %bb.7:
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    jmp LBB6_8
-; WIN32-NEXT:  LBB6_6:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    andl $2139095040, %esi # imm = 0x7F800000
-; WIN32-NEXT:  LBB6_8:
-; WIN32-NEXT:    shrl $23, %esi
-; WIN32-NEXT:    cmpl $8388608, %edx # imm = 0x800000
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(4), %st
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jae LBB6_10
-; WIN32-NEXT:  # %bb.9:
-; WIN32-NEXT:    addl $-25, %esi
-; WIN32-NEXT:  LBB6_10:
-; WIN32-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, %ebp
-; WIN32-NEXT:    andl $2147483647, %ebp # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %ebp # imm = 0x800000
-; WIN32-NEXT:    jb LBB6_11
-; WIN32-NEXT:  # %bb.12:
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    jmp LBB6_13
-; WIN32-NEXT:  LBB6_11:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    andl $2139095040, %esi # imm = 0x7F800000
-; WIN32-NEXT:  LBB6_13:
-; WIN32-NEXT:    shrl $23, %esi
-; WIN32-NEXT:    cmpl $8388608, %ebp # imm = 0x800000
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fmul %st, %st(4)
-; WIN32-NEXT:    fxch %st(4)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    jae LBB6_15
-; WIN32-NEXT:  # %bb.14:
-; WIN32-NEXT:    addl $-25, %esi
-; WIN32-NEXT:  LBB6_15:
-; WIN32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, %edx
-; WIN32-NEXT:    andl $2147483647, %edx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %edx # imm = 0x800000
-; WIN32-NEXT:    jb LBB6_16
-; WIN32-NEXT:  # %bb.17:
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    jmp LBB6_18
-; WIN32-NEXT:  LBB6_16:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    andl $2139095040, %esi # imm = 0x7F800000
-; WIN32-NEXT:  LBB6_18:
-; WIN32-NEXT:    shrl $23, %esi
-; WIN32-NEXT:    cmpl $8388608, %edx # imm = 0x800000
-; WIN32-NEXT:    jae LBB6_20
-; WIN32-NEXT:  # %bb.19:
-; WIN32-NEXT:    addl $-25, %esi
-; WIN32-NEXT:  LBB6_20:
-; WIN32-NEXT:    andl $-2139095041, %eax # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %eax # imm = 0x3F000000
-; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    andl $-2139095041, %ebx # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %ebx # imm = 0x3F000000
-; WIN32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    andl $-2139095041, %ecx # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %ecx # imm = 0x3F000000
-; WIN32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    andl $-2139095041, %edi # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %edi # imm = 0x3F000000
-; WIN32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl $-126, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; WIN32-NEXT:    addl $-126, (%esp) # 4-byte Folded Spill
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    addl $-126, %ecx
-; WIN32-NEXT:    addl $-126, %esi
-; WIN32-NEXT:    addl $-2139095040, %edx # imm = 0x80800000
-; WIN32-NEXT:    xorl %edi, %edi
-; WIN32-NEXT:    cmpl $-2139095040, %edx # imm = 0x80800000
-; WIN32-NEXT:    movl $0, %ebx
-; WIN32-NEXT:    jbe LBB6_22
-; WIN32-NEXT:  # %bb.21:
-; WIN32-NEXT:    movl %esi, %ebx
-; WIN32-NEXT:  LBB6_22:
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN32-NEXT:    jbe LBB6_24
-; WIN32-NEXT:  # %bb.23:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB6_24:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    cmpl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    movl $0, %edx
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; WIN32-NEXT:    jbe LBB6_26
-; WIN32-NEXT:  # %bb.25:
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:  LBB6_26:
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jbe LBB6_28
-; WIN32-NEXT:  # %bb.27:
-; WIN32-NEXT:    fstp %st(3)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB6_28:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN32-NEXT:    addl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    cmpl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    movl $0, %ecx
-; WIN32-NEXT:    jbe LBB6_30
-; WIN32-NEXT:  # %bb.29:
-; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; WIN32-NEXT:  LBB6_30:
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jbe LBB6_32
-; WIN32-NEXT:  # %bb.31:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB6_32:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    addl $-2139095040, %ebp # imm = 0x80800000
-; WIN32-NEXT:    cmpl $-2139095040, %ebp # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB6_34
-; WIN32-NEXT:  # %bb.33:
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:  LBB6_34:
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jbe LBB6_36
-; WIN32-NEXT:  # %bb.35:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB6_36:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl %edi, 28(%eax)
-; WIN32-NEXT:    movl %ecx, 24(%eax)
-; WIN32-NEXT:    movl %edx, 20(%eax)
-; WIN32-NEXT:    movl %ebx, 16(%eax)
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:    fstps 12(%eax)
-; WIN32-NEXT:    fstps 8(%eax)
-; WIN32-NEXT:    fstps 4(%eax)
-; WIN32-NEXT:    fstps (%eax)
-; WIN32-NEXT:    addl $68, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, 28(%esi)
+; WIN32-NEXT:    movl %edx, 24(%esi)
+; WIN32-NEXT:    movl %ecx, 20(%esi)
+; WIN32-NEXT:    movl %eax, 16(%esi)
+; WIN32-NEXT:    fstps 12(%esi)
+; WIN32-NEXT:    fstps 8(%esi)
+; WIN32-NEXT:    fstps 4(%esi)
+; WIN32-NEXT:    fstps (%esi)
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    addl $60, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
-; WIN32-NEXT:    popl %ebx
-; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   ret { <4 x float>, <4 x i32> } %result
@@ -717,123 +418,48 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) {
 ;
 ; WIN32-LABEL: test_frexp_v4f32_v4i32_only_use_fract:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $48, %esp
+; WIN32-NEXT:    subl $60, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds __real@4c000000
-; WIN32-NEXT:    fld %st(1)
-; WIN32-NEXT:    fmul %st(1), %st
+; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(2)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(3), %st
+; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(3)
-; WIN32-NEXT:    fsts {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(3), %st
+; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(4)
-; WIN32-NEXT:    fsts (%esp)
-; WIN32-NEXT:    fmul %st, %st(2)
-; WIN32-NEXT:    fxch %st(2)
+; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %eax # imm = 0x800000
-; WIN32-NEXT:    jae LBB7_2
-; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:  LBB7_2:
-; WIN32-NEXT:    andl $-2139095041, %ecx # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %ecx # imm = 0x3F000000
-; WIN32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %ecx # imm = 0x800000
-; WIN32-NEXT:    jae LBB7_4
-; WIN32-NEXT:  # %bb.3:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:  LBB7_4:
-; WIN32-NEXT:    andl $-2139095041, %edx # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %edx # imm = 0x3F000000
-; WIN32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    andl $2147483647, %edx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %edx # imm = 0x800000
-; WIN32-NEXT:    jae LBB7_6
-; WIN32-NEXT:  # %bb.5:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:  LBB7_6:
-; WIN32-NEXT:    andl $-2139095041, %esi # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %esi # imm = 0x3F000000
-; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl (%esp), %edi
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    andl $2147483647, %esi # imm = 0x7FFFFFFF
-; WIN32-NEXT:    cmpl $8388608, %esi # imm = 0x800000
-; WIN32-NEXT:    jae LBB7_8
-; WIN32-NEXT:  # %bb.7:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:  LBB7_8:
-; WIN32-NEXT:    andl $-2139095041, %edi # imm = 0x807FFFFF
-; WIN32-NEXT:    orl $1056964608, %edi # imm = 0x3F000000
-; WIN32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl $-2139095040, %eax # imm = 0x80800000
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    cmpl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB7_10
-; WIN32-NEXT:  # %bb.9:
-; WIN32-NEXT:    fstp %st(1)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB7_10:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $-2139095040, %ecx # imm = 0x80800000
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    cmpl $-2139095040, %ecx # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB7_12
-; WIN32-NEXT:  # %bb.11:
-; WIN32-NEXT:    fstp %st(3)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB7_12:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    addl $-2139095040, %edx # imm = 0x80800000
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    cmpl $-2139095040, %edx # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB7_14
-; WIN32-NEXT:  # %bb.13:
-; WIN32-NEXT:    fstp %st(4)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB7_14:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    addl $-2139095040, %esi # imm = 0x80800000
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    cmpl $-2139095040, %esi # imm = 0x80800000
-; WIN32-NEXT:    jbe LBB7_16
-; WIN32-NEXT:  # %bb.15:
-; WIN32-NEXT:    fstp %st(2)
-; WIN32-NEXT:    fldz
-; WIN32-NEXT:  LBB7_16:
-; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fstps 12(%eax)
-; WIN32-NEXT:    fxch %st(2)
-; WIN32-NEXT:    fstps 8(%eax)
-; WIN32-NEXT:    fstps 4(%eax)
-; WIN32-NEXT:    fstps (%eax)
-; WIN32-NEXT:    addl $48, %esp
+; WIN32-NEXT:    fstps 12(%esi)
+; WIN32-NEXT:    fstps 8(%esi)
+; WIN32-NEXT:    fstps 4(%esi)
+; WIN32-NEXT:    fstps (%esi)
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    addl $60, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    retl
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   %result.0 = extractvalue { <4 x float>, <4 x i32> } %result, 0
@@ -873,146 +499,46 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) {
 ;
 ; WIN32-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %ebp
-; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $36, %esp
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds __real@4c000000
-; WIN32-NEXT:    fld %st(1)
-; WIN32-NEXT:    fmul %st(1), %st
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl $2139095040, %ecx # imm = 0x7F800000
-; WIN32-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    andl %edx, %eax
-; WIN32-NEXT:    cmpl $8388608, %eax # imm = 0x800000
-; WIN32-NEXT:    jb LBB8_1
-; WIN32-NEXT:  # %bb.2:
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    jmp LBB8_3
-; WIN32-NEXT:  LBB8_1:
+; WIN32-NEXT:    subl $28, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    andl %ecx, %esi
-; WIN32-NEXT:  LBB8_3:
-; WIN32-NEXT:    shrl $23, %esi
-; WIN32-NEXT:    cmpl $8388608, %eax # imm = 0x800000
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jae LBB8_5
-; WIN32-NEXT:  # %bb.4:
-; WIN32-NEXT:    addl $-25, %esi
-; WIN32-NEXT:  LBB8_5:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    andl %edx, %edi
-; WIN32-NEXT:    cmpl $8388608, %edi # imm = 0x800000
-; WIN32-NEXT:    jb LBB8_6
-; WIN32-NEXT:  # %bb.7:
-; WIN32-NEXT:    movl %edi, %ebx
-; WIN32-NEXT:    jmp LBB8_8
-; WIN32-NEXT:  LBB8_6:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    andl %ecx, %ebx
-; WIN32-NEXT:  LBB8_8:
-; WIN32-NEXT:    shrl $23, %ebx
-; WIN32-NEXT:    cmpl $8388608, %edi # imm = 0x800000
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstp %st(0)
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fld %st(0)
-; WIN32-NEXT:    fmul %st(2), %st
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jae LBB8_10
-; WIN32-NEXT:  # %bb.9:
-; WIN32-NEXT:    addl $-25, %ebx
-; WIN32-NEXT:  LBB8_10:
-; WIN32-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    andl %edx, %ebp
-; WIN32-NEXT:    cmpl $8388608, %ebp # imm = 0x800000
-; WIN32-NEXT:    jb LBB8_11
-; WIN32-NEXT:  # %bb.12:
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    jmp LBB8_13
-; WIN32-NEXT:  LBB8_11:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    andl %ecx, %esi
-; WIN32-NEXT:  LBB8_13:
-; WIN32-NEXT:    shrl $23, %esi
-; WIN32-NEXT:    cmpl $8388608, %ebp # imm = 0x800000
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstp %st(0)
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fmul %st, %st(1)
-; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
-; WIN32-NEXT:    jae LBB8_15
-; WIN32-NEXT:  # %bb.14:
-; WIN32-NEXT:    addl $-25, %esi
-; WIN32-NEXT:  LBB8_15:
-; WIN32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    cmpl $8388608, %edx # imm = 0x800000
-; WIN32-NEXT:    jb LBB8_16
-; WIN32-NEXT:  # %bb.17:
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    jmp LBB8_18
-; WIN32-NEXT:  LBB8_16:
-; WIN32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:  LBB8_18:
-; WIN32-NEXT:    shrl $23, %ecx
-; WIN32-NEXT:    cmpl $8388608, %edx # imm = 0x800000
-; WIN32-NEXT:    jae LBB8_20
-; WIN32-NEXT:  # %bb.19:
-; WIN32-NEXT:    addl $-25, %ecx
-; WIN32-NEXT:  LBB8_20:
-; WIN32-NEXT:    addl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    cmpl $-2139095040, %eax # imm = 0x80800000
-; WIN32-NEXT:    movl $0, %eax
-; WIN32-NEXT:    jbe LBB8_22
-; WIN32-NEXT:  # %bb.21:
-; WIN32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; WIN32-NEXT:    addl $-126, %eax
-; WIN32-NEXT:  LBB8_22:
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    addl $-2139095040, %edi # imm = 0x80800000
-; WIN32-NEXT:    cmpl $-2139095040, %edi # imm = 0x80800000
-; WIN32-NEXT:    movl $0, %edi
-; WIN32-NEXT:    jbe LBB8_24
-; WIN32-NEXT:  # %bb.23:
-; WIN32-NEXT:    addl $-126, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:  LBB8_24:
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstp %st(0)
+; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstpl (%esp)
+; WIN32-NEXT:    calll _frexp
+; WIN32-NEXT:    fstp %st(0)
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    addl $-2139095040, %ebp # imm = 0x80800000
-; WIN32-NEXT:    cmpl $-2139095040, %ebp # imm = 0x80800000
-; WIN32-NEXT:    movl $0, %ebx
-; WIN32-NEXT:    jbe LBB8_26
-; WIN32-NEXT:  # %bb.25:
-; WIN32-NEXT:    addl $-126, %esi
-; WIN32-NEXT:    movl %esi, %ebx
-; WIN32-NEXT:  LBB8_26:
-; WIN32-NEXT:    addl $-2139095040, %edx # imm = 0x80800000
-; WIN32-NEXT:    cmpl $-2139095040, %edx # imm = 0x80800000
-; WIN32-NEXT:    movl $0, %edx
-; WIN32-NEXT:    jbe LBB8_28
-; WIN32-NEXT:  # %bb.27:
-; WIN32-NEXT:    addl $-126, %ecx
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:  LBB8_28:
-; WIN32-NEXT:    movl %edx, 12(%eax)
-; WIN32-NEXT:    movl %ebx, 8(%eax)
-; WIN32-NEXT:    movl %edi, 4(%eax)
-; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
-; WIN32-NEXT:    addl $36, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, 12(%esi)
+; WIN32-NEXT:    movl %edx, 8(%esi)
+; WIN32-NEXT:    movl %ecx, 4(%esi)
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    addl $28, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
-; WIN32-NEXT:    popl %ebx
-; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   %result.1 = extractvalue { <4 x float>, <4 x i32> } %result, 1
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 3b5ff12fb4ecad..acaa9be3f89a23 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2368,17 +2368,17 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; SSE41-NEXT:    psubb %xmm3, %xmm1
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm1, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pmullw %xmm4, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    packuswb %xmm2, %xmm3
-; SSE41-NEXT:    paddb %xmm3, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm3, %xmm5
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    pmaddubsw %xmm3, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm5
+; SSE41-NEXT:    paddb %xmm5, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vec128_i8_signed_reg_reg:
@@ -2390,16 +2390,14 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -2431,13 +2429,12 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -2450,13 +2447,12 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -2584,28 +2580,28 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ;
 ; SSE41-LABEL: vec128_i8_unsigned_reg_reg:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pminub %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pminub %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpeqb %xmm3, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
 ; SSE41-NEXT:    pmaxub %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm3, %xmm1
+; SSE41-NEXT:    psubb %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm1, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pmullw %xmm4, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    packuswb %xmm2, %xmm3
-; SSE41-NEXT:    paddb %xmm3, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm3, %xmm5
+; SSE41-NEXT:    pand %xmm2, %xmm5
+; SSE41-NEXT:    pandn %xmm4, %xmm2
+; SSE41-NEXT:    pmaddubsw %xmm2, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm5
+; SSE41-NEXT:    paddb %xmm5, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vec128_i8_unsigned_reg_reg:
@@ -2619,16 +2615,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -2662,13 +2656,12 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -2681,13 +2674,12 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -2820,28 +2812,27 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ;
 ; SSE41-LABEL: vec128_i8_signed_mem_reg:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pcmpgtb %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    pminsb %xmm0, %xmm1
-; SSE41-NEXT:    pmaxsb %xmm2, %xmm0
-; SSE41-NEXT:    psubb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pminsb %xmm0, %xmm3
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm3, %xmm0
 ; SSE41-NEXT:    psrlw $1, %xmm0
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm0, %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm0, %xmm3
-; SSE41-NEXT:    pmullw %xmm4, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    packuswb %xmm3, %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm3, %xmm5
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    pmaddubsw %xmm3, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vec128_i8_signed_mem_reg:
@@ -2854,16 +2845,14 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -2897,13 +2886,12 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
+; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -2917,13 +2905,12 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -3056,27 +3043,27 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ;
 ; SSE41-LABEL: vec128_i8_signed_reg_mem:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtb %xmm2, %xmm1
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pminsb %xmm2, %xmm3
-; SSE41-NEXT:    pmaxsb %xmm0, %xmm2
-; SSE41-NEXT:    psubb %xmm3, %xmm2
-; SSE41-NEXT:    psrlw $1, %xmm2
-; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm2, %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pmullw %xmm4, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    packuswb %xmm1, %xmm3
-; SSE41-NEXT:    paddb %xmm3, %xmm0
+; SSE41-NEXT:    pminsb %xmm1, %xmm3
+; SSE41-NEXT:    pmaxsb %xmm0, %xmm1
+; SSE41-NEXT:    psubb %xmm3, %xmm1
+; SSE41-NEXT:    psrlw $1, %xmm1
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm3, %xmm5
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    pmaddubsw %xmm3, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    paddb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vec128_i8_signed_reg_mem:
@@ -3089,16 +3076,14 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -3132,13 +3117,12 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -3152,13 +3136,12 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -3293,26 +3276,26 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE41-LABEL: vec128_i8_signed_mem_mem:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa (%rdi), %xmm1
-; SSE41-NEXT:    movdqa (%rsi), %xmm3
+; SSE41-NEXT:    movdqa (%rsi), %xmm0
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    pcmpgtb %xmm3, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pminsb %xmm3, %xmm0
-; SSE41-NEXT:    pmaxsb %xmm1, %xmm3
-; SSE41-NEXT:    psubb %xmm0, %xmm3
-; SSE41-NEXT:    psrlw $1, %xmm3
-; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pminsb %xmm0, %xmm3
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm3, %xmm0
+; SSE41-NEXT:    psrlw $1, %xmm0
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm3, %xmm2
-; SSE41-NEXT:    pmullw %xmm4, %xmm0
-; SSE41-NEXT:    pand %xmm3, %xmm0
-; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm3, %xmm5
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    pmaddubsw %xmm3, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    paddb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -3327,16 +3310,14 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -3372,13 +3353,12 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -3393,13 +3373,12 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOPAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
+; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 92060aec307486..7d882b772a64d1 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1911,28 +1911,24 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm8, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm8
+; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm1, %xmm8
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -1947,16 +1943,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX2-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX2-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1975,23 +1969,20 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; XOP-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOP-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; XOP-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; XOP-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; XOP-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; XOP-NEXT:    vpandn %xmm5, %xmm7, %xmm8
+; XOP-NEXT:    vpmaddubsw %xmm8, %xmm1, %xmm8
+; XOP-NEXT:    vpand %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
+; XOP-NEXT:    vpperm %xmm5, %xmm8, %xmm1, %xmm1
+; XOP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; XOP-NEXT:    vpandn %xmm4, %xmm7, %xmm6
+; XOP-NEXT:    vpmaddubsw %xmm6, %xmm2, %xmm6
+; XOP-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; XOP-NEXT:    vpmaddubsw %xmm4, %xmm2, %xmm2
 ; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -2007,16 +1998,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2096,28 +2085,24 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm8, %xmm4
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; AVX1-NEXT:    vpmullw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm8, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm6, %xmm7, %xmm8
+; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm1, %xmm8
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2134,16 +2119,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX2-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX2-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2162,23 +2145,20 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; XOP-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOP-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; XOP-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; XOP-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; XOP-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; XOP-NEXT:    vpandn %xmm5, %xmm7, %xmm8
+; XOP-NEXT:    vpmaddubsw %xmm8, %xmm1, %xmm8
+; XOP-NEXT:    vpand %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
+; XOP-NEXT:    vpperm %xmm5, %xmm8, %xmm1, %xmm1
+; XOP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; XOP-NEXT:    vpandn %xmm4, %xmm7, %xmm6
+; XOP-NEXT:    vpmaddubsw %xmm6, %xmm2, %xmm6
+; XOP-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; XOP-NEXT:    vpmaddubsw %xmm4, %xmm2, %xmm2
 ; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -2195,16 +2175,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2285,28 +2263,24 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm8
+; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2322,56 +2296,51 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX2-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX2-NEXT:    vpmaddubsw %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: vec256_i8_signed_mem_reg:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOP-NEXT:    vmovdqa (%rdi), %xmm1
+; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOP-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT:    vpcomgtb %xmm2, %xmm3, %xmm4
-; XOP-NEXT:    vpcomgtb %xmm0, %xmm1, %xmm5
-; XOP-NEXT:    vpminsb %xmm0, %xmm1, %xmm6
-; XOP-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vpcomgtb %xmm1, %xmm3, %xmm4
+; XOP-NEXT:    vpcomgtb %xmm0, %xmm2, %xmm5
+; XOP-NEXT:    vpminsb %xmm0, %xmm2, %xmm6
+; XOP-NEXT:    vpmaxsb %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
-; XOP-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
-; XOP-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpminsb %xmm1, %xmm3, %xmm6
+; XOP-NEXT:    vpmaxsb %xmm1, %xmm3, %xmm1
+; XOP-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; XOP-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; XOP-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
 ; XOP-NEXT:    vpshlb %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; XOP-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; XOP-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; XOP-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm2, %xmm2
-; XOP-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; XOP-NEXT:    vpandn %xmm5, %xmm7, %xmm8
+; XOP-NEXT:    vpmaddubsw %xmm8, %xmm0, %xmm8
+; XOP-NEXT:    vpand %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
+; XOP-NEXT:    vpperm %xmm5, %xmm8, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; XOP-NEXT:    vpandn %xmm4, %xmm7, %xmm6
+; XOP-NEXT:    vpmaddubsw %xmm6, %xmm1, %xmm6
+; XOP-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; XOP-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm1, %xmm1
+; XOP-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; XOP-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec256_i8_signed_mem_reg:
@@ -2384,16 +2353,14 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2474,28 +2441,24 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm8
+; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm2, %xmm8
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2511,16 +2474,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX2-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX2-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2540,23 +2501,20 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOP-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; XOP-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; XOP-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; XOP-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; XOP-NEXT:    vpandn %xmm5, %xmm7, %xmm8
+; XOP-NEXT:    vpmaddubsw %xmm8, %xmm1, %xmm8
+; XOP-NEXT:    vpand %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
+; XOP-NEXT:    vpperm %xmm5, %xmm8, %xmm1, %xmm1
+; XOP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; XOP-NEXT:    vpandn %xmm4, %xmm7, %xmm6
+; XOP-NEXT:    vpmaddubsw %xmm6, %xmm2, %xmm6
+; XOP-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; XOP-NEXT:    vpmaddubsw %xmm4, %xmm2, %xmm2
 ; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -2573,16 +2531,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2647,48 +2603,44 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; AVX1-LABEL: vec256_i8_signed_mem_mem:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm3
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm2
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm5
-; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm6
-; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm5
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpminsb %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm8
+; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm1, %xmm8
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: vec256_i8_signed_mem_mem:
@@ -2702,16 +2654,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX2-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2732,23 +2682,20 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
 ; XOP-NEXT:    vpshlb %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; XOP-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm6, %xmm8, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; XOP-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; XOP-NEXT:    vpmullw %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; XOP-NEXT:    vpandn %xmm5, %xmm7, %xmm8
+; XOP-NEXT:    vpmaddubsw %xmm8, %xmm0, %xmm8
+; XOP-NEXT:    vpand %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
+; XOP-NEXT:    vpperm %xmm5, %xmm8, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; XOP-NEXT:    vpandn %xmm4, %xmm7, %xmm6
+; XOP-NEXT:    vpmaddubsw %xmm6, %xmm1, %xmm6
+; XOP-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; XOP-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm1
 ; XOP-NEXT:    vpperm %xmm5, %xmm6, %xmm1, %xmm1
 ; XOP-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; XOP-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -2766,16 +2713,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index ad08ecb35fdef6..1413a19b349357 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -889,21 +889,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa (%rsi), %ymm2
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa 32(%rsi), %ymm3
-; CHECK-SKX-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-SKX-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-SKX-VBMI-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; CHECK-SKX-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-SKX-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-SKX-VBMI-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; CHECK-SKX-VBMI-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; CHECK-SKX-VBMI-NEXT:    vpermt2b %ymm4, %ymm3, %ymm1
-; CHECK-SKX-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-SKX-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-SKX-VBMI-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; CHECK-SKX-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-SKX-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-SKX-VBMI-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; CHECK-SKX-VBMI-NEXT:    vpermt2b %ymm4, %ymm3, %ymm0
+; CHECK-SKX-VBMI-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT:    vpandn %ymm3, %ymm4, %ymm5
+; CHECK-SKX-VBMI-NEXT:    vpmaddubsw %ymm5, %ymm1, %ymm5
+; CHECK-SKX-VBMI-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; CHECK-SKX-VBMI-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
+; CHECK-SKX-VBMI-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
+; CHECK-SKX-VBMI-NEXT:    vpermt2b %ymm5, %ymm3, %ymm1
+; CHECK-SKX-VBMI-NEXT:    vpandn %ymm2, %ymm4, %ymm5
+; CHECK-SKX-VBMI-NEXT:    vpmaddubsw %ymm5, %ymm0, %ymm5
+; CHECK-SKX-VBMI-NEXT:    vpand %ymm2, %ymm4, %ymm2
+; CHECK-SKX-VBMI-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; CHECK-SKX-VBMI-NEXT:    vpermt2b %ymm5, %ymm3, %ymm0
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa %ymm0, (%rdx)
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa %ymm1, 32(%rdx)
 ; CHECK-SKX-VBMI-NEXT:    vzeroupper
@@ -915,25 +912,19 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; CHECK-AVX512-NEXT:    vmovdqa (%rsi), %ymm2
 ; CHECK-AVX512-NEXT:    vmovdqa 32(%rsi), %ymm3
-; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-AVX512-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; CHECK-AVX512-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm4, %ymm4
-; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-AVX512-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; CHECK-AVX512-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
-; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-AVX512-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
-; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; CHECK-AVX512-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT:    vpand %ymm3, %ymm4, %ymm5
+; CHECK-AVX512-NEXT:    vpmaddubsw %ymm5, %ymm1, %ymm5
+; CHECK-AVX512-NEXT:    vpandn %ymm3, %ymm4, %ymm3
+; CHECK-AVX512-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
+; CHECK-AVX512-NEXT:    vpsllw $8, %ymm1, %ymm1
+; CHECK-AVX512-NEXT:    vpternlogq $248, %ymm4, %ymm5, %ymm1
+; CHECK-AVX512-NEXT:    vpand %ymm2, %ymm4, %ymm3
+; CHECK-AVX512-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; CHECK-AVX512-NEXT:    vpandn %ymm2, %ymm4, %ymm2
+; CHECK-AVX512-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; CHECK-AVX512-NEXT:    vpsllw $8, %ymm0, %ymm0
+; CHECK-AVX512-NEXT:    vpternlogq $248, %ymm4, %ymm3, %ymm0
 ; CHECK-AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
 ; CHECK-AVX512-NEXT:    vmovdqa %ymm1, 32(%rdx)
 ; CHECK-AVX512-NEXT:    vzeroupper
@@ -945,21 +936,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; CHECK-VBMI-NEXT:    vmovdqa (%rsi), %ymm2
 ; CHECK-VBMI-NEXT:    vmovdqa 32(%rsi), %ymm3
-; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-VBMI-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-VBMI-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; CHECK-VBMI-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; CHECK-VBMI-NEXT:    vpermt2b %ymm4, %ymm3, %ymm1
-; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-VBMI-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-VBMI-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; CHECK-VBMI-NEXT:    vpermt2b %ymm4, %ymm3, %ymm0
+; CHECK-VBMI-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT:    vpandn %ymm3, %ymm4, %ymm5
+; CHECK-VBMI-NEXT:    vpmaddubsw %ymm5, %ymm1, %ymm5
+; CHECK-VBMI-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; CHECK-VBMI-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
+; CHECK-VBMI-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
+; CHECK-VBMI-NEXT:    vpermt2b %ymm5, %ymm3, %ymm1
+; CHECK-VBMI-NEXT:    vpandn %ymm2, %ymm4, %ymm5
+; CHECK-VBMI-NEXT:    vpmaddubsw %ymm5, %ymm0, %ymm5
+; CHECK-VBMI-NEXT:    vpand %ymm2, %ymm4, %ymm2
+; CHECK-VBMI-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; CHECK-VBMI-NEXT:    vpermt2b %ymm5, %ymm3, %ymm0
 ; CHECK-VBMI-NEXT:    vmovdqa %ymm0, (%rdx)
 ; CHECK-VBMI-NEXT:    vmovdqa %ymm1, 32(%rdx)
 ; CHECK-VBMI-NEXT:    vzeroupper
@@ -976,14 +964,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa64 (%rsi), %zmm1
-; CHECK-SKX-VBMI-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; CHECK-SKX-VBMI-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; CHECK-SKX-VBMI-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; CHECK-SKX-VBMI-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; CHECK-SKX-VBMI-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; CHECK-SKX-VBMI-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; CHECK-SKX-VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
-; CHECK-SKX-VBMI-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1
+; CHECK-SKX-VBMI-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT:    vpandnq %zmm1, %zmm2, %zmm3
+; CHECK-SKX-VBMI-NEXT:    vpmaddubsw %zmm3, %zmm0, %zmm3
+; CHECK-SKX-VBMI-NEXT:    vpandq %zmm1, %zmm2, %zmm1
+; CHECK-SKX-VBMI-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; CHECK-SKX-VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
+; CHECK-SKX-VBMI-NEXT:    vpermi2b %zmm3, %zmm0, %zmm1
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa64 %zmm1, (%rdx)
 ; CHECK-SKX-VBMI-NEXT:    vzeroupper
 ; CHECK-SKX-VBMI-NEXT:    retq
@@ -992,16 +979,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; CHECK-AVX512-NEXT:    vmovdqa64 (%rsi), %zmm1
-; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; CHECK-AVX512-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; CHECK-AVX512-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-AVX512-NEXT:    vpandq %zmm3, %zmm2, %zmm2
-; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; CHECK-AVX512-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; CHECK-AVX512-NEXT:    vpandq %zmm3, %zmm0, %zmm0
-; CHECK-AVX512-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; CHECK-AVX512-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT:    vpandq %zmm1, %zmm2, %zmm3
+; CHECK-AVX512-NEXT:    vpmaddubsw %zmm3, %zmm0, %zmm3
+; CHECK-AVX512-NEXT:    vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-AVX512-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; CHECK-AVX512-NEXT:    vpsllw $8, %zmm0, %zmm0
+; CHECK-AVX512-NEXT:    vpternlogq $248, %zmm2, %zmm3, %zmm0
 ; CHECK-AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; CHECK-AVX512-NEXT:    vzeroupper
 ; CHECK-AVX512-NEXT:    retq
@@ -1010,14 +994,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-VBMI:       # %bb.0:
 ; CHECK-VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; CHECK-VBMI-NEXT:    vmovdqa64 (%rsi), %zmm1
-; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; CHECK-VBMI-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; CHECK-VBMI-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; CHECK-VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
-; CHECK-VBMI-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1
+; CHECK-VBMI-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT:    vpandnq %zmm1, %zmm2, %zmm3
+; CHECK-VBMI-NEXT:    vpmaddubsw %zmm3, %zmm0, %zmm3
+; CHECK-VBMI-NEXT:    vpandq %zmm1, %zmm2, %zmm1
+; CHECK-VBMI-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; CHECK-VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
+; CHECK-VBMI-NEXT:    vpermi2b %zmm3, %zmm0, %zmm1
 ; CHECK-VBMI-NEXT:    vmovdqa64 %zmm1, (%rdx)
 ; CHECK-VBMI-NEXT:    vzeroupper
 ; CHECK-VBMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/osx-private-labels.ll b/llvm/test/CodeGen/X86/osx-private-labels.ll
index f3343ccb98f53c..3ad06a25a0174b 100644
--- a/llvm/test/CodeGen/X86/osx-private-labels.ll
+++ b/llvm/test/CodeGen/X86/osx-private-labels.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
 ; Test all the cases where a L label is safe. Removing any entry from
-; TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols should cause
-; this to fail.
+; isSectionAtomizableBySymbols should cause this to fail.
 ; We also test some noteworthy cases that require an l label.
 
 @private1 = private unnamed_addr constant [4 x i8] c"zed\00"
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 26272b62e5fb8f..fe791e45eff99a 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -160,17 +160,16 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
 ;
 ; SSE41-LABEL: mul_v16i8:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm1, %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    pmullw %xmm3, %xmm2
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pmaddubsw %xmm3, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    pandn %xmm1, %xmm2
+; SSE41-NEXT:    pmaddubsw %xmm2, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: mul_v16i8:
@@ -585,55 +584,50 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
 ;
 ; SSE41-LABEL: mul_v32i8:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm2, %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    pmullw %xmm5, %xmm4
-; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    packuswb %xmm0, %xmm4
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm3, %xmm1
-; SSE41-NEXT:    pand %xmm6, %xmm1
-; SSE41-NEXT:    pmullw %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm6, %xmm2
-; SSE41-NEXT:    packuswb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    pmaddubsw %xmm5, %xmm6
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    pandn %xmm2, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm5, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm2, %xmm5
+; SSE41-NEXT:    pand %xmm4, %xmm5
+; SSE41-NEXT:    pandn %xmm3, %xmm4
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: mul_v32i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v32i8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm1, %ymm2, %ymm3
+; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v32i8:
@@ -906,110 +900,96 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ;
 ; SSE41-LABEL: mul_v64i8:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movdqa %xmm1, %xmm8
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm4, %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm9, %xmm1
-; SSE41-NEXT:    pmullw %xmm10, %xmm0
-; SSE41-NEXT:    pand %xmm9, %xmm0
-; SSE41-NEXT:    packuswb %xmm1, %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm5, %xmm8
-; SSE41-NEXT:    pand %xmm9, %xmm8
-; SSE41-NEXT:    pmullw %xmm4, %xmm1
-; SSE41-NEXT:    pand %xmm9, %xmm1
-; SSE41-NEXT:    packuswb %xmm8, %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm6, %xmm2
-; SSE41-NEXT:    pand %xmm9, %xmm2
-; SSE41-NEXT:    pmullw %xmm5, %xmm4
-; SSE41-NEXT:    pand %xmm9, %xmm4
-; SSE41-NEXT:    packuswb %xmm2, %xmm4
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm7, %xmm3
-; SSE41-NEXT:    pand %xmm9, %xmm3
-; SSE41-NEXT:    pmullw %xmm2, %xmm5
-; SSE41-NEXT:    pand %xmm9, %xmm5
-; SSE41-NEXT:    packuswb %xmm3, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm2
-; SSE41-NEXT:    movdqa %xmm5, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    movdqa %xmm8, %xmm9
+; SSE41-NEXT:    pand %xmm4, %xmm9
+; SSE41-NEXT:    movdqa %xmm0, %xmm10
+; SSE41-NEXT:    pmaddubsw %xmm9, %xmm10
+; SSE41-NEXT:    pand %xmm8, %xmm10
+; SSE41-NEXT:    movdqa %xmm8, %xmm9
+; SSE41-NEXT:    pandn %xmm4, %xmm9
+; SSE41-NEXT:    pmaddubsw %xmm9, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    pand %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm9
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm9
+; SSE41-NEXT:    pand %xmm8, %xmm9
+; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    pandn %xmm5, %xmm4
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    por %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    pand %xmm6, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    pandn %xmm6, %xmm4
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm2
+; SSE41-NEXT:    psllw $8, %xmm2
+; SSE41-NEXT:    por %xmm5, %xmm2
+; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    pand %xmm7, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    pandn %xmm7, %xmm8
+; SSE41-NEXT:    pmaddubsw %xmm8, %xmm3
+; SSE41-NEXT:    psllw $8, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: mul_v64i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm4, %ymm5
+; AVX2-NEXT:    vpmaddubsw %ymm5, %ymm0, %ymm5
+; AVX2-NEXT:    vpand %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm2
+; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpandn %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v64i8:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpand %ymm1, %ymm3, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vpandn %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpandn %ymm2, %ymm3, %ymm1
+; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogq $248, %zmm3, %zmm4, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v64i8:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpmaddubsw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpternlogq $248, %zmm2, %zmm3, %zmm0
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <64 x i8> %i, %j
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 37c7b051de7b13..13fa639dc63be1 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -109,18 +109,18 @@ define i16 @cnt16(i16 %x) nounwind readnone {
 ;
 ; X64-NDD-LABEL: cnt16:
 ; X64-NDD:       # %bb.0:
-; X64-NDD-NEXT:    shrl %edi, %eax
-; X64-NDD-NEXT:    andl $21845, %eax # imm = 0x5555
-; X64-NDD-NEXT:    subl %eax, %edi, %eax
-; X64-NDD-NEXT:    andl $13107, %eax, %ecx # imm = 0x3333
-; X64-NDD-NEXT:    shrl $2, %eax
-; X64-NDD-NEXT:    andl $13107, %eax # imm = 0x3333
-; X64-NDD-NEXT:    addl %ecx, %eax
-; X64-NDD-NEXT:    shrl $4, %eax, %ecx
-; X64-NDD-NEXT:    addl %ecx, %eax
-; X64-NDD-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X64-NDD-NEXT:    shrl $8, %eax, %ecx
-; X64-NDD-NEXT:    addl %ecx, %eax
+; X64-NDD-NEXT:    shrw %di, %ax
+; X64-NDD-NEXT:    andw $21845, %ax # imm = 0x5555
+; X64-NDD-NEXT:    subw %ax, %di, %ax
+; X64-NDD-NEXT:    andw $13107, %ax, %cx # imm = 0x3333
+; X64-NDD-NEXT:    shrw $2, %ax
+; X64-NDD-NEXT:    andw $13107, %ax # imm = 0x3333
+; X64-NDD-NEXT:    addw %cx, %ax
+; X64-NDD-NEXT:    shrw $4, %ax, %cx
+; X64-NDD-NEXT:    addw %cx, %ax
+; X64-NDD-NEXT:    andw $3855, %ax # imm = 0xF0F
+; X64-NDD-NEXT:    movzbl %ah, %ecx
+; X64-NDD-NEXT:    addw %cx, %ax
 ; X64-NDD-NEXT:    movzbl %al, %eax
 ; X64-NDD-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NDD-NEXT:    retq
@@ -1812,19 +1812,20 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
 ;
 ; X64-NDD-LABEL: popcount_i16_zext:
 ; X64-NDD:       # %bb.0:
-; X64-NDD-NEXT:    shrl %edi, %eax
-; X64-NDD-NEXT:    andl $21845, %eax # imm = 0x5555
-; X64-NDD-NEXT:    subl %eax, %edi, %eax
-; X64-NDD-NEXT:    andl $13107, %eax, %ecx # imm = 0x3333
-; X64-NDD-NEXT:    shrl $2, %eax
-; X64-NDD-NEXT:    andl $13107, %eax # imm = 0x3333
-; X64-NDD-NEXT:    addl %ecx, %eax
-; X64-NDD-NEXT:    shrl $4, %eax, %ecx
-; X64-NDD-NEXT:    addl %ecx, %eax
-; X64-NDD-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X64-NDD-NEXT:    shrl $8, %eax, %ecx
-; X64-NDD-NEXT:    addl %ecx, %eax
+; X64-NDD-NEXT:    shrw %di, %ax
+; X64-NDD-NEXT:    andw $21845, %ax # imm = 0x5555
+; X64-NDD-NEXT:    subw %ax, %di, %ax
+; X64-NDD-NEXT:    andw $13107, %ax, %cx # imm = 0x3333
+; X64-NDD-NEXT:    shrw $2, %ax
+; X64-NDD-NEXT:    andw $13107, %ax # imm = 0x3333
+; X64-NDD-NEXT:    addw %cx, %ax
+; X64-NDD-NEXT:    shrw $4, %ax, %cx
+; X64-NDD-NEXT:    addw %cx, %ax
+; X64-NDD-NEXT:    andw $3855, %ax # imm = 0xF0F
+; X64-NDD-NEXT:    movzbl %ah, %ecx
+; X64-NDD-NEXT:    addw %cx, %ax
 ; X64-NDD-NEXT:    movzbl %al, %eax
+; X64-NDD-NEXT:    movzwl %ax, %eax
 ; X64-NDD-NEXT:    retq
   %cnt = tail call i16 @llvm.ctpop.i16(i16 %x)
   %z = zext i16 %cnt to i32
diff --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll
index b4ac3fc783e0a9..a627050a54d138 100644
--- a/llvm/test/CodeGen/X86/pr81136.ll
+++ b/llvm/test/CodeGen/X86/pr81136.ll
@@ -8,8 +8,6 @@ define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
 ; CHECK-NEXT:    vmovd %esi, %xmm1
 ; CHECK-NEXT:    vmovdqa (%rdx), %ymm2
 ; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm4 = [128,1]
-; CHECK-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
 ; CHECK-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -17,10 +15,10 @@ define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
 ; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
 ; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
 ; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; CHECK-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vandnpd %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vmovmskpd %ymm0, %eax
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index c023f129d7565a..a950a13b0d8ca0 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -58,16 +58,13 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
 define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX256BW-LABEL: test_mul_32i8:
 ; AVX256BW:       # %bb.0:
-; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX256BW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX256BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX256BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX256BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX256BW-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX256BW-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX256BW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX256BW-NEXT:    vpand %ymm1, %ymm2, %ymm3
+; AVX256BW-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; AVX256BW-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX256BW-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX256BW-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX256BW-NEXT:    vpternlogq $248, %ymm2, %ymm3, %ymm0
 ; AVX256BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: test_mul_32i8:
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index aea76f694a0fc2..68749d6cb8feee 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -856,8 +856,7 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin
 ; AVX512VBMI-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMI-NEXT:    vpermt2b %ymm2, %ymm1, %ymm0
+; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index a481aaef4257d0..95e249984e1849 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -348,8 +348,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
 ; AVX512VBMIVL:       # %bb.0:
 ; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT:    vpermt2b %ymm2, %ymm1, %ymm0
+; AVX512VBMIVL-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/stack-protector-msvc.ll b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
index d0b6585f40ffec..7cb36aa9db6728 100644
--- a/llvm/test/CodeGen/X86/stack-protector-msvc.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X86 %s
 ; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X64 %s
 
@@ -8,6 +9,95 @@
 @"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"    ; <ptr> [#uses=1]
 
 define void @test(ptr %a) nounwind ssp {
+; MSVC-X86-LABEL: test:
+; MSVC-X86:       # %bb.0: # %entry
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    subl $12, %esp
+; MSVC-X86-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-NEXT:    xorl %esp, %eax
+; MSVC-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-NEXT:    movl %esp, %esi
+; MSVC-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    calll _strcpy
+; MSVC-X86-NEXT:    addl $8, %esp
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    pushl $LC
+; MSVC-X86-NEXT:    calll _printf
+; MSVC-X86-NEXT:    addl $8, %esp
+; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-NEXT:    xorl %esp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    addl $12, %esp
+; MSVC-X86-NEXT:    popl %esi
+; MSVC-X86-NEXT:    retl
+;
+; MSVC-X64-LABEL: test:
+; MSVC-X64:       # %bb.0: # %entry
+; MSVC-X64-NEXT:    pushq %rsi
+; MSVC-X64-NEXT:    subq $64, %rsp
+; MSVC-X64-NEXT:    movq %rcx, %rdx
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-NEXT:    xorq %rsp, %rax
+; MSVC-X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; MSVC-X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; MSVC-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; MSVC-X64-NEXT:    movq %rsi, %rcx
+; MSVC-X64-NEXT:    callq strcpy
+; MSVC-X64-NEXT:    leaq LC(%rip), %rcx
+; MSVC-X64-NEXT:    movq %rsi, %rdx
+; MSVC-X64-NEXT:    callq printf
+; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-NEXT:    callq __security_check_cookie
+; MSVC-X64-NEXT:    addq $64, %rsp
+; MSVC-X64-NEXT:    popq %rsi
+; MSVC-X64-NEXT:    retq
+;
+; MSVC-X86-O0-LABEL: test:
+; MSVC-X86-O0:       # %bb.0: # %entry
+; MSVC-X86-O0-NEXT:    subl $20, %esp
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    xorl %esp, %eax
+; MSVC-X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    movl %esp, %eax
+; MSVC-X86-O0-NEXT:    movl %ecx, 4(%eax)
+; MSVC-X86-O0-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    movl %ecx, (%eax)
+; MSVC-X86-O0-NEXT:    calll _strcpy
+; MSVC-X86-O0-NEXT:    leal LC, %ecx
+; MSVC-X86-O0-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; MSVC-X86-O0-NEXT:    movl %ecx, (%esp)
+; MSVC-X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-O0-NEXT:    calll _printf
+; MSVC-X86-O0-NEXT:  # %bb.1: # %return
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %esp, %ecx
+; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:    addl $20, %esp
+; MSVC-X86-O0-NEXT:    retl
+;
+; MSVC-X64-O0-LABEL: test:
+; MSVC-X64-O0:       # %bb.0: # %entry
+; MSVC-X64-O0-NEXT:    subq $56, %rsp
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rax
+; MSVC-X64-O0-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; MSVC-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    callq strcpy
+; MSVC-X64-O0-NEXT:    leaq LC(%rip), %rcx
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; MSVC-X64-O0-NEXT:    callq printf
+; MSVC-X64-O0-NEXT:  # %bb.1: # %return
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:    addq $56, %rsp
+; MSVC-X64-O0-NEXT:    retq
 entry:
  %a_addr = alloca ptr    ; <ptr> [#uses=2]
  %buf = alloca [8 x i8]    ; <ptr> [#uses=2]
@@ -21,91 +111,245 @@ return:    ; preds = %entry
  ret void
 }
 
-; MSVC-X86-LABEL: _test:
-; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]]
-; MSVC-X86: xorl %esp, %[[REG1]]
-; MSVC-X86: movl %[[REG1]], [[SLOT:[0-9]*]](%esp)
-; MSVC-X86: calll _strcpy
-; MSVC-X86: movl [[SLOT]](%esp), %ecx
-; MSVC-X86: xorl %esp, %ecx
-; MSVC-X86: calll @__security_check_cookie@4
-; MSVC-X86: retl
-
-; MSVC-X64-LABEL: test:
-; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
-; MSVC-X64: xorq %rsp, %[[REG1]]
-; MSVC-X64: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp)
-; MSVC-X64: callq strcpy
-; MSVC-X64: movq [[SLOT]](%rsp), %rcx
-; MSVC-X64: xorq %rsp, %rcx
-; MSVC-X64: callq __security_check_cookie
-; MSVC-X64: retq
-
-; MSVC-X86-O0-LABEL: _test:
-; MSVC-X86-O0: movl ___security_cookie, %[[REG1:[^ ]*]]
-; MSVC-X86-O0: xorl %esp, %[[REG1]]
-; MSVC-X86-O0: movl %[[REG1]], [[SLOT:[0-9]*]](%esp)
-; MSVC-X86-O0: calll _strcpy
-; MSVC-X86-O0: movl [[SLOT]](%esp), %ecx
-; MSVC-X86-O0: xorl %esp, %ecx
-; MSVC-X86-O0: calll @__security_check_cookie@4
-; MSVC-X86-O0: retl
-
-; MSVC-X64-O0-LABEL: test:
-; MSVC-X64-O0: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
-; MSVC-X64-O0: xorq %rsp, %[[REG1]]
-; MSVC-X64-O0: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp)
-; MSVC-X64-O0: callq strcpy
-; MSVC-X64-O0: movq [[SLOT]](%rsp), %rcx
-; MSVC-X64-O0: xorq %rsp, %rcx
-; MSVC-X64-O0: callq __security_check_cookie
-; MSVC-X64-O0: retq
-
-
 declare void @escape(ptr)
 
 define void @test_vla(i32 %n) nounwind ssp {
+; MSVC-X86-LABEL: test_vla:
+; MSVC-X86:       # %bb.0:
+; MSVC-X86-NEXT:    pushl %ebp
+; MSVC-X86-NEXT:    movl %esp, %ebp
+; MSVC-X86-NEXT:    pushl %eax
+; MSVC-X86-NEXT:    movl 8(%ebp), %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    movl %ecx, -4(%ebp)
+; MSVC-X86-NEXT:    shll $2, %eax
+; MSVC-X86-NEXT:    calll __chkstk
+; MSVC-X86-NEXT:    movl %esp, %eax
+; MSVC-X86-NEXT:    pushl %eax
+; MSVC-X86-NEXT:    calll _escape
+; MSVC-X86-NEXT:    addl $4, %esp
+; MSVC-X86-NEXT:    movl -4(%ebp), %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    movl %ebp, %esp
+; MSVC-X86-NEXT:    popl %ebp
+; MSVC-X86-NEXT:    retl
+;
+; MSVC-X64-LABEL: test_vla:
+; MSVC-X64:       # %bb.0:
+; MSVC-X64-NEXT:    pushq %rbp
+; MSVC-X64-NEXT:    subq $16, %rsp
+; MSVC-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-NEXT:    xorq %rbp, %rax
+; MSVC-X64-NEXT:    movq %rax, -8(%rbp)
+; MSVC-X64-NEXT:    movl %ecx, %eax
+; MSVC-X64-NEXT:    leaq 15(,%rax,4), %rax
+; MSVC-X64-NEXT:    andq $-16, %rax
+; MSVC-X64-NEXT:    callq __chkstk
+; MSVC-X64-NEXT:    subq %rax, %rsp
+; MSVC-X64-NEXT:    movq %rsp, %rcx
+; MSVC-X64-NEXT:    subq $32, %rsp
+; MSVC-X64-NEXT:    callq escape
+; MSVC-X64-NEXT:    addq $32, %rsp
+; MSVC-X64-NEXT:    movq -8(%rbp), %rcx
+; MSVC-X64-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-NEXT:    subq $32, %rsp
+; MSVC-X64-NEXT:    callq __security_check_cookie
+; MSVC-X64-NEXT:    movq %rbp, %rsp
+; MSVC-X64-NEXT:    popq %rbp
+; MSVC-X64-NEXT:    retq
+;
+; MSVC-X86-O0-LABEL: test_vla:
+; MSVC-X86-O0:       # %bb.0:
+; MSVC-X86-O0-NEXT:    pushl %ebp
+; MSVC-X86-O0-NEXT:    movl %esp, %ebp
+; MSVC-X86-O0-NEXT:    pushl %eax
+; MSVC-X86-O0-NEXT:    movl 8(%ebp), %eax
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    movl %ecx, -4(%ebp)
+; MSVC-X86-O0-NEXT:    shll $2, %eax
+; MSVC-X86-O0-NEXT:    calll __chkstk
+; MSVC-X86-O0-NEXT:    movl %esp, %eax
+; MSVC-X86-O0-NEXT:    subl $4, %esp
+; MSVC-X86-O0-NEXT:    movl %eax, (%esp)
+; MSVC-X86-O0-NEXT:    calll _escape
+; MSVC-X86-O0-NEXT:    addl $4, %esp
+; MSVC-X86-O0-NEXT:    movl -4(%ebp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:    movl %ebp, %esp
+; MSVC-X86-O0-NEXT:    popl %ebp
+; MSVC-X86-O0-NEXT:    retl
+;
+; MSVC-X64-O0-LABEL: test_vla:
+; MSVC-X64-O0:       # %bb.0:
+; MSVC-X64-O0-NEXT:    pushq %rbp
+; MSVC-X64-O0-NEXT:    subq $16, %rsp
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rax
+; MSVC-X64-O0-NEXT:    movq %rax, -8(%rbp)
+; MSVC-X64-O0-NEXT:    movl %ecx, %eax
+; MSVC-X64-O0-NEXT:    # kill: def $rax killed $eax
+; MSVC-X64-O0-NEXT:    leaq 15(,%rax,4), %rax
+; MSVC-X64-O0-NEXT:    andq $-16, %rax
+; MSVC-X64-O0-NEXT:    callq __chkstk
+; MSVC-X64-O0-NEXT:    subq %rax, %rsp
+; MSVC-X64-O0-NEXT:    movq %rsp, %rcx
+; MSVC-X64-O0-NEXT:    subq $32, %rsp
+; MSVC-X64-O0-NEXT:    callq escape
+; MSVC-X64-O0-NEXT:    addq $32, %rsp
+; MSVC-X64-O0-NEXT:    movq -8(%rbp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-O0-NEXT:    subq $32, %rsp
+; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:    movq %rbp, %rsp
+; MSVC-X64-O0-NEXT:    popq %rbp
+; MSVC-X64-O0-NEXT:    retq
   %vla = alloca i32, i32 %n
   call void @escape(ptr %vla)
   ret void
 }
 
-; MSVC-X86-LABEL: _test_vla:
-; MSVC-X86: pushl %ebp
-; MSVC-X86: movl %esp, %ebp
-; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]]
-; MSVC-X86: xorl %ebp, %[[REG1]]
-; MSVC-X86: movl %[[REG1]], [[SLOT:-[0-9]*]](%ebp)
-; MSVC-X86: calll __chkstk
-; MSVC-X86: pushl
-; MSVC-X86: calll _escape
-; MSVC-X86: movl [[SLOT]](%ebp), %ecx
-; MSVC-X86: xorl %ebp, %ecx
-; MSVC-X86: calll @__security_check_cookie@4
-; MSVC-X86: movl %ebp, %esp
-; MSVC-X86: popl %ebp
-; MSVC-X86: retl
-
-; MSVC-X64-LABEL: test_vla:
-; MSVC-X64: pushq %rbp
-; MSVC-X64: subq $16, %rsp
-; MSVC-X64: leaq 16(%rsp), %rbp
-; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
-; MSVC-X64: xorq %rbp, %[[REG1]]
-; MSVC-X64: movq %[[REG1]], [[SLOT:-[0-9]*]](%rbp)
-; MSVC-X64: callq __chkstk
-; MSVC-X64: callq escape
-; MSVC-X64: movq [[SLOT]](%rbp), %rcx
-; MSVC-X64: xorq %rbp, %rcx
-; MSVC-X64: callq __security_check_cookie
-; MSVC-X64: retq
-
-
 ; This case is interesting because we address local variables with RBX but XOR
 ; the guard value with RBP. That's fine, either value will do, as long as they
 ; are the same across the life of the frame.
 
 define void @test_vla_realign(i32 %n) nounwind ssp {
+; MSVC-X86-LABEL: test_vla_realign:
+; MSVC-X86:       # %bb.0:
+; MSVC-X86-NEXT:    pushl %ebp
+; MSVC-X86-NEXT:    movl %esp, %ebp
+; MSVC-X86-NEXT:    pushl %edi
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    andl $-32, %esp
+; MSVC-X86-NEXT:    subl $32, %esp
+; MSVC-X86-NEXT:    movl %esp, %esi
+; MSVC-X86-NEXT:    movl 8(%ebp), %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    movl %ecx, 12(%esi)
+; MSVC-X86-NEXT:    shll $2, %eax
+; MSVC-X86-NEXT:    calll __chkstk
+; MSVC-X86-NEXT:    movl %esp, %edi
+; MSVC-X86-NEXT:    movl %esi, %eax
+; MSVC-X86-NEXT:    pushl %eax
+; MSVC-X86-NEXT:    calll _escape
+; MSVC-X86-NEXT:    addl $4, %esp
+; MSVC-X86-NEXT:    pushl %edi
+; MSVC-X86-NEXT:    calll _escape
+; MSVC-X86-NEXT:    addl $4, %esp
+; MSVC-X86-NEXT:    movl 12(%esi), %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    leal -8(%ebp), %esp
+; MSVC-X86-NEXT:    popl %esi
+; MSVC-X86-NEXT:    popl %edi
+; MSVC-X86-NEXT:    popl %ebp
+; MSVC-X86-NEXT:    retl
+;
+; MSVC-X64-LABEL: test_vla_realign:
+; MSVC-X64:       # %bb.0:
+; MSVC-X64-NEXT:    pushq %rbp
+; MSVC-X64-NEXT:    pushq %rsi
+; MSVC-X64-NEXT:    pushq %rbx
+; MSVC-X64-NEXT:    subq $32, %rsp
+; MSVC-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; MSVC-X64-NEXT:    andq $-32, %rsp
+; MSVC-X64-NEXT:    movq %rsp, %rbx
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-NEXT:    xorq %rbp, %rax
+; MSVC-X64-NEXT:    movq %rax, 24(%rbx)
+; MSVC-X64-NEXT:    movl %ecx, %eax
+; MSVC-X64-NEXT:    leaq 15(,%rax,4), %rax
+; MSVC-X64-NEXT:    andq $-16, %rax
+; MSVC-X64-NEXT:    callq __chkstk
+; MSVC-X64-NEXT:    subq %rax, %rsp
+; MSVC-X64-NEXT:    movq %rsp, %rsi
+; MSVC-X64-NEXT:    subq $32, %rsp
+; MSVC-X64-NEXT:    movq %rbx, %rcx
+; MSVC-X64-NEXT:    callq escape
+; MSVC-X64-NEXT:    movq %rsi, %rcx
+; MSVC-X64-NEXT:    callq escape
+; MSVC-X64-NEXT:    addq $32, %rsp
+; MSVC-X64-NEXT:    movq 24(%rbx), %rcx
+; MSVC-X64-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-NEXT:    subq $32, %rsp
+; MSVC-X64-NEXT:    callq __security_check_cookie
+; MSVC-X64-NEXT:    movq %rbp, %rsp
+; MSVC-X64-NEXT:    popq %rbx
+; MSVC-X64-NEXT:    popq %rsi
+; MSVC-X64-NEXT:    popq %rbp
+; MSVC-X64-NEXT:    retq
+;
+; MSVC-X86-O0-LABEL: test_vla_realign:
+; MSVC-X86-O0:       # %bb.0:
+; MSVC-X86-O0-NEXT:    pushl %ebp
+; MSVC-X86-O0-NEXT:    movl %esp, %ebp
+; MSVC-X86-O0-NEXT:    pushl %esi
+; MSVC-X86-O0-NEXT:    andl $-32, %esp
+; MSVC-X86-O0-NEXT:    subl $64, %esp
+; MSVC-X86-O0-NEXT:    movl %esp, %esi
+; MSVC-X86-O0-NEXT:    movl 8(%ebp), %eax
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    movl %ecx, 48(%esi)
+; MSVC-X86-O0-NEXT:    shll $2, %eax
+; MSVC-X86-O0-NEXT:    calll __chkstk
+; MSVC-X86-O0-NEXT:    movl %esp, %eax
+; MSVC-X86-O0-NEXT:    movl %eax, 28(%esi) # 4-byte Spill
+; MSVC-X86-O0-NEXT:    leal 32(%esi), %eax
+; MSVC-X86-O0-NEXT:    subl $4, %esp
+; MSVC-X86-O0-NEXT:    movl %eax, (%esp)
+; MSVC-X86-O0-NEXT:    calll _escape
+; MSVC-X86-O0-NEXT:    movl 28(%esi), %eax # 4-byte Reload
+; MSVC-X86-O0-NEXT:    movl %eax, (%esp)
+; MSVC-X86-O0-NEXT:    calll _escape
+; MSVC-X86-O0-NEXT:    addl $4, %esp
+; MSVC-X86-O0-NEXT:    movl 48(%esi), %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:    leal -4(%ebp), %esp
+; MSVC-X86-O0-NEXT:    popl %esi
+; MSVC-X86-O0-NEXT:    popl %ebp
+; MSVC-X86-O0-NEXT:    retl
+;
+; MSVC-X64-O0-LABEL: test_vla_realign:
+; MSVC-X64-O0:       # %bb.0:
+; MSVC-X64-O0-NEXT:    pushq %rbp
+; MSVC-X64-O0-NEXT:    pushq %rbx
+; MSVC-X64-O0-NEXT:    subq $72, %rsp
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; MSVC-X64-O0-NEXT:    andq $-32, %rsp
+; MSVC-X64-O0-NEXT:    movq %rsp, %rbx
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rax
+; MSVC-X64-O0-NEXT:    movq %rax, 64(%rbx)
+; MSVC-X64-O0-NEXT:    movl %ecx, %eax
+; MSVC-X64-O0-NEXT:    # kill: def $rax killed $eax
+; MSVC-X64-O0-NEXT:    leaq 15(,%rax,4), %rax
+; MSVC-X64-O0-NEXT:    andq $-16, %rax
+; MSVC-X64-O0-NEXT:    callq __chkstk
+; MSVC-X64-O0-NEXT:    subq %rax, %rsp
+; MSVC-X64-O0-NEXT:    movq %rsp, %rax
+; MSVC-X64-O0-NEXT:    movq %rax, 24(%rbx) # 8-byte Spill
+; MSVC-X64-O0-NEXT:    leaq 32(%rbx), %rcx
+; MSVC-X64-O0-NEXT:    subq $32, %rsp
+; MSVC-X64-O0-NEXT:    callq escape
+; MSVC-X64-O0-NEXT:    movq 24(%rbx), %rcx # 8-byte Reload
+; MSVC-X64-O0-NEXT:    callq escape
+; MSVC-X64-O0-NEXT:    addq $32, %rsp
+; MSVC-X64-O0-NEXT:    movq 64(%rbx), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-O0-NEXT:    subq $32, %rsp
+; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:    leaq 8(%rbp), %rsp
+; MSVC-X64-O0-NEXT:    popq %rbx
+; MSVC-X64-O0-NEXT:    popq %rbp
+; MSVC-X64-O0-NEXT:    retq
   %realign = alloca i32, align 32
   %vla = alloca i32, i32 %n
   call void @escape(ptr %realign)
@@ -113,45 +357,6 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
   ret void
 }
 
-; MSVC-X86-LABEL: _test_vla_realign:
-; MSVC-X86: pushl %ebp
-; MSVC-X86: movl %esp, %ebp
-; MSVC-X86: pushl %esi
-; MSVC-X86: andl $-32, %esp
-; MSVC-X86: subl $32, %esp
-; MSVC-X86: movl %esp, %esi
-; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]]
-; MSVC-X86: xorl %ebp, %[[REG1]]
-; MSVC-X86: movl %[[REG1]], [[SLOT:[0-9]*]](%esi)
-; MSVC-X86: calll __chkstk
-; MSVC-X86: pushl
-; MSVC-X86: calll _escape
-; MSVC-X86: movl [[SLOT]](%esi), %ecx
-; MSVC-X86: xorl %ebp, %ecx
-; MSVC-X86: calll @__security_check_cookie@4
-; MSVC-X86: leal -8(%ebp), %esp
-; MSVC-X86: popl %esi
-; MSVC-X86: popl %ebp
-; MSVC-X86: retl
-
-; MSVC-X64-LABEL: test_vla_realign:
-; MSVC-X64: pushq %rbp
-; MSVC-X64: pushq %rbx
-; MSVC-X64: subq $32, %rsp
-; MSVC-X64: leaq 32(%rsp), %rbp
-; MSVC-X64: andq $-32, %rsp
-; MSVC-X64: movq %rsp, %rbx
-; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
-; MSVC-X64: xorq %rbp, %[[REG1]]
-; MSVC-X64: movq %[[REG1]], [[SLOT:[0-9]*]](%rbx)
-; MSVC-X64: callq __chkstk
-; MSVC-X64: callq escape
-; MSVC-X64: movq [[SLOT]](%rbx), %rcx
-; MSVC-X64: xorq %rbp, %rcx
-; MSVC-X64: callq __security_check_cookie
-; MSVC-X64: retq
-
-
 declare ptr @strcpy(ptr, ptr) nounwind
 
 declare i32 @printf(ptr, ...) nounwind
diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll
index bb9b4429c07614..012c8aa5d969c0 100644
--- a/llvm/test/CodeGen/X86/tailcc-ssp.ll
+++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=x86_64-windows-msvc %s -o - -verify-machineinstrs | FileCheck %s -check-prefix=WINDOWS
 ; RUN: llc -mtriple=x86_64-linux-gnu    %s -o - -verify-machineinstrs | FileCheck %s -check-prefix=LINUX
 
@@ -5,15 +6,42 @@ declare void @h(ptr, i64, ptr)
 
 define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 ; WINDOWS-LABEL: tailcall_frame:
-; WINDOWS: callq __security_check_cookie
-; WINDOWS: xorl %ecx, %ecx
-; WINDOWS: jmp h
-
+; WINDOWS:       # %bb.0:
+; WINDOWS-NEXT:    subq $56, %rsp
+; WINDOWS-NEXT:    .seh_stackalloc 56
+; WINDOWS-NEXT:    .seh_endprologue
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
+; WINDOWS-NEXT:    callq __security_check_cookie
+; WINDOWS-NEXT:    xorl %ecx, %ecx
+; WINDOWS-NEXT:    xorl %edx, %edx
+; WINDOWS-NEXT:    xorl %r8d, %r8d
+; WINDOWS-NEXT:    addq $56, %rsp
+; WINDOWS-NEXT:    jmp h # TAILCALL
+; WINDOWS-NEXT:    .seh_endproc
+;
 ; LINUX-LABEL: tailcall_frame:
-; LINUX: jne
-; LINUX: jmp h
-; LINUX: callq __stack_chk_fail
-
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    subq $24, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 32
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; LINUX-NEXT:    jne .LBB0_2
+; LINUX-NEXT:  # %bb.1: # %SP_return
+; LINUX-NEXT:    xorl %edi, %edi
+; LINUX-NEXT:    xorl %esi, %esi
+; LINUX-NEXT:    xorl %edx, %edx
+; LINUX-NEXT:    addq $24, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    jmp h@PLT # TAILCALL
+; LINUX-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; LINUX-NEXT:    .cfi_def_cfa_offset 32
+; LINUX-NEXT:    callq __stack_chk_fail@PLT
    tail call tailcc void @h(ptr null, i64 0, ptr null)
    ret void
 }
@@ -21,18 +49,39 @@ define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 declare void @bar()
 define void @tailcall_unrelated_frame() sspreq {
 ; WINDOWS-LABEL: tailcall_unrelated_frame:
-; WINDOWS: subq [[STACK:\$.*]], %rsp
-; WINDOWS: callq bar
-; WINDOWS: callq __security_check_cookie
-; WINDOWS: addq [[STACK]], %rsp
-; WINDOWS: jmp bar
-
+; WINDOWS:       # %bb.0:
+; WINDOWS-NEXT:    subq $40, %rsp
+; WINDOWS-NEXT:    .seh_stackalloc 40
+; WINDOWS-NEXT:    .seh_endprologue
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WINDOWS-NEXT:    callq bar
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
+; WINDOWS-NEXT:    callq __security_check_cookie
+; WINDOWS-NEXT:    nop
+; WINDOWS-NEXT:    addq $40, %rsp
+; WINDOWS-NEXT:    jmp bar # TAILCALL
+; WINDOWS-NEXT:    .seh_endproc
+;
 ; LINUX-LABEL: tailcall_unrelated_frame:
-; LINUX: callq bar
-; LINUX: jne
-; LINUX: jmp bar
-; LINUX: callq __stack_chk_fail
-
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    pushq %rax
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    movq %rax, (%rsp)
+; LINUX-NEXT:    callq bar@PLT
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    cmpq (%rsp), %rax
+; LINUX-NEXT:    jne .LBB1_2
+; LINUX-NEXT:  # %bb.1: # %SP_return
+; LINUX-NEXT:    popq %rax
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    jmp bar@PLT # TAILCALL
+; LINUX-NEXT:  .LBB1_2: # %CallStackCheckFailBlk
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    callq __stack_chk_fail@PLT
   call void @bar()
   tail call void @bar()
   ret void
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
index 05854ff728a077..ab1922e3ad9a27 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
@@ -3432,36 +3432,36 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_2_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_2_v8i32:
@@ -3535,36 +3535,35 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_3_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_3_v8i32:
@@ -3638,36 +3637,36 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_3_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_3_v8i32:
@@ -3741,36 +3740,35 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_4_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4,4,4,4]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_4_v8i32:
@@ -3844,36 +3842,36 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_4_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4,4,4,4]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_4_v8i32:
@@ -3947,36 +3945,35 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_5_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_5_v8i32:
@@ -4050,36 +4047,36 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_5_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_5_v8i32:
@@ -4153,36 +4150,35 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_6_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [6,6,6,6]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_6_v8i32:
@@ -4256,36 +4252,36 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_6_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [6,6,6,6]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_6_v8i32:
@@ -4359,36 +4355,35 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_7_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_7_v8i32:
@@ -4462,36 +4457,36 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_7_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_7_v8i32:
@@ -4565,36 +4560,35 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_8_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [8,8,8,8]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_8_v8i32:
@@ -4668,36 +4662,36 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_8_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [8,8,8,8]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_8_v8i32:
@@ -4771,36 +4765,35 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_9_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [9,9,9,9]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_9_v8i32:
@@ -4874,36 +4867,36 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_9_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [9,9,9,9]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_9_v8i32:
@@ -4977,36 +4970,35 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_10_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [10,10,10,10]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_10_v8i32:
@@ -5080,36 +5072,36 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_10_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [10,10,10,10]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_10_v8i32:
@@ -5183,36 +5175,35 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_11_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [11,11,11,11]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_11_v8i32:
@@ -5286,36 +5277,36 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_11_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [11,11,11,11]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_11_v8i32:
@@ -5389,36 +5380,35 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_12_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [12,12,12,12]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_12_v8i32:
@@ -5492,36 +5482,36 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_12_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [12,12,12,12]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_12_v8i32:
@@ -5591,40 +5581,39 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
   %4 = sext <8 x i1> %3 to <8 x i32>
   ret <8 x i32> %4
 }
-
-define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
-; AVX1-LABEL: ult_13_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+
+define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
+; AVX1-LABEL: ult_13_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [13,13,13,13]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_13_v8i32:
@@ -5698,36 +5687,36 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_13_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [13,13,13,13]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_13_v8i32:
@@ -5801,36 +5790,35 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_14_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [14,14,14,14]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_14_v8i32:
@@ -5904,36 +5892,36 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_14_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [14,14,14,14]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_14_v8i32:
@@ -6007,36 +5995,35 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_15_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_15_v8i32:
@@ -6110,36 +6097,36 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_15_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_15_v8i32:
@@ -6213,36 +6200,35 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_16_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [16,16,16,16]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_16_v8i32:
@@ -6316,36 +6302,36 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_16_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [16,16,16,16]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_16_v8i32:
@@ -6419,36 +6405,35 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_17_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [17,17,17,17]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_17_v8i32:
@@ -6522,36 +6507,36 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_17_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [17,17,17,17]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_17_v8i32:
@@ -6625,36 +6610,35 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_18_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [18,18,18,18]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_18_v8i32:
@@ -6728,36 +6712,36 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_18_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [18,18,18,18]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_18_v8i32:
@@ -6831,36 +6815,35 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_19_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [19,19,19,19]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_19_v8i32:
@@ -6934,36 +6917,36 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_19_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [19,19,19,19]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_19_v8i32:
@@ -7037,36 +7020,35 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_20_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [20,20,20,20]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_20_v8i32:
@@ -7140,36 +7122,36 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_20_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [20,20,20,20]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_20_v8i32:
@@ -7243,36 +7225,35 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_21_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [21,21,21,21]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_21_v8i32:
@@ -7346,36 +7327,36 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_21_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [21,21,21,21]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_21_v8i32:
@@ -7449,36 +7430,35 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_22_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [22,22,22,22]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_22_v8i32:
@@ -7552,36 +7532,36 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_22_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [22,22,22,22]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_22_v8i32:
@@ -7655,36 +7635,35 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_23_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [23,23,23,23]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_23_v8i32:
@@ -7758,36 +7737,36 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_23_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [23,23,23,23]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_23_v8i32:
@@ -7861,36 +7840,35 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_24_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [24,24,24,24]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_24_v8i32:
@@ -7964,36 +7942,36 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_24_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [24,24,24,24]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_24_v8i32:
@@ -8067,36 +8045,35 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_25_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [25,25,25,25]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_25_v8i32:
@@ -8170,36 +8147,36 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_25_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [25,25,25,25]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_25_v8i32:
@@ -8273,36 +8250,35 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_26_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [26,26,26,26]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_26_v8i32:
@@ -8376,36 +8352,36 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_26_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [26,26,26,26]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_26_v8i32:
@@ -8479,36 +8455,35 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_27_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [27,27,27,27]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_27_v8i32:
@@ -8582,36 +8557,36 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_27_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [27,27,27,27]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_27_v8i32:
@@ -8685,36 +8660,35 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_28_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [28,28,28,28]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_28_v8i32:
@@ -8788,36 +8762,36 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_28_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [28,28,28,28]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_28_v8i32:
@@ -8891,36 +8865,35 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_29_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [29,29,29,29]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_29_v8i32:
@@ -8994,36 +8967,36 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_29_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [29,29,29,29]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_29_v8i32:
@@ -9097,36 +9070,35 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_30_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [30,30,30,30]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_30_v8i32:
@@ -9200,36 +9172,36 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
 define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ugt_30_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [30,30,30,30]
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ugt_30_v8i32:
@@ -9303,36 +9275,35 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
 define <8 x i32> @ult_31_v8i32(<8 x i32> %0) {
 ; AVX1-LABEL: ult_31_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [31,31,31,31]
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ult_31_v8i32:
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 85c1e25c29ed5b..dc9e69137a8a7e 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -2419,12 +2419,9 @@ define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) {
 ; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: load_sext_8i1_to_8i32:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 50be3c5bc6c065..efe34c52b3710d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -62,35 +62,37 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
 ; SSE-LABEL: PR50049:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm2
-; SSE-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE-NEXT:    movdqa 32(%rdi), %xmm1
+; SSE-NEXT:    movdqa 16(%rdi), %xmm3
+; SSE-NEXT:    movdqa 32(%rdi), %xmm0
 ; SSE-NEXT:    movdqa (%rsi), %xmm4
 ; SSE-NEXT:    movdqa 16(%rsi), %xmm5
-; SSE-NEXT:    movdqa 32(%rsi), %xmm3
+; SSE-NEXT:    movdqa 32(%rsi), %xmm1
 ; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u]
-; SSE-NEXT:    pshufb %xmm6, %xmm0
+; SSE-NEXT:    pshufb %xmm6, %xmm3
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u]
 ; SSE-NEXT:    pshufb %xmm7, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
+; SSE-NEXT:    pshufb %xmm3, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,1,4,7,10,13]
+; SSE-NEXT:    pshufb %xmm8, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    pshufb %xmm6, %xmm5
 ; SSE-NEXT:    pshufb %xmm7, %xmm4
 ; SSE-NEXT:    por %xmm5, %xmm4
-; SSE-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE-NEXT:    pmullw %xmm5, %xmm0
-; SSE-NEXT:    pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    pmovzxbw {{.*#+}} xmm6 = [8,9,10,128,128,128,128,128]
-; SSE-NEXT:    pshufb %xmm6, %xmm4
-; SSE-NEXT:    pmovzxbw {{.*#+}} xmm7 = [128,128,128,1,4,7,10,13]
-; SSE-NEXT:    pshufb %xmm7, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm3
-; SSE-NEXT:    pshufb %xmm6, %xmm2
-; SSE-NEXT:    pshufb %xmm7, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    pmullw %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm5, %xmm1
-; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    pshufb %xmm3, %xmm4
+; SSE-NEXT:    pshufb %xmm8, %xmm1
+; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    pmaddubsw %xmm3, %xmm4
+; SSE-NEXT:    pand %xmm2, %xmm4
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    pmaddubsw %xmm2, %xmm0
+; SSE-NEXT:    psllw $8, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: PR50049:
@@ -110,23 +112,21 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm2
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpandn %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR50049:
diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
index 9fd5b9010b0cf8..44bd58f9d4121a 100644
--- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
@@ -679,15 +679,16 @@ define void @PR54171(ptr %mask0, ptr %mask1, i64 %i) {
 ; AVX1-NEXT:  # %bb.1: # %if.then
 ; AVX1-NEXT:    vmovd %edx, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
-; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
-; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vmovdqa %xmm1, 16(%rsi)
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [0.0E+0,0.0E+0,1.0E+0,1.0E+0,2.0E+0,2.0E+0,3.0E+0,3.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm1, (%rdi)
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,4.0E+0,5.0E+0,5.0E+0,6.0E+0,6.0E+0,7.0E+0,7.0E+0]
+; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
 ; AVX1-NEXT:  .LBB18_2: # %if.end
+; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR54171:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 99e8cdb179c8dc..9290f9f17b0532 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1638,10 +1638,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
-; AVX512BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/Xtensa/blockaddress.ll b/llvm/test/CodeGen/Xtensa/blockaddress.ll
new file mode 100644
index 00000000000000..bbeb1790a1b785
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/blockaddress.ll
@@ -0,0 +1,26 @@
+; RUN: llc --mtriple=xtensa < %s | FileCheck %s
+
+@addr = global ptr null
+
+define void @test_blockaddress() {
+
+  store volatile ptr blockaddress(@test_blockaddress, %block), ptr @addr
+; CHECK:      .literal_position
+; CHECK-NEXT: .literal .LCPI0_0, addr
+; CHECK-NEXT: .literal .LCPI0_1, .Ltmp0
+; CHECK-LABEL: test_blockaddress:
+; CHECK:      # %bb.0:
+; CHECK-NEXT: l32r a8, .LCPI0_0
+; CHECK-NEXT: l32r a9, .LCPI0_1
+; CHECK-NEXT: s32i a9, a8, 0
+; CHECK-NEXT: l32i a8, a8, 0
+; CHECK-NEXT: jx a8
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .LBB0_1:
+
+  %val = load volatile ptr, ptr @addr
+  indirectbr ptr %val, [label %block]
+
+block:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Xtensa/brcc.ll b/llvm/test/CodeGen/Xtensa/brcc.ll
new file mode 100644
index 00000000000000..83f6dfd1aebc34
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/brcc.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=xtensa < %s | FileCheck %s
+
+; CHECK-LABEL: brcc1:
+; CHECK: bge   a3, a2, .LBB0_2
+; CHECK: addi  a2, a2, 4
+; CHECK: .LBB0_2:
+define i32 @brcc1(i32 %a, i32 %b) nounwind {
+entry:
+  %wb = icmp sgt i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+; CHECK-LABEL: brcc2
+; CHECK: bgeu  a3, a2, .LBB1_2
+; CHECK: addi  a2, a2, 4
+; CHECK: .LBB1_2:
+define i32 @brcc2(i32 %a, i32 %b) nounwind {
+entry:
+  %wb = icmp ugt i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+; CHECK-LABEL: brcc3:
+; CHECK: blt   a3, a2, .LBB2_2
+; CHECK: addi  a2, a2, 4
+; CHECK: .LBB2_2:
+define i32 @brcc3(i32 %a, i32 %b) nounwind {
+entry:
+  %wb = icmp sle i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+; CHECK-LABEL: brcc4
+; CHECK: bltu  a3, a2, .LBB3_2
+; CHECK: addi  a2, a2, 4
+; CHECK: .LBB3_2:
+define i32 @brcc4(i32 %a, i32 %b) nounwind {
+entry:
+  %wb = icmp ule i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
diff --git a/llvm/test/CodeGen/Xtensa/indirectbr.ll b/llvm/test/CodeGen/Xtensa/indirectbr.ll
new file mode 100644
index 00000000000000..c4181c28826f46
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/indirectbr.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefix=XTENSA %s
+
+define i32 @indirectbr(i8* %target) nounwind {
+; XTENSA-LABEL: indirectbr:
+; XTENSA:         jx a2
+; XTENSA-NEXT:  .LBB0_1: # %test_label
+; XTENSA-NEXT:    movi a2, 0
+; XTENSA-NEXT:    ret
+  indirectbr i8* %target, [label %test_label]
+test_label:
+  br label %ret
+ret:
+  ret i32 0
+}
+
+define i32 @indirectbr_with_offset(i8* %a) nounwind {
+; XTENSA-LABEL: indirectbr_with_offset:
+; XTENSA:         movi a8, 1380
+; XTENSA-NEXT:    add a8, a2, a8
+; XTENSA-NEXT:    jx a8
+; XTENSA-NEXT:  .LBB1_1: # %test_label
+; XTENSA-NEXT:    movi a2, 0
+; XTENSA-NEXT:    ret
+  %target = getelementptr inbounds i8, i8* %a, i32 1380
+  indirectbr i8* %target, [label %test_label]
+test_label:
+  br label %ret
+ret:
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/Xtensa/jumpt.ll b/llvm/test/CodeGen/Xtensa/jumpt.ll
new file mode 100644
index 00000000000000..66c2fc39e3952b
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/jumpt.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+define void @switch_4_xtensa(i32 %in, ptr %out) nounwind {
+; CHECK: .literal_position
+; CHECK-NEXT:  .LCPI0_0, .LJTI0_0
+; CHECK-LABEL: switch_4_xtensa:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:  addi a9, a2, -1
+; CHECK-NEXT:  movi a8, 3
+; CHECK-NEXT:  bltu a8, a9, .LBB0_6
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:  l32r a10, .LCPI0_0
+; CHECK-NEXT:  addx4 a9, a9, a10
+; CHECK-NEXT:  l32i a9, a9, 0
+; CHECK-NEXT:  jx a9
+; CHECK:       ret
+
+entry:
+  switch i32 %in, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+   i32 4, label %bb4
+  ]
+bb1:
+  store i32 4, ptr %out
+  br label %exit
+bb2:
+  store i32 3, ptr %out
+  br label %exit
+bb3:
+  store i32 2, ptr %out
+  br label %exit
+bb4:
+  store i32 1, ptr %out
+  br label %exit
+exit:
+  ret void
+}
diff --git a/llvm/test/DebugInfo/COFF/fission-sections.ll b/llvm/test/DebugInfo/COFF/fission-sections.ll
index da1418d937bb61..e7494ce92dfcfd 100644
--- a/llvm/test/DebugInfo/COFF/fission-sections.ll
+++ b/llvm/test/DebugInfo/COFF/fission-sections.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -split-dwarf-file=baz.dwo -split-dwarf-output=%t.dwo -O0 %s -mtriple=x86_64-unknown-windows-msvc -filetype=obj -o %t
-; RUN: llvm-objdump -h %t | FileCheck --check-prefix=OBJ %s
-; RUN: llvm-objdump -h %t.dwo | FileCheck --check-prefix=DWO %s
+; RUN: llvm-objdump -ht %t | FileCheck --check-prefix=OBJ %s
+; RUN: llvm-objdump -ht %t.dwo | FileCheck --check-prefix=DWO %s
 
 ; This test is derived from test/DebugInfo/X86/fission-cu.ll
 ; But it checks that the output objects have the expected sections
@@ -35,8 +35,23 @@ source_filename = "test/DebugInfo/X86/fission-cu.ll"
 ; OBJ-NEXT:  8 .debug_pubtypes
 ; OBJ-NEXT:  9 .debug_line
 
+; OBJ:     .debug_abbrev
+; OBJ:     .debug_info
+; OBJ-NOT: .dwo
+
 ; DWO:      Idx Name
 ; DWO-NEXT:   0 .debug_str.dwo
 ; DWO-NEXT:   1 .debug_str_offsets.dwo
 ; DWO-NEXT:   2 .debug_info.dwo
 ; DWO-NEXT:   3 .debug_abbrev.dwo
+
+; DWO:      SYMBOL TABLE:
+; DWO-NEXT: [ 0](sec  1)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .debug_str.dwo
+; DWO-NEXT: AUX scnlen 0x49 nreloc 0 nlnno 0 checksum 0xa4983874 assoc 1 comdat 0
+; DWO-NEXT: [ 2](sec  2)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .debug_str_offsets.dwo
+; DWO-NEXT: AUX scnlen 0x14 nreloc 0 nlnno 0 checksum 0x9392f0f0 assoc 2 comdat 0
+; DWO-NEXT: [ 4](sec  3)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .debug_info.dwo
+; DWO-NEXT: AUX scnlen 0x29 nreloc 0 nlnno 0 checksum 0xc8e5b275 assoc 3 comdat 0
+; DWO-NEXT: [ 6](sec  4)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .debug_abbrev.dwo
+; DWO-NEXT: AUX scnlen 0x33 nreloc 0 nlnno 0 checksum 0x8056e5e4 assoc 4 comdat 0
+; DWO-EMPTY:
diff --git a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
index 76b6a47e885d99..4ae0b78f160c83 100644
--- a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
+++ b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
@@ -36,7 +36,7 @@
 ; CHECK: $L__func_end2:
 ; CHECK: }
 
-; CHECK: .file 1 "{{.*}}b.c"
+; CHECK-NEXT: .file 1 "{{.*}}b.c"
 
 ; Function Attrs: nounwind uwtable
 define i32 @b(i32 %c) #0 !dbg !5 {
diff --git a/llvm/test/DebugInfo/X86/InlinedFnLocalVar.ll b/llvm/test/DebugInfo/X86/InlinedFnLocalVar.ll
index 0b27ec8194a1cd..1a21725ecaf68c 100644
--- a/llvm/test/DebugInfo/X86/InlinedFnLocalVar.ll
+++ b/llvm/test/DebugInfo/X86/InlinedFnLocalVar.ll
@@ -2,7 +2,7 @@
 ; RUN: llc --try-experimental-debuginfo-iterators -mtriple i686-pc-cygwin -O2 %s -o - | FileCheck %s
 ; Check struct X for dead variable xyz from inlined function foo.
 
-; CHECK: Lsection_info
+; CHECK: .section .debug_info,"dr"
 ; CHECK:	DW_TAG_structure_type
 ; CHECK-NEXT:	info_string
 
diff --git a/llvm/test/DebugInfo/X86/ref_addr_relocation.ll b/llvm/test/DebugInfo/X86/ref_addr_relocation.ll
index ba31b2498a384f..21e54e73edddbf 100644
--- a/llvm/test/DebugInfo/X86/ref_addr_relocation.ll
+++ b/llvm/test/DebugInfo/X86/ref_addr_relocation.ll
@@ -47,7 +47,7 @@
 ; Make sure this is relocatable.
 ; and test that we don't create the labels to emit a correct COFF relocation
 ; ELF-ASM: .quad .debug_info+[[TYPE]] # DW_AT_type
-; COFF-ASM: .secrel32 .Lsection_info+[[TYPE]] # DW_AT_type
+; COFF-ASM: .secrel32 .debug_info+[[TYPE]] # DW_AT_type
 ; DARWIN-ASM2: .quad [[TYPE]] ## DW_AT_type
 ; DARWIN-ASM4: .long [[TYPE]] ## DW_AT_type
 ; CHECK-NOT: DW_TAG_structure_type
diff --git a/llvm/test/DebugInfo/attr-btf_type_tag.ll b/llvm/test/DebugInfo/attr-btf_type_tag.ll
index 47a2aa59106d17..ffdef06bb2dc4a 100644
--- a/llvm/test/DebugInfo/attr-btf_type_tag.ll
+++ b/llvm/test/DebugInfo/attr-btf_type_tag.ll
@@ -1,3 +1,4 @@
+; UNSUPPORTED: target={{.*}}-aix{{.*}}
 ; REQUIRES: object-emission
 ; RUN: llc -filetype=obj -o %t %s
 ; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s
diff --git a/llvm/test/DebugInfo/unrolled-loop-remainder.ll b/llvm/test/DebugInfo/unrolled-loop-remainder.ll
index 6a7fe779f3f87a..f2bd855015e77c 100644
--- a/llvm/test/DebugInfo/unrolled-loop-remainder.ll
+++ b/llvm/test/DebugInfo/unrolled-loop-remainder.ll
@@ -58,7 +58,7 @@ define i32 @func_c() local_unnamed_addr #0 !dbg !14 {
 ; CHECK-NEXT:    [[DOTUNR1_PH:%.*]] = phi i32 [ [[ADD_PROL]], [[FOR_BODY_PROL]] ], [ [[ADD_PROL_1]], [[FOR_BODY_PROL_1]] ], [ [[ADD_PROL_2]], [[FOR_BODY_PROL_2]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_PROL_LOOPEXIT]], !dbg [[DBG24]]
 ; CHECK:       for.body.prol.loopexit:
-; CHECK-NEXT:    [[DOTLCSSA_UNR:%.*]] = phi ptr [ undef, [[FOR_BODY_LR_PH]] ], [ [[DOTLCSSA_UNR_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[DOTLCSSA_UNR:%.*]] = phi ptr [ poison, [[FOR_BODY_LR_PH]] ], [ [[DOTLCSSA_UNR_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[DOTUNR:%.*]] = phi ptr [ [[A_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[DOTUNR_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[DOTUNR1:%.*]] = phi i32 [ [[DOTPR]], [[FOR_BODY_LR_PH]] ], [ [[DOTUNR1_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP3]], 3, !dbg [[DBG24]]
diff --git a/llvm/test/Demangle/ms-auto-templates.test b/llvm/test/Demangle/ms-auto-templates.test
new file mode 100644
index 00000000000000..a90ffb69df5588
--- /dev/null
+++ b/llvm/test/Demangle/ms-auto-templates.test
@@ -0,0 +1,57 @@
+; RUN: llvm-undname < %s | FileCheck %s
+
+; CHECK-NOT: Invalid mangled name
+
+??0?$AutoNTTPClass@$MPEAH1?i@@3HA@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int i>::AutoNTTPClass<&int i>(void)
+
+??0?$AutoNTTPClass@$1?i@@3HA@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int i>::AutoNTTPClass<&int i>(void)
+
+??0?$AutoNTTPClass@$MPEAH1?i@@3HA$MPEAH1?j@@3HA@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int i, &int j>::AutoNTTPClass<&int i, &int j>(void)
+
+??0?$AutoNTTPClass@$1?i@@3HA$1?j@@3HA@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int i, &int j>::AutoNTTPClass<&int i, &int j>(void)
+
+??0?$AutoNTTPClass@$MP6AHXZ1?Func@@YAHXZ@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int __cdecl Func(void)>::AutoNTTPClass<&int __cdecl Func(void)>(void)
+
+??0?$AutoNTTPClass@$1?Func@@YAHXZ@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int __cdecl Func(void)>::AutoNTTPClass<&int __cdecl Func(void)>(void)
+
+??0?$AutoNTTPClass@$MP6AHXZ1?Func@@YAHXZ$MP6AHXZ1?Func2@@YAHXZ@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int __cdecl Func(void), &int __cdecl Func2(void)>::AutoNTTPClass<&int __cdecl Func(void), &int __cdecl Func2(void)>(void)
+
+??0?$AutoNTTPClass@$1?Func@@YAHXZ$1?Func2@@YAHXZ@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&int __cdecl Func(void), &int __cdecl Func2(void)>::AutoNTTPClass<&int __cdecl Func(void), &int __cdecl Func2(void)>(void)
+
+??$AutoFunc@$MPEAH1?i@@3HA@@YA?A?<auto>@@XZ
+; CHECK: <auto> __cdecl AutoFunc<&int i>(void)
+
+??$AutoFunc@$1?i@@3HA@@YA?A?<auto>@@XZ
+; CHECK: <auto> __cdecl AutoFunc<&int i>(void)
+
+??$AutoFunc@$MP6AHXZ1?Func@@YAHXZ@@YA?A?<auto>@@XZ
+; CHECK: <auto> __cdecl AutoFunc<&int __cdecl Func(void)>(void)
+
+??$AutoFunc@$1?Func@@YAHXZ@@YA?A?<auto>@@XZ
+; CHECK: <auto> __cdecl AutoFunc<&int __cdecl Func(void)>(void)
+
+??$AutoFunc@$MH00@@YA?A?<auto>@@XZ
+; CHECK: <auto> __cdecl AutoFunc<1>(void)
+
+??$AutoFunc@$00@@YA?A?<auto>@@XZ
+; CHECK: <auto> __cdecl AutoFunc<1>(void)
+
+??0?$AutoNTTPClass@$0A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<0>::AutoNTTPClass<0>(void)
+
+??0?$AutoNTTPClass@$MH0A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<0>::AutoNTTPClass<0>(void)
+
+??0?$AutoNTTPClass@$0A@$0A@$0GB@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<0, 0, 97>::AutoNTTPClass<0, 0, 97>(void)
+
+??0?$AutoNTTPClass@$MH0A@$M_N0A@$MD0GB@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<0, 0, 97>::AutoNTTPClass<0, 0, 97>(void)
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
new file mode 100644
index 00000000000000..b4f6e04d4eb4a5
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
@@ -0,0 +1,68 @@
+# REQUIRES: asserts
+# RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv7.o %s
+# RUN: llvm-objdump -s --section=.rodata %t_armv7.o | FileCheck --check-prefix=CHECK-OBJ %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_armv7.o -debug-only=jitlink 2>&1 \
+# RUN:              | FileCheck --check-prefix=CHECK-LG %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_armv7.o -check %s
+
+# RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
+# RUN: llvm-objdump -s --section=.rodata %t_thumbv7.o | FileCheck --check-prefix=CHECK-OBJ %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_thumbv7.o -debug-only=jitlink 2>&1 \
+# RUN:              | FileCheck --check-prefix=CHECK-LG %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_thumbv7.o -check %s
+
+# The strings of "H1\00", "H2\00" and "H3\00" are encoded as
+#               0x483100, 0x483200 and 0x483300 in the .rodata section.
+# CHECK-OBJ: Contents of section .rodata:
+# CHECK-OBJ: 0000 48310048 32004833 00                 H1.H2.H3.
+
+# CHECK-LG: Starting link phase 1 for graph
+# CHECK-LG: section .rodata:
+
+# CHECK-LG:       block 0x0 size = 0x00000009, align = 1, alignment-offset = 0
+# CHECK-LG-NEXT:    symbols:
+# CHECK-LG-NEXT:      0x0 (block + 0x00000000): size: 0x00000003, linkage: strong, scope: default, live  -   Lstr.H1
+# CHECK-LG-NEXT:      0x3 (block + 0x00000003): size: 0x00000003, linkage: strong, scope: default, live  -   Lstr.H2
+# CHECK-LG-NOT:       0x2 (block + 0x00000002): size: 0x00000003, linkage: strong, scope: default, live  -   Lstr.H2
+# CHECK-LG-NEXT:      0x6 (block + 0x00000006): size: 0x00000003, linkage: strong, scope: default, live  -   Lstr.H3
+
+# jitlink-check: Lstr.H1 = 0x76ff0000
+# jitlink-check: (*{4}(Lstr.H1))[23:0] = 0x003148
+	.globl	Lstr.H1
+	.type	Lstr.H1,%object
+	.section	.rodata,"a",%progbits
+Lstr.H1:
+	.asciz	"H1"
+	.size	Lstr.H1, 3
+
+# H2 is unaligned as its beginning address is base address + 0x3
+# Make sure the string we get is 0x003248 and not 0x324800
+# jitlink-check: Lstr.H2 = 0x76ff0003
+# jitlink-check: (*{4}(Lstr.H2))[23:0] = 0x003248
+	.globl	Lstr.H2
+	.type	Lstr.H2,%object
+Lstr.H2:
+	.asciz	"H2"
+	.size	Lstr.H2, 3
+
+# jitlink-check: Lstr.H3 = 0x76ff0006
+# jitlink-check: (*{4}(Lstr.H3))[23:0] = 0x003348
+	.globl	Lstr.H3
+	.type	Lstr.H3,%object
+Lstr.H3:
+	.asciz	"H3"
+	.size	Lstr.H3, 3
+
+	.text
+	.syntax unified
+# Empty main function for jitlink to be happy
+	.globl	main
+	.type	main,%function
+	.p2align	2
+main:
+	bx	lr
+	.size	main,.-main
diff --git a/llvm/test/ExecutionEngine/Orc/global-ctor-order.ll b/llvm/test/ExecutionEngine/Orc/global-ctor-order.ll
index de9a35465f8cb6..98cf67af8276da 100644
--- a/llvm/test/ExecutionEngine/Orc/global-ctor-order.ll
+++ b/llvm/test/ExecutionEngine/Orc/global-ctor-order.ll
@@ -1,9 +1,5 @@
 ; Test that global constructors are correctly ordered
 ;
-; Uncovers a pre-existing issue on Arm 32 bit, see
-; https://github.com/llvm/llvm-project/issues/95911.
-; UNSUPPORTED: target=arm{{.*}}
-;
 ; RUN: lli -jit-kind=orc %s | FileCheck %s
 ;
 ; CHECK: H1
diff --git a/llvm/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64.s b/llvm/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64.s
index d63bcfc6df3446..e559e51f634968 100644
--- a/llvm/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64.s
+++ b/llvm/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64.s
@@ -5,7 +5,6 @@
 
 
 	.section section,"rx"
-section:
 	.long 0
 Lreloc:
 	.long 0
diff --git a/llvm/test/Feature/load_plugin_error.ll b/llvm/test/Feature/load_plugin_error.ll
index 24a7cce5d8d397..a112bfbb1cd193 100644
--- a/llvm/test/Feature/load_plugin_error.ll
+++ b/llvm/test/Feature/load_plugin_error.ll
@@ -1,5 +1,18 @@
-; REQUIRES: plugins, examples
+; REQUIRES: plugins, x86-registered-target
 ; UNSUPPORTED: target={{.*windows.*}}
 
-; RUN: not opt < %s -load-pass-plugin=%t/nonexistant.so -disable-output 2>&1 | FileCheck %s
-; CHECK: Could not load library {{.*}}nonexistant.so
+; RUN: not opt < %s -load-pass-plugin=%t/nonexistent.so -disable-output 2>&1 | FileCheck %s
+
+; RUN: opt %s -o %t.o
+; RUN: not llvm-lto2 run -load-pass-plugin=%t/nonexistent.so %t.o -o %t \
+; RUN:     -r %t.o,test 2>&1 | \
+; RUN:   FileCheck %s
+
+; CHECK: Could not load library {{.*}}nonexistent.so
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test() {
+  ret void
+}
diff --git a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
index c7732a4781b67c..20c002e0565dbd 100644
--- a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
@@ -1,5 +1,6 @@
 ; Check that MC/DC intrinsics are properly lowered
 ; RUN: opt < %s -passes=instrprof -S | FileCheck %s --check-prefixes=CHECK,BASIC
+; RUN: opt < %s -passes=instrprof -S -instrprof-atomic-counter-update-all | FileCheck %s --check-prefixes=CHECK,ATOMIC
 ; RUN: opt < %s -passes=instrprof -runtime-counter-relocation -S 2>&1 | FileCheck %s --check-prefix RELOC
 
 ; RELOC: Runtime counter relocation is presently not supported for MC/DC bitmaps
@@ -9,6 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @__profn_test = private constant [4 x i8] c"test"
 
 ; BASIC: [[PROFBM_ADDR:@__profbm_test]] = private global [1 x i8] zeroinitializer, section "__llvm_prf_bits", comdat, align 1
+; ATOMIC: [[PROFBM_ADDR:@__profbm_test]] = private global [1 x i8] zeroinitializer, section "__llvm_prf_bits", comdat, align 1
 
 define dso_local void @test(i32 noundef %A) {
 entry:
@@ -38,8 +40,17 @@ entry:
 ; CHECK:      %[[BITS:.+]] = load i8, ptr %[[ARGPTR]], align 1
 ; BASIC-NEXT: %[[LAB11:[0-9]+]] = or i8 %[[BITS]], %[[ARGVAL]]
 ; BASIC-NEXT: store i8 %[[LAB11]], ptr %[[ARGPTR]], align 1
+; ATOMIC-NEXT: %[[MASKED:.+]] = and i8 %[[BITS]], %[[ARGVAL]]
+; ATOMIC-NEXT: %[[SHOULDWRITE:.+]] = icmp ne i8 %[[MASKED]], %[[ARGVAL]]
+; ATOMIC-NEXT: br i1 %[[SHOULDWRITE]], label %[[WRITE:.+]], label %[[SKIP:.+]], !prof ![[MDPROF:[0-9]+]]
+; ATOMIC: [[WRITE]]:
+; ATOMIC-NEXT: %{{.+}} = atomicrmw or ptr %[[ARGPTR]], i8 %[[ARGVAL]] monotonic, align 1
+; ATOMIC-NEXT: ret void
+; ATOMIC: [[SKIP]]:
 ; CHECK-NEXT: ret void
 
+; ATOMIC: ![[MDPROF]] = !{!"branch_weights", i32 1, i32 1048575}
+
 declare void @llvm.instrprof.cover(ptr, i64, i32, i32)
 
 declare void @llvm.instrprof.mcdc.parameters(ptr, i64, i32)
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
new file mode 100644
index 00000000000000..8110dd485d369b
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -0,0 +1,916 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DQQ
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dlq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DLQ
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests with simple control flow.
+
+@float_const = private unnamed_addr constant float 0.5
+@x86_fp80_const = private unnamed_addr constant x86_fp80 0xK3FC9E69594BEC44DE000
+@double_const = private unnamed_addr constant double 0.5
+
+
+define float @return_param_float(float %a) sanitize_numerical_stability {
+; CHECK-LABEL: @return_param_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @return_param_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_float_d(float [[A]], double [[TMP4]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[A]] to double
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], double [[TMP7]], double [[TMP4]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @return_param_float to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP8]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[A]]
+;
+entry:
+  ret float %a
+}
+
+; Note that the shadow fadd should not have a `fast` flag.
+define float @param_add_return_float(float %a) sanitize_numerical_stability {
+; CHECK-LABEL: @param_add_return_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd fast float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @param_add_return_float to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[B]]
+;
+entry:
+  %b = fadd fast float %a, 1.0
+  ret float %b
+}
+
+define x86_fp80 @param_add_return_x86_fp80(x86_fp80 %a) sanitize_numerical_stability {
+; CHECK-LABEL: @param_add_return_x86_fp80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_x86_fp80 to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load fp128, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext x86_fp80 [[A:%.*]] to fp128
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], fp128 [[TMP2]], fp128 [[TMP3]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd x86_fp80 [[A]], 0xK3FC9E69594BEC44DE000
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fp128 [[TMP4]], 0xLC0000000000000003FC9CD2B297D889B
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[B]], fp128 [[TMP5]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext x86_fp80 [[B]] to fp128
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], fp128 [[TMP8]], fp128 [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @param_add_return_x86_fp80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[B]]
+;
+entry:
+  %b = fadd x86_fp80 %a, 0xK3FC9E69594BEC44DE000
+  ret x86_fp80 %b
+}
+
+define double @param_add_return_double(double %a) sanitize_numerical_stability {
+; DQQ-LABEL: @param_add_return_double(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_double to i64)
+; DQQ-NEXT:    [[TMP2:%.*]] = load fp128, ptr @__nsan_shadow_args_ptr, align 1
+; DQQ-NEXT:    [[TMP3:%.*]] = fpext double [[A:%.*]] to fp128
+; DQQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], fp128 [[TMP2]], fp128 [[TMP3]]
+; DQQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[B:%.*]] = fadd double [[A]], 1.000000e+00
+; DQQ-NEXT:    [[TMP5:%.*]] = fadd fp128 [[TMP4]], 0xL00000000000000003FFF000000000000
+; DQQ-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_double_q(double [[B]], fp128 [[TMP5]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; DQQ-NEXT:    [[TMP8:%.*]] = fpext double [[B]] to fp128
+; DQQ-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], fp128 [[TMP8]], fp128 [[TMP5]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @param_add_return_double to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[B]]
+;
+; DLQ-LABEL: @param_add_return_double(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_double to i64)
+; DLQ-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr @__nsan_shadow_args_ptr, align 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[A:%.*]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], x86_fp80 [[TMP2]], x86_fp80 [[TMP3]]
+; DLQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[B:%.*]] = fadd double [[A]], 1.000000e+00
+; DLQ-NEXT:    [[TMP5:%.*]] = fadd x86_fp80 [[TMP4]], 0xK3FFF8000000000000000
+; DLQ-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_double_l(double [[B]], x86_fp80 [[TMP5]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; DLQ-NEXT:    [[TMP8:%.*]] = fpext double [[B]] to x86_fp80
+; DLQ-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], x86_fp80 [[TMP8]], x86_fp80 [[TMP5]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @param_add_return_double to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[B]]
+;
+entry:
+  %b = fadd double %a, 1.0
+  ret double %b
+}
+
+define <2 x float> @return_param_add_return_float_vector(<2 x float> %a) sanitize_numerical_stability {
+; CHECK-LABEL: @return_param_add_return_float_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @return_param_add_return_float_vector to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[A:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd <2 x float> [[A]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[B]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP6]], double [[TMP7]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[B]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP9]], double [[TMP10]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = or i32 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[B]] to <2 x double>
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @return_param_add_return_float_vector to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret <2 x float> [[B]]
+;
+entry:
+  %b = fadd <2 x float> %a, <float 1.0, float 1.0>
+  ret <2 x float> %b
+}
+
+; TODO: This is ignored for now.
+define [2 x float] @return_param_float_array([2 x float] %a) sanitize_numerical_stability {
+; CHECK-LABEL: @return_param_float_array(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret [2 x float] [[A:%.*]]
+;
+entry:
+  ret [2 x float] %a
+}
+
+define void @constantload_add_store_float(ptr %dst) sanitize_numerical_stability {
+; CHECK-LABEL: @constantload_add_store_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load float, ptr @float_const, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[B]], 1.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[DST:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__nsan_internal_check_float_d(float [[C]], double [[TMP1]], i32 4, i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP6]], double [[TMP1]]
+; CHECK-NEXT:    store double [[TMP7]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    store float [[C]], ptr [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load float, ptr @float_const
+  %c = fadd float %b, 1.0
+  store float %c, ptr %dst, align 1
+  ret void
+}
+
+define void @constantload_add_store_x86_fp80(ptr %dst) sanitize_numerical_stability {
+; CHECK-LABEL: @constantload_add_store_x86_fp80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load x86_fp80, ptr @x86_fp80_const, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = fpext x86_fp80 [[B]] to fp128
+; CHECK-NEXT:    [[C:%.*]] = fadd x86_fp80 [[B]], 0xK3FC9E69594BEC44DE000
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fp128 [[TMP0]], 0xLC0000000000000003FC9CD2B297D889B
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__nsan_get_shadow_ptr_for_longdouble_store(ptr [[DST:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[C]], fp128 [[TMP1]], i32 4, i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext x86_fp80 [[C]] to fp128
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], fp128 [[TMP6]], fp128 [[TMP1]]
+; CHECK-NEXT:    store fp128 [[TMP7]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    store x86_fp80 [[C]], ptr [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load x86_fp80, ptr @x86_fp80_const
+  %c = fadd x86_fp80 %b, 0xK3FC9E69594BEC44DE000
+  store x86_fp80 %c, ptr %dst, align 1
+  ret void
+}
+
+define void @constantload_add_store_double(ptr %dst) sanitize_numerical_stability {
+; DQQ-LABEL: @constantload_add_store_double(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[B:%.*]] = load double, ptr @double_const, align 8
+; DQQ-NEXT:    [[TMP0:%.*]] = fpext double [[B]] to fp128
+; DQQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DQQ-NEXT:    [[TMP1:%.*]] = fadd fp128 [[TMP0]], 0xL00000000000000003FFF000000000000
+; DQQ-NEXT:    [[TMP2:%.*]] = call ptr @__nsan_get_shadow_ptr_for_double_store(ptr [[DST:%.*]], i64 1)
+; DQQ-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DST]] to i64
+; DQQ-NEXT:    [[TMP4:%.*]] = call i32 @__nsan_internal_check_double_q(double [[C]], fp128 [[TMP1]], i32 4, i64 [[TMP3]])
+; DQQ-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 1
+; DQQ-NEXT:    [[TMP6:%.*]] = fpext double [[C]] to fp128
+; DQQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], fp128 [[TMP6]], fp128 [[TMP1]]
+; DQQ-NEXT:    store fp128 [[TMP7]], ptr [[TMP2]], align 1
+; DQQ-NEXT:    store double [[C]], ptr [[DST]], align 1
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @constantload_add_store_double(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[B:%.*]] = load double, ptr @double_const, align 8
+; DLQ-NEXT:    [[TMP0:%.*]] = fpext double [[B]] to x86_fp80
+; DLQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DLQ-NEXT:    [[TMP1:%.*]] = fadd x86_fp80 [[TMP0]], 0xK3FFF8000000000000000
+; DLQ-NEXT:    [[TMP2:%.*]] = call ptr @__nsan_get_shadow_ptr_for_double_store(ptr [[DST:%.*]], i64 1)
+; DLQ-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DST]] to i64
+; DLQ-NEXT:    [[TMP4:%.*]] = call i32 @__nsan_internal_check_double_l(double [[C]], x86_fp80 [[TMP1]], i32 4, i64 [[TMP3]])
+; DLQ-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 1
+; DLQ-NEXT:    [[TMP6:%.*]] = fpext double [[C]] to x86_fp80
+; DLQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], x86_fp80 [[TMP6]], x86_fp80 [[TMP1]]
+; DLQ-NEXT:    store x86_fp80 [[TMP7]], ptr [[TMP2]], align 1
+; DLQ-NEXT:    store double [[C]], ptr [[DST]], align 1
+; DLQ-NEXT:    ret void
+;
+entry:
+  %b = load double, ptr @double_const
+  %c = fadd double %b, 1.0
+  store double %c, ptr %dst, align 1
+  ret void
+}
+
+define void @load_add_store_float(ptr %a) sanitize_numerical_stability {
+; CHECK-LABEL: @load_add_store_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load float, ptr [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[A]], i64 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP0]], align 1
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ [[TMP3]], [[TMP2]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[B]], 1.000000e+00
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd double [[TMP7]], 1.000000e+00
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[A]], i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__nsan_internal_check_float_d(float [[C]], double [[TMP8]], i32 4, i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], double [[TMP13]], double [[TMP8]]
+; CHECK-NEXT:    store double [[TMP14]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    store float [[C]], ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load float, ptr %a, align 1
+  %c = fadd float %b, 1.0
+  store float %c, ptr %a, align 1
+  ret void
+}
+
+define void @load_add_store_x86_fp80(ptr %a) sanitize_numerical_stability {
+; CHECK-LABEL: @load_add_store_x86_fp80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load x86_fp80, ptr [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_longdouble_load(ptr [[A]], i64 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = load fp128, ptr [[TMP0]], align 1
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext x86_fp80 [[B]] to fp128
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi fp128 [ [[TMP3]], [[TMP2]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-NEXT:    [[C:%.*]] = fadd x86_fp80 [[B]], 0xK3FC9E69594BEC44DE000
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fp128 [[TMP7]], 0xLC0000000000000003FC9CD2B297D889B
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_longdouble_store(ptr [[A]], i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[C]], fp128 [[TMP8]], i32 4, i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = fpext x86_fp80 [[C]] to fp128
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], fp128 [[TMP13]], fp128 [[TMP8]]
+; CHECK-NEXT:    store fp128 [[TMP14]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    store x86_fp80 [[C]], ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load x86_fp80, ptr %a, align 1
+  %c = fadd x86_fp80 %b, 0xK3FC9E69594BEC44DE000
+  store x86_fp80 %c, ptr %a, align 1
+  ret void
+}
+
+define void @load_add_store_double(ptr %a) sanitize_numerical_stability {
+; DQQ-LABEL: @load_add_store_double(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[B:%.*]] = load double, ptr [[A:%.*]], align 1
+; DQQ-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_double_load(ptr [[A]], i64 1)
+; DQQ-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+; DQQ-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
+; DQQ:       2:
+; DQQ-NEXT:    [[TMP3:%.*]] = load fp128, ptr [[TMP0]], align 1
+; DQQ-NEXT:    br label [[TMP6:%.*]]
+; DQQ:       4:
+; DQQ-NEXT:    [[TMP5:%.*]] = fpext double [[B]] to fp128
+; DQQ-NEXT:    br label [[TMP6]]
+; DQQ:       6:
+; DQQ-NEXT:    [[TMP7:%.*]] = phi fp128 [ [[TMP3]], [[TMP2]] ], [ [[TMP5]], [[TMP4]] ]
+; DQQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DQQ-NEXT:    [[TMP8:%.*]] = fadd fp128 [[TMP7]], 0xL00000000000000003FFF000000000000
+; DQQ-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_double_store(ptr [[A]], i64 1)
+; DQQ-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A]] to i64
+; DQQ-NEXT:    [[TMP11:%.*]] = call i32 @__nsan_internal_check_double_q(double [[C]], fp128 [[TMP8]], i32 4, i64 [[TMP10]])
+; DQQ-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
+; DQQ-NEXT:    [[TMP13:%.*]] = fpext double [[C]] to fp128
+; DQQ-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], fp128 [[TMP13]], fp128 [[TMP8]]
+; DQQ-NEXT:    store fp128 [[TMP14]], ptr [[TMP9]], align 1
+; DQQ-NEXT:    store double [[C]], ptr [[A]], align 1
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @load_add_store_double(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[B:%.*]] = load double, ptr [[A:%.*]], align 1
+; DLQ-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_double_load(ptr [[A]], i64 1)
+; DLQ-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+; DLQ-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
+; DLQ:       2:
+; DLQ-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[TMP0]], align 1
+; DLQ-NEXT:    br label [[TMP6:%.*]]
+; DLQ:       4:
+; DLQ-NEXT:    [[TMP5:%.*]] = fpext double [[B]] to x86_fp80
+; DLQ-NEXT:    br label [[TMP6]]
+; DLQ:       6:
+; DLQ-NEXT:    [[TMP7:%.*]] = phi x86_fp80 [ [[TMP3]], [[TMP2]] ], [ [[TMP5]], [[TMP4]] ]
+; DLQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DLQ-NEXT:    [[TMP8:%.*]] = fadd x86_fp80 [[TMP7]], 0xK3FFF8000000000000000
+; DLQ-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_double_store(ptr [[A]], i64 1)
+; DLQ-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A]] to i64
+; DLQ-NEXT:    [[TMP11:%.*]] = call i32 @__nsan_internal_check_double_l(double [[C]], x86_fp80 [[TMP8]], i32 4, i64 [[TMP10]])
+; DLQ-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
+; DLQ-NEXT:    [[TMP13:%.*]] = fpext double [[C]] to x86_fp80
+; DLQ-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], x86_fp80 [[TMP13]], x86_fp80 [[TMP8]]
+; DLQ-NEXT:    store x86_fp80 [[TMP14]], ptr [[TMP9]], align 1
+; DLQ-NEXT:    store double [[C]], ptr [[A]], align 1
+; DLQ-NEXT:    ret void
+;
+entry:
+  %b = load double, ptr %a, align 1
+  %c = fadd double %b, 1.0
+  store double %c, ptr %a, align 1
+  ret void
+}
+
+define void @load_add_store_vector(<2 x float>* %a) sanitize_numerical_stability {
+; CHECK-LABEL: @load_add_store_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load <2 x float>, ptr [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[A]], i64 2)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext <2 x float> [[B]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP3]], [[TMP2]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-NEXT:    [[C:%.*]] = fadd <2 x float> [[B]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[A]], i64 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[C]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP10]], double [[TMP11]], i32 4, i64 [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[C]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP14]], double [[TMP15]], i32 4, i64 [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = or i32 [[TMP13]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = fpext <2 x float> [[C]] to <2 x double>
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], <2 x double> [[TMP20]], <2 x double> [[TMP8]]
+; CHECK-NEXT:    store <2 x double> [[TMP21]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    store <2 x float> [[C]], ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load <2 x float>, ptr %a, align 1
+  %c = fadd <2 x float> %b, <float 1.0, float 1.0>
+  store <2 x float> %c, ptr %a, align 1
+  ret void
+}
+
+declare float @returns_float()
+
+define void @call_fn_returning_float(ptr %dst) sanitize_numerical_stability {
+; CHECK-LABEL: @call_fn_returning_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = call float @returns_float()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @returns_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[B]], 1.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[DST:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_float_d(float [[C]], double [[TMP5]], i32 4, i64 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], double [[TMP10]], double [[TMP5]]
+; CHECK-NEXT:    store double [[TMP11]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    store float [[C]], ptr [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = call float @returns_float()
+  %c = fadd float %b, 1.0
+  store float %c, ptr %dst, align 1
+  ret void
+}
+
+define float @return_fn_returning_float(ptr %dst) sanitize_numerical_stability {
+; CHECK-LABEL: @return_fn_returning_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = call float @returns_float()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @returns_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP4]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], double [[TMP7]], double [[TMP4]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @return_fn_returning_float to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP8]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[B]]
+;
+entry:
+  %b = call float @returns_float()
+  ret float %b
+}
+
+declare void @takes_floats(float %a, i8 %b, double %c, x86_fp80 %d)
+
+define void @call_fn_taking_float() sanitize_numerical_stability {
+; DQQ-LABEL: @call_fn_taking_float(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    store ptr @takes_floats, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    store double 1.000000e+00, ptr @__nsan_shadow_args_ptr, align 1
+; DQQ-NEXT:    store fp128 0xL00000000000000004000800000000000, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 8), align 1
+; DQQ-NEXT:    store fp128 0xLC0000000000000003FC9CD2B297D889B, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 24), align 1
+; DQQ-NEXT:    call void @takes_floats(float 1.000000e+00, i8 2, double 3.000000e+00, x86_fp80 0xK3FC9E69594BEC44DE000)
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @call_fn_taking_float(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    store ptr @takes_floats, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    store double 1.000000e+00, ptr @__nsan_shadow_args_ptr, align 1
+; DLQ-NEXT:    store x86_fp80 0xK4000C000000000000000, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 8), align 1
+; DLQ-NEXT:    store fp128 0xLC0000000000000003FC9CD2B297D889B, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 18), align 1
+; DLQ-NEXT:    call void @takes_floats(float 1.000000e+00, i8 2, double 3.000000e+00, x86_fp80 0xK3FC9E69594BEC44DE000)
+; DLQ-NEXT:    ret void
+;
+entry:
+  call void @takes_floats(float 1.0, i8 2, double 3.0, x86_fp80 0xK3FC9E69594BEC44DE000)
+  ret void
+}
+
+declare float @llvm.sin.f32(float) readnone
+
+define float @call_sin_intrinsic() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sin_intrinsic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.sin.f32(float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sin_intrinsic to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.sin.f32(float 1.0)
+  ret float %r
+}
+
+declare float @sinf(float)
+
+define float @call_sinf_libfunc() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sinf_libfunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @sinf(float 1.000000e+00) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sinf_libfunc to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @sinf(float 1.0)
+  ret float %r
+}
+
+declare double @sin(double)
+
+; FIXME: nsan uses `sin(double)` for fp128.
+define double @call_sin_libfunc() sanitize_numerical_stability {
+; DQQ-LABEL: @call_sin_libfunc(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #[[ATTR4]]
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_sin_libfunc to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_sin_libfunc(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #[[ATTR4]]
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_sin_libfunc to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @sin(double 1.0)
+  ret double %r
+}
+
+declare double @frexp(double, i32*)
+
+define double @call_frexp_libfunc_nointrinsic(double %0, i32* nocapture %1) sanitize_numerical_stability {
+; DQQ-LABEL: @call_frexp_libfunc_nointrinsic(
+; DQQ-NEXT:    [[TMP3:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], ptrtoint (ptr @call_frexp_libfunc_nointrinsic to i64)
+; DQQ-NEXT:    [[TMP5:%.*]] = load fp128, ptr @__nsan_shadow_args_ptr, align 1
+; DQQ-NEXT:    [[TMP6:%.*]] = fpext double [[TMP0:%.*]] to fp128
+; DQQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP4]], fp128 [[TMP5]], fp128 [[TMP6]]
+; DQQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_double_q(double [[TMP0]], fp128 [[TMP7]], i32 2, i64 0)
+; DQQ-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+; DQQ-NEXT:    [[TMP10:%.*]] = fpext double [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], fp128 [[TMP10]], fp128 [[TMP7]]
+; DQQ-NEXT:    [[TMP12:%.*]] = tail call double @frexp(double [[TMP0]], ptr [[TMP1:%.*]])
+; DQQ-NEXT:    [[TMP13:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], ptrtoint (ptr @frexp to i64)
+; DQQ-NEXT:    [[TMP15:%.*]] = load fp128, ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    [[TMP16:%.*]] = fpext double [[TMP12]] to fp128
+; DQQ-NEXT:    [[TMP17:%.*]] = select i1 [[TMP14]], fp128 [[TMP15]], fp128 [[TMP16]]
+; DQQ-NEXT:    [[TMP18:%.*]] = call i32 @__nsan_internal_check_double_q(double [[TMP12]], fp128 [[TMP17]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 1
+; DQQ-NEXT:    [[TMP20:%.*]] = fpext double [[TMP12]] to fp128
+; DQQ-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], fp128 [[TMP20]], fp128 [[TMP17]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_frexp_libfunc_nointrinsic to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP21]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[TMP12]]
+;
+; DLQ-LABEL: @call_frexp_libfunc_nointrinsic(
+; DLQ-NEXT:    [[TMP3:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], ptrtoint (ptr @call_frexp_libfunc_nointrinsic to i64)
+; DLQ-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr @__nsan_shadow_args_ptr, align 1
+; DLQ-NEXT:    [[TMP6:%.*]] = fpext double [[TMP0:%.*]] to x86_fp80
+; DLQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP4]], x86_fp80 [[TMP5]], x86_fp80 [[TMP6]]
+; DLQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_double_l(double [[TMP0]], x86_fp80 [[TMP7]], i32 2, i64 0)
+; DLQ-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+; DLQ-NEXT:    [[TMP10:%.*]] = fpext double [[TMP0]] to x86_fp80
+; DLQ-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], x86_fp80 [[TMP10]], x86_fp80 [[TMP7]]
+; DLQ-NEXT:    [[TMP12:%.*]] = tail call double @frexp(double [[TMP0]], ptr [[TMP1:%.*]])
+; DLQ-NEXT:    [[TMP13:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], ptrtoint (ptr @frexp to i64)
+; DLQ-NEXT:    [[TMP15:%.*]] = load x86_fp80, ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    [[TMP16:%.*]] = fpext double [[TMP12]] to x86_fp80
+; DLQ-NEXT:    [[TMP17:%.*]] = select i1 [[TMP14]], x86_fp80 [[TMP15]], x86_fp80 [[TMP16]]
+; DLQ-NEXT:    [[TMP18:%.*]] = call i32 @__nsan_internal_check_double_l(double [[TMP12]], x86_fp80 [[TMP17]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 1
+; DLQ-NEXT:    [[TMP20:%.*]] = fpext double [[TMP12]] to x86_fp80
+; DLQ-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], x86_fp80 [[TMP20]], x86_fp80 [[TMP17]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_frexp_libfunc_nointrinsic to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP21]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[TMP12]]
+;
+  %3 = tail call double @frexp(double %0, i32* %1)
+  ret double %3
+}
+
+define float @call_fn_taking_float_by_fn_ptr(float (float)* nocapture %fn_ptr) sanitize_numerical_stability {
+; CHECK-LABEL: @call_fn_taking_float_by_fn_ptr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store ptr [[FN_PTR:%.*]], ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    store double 1.000000e+00, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[R:%.*]] = call float [[FN_PTR]](float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN_PTR]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP5]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fn_taking_float_by_fn_ptr to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float %fn_ptr(float 1.0)
+  ret float %r
+}
+
+define void @store_float(ptr %dst) sanitize_numerical_stability {
+; CHECK-LABEL: @store_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[DST:%.*]], i64 1)
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[TMP0]], align 1
+; CHECK-NEXT:    store float 4.200000e+01, ptr [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  store float 42.0, ptr %dst, align 1
+  ret void
+}
+
+define i1 @inline_asm(double %0) sanitize_numerical_stability {
+; DQQ-LABEL: @inline_asm(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @inline_asm to i64)
+; DQQ-NEXT:    [[TMP3:%.*]] = load fp128, ptr @__nsan_shadow_args_ptr, align 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[TMP0:%.*]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], fp128 [[TMP3]], fp128 [[TMP4]]
+; DQQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP6:%.*]] = call i32 asm "pmovmskb $1, $0", "=r,x,~{dirflag},~{fpsr},~{flags}"(double [[TMP0]])
+; DQQ-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
+; DQQ-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[TMP7]], 0
+; DQQ-NEXT:    ret i1 [[TMP8]]
+;
+; DLQ-LABEL: @inline_asm(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @inline_asm to i64)
+; DLQ-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr @__nsan_shadow_args_ptr, align 1
+; DLQ-NEXT:    [[TMP4:%.*]] = fpext double [[TMP0:%.*]] to x86_fp80
+; DLQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP4]]
+; DLQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP6:%.*]] = call i32 asm "pmovmskb $1, $0", "=r,x,~{dirflag},~{fpsr},~{flags}"(double [[TMP0]])
+; DLQ-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
+; DLQ-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[TMP7]], 0
+; DLQ-NEXT:    ret i1 [[TMP8]]
+;
+entry:
+  %1 = call i32 asm "pmovmskb $1, $0", "=r,x,~{dirflag},~{fpsr},~{flags}"(double %0)
+  %2 = trunc i32 %1 to i8
+  %3 = icmp slt i8 %2, 0
+  ret i1 %3
+}
+
+define void @vector_extract(<2 x float> %0) sanitize_numerical_stability {
+; CHECK-LABEL: @vector_extract(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @vector_extract to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP0:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x double> [[TMP3]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = extractelement <2 x float> %0, i32 1
+  ret void
+}
+
+define void @vector_insert(<2 x float> %0) sanitize_numerical_stability {
+; CHECK-LABEL: @vector_insert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @vector_insert to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP0:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x double> [[TMP3]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP0]], float 1.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP5]], double 1.000000e+00, i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertelement <2 x float> %0, float 1.0, i32 1
+  ret void
+}
+
+
+define void @vector_shuffle(<2 x float> %0) sanitize_numerical_stability {
+; CHECK-LABEL: @vector_shuffle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @vector_shuffle to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP0:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x double> [[TMP3]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = shufflevector <2 x float> %0, <2 x float> <float 1.0, float 1.0>, <2 x i32> <i32 1, i32 3>
+  ret void
+}
+
+define void @aggregate_extract({i32, {float, i1}} %0) sanitize_numerical_stability {
+; CHECK-LABEL: @aggregate_extract(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, { float, i1 } } [[TMP0:%.*]], 1, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = extractvalue {i32, {float, i1}} %0, 1, 0
+  ret void
+}
+
+define void @aggregate_insert({i32, {float, i1}} %0, float %1) sanitize_numerical_stability {
+; CHECK-LABEL: @aggregate_insert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], ptrtoint (ptr @aggregate_insert to i64)
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext float [[TMP1:%.*]] to double
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], double [[TMP4]], double [[TMP5]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP1]], double [[TMP6]], i32 5, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], double [[TMP9]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { i32, { float, i1 } } [[TMP0:%.*]], float [[TMP1]], 1, 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %2 = insertvalue {i32, {float, i1}} %0, float %1, 1, 0
+  ret void
+}
+
+define void @aggregate_insert_avoid_const_check({i32, {float, i1}} %0) sanitize_numerical_stability {
+; CHECK-LABEL: @aggregate_insert_avoid_const_check(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i32, { float, i1 } } [[TMP0:%.*]], float 1.000000e+00, 1, 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertvalue {i32, {float, i1}} %0, float 1.0, 1, 0
+  ret void
+}
+
+
+declare float @fabsf(float)
+
+define float @sub_fabs(float %a, float %b) sanitize_numerical_stability {
+; CHECK-LABEL: @sub_fabs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @sub_fabs to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 8), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[B:%.*]] to double
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[S:%.*]] = fsub float [[A]], [[B]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub double [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__nsan_internal_check_float_d(float [[S]], double [[TMP8]], i32 2, i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = fpext float [[S]] to double
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP10]], double [[TMP11]], double [[TMP8]]
+; CHECK-NEXT:    [[R:%.*]] = call float @fabsf(float [[S]]) #[[ATTR4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call double @llvm.fabs.f64(double [[TMP8]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP13]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], double [[TMP16]], double [[TMP13]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @sub_fabs to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP17]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %s = fsub float %a, %b
+  %r = call float @fabsf(float %s)
+  ret float %r
+}
+
+; Note that the `unsafe-fp-math` from the function attributes should be moved to
+; individual instructions, with the shadow instructions NOT getting the attribute.
+define float @param_add_return_float_unsafe_fp_math(float %a) #0 {
+; CHECK-LABEL: @param_add_return_float_unsafe_fp_math(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd fast float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[B]]
+;
+entry:
+  %b = fadd float %a, 1.0
+  ret float %b
+}
+
+
+define void @truncate(<2 x double> %0) sanitize_numerical_stability {
+; DQQ-LABEL: @truncate(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @truncate to i64)
+; DQQ-NEXT:    [[TMP3:%.*]] = load <2 x fp128>, ptr @__nsan_shadow_args_ptr, align 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext <2 x double> [[TMP0:%.*]] to <2 x fp128>
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x fp128> [[TMP3]], <2 x fp128> [[TMP4]]
+; DQQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP6:%.*]] = fptrunc <2 x double> [[TMP0]] to <2 x float>
+; DQQ-NEXT:    [[TMP7:%.*]] = fptrunc <2 x fp128> [[TMP5]] to <2 x double>
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @truncate(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @truncate to i64)
+; DLQ-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr @__nsan_shadow_args_ptr, align 1
+; DLQ-NEXT:    [[TMP4:%.*]] = fpext <2 x double> [[TMP0:%.*]] to <2 x x86_fp80>
+; DLQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x x86_fp80> [[TMP3]], <2 x x86_fp80> [[TMP4]]
+; DLQ-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP6:%.*]] = fptrunc <2 x double> [[TMP0]] to <2 x float>
+; DLQ-NEXT:    [[TMP7:%.*]] = fptrunc <2 x x86_fp80> [[TMP5]] to <2 x double>
+; DLQ-NEXT:    ret void
+;
+entry:
+  %1 = fptrunc <2 x double> %0 to  <2 x float>
+  ret void
+}
+
+define void @unaryop(float %a) sanitize_numerical_stability {
+; CHECK-LABEL: @unaryop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @unaryop to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[C:%.*]] = fneg float [[A]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP4]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = fneg float %a
+  ret void
+}
+
+
+attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/cfg.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/cfg.ll
new file mode 100644
index 00000000000000..62c57eab625f2e
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/cfg.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=nsan -nsan-shadow-type-mapping=dqq -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests with more involved control flow to check lazy construction of the
+; shadow values.
+
+define float @forward_use() sanitize_numerical_stability {
+; CHECK-LABEL: @forward_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BLOCK1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[B:%.*]], 2.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd double [[TMP2:%.*]], 2.000000e+00
+; CHECK-NEXT:    br label [[BLOCK1]]
+; CHECK:       block1:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ [[D]], [[LOOP:%.*]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[TMP0]], [[LOOP]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[B]] = fadd float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP2]] = fadd double [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %block1
+
+loop:
+  %d = fadd float %b, 2.0 ; this is a forward reference, requiring shadow(%b) to be available.
+  br label %block1
+
+block1:
+  %a = phi float [ %d, %loop], [ 1.0, %entry ]
+  %b = fadd float %a, 1.0
+  br label %loop
+}
+
+define float @forward_use_with_load(float* %p) sanitize_numerical_stability {
+; CHECK-LABEL: @forward_use_with_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BLOCK1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[B:%.*]], 2.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd double [[TMP10:%.*]], 2.000000e+00
+; CHECK-NEXT:    br label [[BLOCK1]]
+; CHECK:       block1:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ [[D]], [[LOOP:%.*]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[TMP0]], [[LOOP]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[P]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[TMP2]], null
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP6:%.*]], label [[TMP4:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[TMP2]], align 1
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[L]] to double
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ [[TMP5]], [[TMP4]] ], [ [[TMP7]], [[TMP6]] ]
+; CHECK-NEXT:    [[B]] = fadd float [[L]], 1.000000e+00
+; CHECK-NEXT:    [[TMP10]] = fadd double [[TMP9]], 1.000000e+00
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %block1
+
+loop:
+  %d = fadd float %b, 2.0 ; this is a forward reference, requiring shadow(%b) to be available.
+  br label %block1
+
+block1:
+  %a = phi float [ %d, %loop], [ 1.0, %entry ]
+  %l = load float, float* %p ; the load creates a new block
+  %b = fadd float %l, 1.0 ; this requires shadow(%l).
+  br label %loop
+}
+
+define float @forward_use_with_two_uses() sanitize_numerical_stability {
+; CHECK-LABEL: @forward_use_with_two_uses(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BLOCK1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[B:%.*]], 2.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd double [[TMP4:%.*]], 2.000000e+00
+; CHECK-NEXT:    br label [[BLOCK1]]
+; CHECK:       block1:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ [[D]], [[LOOP:%.*]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[TMP0]], [[LOOP]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[T1:%.*]] = fadd float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd double [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[T2:%.*]] = fadd float [[T1]], 3.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP2]], 3.000000e+00
+; CHECK-NEXT:    [[B]] = fadd float [[T1]], [[T2]]
+; CHECK-NEXT:    [[TMP4]] = fadd double [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %block1
+
+loop:
+  %d = fadd float %b, 2.0 ; this is a forward reference, requiring shadow(%b) to be available.
+  br label %block1
+
+block1:
+  %a = phi float [ %d, %loop], [ 1.0, %entry ]
+  %t1 = fadd float %a, 1.0
+  %t2 = fadd float %t1, 3.0 ; this requires shadow(%t1)
+  %b = fadd float %t1, %t2 ; this requires shadow(%t2) and shadow(%t1).
+  br label %loop
+}
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/invoke.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/invoke.ll
new file mode 100644
index 00000000000000..472da56cf3122e
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/invoke.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -S %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests for invoke instructions that require special handling of the phis.
+
+declare float @may_throw()
+
+declare void @personalityFn()
+
+define float @invoke1() sanitize_numerical_stability personality void ()* @personalityFn {
+; CHECK-LABEL: @invoke1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = invoke float @may_throw()
+; CHECK-NEXT:            to label [[TMP0:%.*]] unwind label [[LAND:%.*]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @may_throw to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP4]]
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       continue:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       land:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    [[LV:%.*]] = uitofp i32 1 to float
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R:%.*]] = phi float [ [[LV]], [[LAND]] ], [ [[C]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ 1.000000e+00, [[LAND]] ], [ [[TMP5]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP6]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], double [[TMP9]], double [[TMP6]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @invoke1 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP10]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+
+entry:
+  %c = invoke float @may_throw() to label %continue unwind label %land
+
+continue:
+  br label %exit
+
+land:
+  %res = landingpad { ptr, i32 } cleanup
+  %lv = uitofp i32 1 to float
+  br label %exit
+
+exit:
+  %r = phi float [ %lv, %land], [ %c, %continue ]
+  ret float %r
+}
+
+define float @invoke2() sanitize_numerical_stability personality void ()* @personalityFn {
+; CHECK-LABEL: @invoke2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = invoke float @may_throw()
+; CHECK-NEXT:            to label [[TMP0:%.*]] unwind label [[LAND:%.*]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @may_throw to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP4]]
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       continue:
+; CHECK-NEXT:    [[CV:%.*]] = fadd float [[C]], 2.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP5]], 2.000000e+00
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       land:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    [[LV:%.*]] = uitofp i32 1 to float
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R:%.*]] = phi float [ [[LV]], [[LAND]] ], [ [[CV]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ 1.000000e+00, [[LAND]] ], [ [[TMP6]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP7]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], double [[TMP10]], double [[TMP7]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @invoke2 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP11]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+
+entry:
+  %c = invoke float @may_throw() to label %continue unwind label %land
+
+continue:
+  %cv = fadd float %c, 2.0
+  br label %exit
+
+land:
+  %res = landingpad { ptr, i32 } cleanup
+  %lv = uitofp i32 1 to float
+  br label %exit
+
+exit:
+  %r = phi float [ %lv, %land], [ %cv, %continue ]
+  ret float %r
+}
+
+define float @invoke3() sanitize_numerical_stability personality void ()* @personalityFn {
+; CHECK-LABEL: @invoke3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = invoke float @may_throw()
+; CHECK-NEXT:            to label [[TMP0:%.*]] unwind label [[LAND:%.*]]
+; CHECK:       land:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    [[LV:%.*]] = uitofp i32 1 to float
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (ptr @may_throw to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP4]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R:%.*]] = phi float [ [[LV]], [[LAND]] ], [ [[C]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ 1.000000e+00, [[LAND]] ], [ [[TMP5]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP6]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], double [[TMP9]], double [[TMP6]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @invoke3 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP10]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+
+entry:
+  %c = invoke float @may_throw() to label %exit unwind label %land
+
+land:
+  %res = landingpad { ptr, i32 } cleanup
+  %lv = uitofp i32 1 to float
+  br label %exit
+
+exit:
+  %r = phi float [ %lv, %land], [ %c, %entry ]
+  ret float %r
+}
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll
new file mode 100644
index 00000000000000..3fe78d8b19b0ad
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll
@@ -0,0 +1,404 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -S %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests with memory manipulation (memcpy, llvm.memcpy, ...).
+
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1)
+
+define void @call_memcpy_intrinsic(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b) sanitize_numerical_stability {
+; CHECK-LABEL: @call_memcpy_intrinsic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @__nsan_copy_values(ptr [[A:%.*]], ptr [[B:%.*]], i64 16)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(16) [[A]], ptr nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(ptr nonnull align 8 dereferenceable(16) %a, ptr nonnull align 8 dereferenceable(16) %b, i64 16, i1 false)
+  ret void
+}
+
+declare dso_local i8* @memcpy(i8*, i8*, i64) local_unnamed_addr
+
+define void @call_memcpy(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b) sanitize_numerical_stability {
+; CHECK-LABEL: @call_memcpy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @memcpy(ptr nonnull align 8 dereferenceable(16) [[A:%.*]], ptr nonnull align 8 dereferenceable(16) [[B:%.*]], i64 16) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call i8* @memcpy(ptr nonnull align 8 dereferenceable(16) %a, ptr nonnull align 8 dereferenceable(16) %b, i64 16)
+  ret void
+}
+
+
+define void @transfer_float(float* %dst, float* %src) sanitize_numerical_stability {
+; CHECK-LABEL: @transfer_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = load float, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[SRC]], i64 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP0]], align 1
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext float [[T]] to double
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ [[TMP3]], [[TMP2]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[DST:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @__nsan_internal_check_float_d(float [[T]], double [[TMP7]], i32 4, i64 [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fpext float [[T]] to double
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP11]], double [[TMP12]], double [[TMP7]]
+; CHECK-NEXT:    store double [[TMP13]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store float [[T]], ptr [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t = load float, ptr %src
+  store float %t, ptr %dst, align 1
+  ret void
+}
+
+define void @transfer_non_float(i32* %dst, i32* %src) sanitize_numerical_stability {
+; CHECK-LABEL: @transfer_non_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[SRC]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 1
+; CHECK-NEXT:    store i32 [[T]], ptr [[DST:%.*]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[DST]])
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[DST]])
+; CHECK-NEXT:    store i64 [[TMP3]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t = load i32, ptr %src
+  store i32 %t, ptr %dst, align 1
+  ret void
+}
+
+define void @transfer_array([2 x float]* %a) sanitize_numerical_stability {
+; CHECK-LABEL: @transfer_array(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load [2 x float], ptr [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[A]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP2]], align 1
+; CHECK-NEXT:    store [2 x float] [[B]], ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[A]])
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[A]])
+; CHECK-NEXT:    store i128 [[TMP3]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load [2 x float], ptr %a, align 1
+  store [2 x float] %b, ptr %a, align 1
+  ret void
+}
+
+define void @swap_untyped1(i64* nonnull align 8 %p, i64* nonnull align 8 %q) sanitize_numerical_stability {
+; CHECK-LABEL: @swap_untyped1(
+; CHECK-NEXT:    [[QV:%.*]] = load i64, ptr [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[Q]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[Q]])
+; CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[PV:%.*]] = load i64, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[P]])
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[P]])
+; CHECK-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i64 [[PV]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[Q]])
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[Q]])
+; CHECK-NEXT:    store i128 [[TMP8]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    store i64 [[QV]], ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[P]])
+; CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[P]])
+; CHECK-NEXT:    store i128 [[TMP4]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    ret void
+;
+  %qv = load i64, ptr %q
+  %pv = load i64, ptr %p
+  store i64 %pv, ptr %q, align 8
+  store i64 %qv, ptr %p, align 8
+  ret void
+}
+
+; Same as swap_untyped1, but the load/stores are in the opposite order.
+define void @swap_untyped2(i64* nonnull align 8 %p, i64* nonnull align 8 %q) sanitize_numerical_stability {
+; CHECK-LABEL: @swap_untyped2(
+; CHECK-NEXT:    [[PV:%.*]] = load i64, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[P]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[P]])
+; CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[QV:%.*]] = load i64, ptr [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[Q]])
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[Q]])
+; CHECK-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i64 [[PV]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[Q]])
+; CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[Q]])
+; CHECK-NEXT:    store i128 [[TMP4]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    store i64 [[QV]], ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = call ptr @__nsan_internal_get_raw_shadow_type_ptr(ptr [[P]])
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call ptr @__nsan_internal_get_raw_shadow_ptr(ptr [[P]])
+; CHECK-NEXT:    store i128 [[TMP8]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    ret void
+;
+  %pv = load i64, ptr %p
+  %qv = load i64, ptr %q
+  store i64 %pv, ptr %q, align 8
+  store i64 %qv, ptr %p, align 8
+  ret void
+}
+
+define void @swap_ft1(float* nonnull align 8 %p, float* nonnull align 8 %q) sanitize_numerical_stability {
+; CHECK-LABEL: @swap_ft1(
+; CHECK-NEXT:    [[QV:%.*]] = load float, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[Q]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP5:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[TMP1]], align 1
+; CHECK-NEXT:    br label [[TMP7:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    br label [[TMP7]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi double [ [[TMP4]], [[TMP3]] ], [ [[TMP6]], [[TMP5]] ]
+; CHECK-NEXT:    [[PV:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[P]], i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[TMP9]], null
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[TMP9]], align 1
+; CHECK-NEXT:    br label [[TMP15:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi double [ [[TMP12]], [[TMP11]] ], [ [[TMP14]], [[TMP13]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[Q]], i64 1)
+; CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__nsan_internal_check_float_d(float [[PV]], double [[TMP16]], i32 4, i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP20]], double [[TMP21]], double [[TMP16]]
+; CHECK-NEXT:    store double [[TMP22]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store float [[PV]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[P]], i64 1)
+; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @__nsan_internal_check_float_d(float [[QV]], double [[TMP8]], i32 4, i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP25]], 1
+; CHECK-NEXT:    [[TMP27:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP26]], double [[TMP27]], double [[TMP8]]
+; CHECK-NEXT:    store double [[TMP28]], ptr [[TMP23]], align 1
+; CHECK-NEXT:    store float [[QV]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %qv = load float, ptr %q
+  %pv = load float, ptr %p
+  store float %pv, ptr %q, align 8
+  store float %qv, ptr %p, align 8
+  ret void
+}
+
+; Same as swap_ft1, but the load/stores are in the opposite order.
+define void @swap_ft2(float* nonnull align 8 %p, float* nonnull align 8 %q) sanitize_numerical_stability {
+; CHECK-LABEL: @swap_ft2(
+; CHECK-NEXT:    [[PV:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[P]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP5:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[TMP1]], align 1
+; CHECK-NEXT:    br label [[TMP7:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    br label [[TMP7]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi double [ [[TMP4]], [[TMP3]] ], [ [[TMP6]], [[TMP5]] ]
+; CHECK-NEXT:    [[QV:%.*]] = load float, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[Q]], i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[TMP9]], null
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[TMP9]], align 1
+; CHECK-NEXT:    br label [[TMP15:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi double [ [[TMP12]], [[TMP11]] ], [ [[TMP14]], [[TMP13]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[Q]], i64 1)
+; CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__nsan_internal_check_float_d(float [[PV]], double [[TMP8]], i32 4, i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP20]], double [[TMP21]], double [[TMP8]]
+; CHECK-NEXT:    store double [[TMP22]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store float [[PV]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[P]], i64 1)
+; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @__nsan_internal_check_float_d(float [[QV]], double [[TMP16]], i32 4, i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP25]], 1
+; CHECK-NEXT:    [[TMP27:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP26]], double [[TMP27]], double [[TMP16]]
+; CHECK-NEXT:    store double [[TMP28]], ptr [[TMP23]], align 1
+; CHECK-NEXT:    store float [[QV]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %pv = load float, ptr %p
+  %qv = load float, ptr %q
+  store float %pv, ptr %q, align 8
+  store float %qv, ptr %p, align 8
+  ret void
+}
+
+define void @swap_vectorft1(<2 x float>* nonnull align 16 %p, <2 x float>* nonnull align 16 %q) sanitize_numerical_stability {
+; CHECK-LABEL: @swap_vectorft1(
+; CHECK-NEXT:    [[QV:%.*]] = load <2 x float>, ptr [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[Q]], i64 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP5:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    br label [[TMP7:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP7]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP4]], [[TMP3]] ], [ [[TMP6]], [[TMP5]] ]
+; CHECK-NEXT:    [[PV:%.*]] = load <2 x float>, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[P]], i64 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[TMP9]], null
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    br label [[TMP15:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP12]], [[TMP11]] ], [ [[TMP14]], [[TMP13]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[Q]], i64 2)
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[PV]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP16]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP18]], double [[TMP19]], i32 4, i64 [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[PV]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP16]], i64 1
+; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP22]], double [[TMP23]], i32 4, i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = or i32 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], <2 x double> [[TMP28]], <2 x double> [[TMP16]]
+; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store <2 x float> [[PV]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[P]], i64 2)
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[QV]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP31]], double [[TMP32]], i32 4, i64 [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[QV]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP35]], double [[TMP36]], i32 4, i64 [[TMP37]])
+; CHECK-NEXT:    [[TMP39:%.*]] = or i32 [[TMP34]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[TMP39]], 1
+; CHECK-NEXT:    [[TMP41:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP40]], <2 x double> [[TMP41]], <2 x double> [[TMP8]]
+; CHECK-NEXT:    store <2 x double> [[TMP42]], ptr [[TMP30]], align 1
+; CHECK-NEXT:    store <2 x float> [[QV]], ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  %qv = load <2 x float>, ptr %q
+  %pv = load <2 x float>, ptr %p
+  store <2 x float> %pv, ptr %q, align 16
+  store <2 x float> %qv, ptr %p, align 16
+  ret void
+}
+
+; Same as swap_vectorft1, but the load/stores are in the opposite order.
+define void @swap_vectorft2(<2 x float>* nonnull align 16 %p, <2 x float>* nonnull align 16 %q) sanitize_numerical_stability {
+; CHECK-LABEL: @swap_vectorft2(
+; CHECK-NEXT:    [[PV:%.*]] = load <2 x float>, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[P]], i64 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP5:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    br label [[TMP7:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP7]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP4]], [[TMP3]] ], [ [[TMP6]], [[TMP5]] ]
+; CHECK-NEXT:    [[QV:%.*]] = load <2 x float>, ptr [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_load(ptr [[Q]], i64 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[TMP9]], null
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    br label [[TMP15:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP12]], [[TMP11]] ], [ [[TMP14]], [[TMP13]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[Q]], i64 2)
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[PV]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP18]], double [[TMP19]], i32 4, i64 [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[PV]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP22]], double [[TMP23]], i32 4, i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = or i32 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], <2 x double> [[TMP28]], <2 x double> [[TMP8]]
+; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store <2 x float> [[PV]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[P]], i64 2)
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[QV]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP16]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP31]], double [[TMP32]], i32 4, i64 [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[QV]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[TMP16]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP35]], double [[TMP36]], i32 4, i64 [[TMP37]])
+; CHECK-NEXT:    [[TMP39:%.*]] = or i32 [[TMP34]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[TMP39]], 1
+; CHECK-NEXT:    [[TMP41:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP40]], <2 x double> [[TMP41]], <2 x double> [[TMP16]]
+; CHECK-NEXT:    store <2 x double> [[TMP42]], ptr [[TMP30]], align 1
+; CHECK-NEXT:    store <2 x float> [[QV]], ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  %pv = load <2 x float>, ptr %p
+  %qv = load <2 x float>, ptr %q
+  store <2 x float> %pv, ptr %q, align 16
+  store <2 x float> %qv, ptr %p, align 16
+  ret void
+}
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/non_float_store.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/non_float_store.ll
new file mode 100644
index 00000000000000..ce2daedf94b0b7
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/non_float_store.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -nsan-propagate-non-ft-const-stores-as-ft -S %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @store_non_float(ptr %dst) sanitize_numerical_stability {
+; CHECK-LABEL: @store_non_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 42, ptr [[DST:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__nsan_get_shadow_ptr_for_float_store(ptr [[DST]], i64 1)
+; CHECK-NEXT:    store double 0x36F5000000000000, ptr [[TMP0]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %dst, align 1
+  ret void
+}
+
+attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/scalable_vector.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/scalable_vector.ll
new file mode 100644
index 00000000000000..3263b8b5fe7491
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/scalable_vector.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -S %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @add_scalable_vector(<vscale x 2 x double> %a) sanitize_numerical_stability {
+; CHECK-LABEL: @add_scalable_vector(
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <vscale x 2 x double> [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret void
+;
+  %add = fadd <vscale x 2 x double> %a, %a
+  ret void
+}
+
+attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s b/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
index fe6d3c21b9a37c..5326533824d9e8 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
@@ -159,73 +159,6 @@ wavefront_size32:
   .amdhsa_wavefront_size32 defined_boolean
 .end_amdhsa_kernel
 
-.p2align 8
-.type next_free_vgpr,@function
-next_free_vgpr:
-  s_endpgm
-
-.p2align 6
-.amdhsa_kernel next_free_vgpr
-// ASM: :[[@LINE+1]]:{{[0-9]+}}: error: directive should have resolvable expression
-  .amdhsa_next_free_vgpr defined_boolean
-  .amdhsa_next_free_sgpr 0
-  .amdhsa_accum_offset 4
-.end_amdhsa_kernel
-
-.p2align 8
-.type next_free_sgpr,@function
-next_free_sgpr:
-  s_endpgm
-
-.p2align 6
-.amdhsa_kernel next_free_sgpr
-  .amdhsa_next_free_vgpr 0
-// ASM: :[[@LINE+1]]:{{[0-9]+}}: error: directive should have resolvable expression
-  .amdhsa_next_free_sgpr defined_boolean
-  .amdhsa_accum_offset 4
-.end_amdhsa_kernel
-
-.p2align 8
-.type accum_offset,@function
-accum_offset:
-  s_endpgm
-
-.p2align 6
-.amdhsa_kernel accum_offset
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
-// ASM: :[[@LINE+1]]:{{[0-9]+}}: error: directive should have resolvable expression
-  .amdhsa_accum_offset defined_boolean
-.end_amdhsa_kernel
-
-.p2align 8
-.type reserve_vcc,@function
-reserve_vcc:
-  s_endpgm
-
-.p2align 6
-.amdhsa_kernel reserve_vcc
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
-  .amdhsa_accum_offset 4
-// ASM: :[[@LINE+1]]:{{[0-9]+}}: error: directive should have resolvable expression
-  .amdhsa_reserve_vcc defined_boolean
-.end_amdhsa_kernel
-
-.p2align 8
-.type reserve_flat_scratch,@function
-reserve_flat_scratch:
-  s_endpgm
-
-.p2align 6
-.amdhsa_kernel reserve_flat_scratch
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
-  .amdhsa_accum_offset 4
-// ASM: :[[@LINE+1]]:{{[0-9]+}}: error: directive should have resolvable expression
-  .amdhsa_reserve_flat_scratch defined_boolean
-.end_amdhsa_kernel
-
 .p2align 8
 .type shared_vgpr_count,@function
 shared_vgpr_count:
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
index 95af59c413ae6d..af4cb1a008f9d9 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
@@ -10,12 +10,12 @@
 // OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00f0afe4 801f007f 000c0000 00000000
+// OBJDUMP-NEXT: 0030 05f0afe4 801f007f 000c0000 00000000
 // expr_defined
 // OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0070 00f0afe4 801f007f 000c0000 00000000
+// OBJDUMP-NEXT: 0070 05f0afe4 801f007f 000c0000 00000000
 
 .text
 // ASM: .text
@@ -61,8 +61,10 @@ expr_defined:
   .amdhsa_exception_fp_ieee_inexact defined_boolean
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+4
+  .amdhsa_next_free_sgpr defined_value+5
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 .set defined_value, 41
@@ -94,8 +96,10 @@ expr_defined:
   .amdhsa_exception_fp_ieee_inexact defined_boolean
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+3
+  .amdhsa_next_free_sgpr defined_value+4
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel expr_defined_later
@@ -117,19 +121,21 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4
+// ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5
+// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
+// ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
-// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
-// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
-// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
-// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
-// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
-// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&2147483648)>>31
 // ASM-NEXT: .amdhsa_shared_vgpr_count 0
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
@@ -166,8 +172,10 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 44
+// ASM-NEXT: .amdhsa_next_free_sgpr 45
+// ASM-NEXT: .amdhsa_reserve_vcc 1
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 3
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 3
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
index e1107fb69ba410..b6e4ddde3d7f9e 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
@@ -10,12 +10,12 @@
 // OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00f0afe4 811f007f 000c0000 00000000
+// OBJDUMP-NEXT: 0030 05f0afe4 811f007f 000c0000 00000000
 // expr_defined
 // OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0070 00f0afe4 811f007f 000c0000 00000000
+// OBJDUMP-NEXT: 0070 05f0afe4 811f007f 000c0000 00000000
 
 .text
 // ASM: .text
@@ -62,8 +62,9 @@ expr_defined:
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_enable_private_segment defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+4
+  .amdhsa_next_free_sgpr defined_value+5
+  .amdhsa_reserve_vcc defined_boolean
 .end_amdhsa_kernel
 
 .set defined_value, 41
@@ -96,8 +97,9 @@ expr_defined:
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_enable_private_segment defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+3
+  .amdhsa_next_free_sgpr defined_value+4
+  .amdhsa_reserve_vcc defined_boolean
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel expr_defined_later
@@ -117,18 +119,19 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
-// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
-// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
-// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
-// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
-// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
-// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4
+// ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5
+// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&2147483648)>>31
 // ASM-NEXT: .amdhsa_shared_vgpr_count 0
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
@@ -163,8 +166,9 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 44
+// ASM-NEXT: .amdhsa_next_free_sgpr 45
+// ASM-NEXT: .amdhsa_reserve_vcc 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 3
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 3
 // ASM-NEXT: .amdhsa_float_denorm_mode_32 3
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
index 449616d35186b7..a80000dc44dac0 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
@@ -10,12 +10,12 @@
 // OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00f02fe4 811f007f 000c0000 00000000
+// OBJDUMP-NEXT: 0030 05f02fe4 811f007f 000c0000 00000000
 // expr_defined
 // OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0070 00f02fe4 811f007f 000c0000 00000000
+// OBJDUMP-NEXT: 0070 05f02fe4 811f007f 000c0000 00000000
 
 .text
 // ASM: .text
@@ -63,8 +63,9 @@ expr_defined:
   .amdhsa_round_robin_scheduling defined_boolean
   .amdhsa_enable_private_segment defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+4
+  .amdhsa_next_free_sgpr defined_value+5
+  .amdhsa_reserve_vcc defined_boolean
 .end_amdhsa_kernel
 
 .set defined_value, 41
@@ -98,8 +99,9 @@ expr_defined:
   .amdhsa_round_robin_scheduling defined_boolean
   .amdhsa_enable_private_segment defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+3
+  .amdhsa_next_free_sgpr defined_value+4
+  .amdhsa_reserve_vcc defined_boolean
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel expr_defined_later
@@ -119,17 +121,18 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
-// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
-// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
-// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
-// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
-// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4
+// ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5
+// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|((((alignto(max(defined_value+4, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max(0, 1), 8))/8)-1)<<6))&2097152)>>21
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
 // ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
@@ -163,8 +166,9 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 44
+// ASM-NEXT: .amdhsa_next_free_sgpr 45
+// ASM-NEXT: .amdhsa_reserve_vcc 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 3
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 3
 // ASM-NEXT: .amdhsa_float_denorm_mode_32 3
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
index c7e05441b45ff7..7ab2e2b28a0e6e 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
@@ -10,12 +10,12 @@
 // OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00f0af00 801f007f 00080000 00000000
+// OBJDUMP-NEXT: 0030 8bf1af00 801f007f 00080000 00000000
 // expr_defined
 // OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0070 00f0af00 801f007f 00080000 00000000
+// OBJDUMP-NEXT: 0070 8af1af00 801f007f 00080000 00000000
 
 .text
 // ASM: .text
@@ -57,8 +57,10 @@ expr_defined:
   .amdhsa_exception_fp_ieee_inexact defined_boolean
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+4
+  .amdhsa_next_free_sgpr defined_value+5
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 .set defined_value, 41
@@ -86,8 +88,10 @@ expr_defined:
   .amdhsa_exception_fp_ieee_inexact defined_boolean
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+3
+  .amdhsa_next_free_sgpr defined_value+4
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel expr_defined_later
@@ -108,14 +112,16 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
-// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
-// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4
+// ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5
+// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
+// ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&8388608)>>23
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
 // ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
@@ -150,8 +156,10 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 44
+// ASM-NEXT: .amdhsa_next_free_sgpr 45
+// ASM-NEXT: .amdhsa_reserve_vcc 1 
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 3
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 3
 // ASM-NEXT: .amdhsa_float_denorm_mode_32 3
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
index 49a5015987a651..caccde7ba0e3b8 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
@@ -11,12 +11,12 @@
 // OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00f0af00 801f007f 00080000 00000000
+// OBJDUMP-NEXT: 0030 8bf1af00 801f007f 00080000 00000000
 // expr_defined
 // OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0070 00f0af00 801f007f 00080000 00000000
+// OBJDUMP-NEXT: 0070 8af1af00 801f007f 00080000 00000000
 
 .text
 // ASM: .text
@@ -58,8 +58,10 @@ expr_defined:
   .amdhsa_exception_fp_ieee_inexact defined_boolean
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+4
+  .amdhsa_next_free_sgpr defined_value+5
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 .set defined_value, 41
@@ -87,8 +89,10 @@ expr_defined:
   .amdhsa_exception_fp_ieee_inexact defined_boolean
   .amdhsa_exception_int_div_zero defined_boolean
   .amdhsa_uses_dynamic_stack defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_value+3
+  .amdhsa_next_free_sgpr defined_value+4
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel expr_defined_later
@@ -109,15 +113,17 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4
+// ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5
+// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
+// ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
-// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
-// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|((((alignto(max(defined_value+4, 1), 4))/4)-1)<<0))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&8388608)>>23
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
 // ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
@@ -152,8 +158,10 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 44
+// ASM-NEXT: .amdhsa_next_free_sgpr 45
+// ASM-NEXT: .amdhsa_reserve_vcc 1
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 3
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 3
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
index b7f89239160fcf..216ae4c42a3d9a 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
@@ -10,12 +10,12 @@
 // OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000100
-// OBJDUMP-NEXT: 0030 0000ac04 81000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 4000ac04 81000000 00000000 00000000
 // expr_defined
 // OBJDUMP-NEXT: 0040 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
-// OBJDUMP-NEXT: 0070 0000ac04 81000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 4000ac04 81000000 00000000 00000000
 
 .text
 // ASM: .text
@@ -43,9 +43,11 @@ expr_defined:
   .amdhsa_ieee_mode defined_boolean
   .amdhsa_fp16_overflow defined_boolean
   .amdhsa_tg_split defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_boolean+1
+  .amdhsa_next_free_sgpr defined_boolean+2
   .amdhsa_accum_offset 4
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 .set defined_boolean, 1
@@ -57,9 +59,11 @@ expr_defined:
   .amdhsa_ieee_mode defined_boolean
   .amdhsa_fp16_overflow defined_boolean
   .amdhsa_tg_split defined_boolean
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr defined_boolean+1
+  .amdhsa_next_free_sgpr defined_boolean+2
   .amdhsa_accum_offset 4
+  .amdhsa_reserve_vcc defined_boolean
+  .amdhsa_reserve_flat_scratch defined_boolean
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel expr_defined_later
@@ -82,18 +86,20 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
-// ASM-NEXT: .amdhsa_accum_offset (((((((0&(~65536))|(defined_boolean<<16))&(~63))|(0<<0))&63)>>0)+1)*4
+// ASM-NEXT: .amdhsa_next_free_vgpr defined_boolean+1
+// ASM-NEXT: .amdhsa_next_free_sgpr defined_boolean+2
+// ASM-NEXT: .amdhsa_accum_offset (((((((0&(~65536))|(defined_boolean<<16))&(~63))|(((4/4)-1)<<0))&63)>>0)+1)*4
+// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
+// ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
-// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
-// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
-// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
-// ASM-NEXT: .amdhsa_tg_split (((((0&(~65536))|(defined_boolean<<16))&(~63))|(0<<0))&65536)>>16
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|((((alignto(max(defined_boolean+1, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|((((alignto(max(defined_boolean+1, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|((((alignto(max(defined_boolean+1, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|((((alignto(max(defined_boolean+1, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|((((alignto(max(defined_boolean+1, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|((((alignto(max(defined_boolean+1, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|((((alignto(max(defined_boolean+1, 1), 8))/8)-1)<<0))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_tg_split (((((0&(~65536))|(defined_boolean<<16))&(~63))|(((4/4)-1)<<0))&65536)>>16 
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
 // ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
@@ -126,9 +132,11 @@ expr_defined:
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 0
 // ASM-NEXT: .amdhsa_system_vgpr_workitem_id 0
-// ASM-NEXT: .amdhsa_next_free_vgpr 0
-// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 2
+// ASM-NEXT: .amdhsa_next_free_sgpr 3
 // ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_vcc 1
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 0
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 0
diff --git a/llvm/test/MC/AMDGPU/hsa-tg-split.s b/llvm/test/MC/AMDGPU/hsa-tg-split.s
index 5a4d3e2c279c50..78235e71258101 100644
--- a/llvm/test/MC/AMDGPU/hsa-tg-split.s
+++ b/llvm/test/MC/AMDGPU/hsa-tg-split.s
@@ -55,6 +55,8 @@ minimal:
 // ASM-NEXT: .amdhsa_next_free_vgpr 0
 // ASM-NEXT: .amdhsa_next_free_sgpr 0
 // ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_vcc 1
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 0
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 0
diff --git a/llvm/test/MC/AMDGPU/hsa-v4.s b/llvm/test/MC/AMDGPU/hsa-v4.s
index e19dba0f5fd0f8..5aa7a58578dec1 100644
--- a/llvm/test/MC/AMDGPU/hsa-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-v4.s
@@ -193,6 +193,7 @@ disabled_user_sgpr:
 // ASM: .amdhsa_next_free_vgpr 0
 // ASM-NEXT: .amdhsa_next_free_sgpr 27
 // ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM: .amdhsa_float_denorm_mode_16_64 0
 // ASM-NEXT: .amdhsa_dx10_clamp 0
diff --git a/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s b/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s
index 4c8849e8540bae..ec3d5bae22d17d 100644
--- a/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s
+++ b/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s
@@ -200,6 +200,7 @@ disabled_user_sgpr:
 // ASM: .amdhsa_next_free_vgpr 0
 // ASM-NEXT: .amdhsa_next_free_sgpr 27
 // ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
 // ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM: .amdhsa_float_denorm_mode_16_64 0
 // ASM-NEXT: .amdhsa_dx10_clamp 0
diff --git a/llvm/test/MC/COFF/section-comdat-conflict.s b/llvm/test/MC/COFF/section-comdat-conflict.s
index 2710b76be565f4..7b3935338503b3 100644
--- a/llvm/test/MC/COFF/section-comdat-conflict.s
+++ b/llvm/test/MC/COFF/section-comdat-conflict.s
@@ -1,6 +1,6 @@
-// RUN: not --crash llvm-mc -triple i386-pc-win32 -filetype=obj < %s 2>&1 |  FileCheck %s
+// RUN: not llvm-mc -triple i386-pc-win32 -filetype=obj < %s 2>&1 |  FileCheck %s
 
-// CHECK: conflicting sections for symbol
+// CHECK: <unknown>:0: error: invalid symbol redefinition
 
         .section .xyz
         .global bar
diff --git a/llvm/test/MC/COFF/section-comdat.s b/llvm/test/MC/COFF/section-comdat.s
index a7803b1f25b953..449131145599f9 100644
--- a/llvm/test/MC/COFF/section-comdat.s
+++ b/llvm/test/MC/COFF/section-comdat.s
@@ -1,15 +1,15 @@
-// RUN: llvm-mc -triple i386-pc-win32 -filetype=obj %s | llvm-readobj -S --symbols - | FileCheck %s
-// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -S --symbols - | FileCheck %s
+// RUN: llvm-mc -triple i386-pc-win32 -filetype=obj %s | llvm-objdump -h -t - | FileCheck %s
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-objdump -h -t - | FileCheck %s
 
 .section assocSec, "dr", discard, "assocSym"
 .global assocSym
 assocSym:
-.long 1
+.long assocSec
 
 .section secName, "dr", discard, "Symbol1"
 .globl Symbol1
 Symbol1:
-.long 1
+.long assocSym
 
 .section secName, "dr", one_only, "Symbol2"
 .globl Symbol2
@@ -46,165 +46,51 @@ Symbol7:
 Symbol8:
 .long 1
 
-// CHECK: Sections [
-// CHECK:   Section {
-// CHECK:     Number: 4
-// CHECK:     Name: assocSec
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK:   Section {
-// CHECK:     Number: 5
-// CHECK:     Name: secName
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK:   Section {
-// CHECK:     Number: 6
-// CHECK:     Name: secName
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK:   Section {
-// CHECK:     Number: 7
-// CHECK:     Name: SecName
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK:   Section {
-// CHECK:     Number: 8
-// CHECK:     Name: SecName
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK:   Section {
-// CHECK:     Number: 9
-// CHECK:     Name: SecName
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK:   Section {
-// CHECK:     Number: 10
-// CHECK:     Name: SecName
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK:   Section {
-// CHECK:     Number: 11
-// CHECK:     Name: SecName
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
-// CHECK: ]
-// CHECK: Symbols [
-// CHECK:   Symbol {
-// CHECK:     Name: assocSec
-// CHECK:     Section: assocSec (4)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: Any
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: assocSym
-// CHECK:     Section: assocSec
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: secName
-// CHECK:     Section: secName (5)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: Any
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol1
-// CHECK:     Section: secName (5)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: secName
-// CHECK:     Section: secName (6)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: NoDuplicates
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol2
-// CHECK:     Section: secName (6)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: SecName
-// CHECK:     Section: SecName (7)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: SameSize
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol3
-// CHECK:     Section: SecName (7)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: SecName
-// CHECK:     Section: SecName (8)
-// CHECK:     AuxSymbolCount: 1
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: ExactMatch
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol4
-// CHECK:     Section: SecName (8)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: SecName
-// CHECK:     Section: SecName (11)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: Associative
-// CHECK:       AssocSection: assocSec (4)
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: SecName
-// CHECK:     Section: SecName (9)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: Largest
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol6
-// CHECK:     Section: SecName (9)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: SecName
-// CHECK:     Section: SecName (10)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: Newest (0x7)
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol7
-// CHECK:     Section: SecName (10)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: assocSec
-// CHECK:     Section: assocSec (12)
-// CHECK:     AuxSectionDef {
-// CHECK:       Selection: Associative (0x5)
-// CHECK:       AssocSection: assocSec (4)
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol5
-// CHECK:     Section: SecName (11)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol8
-// CHECK:     Section: assocSec (12)
-// CHECK:   }
-// CHECK: ]
+# CHECK:      Sections:
+# CHECK-NEXT: Idx Name          Size
+# CHECK-NEXT:   0 .text         00000000
+# CHECK-NEXT:   1 .data         00000000
+# CHECK-NEXT:   2 .bss          00000000
+# CHECK-NEXT:   3 assocSec      00000004
+# CHECK-NEXT:   4 secName       00000004
+# CHECK-NEXT:   5 secName       00000004
+# CHECK-NEXT:   6 SecName       00000004
+# CHECK-NEXT:   7 SecName       00000004
+# CHECK-NEXT:   8 SecName       00000004
+# CHECK-NEXT:   9 SecName       00000004
+# CHECK-NEXT:  10 SecName       00000004
+# CHECK-NEXT:  11 assocSec      00000004
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: [ 0](sec  1)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .text
+# CHECK-NEXT: AUX scnlen 0x0 nreloc 0 nlnno 0 checksum 0x0 assoc 1 comdat 0
+# CHECK-NEXT: [ 2](sec  2)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .data
+# CHECK-NEXT: AUX scnlen 0x0 nreloc 0 nlnno 0 checksum 0x0 assoc 2 comdat 0
+# CHECK-NEXT: [ 4](sec  3)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .bss
+# CHECK-NEXT: AUX scnlen 0x0 nreloc 0 nlnno 0 checksum 0x0 assoc 3 comdat 0
+# CHECK-NEXT: [ 6](sec  4)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 assocSec
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 1 nlnno 0 checksum 0x0 assoc 4 comdat 2
+# CHECK-NEXT: [ 8](sec  4)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 assocSym
+# CHECK-NEXT: [ 9](sec  5)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 secName
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 1 nlnno 0 checksum 0x0 assoc 5 comdat 2
+# CHECK-NEXT: [11](sec  5)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol1
+# CHECK-NEXT: [12](sec  6)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 secName
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 0 nlnno 0 checksum 0xb8bc6765 assoc 6 comdat 1
+# CHECK-NEXT: [14](sec  6)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol2
+# CHECK-NEXT: [15](sec  7)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 SecName
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 0 nlnno 0 checksum 0xb8bc6765 assoc 7 comdat 3
+# CHECK-NEXT: [17](sec  7)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol3
+# CHECK-NEXT: [18](sec  8)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 SecName
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 0 nlnno 0 checksum 0xb8bc6765 assoc 8 comdat 4
+# CHECK-NEXT: [20](sec  8)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol4
+# CHECK-NEXT: [21](sec 11)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 SecName
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 0 nlnno 0 checksum 0xb8bc6765 assoc 4 comdat 5
+# CHECK-NEXT: [23](sec  9)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 SecName
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 0 nlnno 0 checksum 0xb8bc6765 assoc 9 comdat 6
+# CHECK-NEXT: [25](sec  9)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol6
+# CHECK-NEXT: [26](sec 10)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 SecName
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 0 nlnno 0 checksum 0xb8bc6765 assoc 10 comdat 7
+# CHECK-NEXT: [28](sec 10)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol7
+# CHECK-NEXT: [29](sec 12)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 assocSec
+# CHECK-NEXT: AUX scnlen 0x4 nreloc 0 nlnno 0 checksum 0xb8bc6765 assoc 4 comdat 5
+# CHECK-NEXT: [31](sec 11)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol5
+# CHECK-NEXT: [32](sec 12)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 Symbol8
diff --git a/llvm/test/MC/COFF/section-sym-err.s b/llvm/test/MC/COFF/section-sym-err.s
new file mode 100644
index 00000000000000..db982ac378098e
--- /dev/null
+++ b/llvm/test/MC/COFF/section-sym-err.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -filetype=obj -triple x86_64-windows-gnu %s -o %t.o 2>&1 | FileCheck %s --implicit-check-not=error:
+
+.section foo0,"xr",discard,foo1
+foo0:
+foo1:
+
+# CHECK: error: symbol 'foo0' is already defined
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index d7417929a8e1b0..f6d2a19326e1db 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -1,17 +1,18 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefix=GCN %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=GFX11,W32 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=GFX11,W64 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W32 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W64 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
 
-# GCN: [[@LINE+1]]:1: warning: invalid instruction encoding
+# GCN-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0xdf,0x00,0x00,0x02
 
 # this is s_singleuse_vdst 0x1234, which is only valid on gfx1150
-# GFX11: [[@LINE+1]]:1: warning: invalid instruction encoding
+# GFX11-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x34,0x12,0x93,0xbf
 
 # this is s_waitcnt_vscnt exec_hi, 0x1234, which is valid on gfx11, but not on gfx12
-# GFX12: [[@LINE+1]]:1: warning: invalid instruction encoding
+# GFX12-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x34,0x12,0x7f,0xbc
 
 # W32: v_dual_add_f32 v5, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, v3, v1, 0xaf123456 ; encoding: [0xff,0x04,0x02,0xc9,0x03,0x03,0x06,0x05,0x56,0x34,0x12,0xaf]
@@ -39,11 +40,11 @@
 0x10,0x40,0x40,0xcc,0x00,0x11,0x02,0x18 # src2 sgpr0
 
 # this is ds_add_f32 with gds bit which is not valid on gfx12+
-# GFX12: [[@LINE+1]]:1: warning: invalid instruction encoding
+# GFX12-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x00,0x00,0x56,0xd8,0x00,0x01,0x00,0x00
 
 # this is image_msaa_load where samp field for gfx12 VSAMPLE is not all zeros
-# GFX12: [[@LINE+1]]:1: warning: invalid instruction encoding
+# GFX12-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x06,0x00,0x46,0xe4,0x01,0x10,0x80,0x00,0x05,0x06,0x07,0x00
 
 # This is ds_read_b32 with gds bit which is not valid on gfx90a.
diff --git a/llvm/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s b/llvm/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s
index b01857ed8c38af..d076c9eebdf5a8 100644
--- a/llvm/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s
+++ b/llvm/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s
@@ -9,20 +9,4 @@ L_var2:
 	.long L_var4 - L_var3
 
 // CHECK:      Relocations [
-// CHECK-NEXT:   Section __data {
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x4
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 2
-// CHECK-NEXT:       Type: X86_64_RELOC_SUBTRACTOR (5)
-// CHECK-NEXT:       Section: __data (2)
-// CHECK-NEXT:     }
-// CHECK-NEXT:     Relocation {
-// CHECK-NEXT:       Offset: 0x4
-// CHECK-NEXT:       PCRel: 0
-// CHECK-NEXT:       Length: 2
-// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
-// CHECK-NEXT:       Section: __data (2)
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
 // CHECK-NEXT: ]
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index e8029ba71cb69f..49c9f396a6e246 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -417,32 +417,32 @@
 .attribute arch, "rv64i_xsfvfwmaccqqq"
 # CHECK: attribute      5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0"
 
-.attribute arch, "rv32i_ssnpm0p8"
-# CHECK: attribute      5, "rv32i2p1_ssnpm0p8"
+.attribute arch, "rv32i_ssnpm1p0"
+# CHECK: attribute      5, "rv32i2p1_ssnpm1p0"
 
-.attribute arch, "rv32i_smnpm0p8"
-# CHECK: attribute      5, "rv32i2p1_smnpm0p8"
+.attribute arch, "rv32i_smnpm1p0"
+# CHECK: attribute      5, "rv32i2p1_smnpm1p0"
 
-.attribute arch, "rv32i_smmpm0p8"
-# CHECK: attribute      5, "rv32i2p1_smmpm0p8"
+.attribute arch, "rv32i_smmpm1p0"
+# CHECK: attribute      5, "rv32i2p1_smmpm1p0"
 
-.attribute arch, "rv32i_sspm0p8"
-# CHECK: attribute      5, "rv32i2p1_sspm0p8"
+.attribute arch, "rv32i_sspm1p0"
+# CHECK: attribute      5, "rv32i2p1_sspm1p0"
 
-.attribute arch, "rv32i_supm0p8"
-# CHECK: attribute      5, "rv32i2p1_supm0p8"
+.attribute arch, "rv32i_supm1p0"
+# CHECK: attribute      5, "rv32i2p1_supm1p0"
 
-.attribute arch, "rv64i_ssnpm0p8"
-# CHECK: attribute      5, "rv64i2p1_ssnpm0p8"
+.attribute arch, "rv64i_ssnpm1p0"
+# CHECK: attribute      5, "rv64i2p1_ssnpm1p0"
 
-.attribute arch, "rv64i_smnpm0p8"
-# CHECK: attribute      5, "rv64i2p1_smnpm0p8"
+.attribute arch, "rv64i_smnpm1p0"
+# CHECK: attribute      5, "rv64i2p1_smnpm1p0"
 
-.attribute arch, "rv64i_smmpm0p8"
-# CHECK: attribute      5, "rv64i2p1_smmpm0p8"
+.attribute arch, "rv64i_smmpm1p0"
+# CHECK: attribute      5, "rv64i2p1_smmpm1p0"
 
-.attribute arch, "rv64i_sspm0p8"
-# CHECK: attribute      5, "rv64i2p1_sspm0p8"
+.attribute arch, "rv64i_sspm1p0"
+# CHECK: attribute      5, "rv64i2p1_sspm1p0"
 
-.attribute arch, "rv64i_supm0p8"
-# CHECK: attribute      5, "rv64i2p1_supm0p8"
+.attribute arch, "rv64i_supm1p0"
+# CHECK: attribute      5, "rv64i2p1_supm1p0"
diff --git a/llvm/test/MC/RISCV/cfi-advance.s b/llvm/test/MC/RISCV/cfi-advance.s
index b99af38f553aa0..1d00214da46c6f 100644
--- a/llvm/test/MC/RISCV/cfi-advance.s
+++ b/llvm/test/MC/RISCV/cfi-advance.s
@@ -1,26 +1,41 @@
-# RUN: llvm-mc -filetype=obj -triple riscv32 %s -o %t.o  
-# RUN: llvm-readelf -sr %t.o | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=-relax %s -o %t.o
+# RUN: llvm-readelf -sr %t.o | FileCheck %s --check-prefix=NORELAX
 # RUN: llvm-dwarfdump --debug-frame %t.o 2>&1 \
 # RUN:     | FileCheck -check-prefix=CHECK-DWARFDUMP %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+relax %s -o %t.relax.o
+# RUN: llvm-readelf -sr %t.relax.o | FileCheck %s --check-prefix=RELAX
 
+# NORELAX:      Relocation section '.rela.text1' at offset {{.*}} contains 1 entries:
+# NORELAX-NEXT:  Offset     Info    Type                Sym. Value  Symbol's Name + Addend
+# NORELAX-NEXT: 00000000  00000313 R_RISCV_CALL_PLT       00000004   .L0 + 0
+# NORELAX-EMPTY:
+# NORELAX-NEXT: Relocation section '.rela.eh_frame' at offset {{.*}} contains 1 entries:
+# NORELAX:       Offset     Info    Type                Sym. Value  Symbol's Name + Addend
+# NORELAX-NEXT: 0000001c  00000139 R_RISCV_32_PCREL       00000000   .L0 + 0
+# NORELAX-EMPTY:
+# NORELAX:      Symbol table '.symtab' contains 13 entries:
+# NORELAX-NEXT:    Num:    Value  Size Type    Bind   Vis       Ndx Name
+# NORELAX-NEXT:      0: 00000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# NORELAX-NEXT:      1: 00000000     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+# NORELAX:           3: 00000004     0 NOTYPE  LOCAL  DEFAULT     2 .L0{{$}}
+# NORELAX-NOT: .L0
 
-# CHECK:      Relocation section '.rela.text1' at offset {{.*}} contains 1 entries:
-# CHECK-NEXT:  Offset     Info    Type                Sym. Value  Symbol's Name + Addend
-# CHECK-NEXT: 00000000  00000313 R_RISCV_CALL_PLT       00000004   .L0 + 0
-# CHECK-EMPTY:
-# CHECK-NEXT: Relocation section '.rela.eh_frame' at offset {{.*}} contains 3 entries:
-# CHECK:       Offset     Info    Type                Sym. Value  Symbol's Name + Addend
-# CHECK-NEXT: 0000001c  00000139 R_RISCV_32_PCREL       00000000   .L0 + 0
-# CHECK-NEXT: 00000035  00000b35 R_RISCV_SET6           00010178   .L0 + 0
-# CHECK-NEXT: 00000035  00000934 R_RISCV_SUB6           0001016e   .L0 + 0
-# CHECK-EMPTY:
-# CHECK:      Symbol table '.symtab' contains 15 entries:
-# CHECK-NEXT:    Num:    Value  Size Type    Bind   Vis       Ndx Name
-# CHECK-NEXT:      0: 00000000     0 NOTYPE  LOCAL  DEFAULT   UND
-# CHECK-NEXT:      1: 00000000     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
-# CHECK:           3: 00000004     0 NOTYPE  LOCAL  DEFAULT     2 .L0{{$}}
-# CHECK:           9: 0001016e     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
-# CHECK:          11: 00010178     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+# RELAX:        Relocation section '.rela.eh_frame' at offset {{.*}} contains 5 entries:
+# RELAX-NEXT:    Offset     Info    Type                Sym. Value  Symbol's Name + Addend
+# RELAX-NEXT:   0000001c  00000139 R_RISCV_32_PCREL       00000000   .L0 + 0
+# RELAX-NEXT:   00000020  00000c23 R_RISCV_ADD32          0001017a   .L0 + 0
+# RELAX-NEXT:   00000020  00000127 R_RISCV_SUB32          00000000   .L0 + 0
+# RELAX-NEXT:   00000035  00000b35 R_RISCV_SET6           00010176   .L0 + 0
+# RELAX-NEXT:   00000035  00000934 R_RISCV_SUB6           0001016e   .L0 + 0
+# RELAX-EMPTY:
+# RELAX:        Symbol table '.symtab' contains 16 entries:
+# RELAX-NEXT:      Num:    Value  Size Type    Bind   Vis       Ndx Name
+# RELAX-NEXT:        0: 00000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# RELAX-NEXT:        1: 00000000     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+# RELAX:             3: 00000004     0 NOTYPE  LOCAL  DEFAULT     2 .L0{{$}}
+# RELAX:             9: 0001016e     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+# RELAX:            11: 00010176     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+# RELAX:            12: 0001017a     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
 
 # CHECK-DWARFDUMP: DW_CFA_advance_loc1: 104
 # CHECK-DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8
diff --git a/llvm/test/MC/RISCV/rv32zbkb-only-valid.s b/llvm/test/MC/RISCV/rv32zbkb-only-valid.s
index a04ff04ba3f10e..39f306c8b7671d 100644
--- a/llvm/test/MC/RISCV/rv32zbkb-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbkb-only-valid.s
@@ -14,3 +14,12 @@ zip t0, t1
 # CHECK-S-OBJ-NOALIAS: unzip t0, t1
 # CHECK-ASM: encoding: [0x93,0x52,0xf3,0x08]
 unzip t0, t1
+
+# Test the encoding used for zext.h for RV32.
+# CHECK-ASM-AND-OBJ: zext.h t0, t1
+# CHECK-ASM: encoding: [0xb3,0x42,0x03,0x08]
+pack t0, t1, x0
+
+# CHECK-ASM-AND-OBJ: zext.h t0, t1
+# CHECK-ASM: encoding: [0xb3,0x42,0x03,0x08]
+zext.h t0, t1
diff --git a/llvm/test/MC/RISCV/rv32zbkb-valid.s b/llvm/test/MC/RISCV/rv32zbkb-valid.s
index 9849430a46f820..0e5ee4cb5fd6ee 100644
--- a/llvm/test/MC/RISCV/rv32zbkb-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbkb-valid.s
@@ -36,11 +36,6 @@ xnor t0, t1, t2
 # CHECK-ASM: encoding: [0xb3,0x42,0x73,0x08]
 pack t0, t1, t2
 
-# Test the encoding used for zext.h for RV32.
-# CHECK-ASM-AND-OBJ: pack t0, t1, zero
-# CHECK-ASM: encoding: [0xb3,0x42,0x03,0x08]
-pack t0, t1, x0
-
 # CHECK-ASM-AND-OBJ: packh t0, t1, t2
 # CHECK-ASM: encoding: [0xb3,0x72,0x73,0x08]
 packh t0, t1, t2
diff --git a/llvm/test/MC/RISCV/rv64zbkb-valid.s b/llvm/test/MC/RISCV/rv64zbkb-valid.s
index fc0541c86ebed0..0a347651ef5e93 100644
--- a/llvm/test/MC/RISCV/rv64zbkb-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbkb-valid.s
@@ -28,7 +28,16 @@ roriw t0, t1, 0
 # CHECK-ASM: encoding: [0xbb,0x42,0x73,0x08]
 packw t0, t1, t2
 
-# Test the encoding used for zext.h
-# CHECK-ASM-AND-OBJ: packw t0, t1, zero
+# Test the encoding used for zext.h on RV64
+# CHECK-ASM-AND-OBJ: zext.h t0, t1
 # CHECK-ASM: encoding: [0xbb,0x42,0x03,0x08]
 packw t0, t1, zero
+
+# Test the encoding used for zext.h on RV32
+# CHECK-ASM-AND-OBJ: pack t0, t1, zero
+# CHECK-ASM: encoding: [0xb3,0x42,0x03,0x08]
+pack t0, t1, zero
+
+# CHECK-ASM-AND-OBJ: zext.h t0, t1
+# CHECK-ASM: encoding: [0xbb,0x42,0x03,0x08]
+zext.h t0, t1
diff --git a/llvm/test/MC/Sparc/sparc-mem-asi-instructions.s b/llvm/test/MC/Sparc/sparc-mem-asi-instructions.s
index 39abe7b99cb40c..8b8503caf4ba0f 100644
--- a/llvm/test/MC/Sparc/sparc-mem-asi-instructions.s
+++ b/llvm/test/MC/Sparc/sparc-mem-asi-instructions.s
@@ -1,9 +1,9 @@
 ! RUN: not llvm-mc %s -triple=sparc   -show-encoding 2>&1 | FileCheck %s --check-prefix=V8
 ! RUN: not llvm-mc %s -triple=sparcv9 -show-encoding 2>&1 | FileCheck %s --check-prefix=V9
 
-! V8: error: malformed ASI tag, must be a constant integer expression
+! V8: error: expected absolute expression
 ! V8-NEXT: lduba [%i0] asi, %o2
-! V9: error: malformed ASI tag, must be %asi, a constant integer expression, or a named tag
+! V9: error: unexpected token
 ! V9-NEXT: lduba [%i0] asi, %o2
 lduba [%i0] asi, %o2
 
diff --git a/llvm/test/MC/Sparc/sparcv9-instructions.s b/llvm/test/MC/Sparc/sparcv9-instructions.s
index 1b11171b7074dd..a7761c10c509bd 100644
--- a/llvm/test/MC/Sparc/sparcv9-instructions.s
+++ b/llvm/test/MC/Sparc/sparcv9-instructions.s
@@ -657,12 +657,12 @@
         ! V9: prefetcha [%i1+3968] %asi, #one_read    ! encoding: [0xc3,0xee,0x6f,0x80]
         prefetcha  [ %i1 + 0xf80 ] %asi, #one_read
 
-        ! V8:      error: malformed ASI tag, must be a constant integer expression
+        ! V8:      error: invalid operand for instruction
         ! V8-NEXT: prefetcha  [ %i1 + %i2 ] #ASI_SNF, 1
         ! V9: prefetcha [%i1+%i2] #ASI_SNF, #one_read ! encoding: [0xc3,0xee,0x50,0x7a]
         prefetcha  [ %i1 + %i2 ] #ASI_SNF, 1
 
-        ! V8:      error: malformed ASI tag, must be a constant integer expression
+        ! V8:      error: unexpected token
         ! V8-NEXT: prefetcha  [ %i1 + %i2 ] #ASI_SNF, #one_read
         ! V9: prefetcha [%i1+%i2] #ASI_SNF, #one_read ! encoding: [0xc3,0xee,0x50,0x7a]
         prefetcha  [ %i1 + %i2 ] #ASI_SNF, #one_read
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 88c91be9263da1..8c3483bfaad7a8 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -920,4 +920,16 @@ main:
     # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xc7,0x02]
     f16x8.relaxed_nmadd
 
+    # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc8,0x02]
+    i16x8.trunc_sat_f16x8_s
+
+    # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc9,0x02]
+    i16x8.trunc_sat_f16x8_u
+
+    # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xca,0x02]
+    f16x8.convert_i16x8_s
+
+    # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xcb,0x02]
+    f16x8.convert_i16x8_u
+
     end_function
diff --git a/llvm/test/MC/X86/align-branch-single.s b/llvm/test/MC/X86/align-branch-single.s
index eb6bb011addaff..a1c28c2b532c55 100644
--- a/llvm/test/MC/X86/align-branch-single.s
+++ b/llvm/test/MC/X86/align-branch-single.s
@@ -3,6 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64 --x86-align-branch-boundary=32 --x86-align-branch=indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=IND
 # RUN: llvm-mc -filetype=obj -triple x86_64 --x86-align-branch-boundary=32 --x86-align-branch=call %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=CAL
 # RUN: llvm-mc -filetype=obj -triple x86_64 --x86-align-branch-boundary=32 --x86-align-branch=ret %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=RET
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 --x86-align-branch-boundary=32 --x86-align-branch=ret %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=RET
 
   # Exercise cases where only one kind of instruction is asked to be aligned.
   # Fused instruction cases are excluded.
diff --git a/llvm/test/Other/crash-stack-trace.ll b/llvm/test/Other/crash-stack-trace.ll
new file mode 100644
index 00000000000000..29e43fe8197c28
--- /dev/null
+++ b/llvm/test/Other/crash-stack-trace.ll
@@ -0,0 +1,20 @@
+; REQUIRES: asserts
+
+; RUN: not --crash opt -passes=trigger-crash-module %s -disable-output 2>&1 | \
+; RUN: FileCheck %s --check-prefix=CHECK-MODULE
+
+; CHECK-MODULE:      Stack dump:
+; CHECK-MODULE-NEXT: 0. Program arguments:
+; CHECK-MODULE-NEXT: 1. Running pass "trigger-crash-module" on module "{{.*}}crash-stack-trace.ll"
+
+; RUN: not --crash opt -passes='sroa,trigger-crash-function' %s -disable-output 2>&1 | \
+; RUN: FileCheck %s --check-prefix=CHECK-FUNCTION
+
+; CHECK-FUNCTION:      Stack dump:
+; CHECK-FUNCTION-NEXT: 0. Program arguments:
+; CHECK-FUNCTION-NEXT: 1. Running pass "function(sroa<modify-cfg>,trigger-crash-function)" on module "{{.*}}crash-stack-trace.ll"
+; CHECK-FUNCTION-NEXT: 2. Running pass "trigger-crash-function" on function "foo"
+
+define void @foo() {
+  ret void
+}
diff --git a/llvm/test/Other/print-on-crash.ll b/llvm/test/Other/print-on-crash.ll
index f9bf8f039c1dbb..565da0c389c168 100644
--- a/llvm/test/Other/print-on-crash.ll
+++ b/llvm/test/Other/print-on-crash.ll
@@ -1,9 +1,9 @@
 ; A test that the hidden option -print-on-crash properly sets a signal handler
-; which gets called when a pass crashes.  The trigger-crash pass asserts.
+; which gets called when a pass crashes.  The trigger-crash-module pass asserts.
 
-; RUN: not --crash opt -print-on-crash -passes=trigger-crash < %s 2>&1 | FileCheck %s --check-prefix=CHECK_SIMPLE
+; RUN: not --crash opt -print-on-crash -passes=trigger-crash-module < %s 2>&1 | FileCheck %s --check-prefix=CHECK_SIMPLE
 
-; RUN: not --crash opt -print-on-crash-path=%t -passes=trigger-crash < %s
+; RUN: not --crash opt -print-on-crash-path=%t -passes=trigger-crash-module < %s
 ; RUN: FileCheck %s --check-prefix=CHECK_SIMPLE --input-file=%t
 
 ; A test that the signal handler set by the  hidden option -print-on-crash
@@ -11,11 +11,11 @@
 
 ; RUN: opt -disable-output -print-on-crash -passes="default<O2>" < %s 2>&1 | FileCheck %s --check-prefix=CHECK_NO_CRASH --allow-empty
 
-; RUN: not --crash opt -print-on-crash -print-module-scope -passes=trigger-crash < %s 2>&1 | FileCheck %s --check-prefix=CHECK_MODULE
+; RUN: not --crash opt -print-on-crash -print-module-scope -passes=trigger-crash-module < %s 2>&1 | FileCheck %s --check-prefix=CHECK_MODULE
 
-; RUN: not --crash opt -print-on-crash -print-module-scope -passes=trigger-crash -filter-passes=trigger-crash < %s 2>&1 | FileCheck %s --check-prefix=CHECK_MODULE
+; RUN: not --crash opt -print-on-crash -print-module-scope -passes=trigger-crash-module -filter-passes=trigger-crash-module < %s 2>&1 | FileCheck %s --check-prefix=CHECK_MODULE
 
-; RUN: not --crash opt -print-on-crash -print-module-scope -passes=trigger-crash -filter-passes=blah < %s 2>&1 | FileCheck %s --check-prefix=CHECK_FILTERED
+; RUN: not --crash opt -print-on-crash -print-module-scope -passes=trigger-crash-module -filter-passes=blah < %s 2>&1 | FileCheck %s --check-prefix=CHECK_FILTERED
 
 ; CHECK_SIMPLE: *** Dump of IR Before Last Pass {{.*}} Started ***
 ; CHECK_SIMPLE: @main
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
index bb1acde2964298..2825ade2d2134d 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
@@ -85,7 +85,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(94), GIMT_Encode2(194), /*)*//*default:*//*Label 4*/ GIMT_Encode4(464),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(95), GIMT_Encode2(195), /*)*//*default:*//*Label 4*/ GIMT_Encode4(464),
 // CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(410), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(428), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 2*/ GIMT_Encode4(440), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
index 35bddf912a541a..43cdd4f5546bb6 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
@@ -37,7 +37,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(69), GIMT_Encode2(73), /*)*//*default:*//*Label 2*/ GIMT_Encode4(84),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(70), GIMT_Encode2(74), /*)*//*default:*//*Label 2*/ GIMT_Encode4(84),
 // CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(26), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_BUILD_VECTOR*//*Label 1*/ GIMT_Encode4(55),
 // CHECK-NEXT:     // Label 0: @26
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index 513a86754d6d9b..7ff637fa1e0e37 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -135,15 +135,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // Verify match table.
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(133), /*)*//*default:*//*Label 6*/ GIMT_Encode4(653),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(466), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(502), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(549), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(583), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(606), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(618),
-// CHECK-NEXT:     // Label 0: @466
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(490), // Rule ID 4 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(134), /*)*//*default:*//*Label 6*/ GIMT_Encode4(657),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(470), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(506), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(553), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(587), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(610), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(622),
+// CHECK-NEXT:     // Label 0: @470
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(494), // Rule ID 4 //
 // CHECK-NEXT:       GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything),
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
 // CHECK-NEXT:       // MIs[0] a
@@ -156,8 +156,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIM_CheckIsSafeToFold, /*NumInsns*/1,
 // CHECK-NEXT:       // Combiner Rule #3: InstTest1
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2),
-// CHECK-NEXT:     // Label 7: @490
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(501), // Rule ID 3 //
+// CHECK-NEXT:     // Label 7: @494
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(505), // Rule ID 3 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // Combiner Rule #2: InstTest0
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1),
-// CHECK-NEXT:     // Label 8: @501
+// CHECK-NEXT:     // Label 8: @505
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @502
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(548), // Rule ID 6 //
+// CHECK-NEXT:     // Label 1: @506
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(552), // Rule ID 6 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled),
 // CHECK-NEXT:       GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] dst
@@ -185,10 +185,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/0, // dst
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 9: @548
+// CHECK-NEXT:     // Label 9: @552
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @549
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(582), // Rule ID 5 //
+// CHECK-NEXT:     // Label 2: @553
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(586), // Rule ID 5 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
 // CHECK-NEXT:       // MIs[0] tmp
 // CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
@@ -204,29 +204,29 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/1, // ptr
 // CHECK-NEXT:       GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1,
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 10: @582
+// CHECK-NEXT:     // Label 10: @586
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @583
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(594), // Rule ID 0 //
+// CHECK-NEXT:     // Label 3: @587
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(598), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 11: @594
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(605), // Rule ID 1 //
+// CHECK-NEXT:     // Label 11: @598
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(609), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 12: @605
+// CHECK-NEXT:     // Label 12: @609
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 4: @606
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(617), // Rule ID 2 //
+// CHECK-NEXT:     // Label 4: @610
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(621), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 13: @617
+// CHECK-NEXT:     // Label 13: @621
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 5: @618
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(652), // Rule ID 7 //
+// CHECK-NEXT:     // Label 5: @622
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(656), // Rule ID 7 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -240,10 +240,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/0, // dst
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 14: @652
+// CHECK-NEXT:     // Label 14: @656
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 6: @653
+// CHECK-NEXT:     // Label 6: @657
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 654 bytes
+// CHECK-NEXT:     }; // Size: 658 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index 5d5bf92664a799..796f5959303190 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -513,7 +513,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 1804 bytes
+// R00O-NEXT:  }; // Size: 1808 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
diff --git a/llvm/test/TableGen/check-duplicate-tuple-regs.td b/llvm/test/TableGen/check-duplicate-tuple-regs.td
new file mode 100644
index 00000000000000..e39345a1ef2c61
--- /dev/null
+++ b/llvm/test/TableGen/check-duplicate-tuple-regs.td
@@ -0,0 +1,10 @@
+// RUN: not llvm-tblgen -gen-register-info -I %p/../../include -I %p/Common %s 2>&1 | FileCheck %s
+
+include "reg-with-subregs-common.td"
+
+// CHECK: error: Register tuple redefines register 'R0_R1_R2_R3'
+def GPR128_Aligned : RegisterTuples<[sub0, sub1, sub2, sub3],
+                                    [(decimate (shl GPR32, 0), 4),
+                                     (decimate (shl GPR32, 1), 4),
+                                     (decimate (shl GPR32, 2), 4),
+                                     (decimate (shl GPR32, 3), 4)]>;
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index 31da626e01f0e0..7290a91c9ccd28 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -3350,17 +3350,17 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
   ret float %res
@@ -3372,17 +3372,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
@@ -3394,17 +3394,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3416,17 +3416,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3530,17 +3530,17 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
   ret float %res
@@ -3552,17 +3552,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
@@ -3574,17 +3574,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3596,17 +3596,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 35c546322e63e0..05fb224e6f1459 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -3350,17 +3350,17 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst
   ret float %res
@@ -3372,17 +3372,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
@@ -3394,17 +3394,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3416,17 +3416,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3530,17 +3530,17 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
   ret float %res
@@ -3552,17 +3552,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
@@ -3574,17 +3574,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3596,17 +3596,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[RES]]
+; COMMON-NEXT:    ret float [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index a5830bd8d7c31f..af6b7e0addfb11 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -1246,17 +1246,17 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
   ret double %res
@@ -1268,17 +1268,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
@@ -1290,17 +1290,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1312,17 +1312,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1462,17 +1462,17 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
   ret double %res
@@ -1484,17 +1484,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
@@ -1506,17 +1506,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1528,17 +1528,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index 4489b639b678fb..69d65e6f1f3799 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -1246,17 +1246,17 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst
   ret double %res
@@ -1268,17 +1268,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
@@ -1290,17 +1290,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1312,17 +1312,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1462,17 +1462,17 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst
   ret double %res
@@ -1484,17 +1484,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
@@ -1506,17 +1506,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1528,17 +1528,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
 ; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
 ; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
 ; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
 ; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
 ; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
 ; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
 ; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[RES]]
+; COMMON-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 533f0f2691045b..021a55f743f664 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -5750,19 +5750,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(
 ; GFX9-NEXT:    ret void
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
-; GFX908-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; GFX908-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
 ; GFX908-NEXT:    ret void
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 387bec75b18ff4..2a5e1bde029ee2 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define float @test_atomicrmw_fmax_f32_flat(ptr %ptr, float %value) {
 ; GCN-LABEL: @test_atomicrmw_fmax_f32_flat(
@@ -257,3 +257,6 @@ define double @test_atomicrmw_fmax_f64_global_strictfp(ptr addrspace(1) %ptr, do
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst
   ret double %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX7: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index e7c8faadd7ed40..0fa409b11b42f2 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define float @test_atomicrmw_fmin_f32_flat(ptr %ptr, float %value) {
 ; GCN-LABEL: @test_atomicrmw_fmin_f32_flat(
@@ -257,3 +257,6 @@ define double @test_atomicrmw_fmin_f64_global_strictfp(ptr addrspace(1) %ptr, do
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst
   ret double %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX7: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/Transforms/ConstraintElimination/induction-condition-in-loop-exit.ll b/llvm/test/Transforms/ConstraintElimination/induction-condition-in-loop-exit.ll
index 2f0b51c410a569..05c7c9e5b90d84 100644
--- a/llvm/test/Transforms/ConstraintElimination/induction-condition-in-loop-exit.ll
+++ b/llvm/test/Transforms/ConstraintElimination/induction-condition-in-loop-exit.ll
@@ -364,6 +364,45 @@ exit:
   ret i1 %t
 }
 
+define i1 @multi_exiting_loop_eq_same_unique_exit_var_compare_not_known_due_to_start(ptr %s, i32 %N) {
+; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_var_compare_not_known_due_to_start(
+; CHECK-SAME: ptr [[S:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[C:%.*]] = icmp ule i32 [[IV]], [[N]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %latch.c = icmp ult i8 %0, 10
+  %iv.next = add nuw nsw i32 %iv, 1
+  br i1 %latch.c, label %loop.header, label %exit
+
+exit:
+  %c = icmp ule i32 %iv, %N
+  ret i1 %c
+}
+
 define i1 @multi_exiting_loop_ne_same_unique_exit_const_compare_known(ptr %s) {
 ; CHECK-LABEL: define i1 @multi_exiting_loop_ne_same_unique_exit_const_compare_known(
 ; CHECK-SAME: ptr [[S:%.*]]) {
@@ -441,3 +480,291 @@ exit:
   %u = icmp ult i32 %iv, 1235
   ret i1 %u
 }
+
+define i1 @multi_exiting_loop_eq_different_exits_const_compare_known(ptr %s) {
+; CHECK-LABEL: define i1 @multi_exiting_loop_eq_different_exits_const_compare_known(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_1:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT_2:.*]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       [[EXIT_2]]:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %exitcond.not = icmp eq i32 %iv, 1234
+  br i1 %exitcond.not, label %exit.1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %latch.c = icmp ult i8 %0, 10
+  %iv.next = add nuw nsw i32 %iv, 1
+  br i1 %latch.c, label %loop.header, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i32 %iv, 1235
+  ret i1 %t.1
+
+exit.2:
+  %t.2 = icmp ult i32 %iv, 1235
+  ret i1 %t.2
+}
+
+define i1 @multi_exiting_loop_eq_different_exits_2_const_compare_known(ptr %s, i1 %c.1, i1 %c.2) {
+; CHECK-LABEL: define i1 @multi_exiting_loop_eq_different_exits_2_const_compare_known(
+; CHECK-SAME: ptr [[S:%.*]], i1 [[C_1:%.*]], i1 [[C_2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_1:.*]], label %[[ELSE_1:.*]]
+; CHECK:       [[ELSE_1]]:
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_1]], label %[[ELSE_2:.*]]
+; CHECK:       [[ELSE_2]]:
+; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_2:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT_2]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i32 [[IV]], 1235
+; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK:       [[EXIT_2]]:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %exitcond.not = icmp eq i32 %iv, 1234
+  br i1 %exitcond.not, label %exit.1, label %else.1
+
+else.1:
+  br i1 %c.1, label %exit.1, label %else.2
+
+else.2:
+  br i1 %c.2, label %exit.2, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %latch.c = icmp ult i8 %0, 10
+  %iv.next = add nuw nsw i32 %iv, 1
+  br i1 %latch.c, label %loop.header, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i32 %iv, 1235
+  ret i1 %t.1
+
+exit.2:
+  %t.2 = icmp ult i32 %iv, 1235
+  ret i1 %t.2
+}
+
+
+define i1 @multi_exiting_loop_eq_different_exits_compare_not_known(ptr %s, i32 %N) {
+; CHECK-LABEL: define i1 @multi_exiting_loop_eq_different_exits_compare_not_known(
+; CHECK-SAME: ptr [[S:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_1:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT_2:.*]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       [[EXIT_2]]:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit.1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %latch.c = icmp ult i8 %0, 10
+  %iv.next = add nuw nsw i32 %iv, 1
+  br i1 %latch.c, label %loop.header, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i32 %iv, %N
+  ret i1 %t.1
+
+exit.2:
+  %t.2 = icmp ult i32 %iv, %N
+  ret i1 %t.2
+}
+
+define i1 @multi_exiting_loop_eq_different_exits_2_compare_not_known(ptr %s, i32 %N, i1 %c.1, i1 %c.2) {
+; CHECK-LABEL: define i1 @multi_exiting_loop_eq_different_exits_2_compare_not_known(
+; CHECK-SAME: ptr [[S:%.*]], i32 [[N:%.*]], i1 [[C_1:%.*]], i1 [[C_2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_1:.*]], label %[[ELSE_1:.*]]
+; CHECK:       [[ELSE_1]]:
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_1]], label %[[ELSE_2:.*]]
+; CHECK:       [[ELSE_2]]:
+; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_2:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT_2]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK:       [[EXIT_2]]:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit.1, label %else.1
+
+else.1:
+  br i1 %c.1, label %exit.1, label %else.2
+
+else.2:
+  br i1 %c.2, label %exit.2, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %latch.c = icmp ult i8 %0, 10
+  %iv.next = add nuw nsw i32 %iv, 1
+  br i1 %latch.c, label %loop.header, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i32 %iv, %N
+  ret i1 %t.1
+
+exit.2:
+  %t.2 = icmp ult i32 %iv, %N
+  ret i1 %t.2
+}
+
+define i1 @multi_exiting_loop_ne_different_exits_const_compare_known(ptr %s) {
+; CHECK-LABEL: define i1 @multi_exiting_loop_ne_different_exits_const_compare_known(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp ne i32 [[IV]], 1234
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[LOOP_LATCH]], label %[[EXIT_1:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT_2:.*]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       [[EXIT_2]]:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %exitcond.not = icmp ne i32 %iv, 1234
+  br i1 %exitcond.not, label %loop.latch, label %exit.1
+
+loop.latch:
+  %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %latch.c = icmp ult i8 %0, 10
+  %iv.next = add nuw nsw i32 %iv, 1
+  br i1 %latch.c, label %loop.header, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i32 %iv, 1235
+  ret i1 %t.1
+
+exit.2:
+  %t.2 = icmp ult i32 %iv, 1235
+  ret i1 %t.2
+}
+
+define i1 @multi_exiting_loop_ne_different_exits_compare_not_known(ptr %s, i32 %N) {
+; CHECK-LABEL: define i1 @multi_exiting_loop_ne_different_exits_compare_not_known(
+; CHECK-SAME: ptr [[S:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp ne i32 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[LOOP_LATCH]], label %[[EXIT_1:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT_2:.*]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       [[EXIT_2]]:
+; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    ret i1 [[T_2]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %exitcond.not = icmp ne i32 %iv, %N
+  br i1 %exitcond.not, label %loop.latch, label %exit.1
+
+loop.latch:
+  %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %latch.c = icmp ult i8 %0, 10
+  %iv.next = add nuw nsw i32 %iv, 1
+  br i1 %latch.c, label %loop.header, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i32 %iv, %N
+  ret i1 %t.1
+
+exit.2:
+  %t.2 = icmp ult i32 %iv, %N
+  ret i1 %t.2
+}
diff --git a/llvm/test/Transforms/DivRemPairs/X86/preserving-debugloc-frx-fry-mul-sub.ll b/llvm/test/Transforms/DivRemPairs/X86/preserving-debugloc-frx-fry-mul-sub.ll
new file mode 100644
index 00000000000000..f0166acac128e6
--- /dev/null
+++ b/llvm/test/Transforms/DivRemPairs/X86/preserving-debugloc-frx-fry-mul-sub.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -passes=div-rem-pairs -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; Check that DivRemPairs's optimizeDivRem() correctly propagates debug locations
+; of `%div` and `%rem` to new freeze, mul and sub instructions.
+
+define i128 @dont_hoist_urem(i128 %a, i128 %b) !dbg !5 {
+; CHECK-LABEL: define i128 @dont_hoist_urem(
+; CHECK-SAME: i128 [[A:%.*]], i128 [[B:%.*]])
+; CHECK:       entry:
+; CHECK-NEXT:    [[A_FROZEN:%.*]] = freeze i128 [[A]], !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[B_FROZEN:%.*]] = freeze i128 [[B]], !dbg [[DBG8]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i128 [[DIV:%.*]], [[B_FROZEN]], !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    [[REM_DECOMPOSED:%.*]] = sub i128 [[A_FROZEN]], [[TMP0:%.*]], !dbg [[DBG11]]
+; CHECK:       end:
+entry:
+  %div = udiv i128 %a, %b, !dbg !8
+  %cmp = icmp eq i128 %div, 42, !dbg !9
+  br i1 %cmp, label %if, label %end, !dbg !10
+
+if:                                               ; preds = %entry
+  %rem = urem i128 %a, %b, !dbg !11
+  br label %end, !dbg !12
+
+end:                                              ; preds = %if, %entry
+  %ret = phi i128 [ %rem, %if ], [ 3, %entry ], !dbg !13
+  ret i128 %ret, !dbg !14
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "frx_fry_.preserve.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "dont_hoist_urem", linkageName: "dont_hoist_urem", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = !DILocation(line: 6, column: 1, scope: !5)
+!14 = !DILocation(line: 7, column: 1, scope: !5)
+; CHECK: [[DBG8]] = !DILocation(line: 1,
+; CHECK: [[DBG11]] = !DILocation(line: 4,
diff --git a/llvm/test/Transforms/DivRemPairs/X86/preserving-debugloc-realrem.ll b/llvm/test/Transforms/DivRemPairs/X86/preserving-debugloc-realrem.ll
new file mode 100644
index 00000000000000..d74c8b9108ffa8
--- /dev/null
+++ b/llvm/test/Transforms/DivRemPairs/X86/preserving-debugloc-realrem.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -passes=div-rem-pairs -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; Check that DivRemPairs's optimizeDivRem() propagates the debug location of
+; replaced `%rem` to the new srem or urem instruction.
+; @decompose_illegal_srem_same_block tests it when the operands of `%rem` is signed.
+; @decompose_illegal_urem_same_block tests it when the operands of `%rem` is unsigned.
+
+define void @decompose_illegal_srem_same_block(i32 %a, i32 %b) !dbg !5 {
+; CHECK-LABEL: define void @decompose_illegal_srem_same_block(
+; CHECK:         %rem.recomposed = srem i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG10:![0-9]+]]
+  %div = sdiv i32 %a, %b, !dbg !8
+  %t0 = mul i32 %div, %b, !dbg !9
+  %rem = sub i32 %a, %t0, !dbg !10
+  call void @foo(i32 %rem, i32 %div), !dbg !11
+  ret void, !dbg !12
+}
+
+define void @decompose_illegal_urem_same_block(i32 %a, i32 %b) !dbg !13 {
+; CHECK-LABEL: define void @decompose_illegal_urem_same_block(
+; CHECK:         %rem.recomposed = urem i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG16:![0-9]+]]
+  %div = udiv i32 %a, %b, !dbg !14
+  %t0 = mul i32 %div, %b, !dbg !15
+  %rem = sub i32 %a, %t0, !dbg !16
+  call void @foo(i32 %rem, i32 %div), !dbg !17
+  ret void, !dbg !18
+}
+
+declare void @foo(i32, i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "realrem_preverse.ll", directory: "/")
+!2 = !{i32 10}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "decompose_illegal_srem_same_block", linkageName: "decompose_illegal_srem_same_block", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = distinct !DISubprogram(name: "decompose_illegal_urem_same_block", linkageName: "decompose_illegal_urem_same_block", scope: null, file: !1, line: 6, type: !6, scopeLine: 6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!14 = !DILocation(line: 6, column: 1, scope: !13)
+!15 = !DILocation(line: 7, column: 1, scope: !13)
+!16 = !DILocation(line: 8, column: 1, scope: !13)
+!17 = !DILocation(line: 9, column: 1, scope: !13)
+!18 = !DILocation(line: 10, column: 1, scope: !13)
+
+; CHECK: [[DBG10]] = !DILocation(line: 3,
+; CHECK: [[DBG16]] = !DILocation(line: 8,
+
diff --git a/llvm/test/Transforms/EarlyCSE/flags.ll b/llvm/test/Transforms/EarlyCSE/flags.ll
index 78282b081c298e..78b3818b211dac 100644
--- a/llvm/test/Transforms/EarlyCSE/flags.ll
+++ b/llvm/test/Transforms/EarlyCSE/flags.ll
@@ -51,7 +51,7 @@ define void @test_inbounds_program_not_ub_if_first_gep_poison(ptr %ptr, i64 %n)
 
 define void @load_both_nonnull(ptr %p) {
 ; CHECK-LABEL: @load_both_nonnull(
-; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !0
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull [[META0:![0-9]+]]
 ; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
 ; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
 ; CHECK-NEXT:    ret void
@@ -79,7 +79,7 @@ define void @load_first_nonnull(ptr %p) {
 
 define void @load_first_nonnull_noundef(ptr %p) {
 ; CHECK-LABEL: @load_first_nonnull_noundef(
-; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !0, !noundef !0
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull [[META0]], !noundef [[META0]]
 ; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
 ; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
 ; CHECK-NEXT:    ret void
@@ -93,7 +93,7 @@ define void @load_first_nonnull_noundef(ptr %p) {
 
 define ptr @store_to_load_forward(ptr %p, ptr %p2) {
 ; CHECK-LABEL: @store_to_load_forward(
-; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !0
+; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull [[META0]]
 ; CHECK-NEXT:    store ptr [[P3]], ptr [[P2:%.*]], align 8
 ; CHECK-NEXT:    ret ptr [[P3]]
 ;
@@ -102,3 +102,13 @@ define ptr @store_to_load_forward(ptr %p, ptr %p2) {
   %res = load ptr, ptr %p2
   ret ptr %res
 }
+
+define i32 @load_undef_noundef(ptr %p) {
+; CHECK-LABEL: @load_undef_noundef(
+; CHECK-NEXT:    store i32 undef, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  store i32 undef, ptr %p
+  %v = load i32, ptr %p, !noundef !{}
+  ret i32 %v
+}
diff --git a/llvm/test/Transforms/FunctionSpecialization/track-return.ll b/llvm/test/Transforms/FunctionSpecialization/track-return.ll
index 54e5de018f19cd..aaff6c138bbaa9 100644
--- a/llvm/test/Transforms/FunctionSpecialization/track-return.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/track-return.ll
@@ -24,7 +24,7 @@ define internal i64 @foo(i1 %flag, i64 %m, i64 %n) {
 ; CHECK-NEXT:    [[RES0:%.*]] = call i64 @bar.specialized.6(i64 4)
 ; CHECK-NEXT:    br label %merge
 ; CHECK:       merge:
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
 ; CHECK:       define internal i64 @foo.specialized.2
 ; CHECK-NEXT:  entry:
@@ -34,7 +34,7 @@ define internal i64 @foo(i1 %flag, i64 %m, i64 %n) {
 ; CHECK-NEXT:    [[RES1:%.*]] = call i64 @bar.specialized.5(i64 3)
 ; CHECK-NEXT:    br label %merge
 ; CHECK:       merge:
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
 entry:
   br i1 %flag, label %plus, label %minus
@@ -58,11 +58,11 @@ define internal i64 @binop(i64 %x, i64 %y) {
 ;
 ; CHECK:       define internal i64 @binop.specialized.3
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
 ; CHECK:       define internal i64 @binop.specialized.4
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
 entry:
   %z = add i64 %x, %y
@@ -77,7 +77,7 @@ define internal i64 @bar(i64 %n) {
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label %if.end
 ; CHECK:       if.end:
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
 ; CHECK:       define internal i64 @bar.specialized.6
 ; CHECK-NEXT:  entry:
@@ -85,7 +85,7 @@ define internal i64 @bar(i64 %n) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br label %if.end
 ; CHECK:       if.end:
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
 entry:
   %cmp = icmp sgt i64 %n, 3
diff --git a/llvm/test/Transforms/GVN/pr48805.ll b/llvm/test/Transforms/GVN/pr48805.ll
index e3fdc280255fa8..dc0159327e7c7d 100644
--- a/llvm/test/Transforms/GVN/pr48805.ll
+++ b/llvm/test/Transforms/GVN/pr48805.ll
@@ -22,7 +22,7 @@ define i64 @test(ptr %p) {
 ; CHECK-NEXT:    [[P1:%.*]] = load ptr, ptr [[A2]], align 8
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RES_PRE]], [[ENTRY_EXIT_CRIT_EDGE]] ], [ undef, [[IF]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RES_PRE]], [[ENTRY_EXIT_CRIT_EDGE]] ], [ poison, [[IF]] ]
 ; CHECK-NEXT:    store ptr @willreturn, ptr [[P]], align 8
 ; CHECK-NEXT:    tail call void @willreturn()
 ; CHECK-NEXT:    ret i64 [[RES]]
diff --git a/llvm/test/Transforms/Inline/ML/avail-external.ll b/llvm/test/Transforms/Inline/ML/avail-external.ll
new file mode 100644
index 00000000000000..1913f096e40310
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/avail-external.ll
@@ -0,0 +1,22 @@
+; REQUIRES: x86_64-linux
+; RUN: rm -rf %t.rundir
+; RUN: rm -rf %t.channel-basename.*
+; RUN: mkdir %t.rundir
+; RUN: cp %S/../../../../lib/Analysis/models/log_reader.py %t.rundir
+; RUN: cp %S/../../../../lib/Analysis/models/interactive_host.py %t.rundir
+; RUN: cp %S/Inputs/interactive_main.py %t.rundir
+; RUN: %python %t.rundir/interactive_main.py %t.channel-basename \
+; RUN:    opt -passes=scc-oz-module-inliner -interactive-model-runner-echo-reply \
+; RUN:    -enable-ml-inliner=release -inliner-interactive-channel-base=%t.channel-basename %s -S -o /dev/null | FileCheck %s
+
+define available_externally void @g() {
+  ret void
+}
+
+define void @f(){
+  call void @g()
+  ret void
+}
+
+; CHECK: is_callee_avail_external: 1
+; CHECK: is_caller_avail_external: 0
diff --git a/llvm/test/Transforms/Inline/update_invoke_prof.ll b/llvm/test/Transforms/Inline/update_invoke_prof.ll
index f6b86dfe5bb1b0..12eb7dbf418c58 100644
--- a/llvm/test/Transforms/Inline/update_invoke_prof.ll
+++ b/llvm/test/Transforms/Inline/update_invoke_prof.ll
@@ -1,6 +1,7 @@
-; Test that branch weights and value profiles associated with invoke are updated
-; in both caller and callee after inline, but invoke instructions with taken or
-; not taken branch probabilities are not updated.
+; Tests that instructions with value profiles and count-type branch weights are
+; updated in both caller and callee after inline, but invoke instructions with
+; taken or not taken branch probabilities are not updated.
+
 ; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
 
 declare i32 @__gxx_personality_v0(...)
@@ -15,21 +16,23 @@ declare void @callee1(ptr %func)
 
 declare void @callee2(ptr %func)
 
-define void @callee(ptr %func) personality ptr @__gxx_personality_v0 !prof !17 {
+define void @callee(ptr %obj) personality ptr @__gxx_personality_v0 !prof !17 {
+  %vtable = load ptr, ptr %obj, !prof !21
+  %func = load ptr, ptr %vtable
   invoke void %func()
-          to label %next unwind label %lpad, !prof !18
+  to label %next unwind label %lpad, !prof !18
 
 next:
   invoke void @callee1(ptr %func)
-          to label %cont unwind label %lpad, !prof !19
+  to label %cont unwind label %lpad, !prof !19
 
 cont:
   invoke void @callee2(ptr %func)
-          to label %ret unwind label %lpad, !prof !20
+  to label %ret unwind label %lpad, !prof !20
 
 lpad:
   %exn = landingpad {ptr, i32}
-          cleanup
+  cleanup
   unreachable
 
 ret:
@@ -57,26 +60,41 @@ ret:
 !18 = !{!"VP", i32 0, i64 1500, i64 123, i64 900, i64 456, i64 600}
 !19 = !{!"branch_weights", i32 1500}
 !20 = !{!"branch_weights", i32 1234, i32 5678}
+!21 = !{!"VP", i32 2, i64 1500, i64 789, i64 900, i64 321, i64 600}
 
-; CHECK-LABEL: @caller(
-; CHECK:  invoke void %func(
-; CHECK-NEXT: {{.*}} !prof ![[PROF1:[0-9]+]]
-; CHECK:  invoke void @callee1(
-; CHECK-NEXT: {{.*}} !prof ![[PROF2:[0-9]+]]
-; CHECK:  invoke void @callee2(
-; CHECK-NEXT: {{.*}} !prof ![[PROF3:[0-9]+]]
-
-; CHECK-LABL: @callee(
-; CHECK:  invoke void %func(
-; CHECK-NEXT: {{.*}} !prof ![[PROF4:[0-9]+]]
-; CHECK:  invoke void @callee1(
-; CHECK-NEXT: {{.*}} !prof ![[PROF5:[0-9]+]]
-; CHECK:  invoke void @callee2(
-; CHECK-NEXT: {{.*}} !prof ![[PROF3]]
+; CHECK-LABEL: define void @caller(
+; CHECK-SAME: ptr [[FUNC:%.*]]) personality ptr @__gxx_personality_v0 !prof [[PROF14:![0-9]+]] {
+; CHECK-NEXT:    [[VTABLE_I:%.*]] = load ptr, ptr [[FUNC]], align 8, !prof [[PROF15:![0-9]+]]
+; CHECK-NEXT:    [[FUNC_I:%.*]] = load ptr, ptr [[VTABLE_I]], align 8
+; CHECK-NEXT:    invoke void [[FUNC_I]]()
+; CHECK-NEXT:            to label %[[NEXT_I:.*]] unwind label %[[LPAD_I:.*]], !prof [[PROF16:![0-9]+]]
+; CHECK:       [[NEXT_I]]:
+; CHECK-NEXT:    invoke void @callee1(ptr [[FUNC_I]])
+; CHECK-NEXT:            to label %[[CONT_I:.*]] unwind label %[[LPAD_I]], !prof [[PROF17:![0-9]+]]
+; CHECK:       [[CONT_I]]:
+; CHECK-NEXT:    invoke void @callee2(ptr [[FUNC_I]])
+; CHECK-NEXT:            to label %[[CALLEE_EXIT:.*]] unwind label %[[LPAD_I]], !prof [[PROF18:![0-9]+]]
+;
 
+; CHECK-LABEL: define void @callee(
+; CHECK-SAME: ptr [[OBJ:%.*]]) personality ptr @__gxx_personality_v0 !prof [[PROF19:![0-9]+]] {
+; CHECK-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[OBJ]], align 8, !prof [[PROF20:![0-9]+]]
+; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    invoke void [[FUNC]]()
+; CHECK-NEXT:            to label %[[NEXT:.*]] unwind label %[[LPAD:.*]], !prof [[PROF21:![0-9]+]]
+; CHECK:       [[NEXT]]:
+; CHECK-NEXT:    invoke void @callee1(ptr [[FUNC]])
+; CHECK-NEXT:            to label %[[CONT:.*]] unwind label %[[LPAD]], !prof [[PROF22:![0-9]+]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    invoke void @callee2(ptr [[FUNC]])
+; CHECK-NEXT:            to label %[[RET:.*]] unwind label %[[LPAD]], !prof [[PROF18]]
 
-; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 1000, i64 123, i64 600, i64 456, i64 400}
-; CHECK: ![[PROF2]] = !{!"branch_weights", i32 1000}
-; CHECK: ![[PROF3]] = !{!"branch_weights", i32 1234, i32 5678}
-; CHECK: ![[PROF4]] = !{!"VP", i32 0, i64 500, i64 123, i64 300, i64 456, i64 200}
-; CHECK: ![[PROF5]] = !{!"branch_weights", i32 500}
+; CHECK: [[PROF14]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF15]] = !{!"VP", i32 2, i64 1000, i64 789, i64 600, i64 321, i64 400}
+; CHECK: [[PROF16]] = !{!"VP", i32 0, i64 1000, i64 123, i64 600, i64 456, i64 400}
+; CHECK: [[PROF17]] = !{!"branch_weights", i32 1000}
+; CHECK: [[PROF18]] = !{!"branch_weights", i32 1234, i32 5678}
+; CHECK: [[PROF19]] = !{!"function_entry_count", i64 500}
+; CHECK: [[PROF20]] = !{!"VP", i32 2, i64 500, i64 789, i64 300, i64 321, i64 200}
+; CHECK: [[PROF21]] = !{!"VP", i32 0, i64 500, i64 123, i64 300, i64 456, i64 200}
+; CHECK: [[PROF22]] = !{!"branch_weights", i32 500}
diff --git a/llvm/test/Transforms/Inline/update_value_profile.ll b/llvm/test/Transforms/Inline/update_value_profile.ll
index daa95e93b68ec5..96aa35fb572dee 100644
--- a/llvm/test/Transforms/Inline/update_value_profile.ll
+++ b/llvm/test/Transforms/Inline/update_value_profile.ll
@@ -2,33 +2,33 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; When 'callee' is inlined into caller1 and caller2, the indirect call value
-; profiles of the inlined copy should be scaled based on callers' profiles,
-; and the indirect call value profiles in 'callee' should be updated.
-define i32 @callee(ptr %0, i32 %1) !prof !20 {
+; When 'callee' is inlined into caller1 and caller2, the indirect call and vtable
+; value profiles of the inlined copy should be scaled based on callers' profiles.
+; The indirect call and vtable value profiles in 'callee' should be updated.
+define i32 @callee(ptr %0, i32 %1) !prof !19 {
 ; CHECK-LABEL: define i32 @callee(
 ; CHECK-SAME: ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) !prof [[PROF0:![0-9]+]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8, !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP0]], i32 [[TMP1]]), !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP0]], i32 [[TMP1]]), !prof [[PROF2:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
-  %3 = load ptr, ptr %0
+  %3 = load ptr, ptr %0, !prof !15
   %5 = getelementptr inbounds i8, ptr %3, i64 8
   %6 = load ptr, ptr %5
-  %7 = tail call i32 %6(ptr %0, i32 %1), !prof !17
+  %7 = tail call i32 %6(ptr %0, i32 %1), !prof !16
   ret i32 %7
 }
 
-define i32 @caller1(i32 %0) !prof !18 {
+define i32 @caller1(i32 %0) !prof !17 {
 ; CHECK-LABEL: define i32 @caller1(
-; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF2:![0-9]+]] {
+; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF3:![0-9]+]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call ptr @_Z10createTypei(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !prof [[PROF4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF3:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF5:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
   %2 = tail call ptr @_Z10createTypei(i32 %0)
@@ -36,14 +36,14 @@ define i32 @caller1(i32 %0) !prof !18 {
   ret i32 %3
 }
 
-define i32 @caller2(i32 %0) !prof !19  {
+define i32 @caller2(i32 %0) !prof !18  {
 ; CHECK-LABEL: define i32 @caller2(
-; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF4:![0-9]+]] {
+; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF6:![0-9]+]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call ptr @_Z10createTypei(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !prof [[PROF7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF8:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
   %2 = tail call ptr @_Z10createTypei(i32 %0)
@@ -67,15 +67,19 @@ declare ptr @_Z10createTypei(i32)
 !12 = !{i32 10000, i64 100, i32 1}
 !13 = !{i32 999000, i64 100, i32 1}
 !14 = !{i32 999999, i64 1, i32 2}
-!17 = !{!"VP", i32 0, i64 1600, i64 123, i64 1000, i64 456, i64 600}
-!18 = !{!"function_entry_count", i64 1000}
-!19 = !{!"function_entry_count", i64 600}
-!20 = !{!"function_entry_count", i64 1700}
+!15 = !{!"VP", i32 2, i64 1600, i64 321, i64 1000, i64 789, i64 600}
+!16 = !{!"VP", i32 0, i64 1600, i64 123, i64 1000, i64 456, i64 600}
+!17 = !{!"function_entry_count", i64 1000}
+!18 = !{!"function_entry_count", i64 600}
+!19 = !{!"function_entry_count", i64 1700}
 ;.
 ; CHECK: [[PROF0]] = !{!"function_entry_count", i64 100}
-; CHECK: [[PROF1]] = !{!"VP", i32 0, i64 94, i64 123, i64 58, i64 456, i64 35}
-; CHECK: [[PROF2]] = !{!"function_entry_count", i64 1000}
-; CHECK: [[PROF3]] = !{!"VP", i32 0, i64 941, i64 123, i64 588, i64 456, i64 352}
-; CHECK: [[PROF4]] = !{!"function_entry_count", i64 600}
-; CHECK: [[PROF5]] = !{!"VP", i32 0, i64 564, i64 123, i64 352, i64 456, i64 211}
+; CHECK: [[PROF1]] = !{!"VP", i32 2, i64 94, i64 321, i64 58, i64 789, i64 35}
+; CHECK: [[PROF2]] = !{!"VP", i32 0, i64 94, i64 123, i64 58, i64 456, i64 35}
+; CHECK: [[PROF3]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF4]] = !{!"VP", i32 2, i64 941, i64 321, i64 588, i64 789, i64 352}
+; CHECK: [[PROF5]] = !{!"VP", i32 0, i64 941, i64 123, i64 588, i64 456, i64 352}
+; CHECK: [[PROF6]] = !{!"function_entry_count", i64 600}
+; CHECK: [[PROF7]] = !{!"VP", i32 2, i64 564, i64 321, i64 352, i64 789, i64 211}
+; CHECK: [[PROF8]] = !{!"VP", i32 0, i64 564, i64 123, i64 352, i64 456, i64 211}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll
new file mode 100644
index 00000000000000..2470337f091807
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll
@@ -0,0 +1,395 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i8> @test_ld1(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 4 x i32> @test_ld1_gather(ptr %a, <vscale x 4 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ld1_gather_index(ptr %a, <vscale x 2 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_scalar_offset(<vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+
+define <vscale x 2 x i64> @test_ld1q_gather_index(ptr %a, <vscale x 2 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1q_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset(<vscale x 2 x i64> %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %a, i64 0)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 16 x i8> @test_ld1q_gather_vector_offset(ptr %a, <vscale x 2 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1q_gather_vector_offset(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_ld1ro(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1ro(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 16 x i8> @test_ld1rq(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1rq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 2 x i64> @test_ld1udq(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1udq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x i32> @test_ld1uwq(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1uwq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 8 x i16> @test_ld2_sret(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2_sret(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld2q_sret(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2q_sret(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld3(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld3q(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3q(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld4(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld4q(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4q(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ldff1(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ldff1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather(ptr %a, <vscale x 4 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ldff1_gather_index(ptr %a, <vscale x 2 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ldff1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset(<vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 16 x i8> @test_ldnf1(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ldnf1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_ldnt1(ptr %a)  #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ldnt1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_ldnt1_gather(ptr %a, <vscale x 4 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ldnt1_gather_index(ptr %a, <vscale x 2 x i64> %b)  #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ldnt1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ldnt1_gather_scalar_offset(<vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldnt1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a)  #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %0
+}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 94c32e3cbe99f7..59118a172a2bc3 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -1023,6 +1023,60 @@ define float @cos_fabs_unary_fneg_f32(float %x) {
   ret float %cos
 }
 
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.sin
+; --------------------------------------------------------------------
+declare float @llvm.amdgcn.sin.f32(float) nounwind readnone
+
+define float @sin_fneg_f32(float %x) {
+; CHECK-LABEL: @sin_fneg_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sin.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[SIN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[SIN]]
+;
+  %x.fneg = fneg float %x
+  %sin = call float @llvm.amdgcn.sin.f32(float %x.fneg)
+  ret float %sin
+}
+
+define float @sin_fabs_f32(float %x) {
+; CHECK-LABEL: @sin_fabs_f32(
+; CHECK-NEXT:    [[X_FABS:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[SIN:%.*]] = call float @llvm.amdgcn.sin.f32(float [[X_FABS]])
+; CHECK-NEXT:    ret float [[SIN]]
+;
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %sin = call float @llvm.amdgcn.sin.f32(float %x.fabs)
+  ret float %sin
+}
+
+define float @sin_fabs_fneg_f32(float %x) {
+; CHECK-LABEL: @sin_fabs_fneg_f32(
+; CHECK-NEXT:    [[X_FABS:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sin.f32(float [[X_FABS]])
+; CHECK-NEXT:    [[SIN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[SIN]]
+;
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fneg float %x.fabs
+  %sin = call float @llvm.amdgcn.sin.f32(float %x.fabs.fneg)
+  ret float %sin
+}
+
+define float @sin_fabs_fneg_fast_f32(float %x) {
+; CHECK-LABEL: @sin_fabs_fneg_fast_f32(
+; CHECK-NEXT:    [[X_FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.amdgcn.sin.f32(float [[X_FABS]])
+; CHECK-NEXT:    [[SIN:%.*]] = fneg fast float [[TMP1]]
+; CHECK-NEXT:    ret float [[SIN]]
+;
+  %x.fabs = call fast float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fneg float %x.fabs
+  %sin = call fast float @llvm.amdgcn.sin.f32(float %x.fabs.fneg)
+  ret float %sin
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.cvt.pkrtz
 ; --------------------------------------------------------------------
@@ -2714,7 +2768,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
 
 define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, ptr undef, align 4
@@ -2737,7 +2791,7 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
 
 define i32 @readfirstlane_idempotent(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2748,7 +2802,7 @@ define i32 @readfirstlane_idempotent(i32 %arg) {
 
 define i32 @readfirstlane_readlane(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2759,10 +2813,10 @@ define i32 @readfirstlane_readlane(i32 %arg) {
 define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2777,10 +2831,10 @@ bb1:
 define i32 @readfirstlane_readlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0)
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2800,7 +2854,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32)
 
 define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7)
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, ptr undef, align 4
@@ -2823,7 +2877,7 @@ define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
 
 define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2833,8 +2887,8 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 
 define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
 ; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
@@ -2844,7 +2898,7 @@ define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1
 
 define i32 @readlane_readfirstlane(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2855,10 +2909,10 @@ define i32 @readlane_readfirstlane(i32 %arg) {
 define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2874,10 +2928,10 @@ bb1:
 define i32 @readlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 0)
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2933,37 +2987,37 @@ define amdgpu_kernel void @update_dpp_undef_old(ptr addrspace(1) %out, i32 %in1)
 ; llvm.amdgcn.permlane16
 ; --------------------------------------------------------------------
 
-declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1 immarg, i1 immarg)
+declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1 immarg, i1 immarg)
 
 define amdgpu_kernel void @permlane16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; CHECK-LABEL: @permlane16(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
 ; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @permlane16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; CHECK-LABEL: @permlane16_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
 ; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
 ; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i32 %res, ptr addrspace(1) %out
   ret void
 }
@@ -2972,37 +3026,37 @@ define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1)
 ; llvm.amdgcn.permlanex16
 ; --------------------------------------------------------------------
 
-declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1 immarg, i1 immarg)
+declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1 immarg, i1 immarg)
 
 define amdgpu_kernel void @permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; CHECK-LABEL: @permlanex16(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
 ; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @permlanex16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; CHECK-LABEL: @permlanex16_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
 ; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+  %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
 ; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+  %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i32 %res, ptr addrspace(1) %out
   ret void
 }
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pmaddubsw.ll b/llvm/test/Transforms/InstCombine/X86/x86-pmaddubsw.ll
new file mode 100644
index 00000000000000..6de8127d5e9bec
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pmaddubsw.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <8 x i16> @undef_pmaddubsw_128(<16 x i8> %a0) {
+; CHECK-LABEL: @undef_pmaddubsw_128(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> undef)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @undef_pmaddubsw_128_commute(<16 x i8> %a0) {
+; CHECK-LABEL: @undef_pmaddubsw_128_commute(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> undef, <16 x i8> %a0)
+  ret <8 x i16> %1
+}
+
+define <16 x i16> @undef_pmaddubsw_256(<32 x i8> %a0) {
+; CHECK-LABEL: @undef_pmaddubsw_256(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @undef_pmaddubsw_256_commute(<32 x i8> %a0) {
+; CHECK-LABEL: @undef_pmaddubsw_256_commute(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> undef, <32 x i8> %a0)
+  ret <16 x i16> %1
+}
+
+define <32 x i16> @undef_pmaddubsw_512(<64 x i8> %a0) {
+; CHECK-LABEL: @undef_pmaddubsw_512(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_pmaddubsw_512_commute(<64 x i8> %a0) {
+; CHECK-LABEL: @undef_pmaddubsw_512_commute(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> undef, <64 x i8> %a0)
+  ret <32 x i16> %1
+}
+
+;
+; Zero Elts
+;
+
+define <8 x i16> @zero_pmaddubsw_128(<16 x i8> %a0) {
+; CHECK-LABEL: @zero_pmaddubsw_128(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @zero_pmaddubsw_128_commute(<16 x i8> %a0) {
+; CHECK-LABEL: @zero_pmaddubsw_128_commute(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> zeroinitializer, <16 x i8> %a0)
+  ret <8 x i16> %1
+}
+
+define <16 x i16> @zero_pmaddubsw_256(<32 x i8>%a0) {
+; CHECK-LABEL: @zero_pmaddubsw_256(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @zero_pmaddubsw_256_commute(<32 x i8> %a0) {
+; CHECK-LABEL: @zero_pmaddubsw_256_commute(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> zeroinitializer, <32 x i8> %a0)
+  ret <16 x i16> %1
+}
+
+define <32 x i16> @zero_pmaddubsw_512(<64 x i8> %a0) {
+; CHECK-LABEL: @zero_pmaddubsw_512(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @zero_pmaddubsw_512_commute(<64 x i8> %a0) {
+; CHECK-LABEL: @zero_pmaddubsw_512_commute(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> zeroinitializer, <64 x i8> %a0)
+  ret <32 x i16> %1
+}
+
+;
+; Constant Folding
+;
+
+define <8 x i16> @fold_pmaddubsw_128() {
+; CHECK-LABEL: @fold_pmaddubsw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 -32768, i16 18, i16 50, i16 1694, i16 162, i16 242, i16 338, i16 450>
+;
+  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
+  ret <8 x i16> %1
+}
+
+define <16 x i16> @fold_pmaddubsw_256() {
+; CHECK-LABEL: @fold_pmaddubsw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 -32768, i16 18, i16 50, i16 1694, i16 162, i16 242, i16 338, i16 450, i16 -256, i16 18, i16 50, i16 1694, i16 162, i16 242, i16 338, i16 450>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, <32 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i16> %1
+}
+
+define <32 x i16> @fold_pmaddubsw_512() {
+; CHECK-LABEL: @fold_pmaddubsw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 -32768, i16 18, i16 50, i16 1694, i16 162, i16 242, i16 338, i16 450, i16 -256, i16 18, i16 50, i16 1694, i16 162, i16 242, i16 338, i16 450, i16 -256, i16 18, i16 50, i16 1694, i16 162, i16 242, i16 338, i16 450, i16 -32768, i16 18, i16 50, i16 1694, i16 162, i16 242, i16 338, i16 450>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <64 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
+  ret <32 x i16> %1
+}
+
+;
+; Demanded Elts
+;
+
+define <8 x i16> @elts_pmaddubsw_128(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: @elts_pmaddubsw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %1, <16 x i8> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @elts_pmaddubsw_256(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: @elts_pmaddubsw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+  %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %2 = shufflevector <32 x i8> %a1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %3 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %1, <32 x i8> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %4
+}
+
+define <32 x i16> @elts_pmaddubsw_512(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK-LABEL: @elts_pmaddubsw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[A0:%.*]], <64 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
+;
+  %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32>
+  %2 = shufflevector <64 x i8> %a1, <64 x i8> undef, <64 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32>
+  %3 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %1, <64 x i8> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> poison, <32 x i32> zeroinitializer
+  ret <32 x i16> %4
+}
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pmaddwd.ll b/llvm/test/Transforms/InstCombine/X86/x86-pmaddwd.ll
new file mode 100644
index 00000000000000..25849f70b6bbf5
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pmaddwd.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <4 x i32> @undef_pmaddwd_128(<8 x i16> %a0) {
+; CHECK-LABEL: @undef_pmaddwd_128(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> undef)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @undef_pmaddwd_128_commute(<8 x i16> %a0) {
+; CHECK-LABEL: @undef_pmaddwd_128_commute(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> undef, <8 x i16> %a0)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @undef_pmaddwd_256(<16 x i16> %a0) {
+; CHECK-LABEL: @undef_pmaddwd_256(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> undef)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @undef_pmaddwd_256_commute(<16 x i16> %a0) {
+; CHECK-LABEL: @undef_pmaddwd_256_commute(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> undef, <16 x i16> %a0)
+  ret <8 x i32> %1
+}
+
+define <16 x i32> @undef_pmaddwd_512(<32 x i16> %a0) {
+; CHECK-LABEL: @undef_pmaddwd_512(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> undef)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @undef_pmaddwd_512_commute(<32 x i16> %a0) {
+; CHECK-LABEL: @undef_pmaddwd_512_commute(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> undef, <32 x i16> %a0)
+  ret <16 x i32> %1
+}
+
+;
+; Zero Elts
+;
+
+define <4 x i32> @zero_pmaddwd_128(<8 x i16> %a0) {
+; CHECK-LABEL: @zero_pmaddwd_128(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> zeroinitializer)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @zero_pmaddwd_128_commute(<8 x i16> %a0) {
+; CHECK-LABEL: @zero_pmaddwd_128_commute(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> zeroinitializer, <8 x i16> %a0)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @zero_pmaddwd_256(<16 x i16> %a0) {
+; CHECK-LABEL: @zero_pmaddwd_256(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @zero_pmaddwd_256_commute(<16 x i16> %a0) {
+; CHECK-LABEL: @zero_pmaddwd_256_commute(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> zeroinitializer, <16 x i16> %a0)
+  ret <8 x i32> %1
+}
+
+define <16 x i32> @zero_pmaddwd_512(<32 x i16> %a0) {
+; CHECK-LABEL: @zero_pmaddwd_512(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @zero_pmaddwd_512_commute(<32 x i16> %a0) {
+; CHECK-LABEL: @zero_pmaddwd_512_commute(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> zeroinitializer, <32 x i16> %a0)
+  ret <16 x i32> %1
+}
+
+;
+; Constant Folding
+;
+
+define <4 x i32> @fold_pmaddwd_128() {
+; CHECK-LABEL: @fold_pmaddwd_128(
+; CHECK-NEXT:    ret <4 x i32> <i32 19, i32 -229364, i32 -21, i32 -491429>
+;
+  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8>, <8 x i16> <i16 -5, i16 7, i16 -32768, i16 32765, i16 -9, i16 -11, i16 -32763, i16 32761>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @fold_pmaddwd_256() {
+; CHECK-LABEL: @fold_pmaddwd_256(
+; CHECK-NEXT:    ret <8 x i32> <i32 -7, i32 32762, i32 91, i32 32750, i32 -239, i32 687938, i32 -451, i32 -32756>
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> <i16 0, i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8, i16 9, i16 -10, i16 11, i16 -12, i16 13, i16 -14, i16 -15>, <16 x i16> <i16 -5, i16 7, i16 -32768, i16 32766, i16 -9, i16 -11, i16 -32764, i16 32762, i16 13, i16 -15, i16 -32760, i16 32758, i16 17, i16 -19, i16 -32756, i16 32756>)
+  ret <8 x i32> %1
+}
+
+define <16 x i32> @fold_pmaddwd_512() {
+; CHECK-LABEL: @fold_pmaddwd_512(
+; CHECK-NEXT:    ret <16 x i32> <i32 -7, i32 32762, i32 91, i32 32750, i32 -239, i32 687938, i32 -451, i32 -32756, i32 -7, i32 32762, i32 91, i32 32750, i32 -239, i32 687938, i32 -451, i32 -32756>
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> <i16 0, i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8, i16 9, i16 -10, i16 11, i16 -12, i16 13, i16 -14, i16 -15, i16 -5, i16 7, i16 -32768, i16 32766, i16 -9, i16 -11, i16 -32764, i16 32762, i16 13, i16 -15, i16 -32760, i16 32758, i16 17, i16 -19, i16 -32756, i16 32756>, <32 x i16> <i16 -5, i16 7, i16 -32768, i16 32766, i16 -9, i16 -11, i16 -32764, i16 32762, i16 13, i16 -15, i16 -32760, i16 32758, i16 17, i16 -19, i16 -32756, i16 32756, i16 0, i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8, i16 9, i16 -10, i16 11, i16 -12, i16 13, i16 -14, i16 -15>)
+  ret <16 x i32> %1
+}
+
+;
+; Demanded Elts
+;
+
+define <4 x i32> @elts_pmaddwd_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_pmaddwd_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
+  %2 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @elts_pmaddwd_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_pmaddwd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <8 x i32> %3, <8 x i32> poison, <8 x i32> zeroinitializer
+  ret <8 x i32> %4
+}
+
+define <16 x i32> @elts_pmaddwd_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_pmaddwd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[A0:%.*]], <32 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %2 = shufflevector <32 x i16> %a1, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %3 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %1, <32 x i16> %2)
+  %4 = shufflevector <16 x i32> %3, <16 x i32> poison, <16 x i32> zeroinitializer
+  ret <16 x i32> %4
+}
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 239e14682c3069..8b3ecaf25cd9d1 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -1435,15 +1435,28 @@ define i32 @and31_add_sexts(i1 %x, i1 %y) {
   ret i32 %r
 }
 
-; Negative test - extra use
-
 define i32 @lshr_add_use_sexts(i1 %x, i1 %y, ptr %p) {
 ; CHECK-LABEL: @lshr_add_use_sexts(
+; CHECK-NEXT:    [[YS:%.*]] = sext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    store i32 [[YS]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %xs = sext i1 %x to i32
+  %ys = sext i1 %y to i32
+  store i32 %ys, ptr %p
+  %sub = add i32 %xs, %ys
+  %r = lshr i32 %sub, 31
+  ret i32 %r
+}
+
+define i32 @lshr_add_use_sexts_2(i1 %x, i1 %y, ptr %p) {
+; CHECK-LABEL: @lshr_add_use_sexts_2(
 ; CHECK-NEXT:    [[XS:%.*]] = sext i1 [[X:%.*]] to i32
 ; CHECK-NEXT:    store i32 [[XS]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[YS:%.*]] = sext i1 [[Y:%.*]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[XS]], [[YS]]
-; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[SUB]], 31
+; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[TMP1]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %xs = sext i1 %x to i32
@@ -1456,18 +1469,20 @@ define i32 @lshr_add_use_sexts(i1 %x, i1 %y, ptr %p) {
 
 ; Negative test - extra use
 
-define i32 @lshr_add_use2_sexts(i1 %x, i1 %y, ptr %p) {
-; CHECK-LABEL: @lshr_add_use2_sexts(
+declare void @use_sexts(i32, i32)
+
+define i32 @lshr_add_use_sexts_both(i1 %x, i1 %y) {
+; CHECK-LABEL: @lshr_add_use_sexts_both(
 ; CHECK-NEXT:    [[XS:%.*]] = sext i1 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[YS:%.*]] = sext i1 [[Y:%.*]] to i32
-; CHECK-NEXT:    store i32 [[YS]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    call void @use_sexts(i32 [[XS]], i32 [[YS]])
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[XS]], [[YS]]
 ; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[SUB]], 31
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %xs = sext i1 %x to i32
   %ys = sext i1 %y to i32
-  store i32 %ys, ptr %p
+  call void @use_sexts(i32 %xs, i32 %ys)
   %sub = add i32 %xs, %ys
   %r = lshr i32 %sub, 31
   ret i32 %r
diff --git a/llvm/test/Transforms/InstCombine/atomicrmw.ll b/llvm/test/Transforms/InstCombine/atomicrmw.ll
index ced4c33d7d9463..763b234698cc5f 100644
--- a/llvm/test/Transforms/InstCombine/atomicrmw.ll
+++ b/llvm/test/Transforms/InstCombine/atomicrmw.ll
@@ -396,3 +396,397 @@ define double @no_sat_fmin_inf(ptr %addr) {
   %res = atomicrmw fmin ptr %addr, double 1.000000e-01 monotonic
   ret double %res
 }
+
+; Idempotent atomicrmw are still canonicalized.
+define i32 @atomic_add_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_add_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0:![0-9]+]], !amdgpu.no.fine.grained.host.memory [[META1:![0-9]+]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw add ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+define i32 @atomic_or_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_or_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define i32 @atomic_sub_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_sub_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw sub ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define i32 @atomic_and_allones_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_and_allones_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw and ptr %addr, i32 -1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define i32 @atomic_umin_uint_max_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_umin_uint_max_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw umin ptr %addr, i32 -1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define i32 @atomic_umax_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_umax_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw umax ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define i8 @atomic_min_smax_char_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_min_smax_char_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i8 0 syncscope("agent") monotonic, align 1, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %res = atomicrmw min ptr %addr, i8 127 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i8 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define i8 @atomic_max_smin_char_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_max_smin_char_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i8 0 syncscope("agent") monotonic, align 1, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %res = atomicrmw max ptr %addr, i8 -128 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i8 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define float @atomic_fsub_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_fsub_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float -0.000000e+00 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fsub ptr %addr, float 0.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret float %res
+}
+
+define float @atomic_fadd_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_fadd_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float -0.000000e+00 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fadd ptr %addr, float -0.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret float %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define float @atomic_fsub_canon_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_fsub_canon_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float -0.000000e+00 release, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fsub ptr %addr, float 0.0 release, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret float %res
+}
+
+define float @atomic_fadd_canon_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_fadd_canon_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float -0.000000e+00 release, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fadd ptr %addr, float -0.0 release, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret float %res
+}
+
+; Can't replace a volatile w/a load; this would eliminate a volatile store.
+define i64 @atomic_sub_zero_volatile_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_sub_zero_volatile_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw volatile sub ptr [[ADDR:%.*]], i64 0 acquire, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw volatile sub ptr %addr, i64 0 acquire, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i64 %res
+}
+
+
+; Check that the transformation properly preserve the syncscope.
+; Idempotent atomicrmw are still canonicalized.
+define i16 @atomic_syncscope_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_syncscope_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i16 0 syncscope("some_syncscope") acquire, align 2, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %res = atomicrmw or ptr %addr, i16 0 syncscope("some_syncscope") acquire, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i16 %res
+}
+
+; By eliminating the store part of the atomicrmw, we would get rid of the
+; release semantic, which is incorrect.  We can canonicalize the operation.
+define i16 @atomic_seq_cst_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_seq_cst_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i16 0 seq_cst, align 2, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %res = atomicrmw add ptr %addr, i16 0 seq_cst, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i16 %res
+}
+
+; Check that the transformation does not apply when the value is changed by
+; the atomic operation (non zero constant).
+define i16 @atomic_add_non_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_add_non_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr [[ADDR:%.*]], i16 2 syncscope("agent") monotonic, align 2, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %res = atomicrmw add ptr %addr, i16 2 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i16 %res
+}
+
+; Idempotent atomicrmw are still canonicalized.
+define i16 @atomic_xor_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_xor_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i16 0 syncscope("agent") monotonic, align 2, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %res = atomicrmw xor ptr %addr, i16 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i16 %res
+}
+
+; Check that the transformation does not apply when the ordering is
+; incompatible with a load (release).  Do canonicalize.
+define i16 @atomic_release_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_release_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i16 0 release, align 2, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %res = atomicrmw sub ptr %addr, i16 0 release, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i16 %res
+}
+
+; Check that the transformation does not apply when the ordering is
+; incompatible with a load (acquire, release).  Do canonicalize.
+define i16 @atomic_acq_rel_preserve_md(ptr %addr) {
+; CHECK-LABEL: @atomic_acq_rel_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i16 0 acq_rel, align 2, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %res = atomicrmw xor ptr %addr, i16 0 acq_rel, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i16 %res
+}
+
+define i32 @sat_or_allones_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_or_allones_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 -1 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr %addr, i32 -1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+define i32 @sat_and_zero_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_and_zero_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw and ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+define i32 @sat_umin_uint_min_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_umin_uint_min_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw umin ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+define i32 @sat_umax_uint_max_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_umax_uint_max_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 -1 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw umax ptr %addr, i32 -1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+define i8 @sat_min_smin_char_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_min_smin_char_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i8 -128 syncscope("agent") monotonic, align 1, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %res = atomicrmw min ptr %addr, i8 -128 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i8 %res
+}
+
+define i8 @sat_max_smax_char_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_max_smax_char_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i8 127 syncscope("agent") monotonic, align 1, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %res = atomicrmw max ptr %addr, i8 127 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i8 %res
+}
+
+define double @sat_fadd_nan_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_fadd_nan_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], double 0x7FF00000FFFFFFFF release, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fadd ptr %addr, double 0x7FF00000FFFFFFFF release, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret double %res
+}
+
+define double @sat_fsub_nan_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_fsub_nan_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], double 0x7FF00000FFFFFFFF release, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fsub ptr %addr, double 0x7FF00000FFFFFFFF release, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret double %res
+}
+
+define void @sat_fsub_nan_unused_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_fsub_nan_unused_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], double 0x7FF00000FFFFFFFF syncscope("agent") monotonic, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw fsub ptr %addr, double 0x7FF00000FFFFFFFF syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @xchg_unused_monotonic_preserve_md(ptr %addr) {
+; CHECK-LABEL: @xchg_unused_monotonic_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw xchg ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @xchg_unused_release_preserve_md(ptr %addr) {
+; CHECK-LABEL: @xchg_unused_release_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 -1 syncscope("agent") release, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw xchg ptr %addr, i32 -1 syncscope("agent") release, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @xchg_unused_under_aligned_preserve_md(ptr %addr) {
+; CHECK-LABEL: @xchg_unused_under_aligned_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 -1 syncscope("agent") release, align 1, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw xchg ptr %addr, i32 -1 syncscope("agent") release, align 1, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @xchg_unused_over_aligned_preserve_md(ptr %addr) {
+; CHECK-LABEL: @xchg_unused_over_aligned_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 -1 syncscope("agent") release, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw xchg ptr %addr, i32 -1 syncscope("agent") release, align 8, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @xchg_unused_seq_cst_preserve_md(ptr %addr) {
+; CHECK-LABEL: @xchg_unused_seq_cst_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 0 syncscope("agent") seq_cst, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw xchg ptr %addr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @xchg_unused_volatile_preserve_md(ptr %addr) {
+; CHECK-LABEL: @xchg_unused_volatile_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw volatile xchg ptr [[ADDR:%.*]], i32 0 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw volatile xchg ptr %addr, i32 0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @sat_or_allones_unused_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_or_allones_unused_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], i32 -1 syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw or ptr %addr, i32 -1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define void @undef_operand_unused_preserve_md(ptr %addr) {
+; CHECK-LABEL: @undef_operand_unused_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 undef syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret void
+;
+  atomicrmw or ptr %addr, i32 undef syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret void
+}
+
+define i32 @undef_operand_used_preserve_md(ptr %addr) {
+; CHECK-LABEL: @undef_operand_used_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[ADDR:%.*]], i32 undef syncscope("agent") monotonic, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr %addr, i32 undef syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret i32 %res
+}
+
+define double @sat_fmax_inf_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_fmax_inf_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], double 0x7FF0000000000000 syncscope("agent") monotonic, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmax ptr %addr, double 0x7FF0000000000000 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret double %res
+}
+
+define double @no_sat_fmax_inf_preserve_md(ptr %addr) {
+; CHECK-LABEL: @no_sat_fmax_inf_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[ADDR:%.*]], double 1.000000e-01 syncscope("agent") monotonic, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmax ptr %addr, double 1.000000e-01 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret double %res
+}
+
+define double @sat_fmin_inf_preserve_md(ptr %addr) {
+; CHECK-LABEL: @sat_fmin_inf_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[ADDR:%.*]], double 0xFFF0000000000000 syncscope("agent") monotonic, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmin ptr %addr, double 0xFFF0000000000000 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret double %res
+}
+
+define double @no_sat_fmin_inf_preserve_md(ptr %addr) {
+; CHECK-LABEL: @no_sat_fmin_inf_preserve_md(
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[ADDR:%.*]], double 1.000000e-01 syncscope("agent") monotonic, align 8, !mmra [[META0]], !amdgpu.no.fine.grained.host.memory [[META1]], !amdgpu.no.remote.memory.access [[META1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmin ptr %addr, double 1.000000e-01 syncscope("agent") monotonic, !amdgpu.no.fine.grained.host.memory !0, !amdgpu.no.remote.memory.access !0, !mmra !1
+  ret double %res
+}
+
+!0 = !{}
+!1 = !{!"foo", !"bar"}
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll b/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll
new file mode 100644
index 00000000000000..7d917aac0aa4f3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define ptr @mul4(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @mul4(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_IDX:%.*]] = shl i64 [[X]], 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+entry:
+  %mul = mul i64 %x, 4
+  %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+  ret ptr %gep
+}
+
+define ptr @mul5(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @mul5(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+entry:
+  %mul = mul i64 %x, 5
+  %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+  ret ptr %gep
+}
+
+define ptr @noinbounds(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @noinbounds(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+entry:
+  %mul = mul i64 %x, 5
+  %gep = getelementptr i32, ptr %p, i64 %mul
+  ret ptr %gep
+}
+
+define ptr @usemul(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @usemul(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[X]], 5
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[MUL]]
+; CHECK-NEXT:    call void @use(i64 [[MUL]])
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+entry:
+  %mul = mul i64 %x, 5
+  %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+  call void @use(i64 %mul)
+  ret ptr %gep
+}
+
+define void @multiple(ptr %p, i64 %x) {
+; CHECK-LABEL: define void @multiple(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL21:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL21]]
+; CHECK-NEXT:    [[MUL20:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL20]]
+; CHECK-NEXT:    call void @use2(ptr [[GEP3]], ptr [[GEP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %mul5 = mul i64 %x, 5
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %mul5
+  %mul20 = mul i64 %x, 20
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 %mul20
+  call void @use2(ptr %gep1, ptr %gep2)
+  ret void
+}
+
+define void @multiplestore(ptr %p, i64 %x) {
+; CHECK-LABEL: define void @multiplestore(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL20:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL20]]
+; CHECK-NEXT:    [[MUL21:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL21]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP2]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[GEP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %mul5 = mul i64 %x, 5
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %mul5
+  %mul20 = mul i64 %x, 20
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 %mul20
+  store i32 0, ptr %gep1
+  store i32 1, ptr %gep2
+  ret void
+}
+
+declare void @use(i64)
+declare void @use2(ptr, ptr)
diff --git a/llvm/test/Transforms/InstCombine/exact.ll b/llvm/test/Transforms/InstCombine/exact.ll
index 10d46e7b2dfb3d..ccad6f974ffb1e 100644
--- a/llvm/test/Transforms/InstCombine/exact.ll
+++ b/llvm/test/Transforms/InstCombine/exact.ll
@@ -120,7 +120,7 @@ define i1 @ashr_icmp1(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = ashr exact i64 %X, 2   ; X/4
+  %A = ashr exact i64 %X, 2  ; X/4
   %B = icmp eq i64 %A, 0
   ret i1 %B
 }
@@ -131,7 +131,7 @@ define i1 @ashr_icmp2(i64 %X) {
 ; CHECK-NEXT:    ret i1 [[Z]]
 ;
   %Y = ashr exact i64 %X, 2  ; x / 4
-  %Z = icmp slt i64 %Y, 4    ; x < 16
+  %Z = icmp slt i64 %Y, 4  ; x < 16
   ret i1 %Z
 }
 
@@ -181,7 +181,7 @@ define i1 @udiv_icmp1(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp ne i64 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = udiv exact i64 %X, 5   ; X/5
+  %A = udiv exact i64 %X, 5  ; X/5
   %B = icmp ne i64 %A, 0
   ret i1 %B
 }
@@ -201,7 +201,7 @@ define i1 @udiv_icmp2(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = udiv exact i64 %X, 5   ; X/5 == 0 --> x == 0
+  %A = udiv exact i64 %X, 5  ; X/5 == 0 --> x == 0
   %B = icmp eq i64 %A, 0
   ret i1 %B
 }
@@ -221,7 +221,7 @@ define i1 @sdiv_icmp1(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = sdiv exact i64 %X, 5   ; X/5 == 0 --> x == 0
+  %A = sdiv exact i64 %X, 5  ; X/5 == 0 --> x == 0
   %B = icmp eq i64 %A, 0
   ret i1 %B
 }
@@ -241,7 +241,7 @@ define i1 @sdiv_icmp2(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 5
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = sdiv exact i64 %X, 5   ; X/5 == 1 --> x == 5
+  %A = sdiv exact i64 %X, 5  ; X/5 == 1 --> x == 5
   %B = icmp eq i64 %A, 1
   ret i1 %B
 }
@@ -261,7 +261,7 @@ define i1 @sdiv_icmp3(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], -5
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = sdiv exact i64 %X, 5   ; X/5 == -1 --> x == -5
+  %A = sdiv exact i64 %X, 5  ; X/5 == -1 --> x == -5
   %B = icmp eq i64 %A, -1
   ret i1 %B
 }
@@ -281,7 +281,7 @@ define i1 @sdiv_icmp4(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = sdiv exact i64 %X, -5   ; X/-5 == 0 --> x == 0
+  %A = sdiv exact i64 %X, -5  ; X/-5 == 0 --> x == 0
   %B = icmp eq i64 %A, 0
   ret i1 %B
 }
@@ -301,7 +301,7 @@ define i1 @sdiv_icmp5(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], -5
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = sdiv exact i64 %X, -5   ; X/-5 == 1 --> x == -5
+  %A = sdiv exact i64 %X, -5  ; X/-5 == 1 --> x == -5
   %B = icmp eq i64 %A, 1
   ret i1 %B
 }
@@ -321,7 +321,7 @@ define i1 @sdiv_icmp6(i64 %X) {
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 5
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
-  %A = sdiv exact i64 %X, -5   ; X/-5 == -1 --> x == 5
+  %A = sdiv exact i64 %X, -5  ; X/-5 == -1 --> x == 5
   %B = icmp eq i64 %A, -1
   ret i1 %B
 }
@@ -336,3 +336,76 @@ define <2 x i1> @sdiv_icmp6_vec(<2 x i64> %X) {
   ret <2 x i1> %B
 }
 
+define i8 @mul_of_udiv(i8 %x) {
+; CHECK-LABEL: @mul_of_udiv(
+; CHECK-NEXT:    [[MUL1:%.*]] = lshr exact i8 [[X:%.*]], 1
+; CHECK-NEXT:    ret i8 [[MUL1]]
+;
+  %div = udiv exact i8 %x, 12
+  %mul = mul i8 %div, 6
+  ret i8 %mul
+}
+
+define i8 @mul_of_sdiv(i8 %x) {
+; CHECK-LABEL: @mul_of_sdiv(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = ashr exact i8 [[X:%.*]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = sub nsw i8 0, [[MUL_NEG]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %div = sdiv exact i8 %x, 12
+  %mul = mul i8 %div, -6
+  ret i8 %mul
+}
+
+define <2 x i8> @mul_of_sdiv_non_splat(<2 x i8> %x) {
+; CHECK-LABEL: @mul_of_sdiv_non_splat(
+; CHECK-NEXT:    [[MUL:%.*]] = sdiv exact <2 x i8> [[X:%.*]], <i8 1, i8 -2>
+; CHECK-NEXT:    ret <2 x i8> [[MUL]]
+;
+  %div = sdiv exact <2 x i8> %x, <i8 6, i8 -12>
+  %mul = mul <2 x i8> %div, <i8 6, i8 6>
+  ret <2 x i8> %mul
+}
+
+define i8 @mul_of_sdiv_fail_missing_exact(i8 %x) {
+; CHECK-LABEL: @mul_of_sdiv_fail_missing_exact(
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i8 [[X:%.*]], 12
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[DIV]], -6
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %div = sdiv i8 %x, 12
+  %mul = mul i8 %div, -6
+  ret i8 %mul
+}
+
+define i8 @mul_of_udiv_fail_bad_remainder(i8 %x) {
+; CHECK-LABEL: @mul_of_udiv_fail_bad_remainder(
+; CHECK-NEXT:    [[DIV:%.*]] = udiv exact i8 [[X:%.*]], 11
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i8 [[DIV]], 6
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %div = udiv exact i8 %x, 11
+  %mul = mul i8 %div, 6
+  ret i8 %mul
+}
+
+define i8 @mul_of_sdiv_fail_ub(i8 %x) {
+; CHECK-LABEL: @mul_of_sdiv_fail_ub(
+; CHECK-NEXT:    [[MUL:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %div = sdiv exact i8 %x, 6
+  %mul = mul i8 %div, -6
+  ret i8 %mul
+}
+
+define <2 x i8> @mul_of_sdiv_fail_ub_non_splat(<2 x i8> %x) {
+; CHECK-LABEL: @mul_of_sdiv_fail_ub_non_splat(
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv exact <2 x i8> [[X:%.*]], <i8 -6, i8 -12>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i8> [[DIV]], <i8 6, i8 6>
+; CHECK-NEXT:    ret <2 x i8> [[MUL]]
+;
+  %div = sdiv exact <2 x i8> %x, <i8 -6, i8 -12>
+  %mul = mul <2 x i8> %div, <i8 6, i8 6>
+  ret <2 x i8> %mul
+}
diff --git a/llvm/test/Transforms/InstCombine/fabs-fneg-fold.ll b/llvm/test/Transforms/InstCombine/fabs-fneg-fold.ll
new file mode 100644
index 00000000000000..a1988e5d1638d7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fabs-fneg-fold.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine %s | FileCheck %s
+
+define float @fabs_fneg_basic(float %x) {
+; CHECK-LABEL: define float @fabs_fneg_basic(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %neg = fneg float %x
+  %fabs = call float @llvm.fabs.f32(float %neg)
+  ret float %fabs
+}
+
+define <2 x float> @fabs_fneg_v2f32(<2 x float> %x) {
+; CHECK-LABEL: define <2 x float> @fabs_fneg_v2f32(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
+; CHECK-NEXT:    ret <2 x float> [[FABS]]
+;
+  %neg = fneg <2 x float> %x
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %neg)
+  ret <2 x float> %fabs
+}
+
+define double @fabs_fneg_f64(double %x) {
+; CHECK-LABEL: define double @fabs_fneg_f64(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call double @llvm.fabs.f64(double [[X]])
+; CHECK-NEXT:    ret double [[FABS]]
+;
+  %neg = fneg double %x
+  %fabs = call double @llvm.fabs.f64(double %neg)
+  ret double %fabs
+}
+
+define <4 x double> @fabs_fneg_v4f64(<4 x double> %x) {
+; CHECK-LABEL: define <4 x double> @fabs_fneg_v4f64(
+; CHECK-SAME: <4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]])
+; CHECK-NEXT:    ret <4 x double> [[FABS]]
+;
+  %neg = fneg <4 x double> %x
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %neg)
+  ret <4 x double> %fabs
+}
+
+define half @fabs_fneg_f16(half %x) {
+; CHECK-LABEL: define half @fabs_fneg_f16(
+; CHECK-SAME: half [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call half @llvm.fabs.f16(half [[X]])
+; CHECK-NEXT:    ret half [[FABS]]
+;
+  %neg = fneg half %x
+  %fabs = call half @llvm.fabs.f16(half %neg)
+  ret half %fabs
+}
+
+define float @fabs_copysign_nnan(float %x, float %y) {
+; CHECK-LABEL: define float @fabs_copysign_nnan(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %y)
+  %fabs = call nnan float @llvm.fabs.f32(float %copysign)
+  ret float %fabs
+}
+
+
+define float @fabs_copysign_ninf(float %x, float %y) {
+; CHECK-LABEL: define float @fabs_copysign_ninf(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call ninf float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %y)
+  %fabs = call ninf float @llvm.fabs.f32(float %copysign)
+  ret float %fabs
+}
+
+define float @fabs_copysign_nsz(float %x, float %y) {
+; CHECK-LABEL: define float @fabs_copysign_nsz(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call nsz float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %y)
+  %fabs = call nsz float @llvm.fabs.f32(float %copysign)
+  ret float %fabs
+}
+
+define float @fabs_copysign_nnan_negative(float %x, float %y) {
+; CHECK-LABEL: define float @fabs_copysign_nnan_negative(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %y)
+  %fabs = call nnan float @llvm.fabs.f32(float %copysign)
+  ret float %fabs
+}
+
+
+define float @fabs_copysign_ninf_negative(float %x, float %y) {
+; CHECK-LABEL: define float @fabs_copysign_ninf_negative(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call ninf float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %y)
+  %fabs = call ninf float @llvm.fabs.f32(float %copysign)
+  ret float %fabs
+}
+
+define float @fabs_copysign_nsz_negative(float %x, float %y) {
+; CHECK-LABEL: define float @fabs_copysign_nsz_negative(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call nsz float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %y)
+  %fabs = call nsz float @llvm.fabs.f32(float %copysign)
+  ret float %fabs
+}
+
+define float @fabs_fneg_no_fabs(float %x) {
+; CHECK-LABEL: define float @fabs_fneg_no_fabs(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %neg = fneg float %x
+  ret float %neg
+}
+
+define <2 x float> @fabs_fneg_splat_v2f32(<2 x float> %x) {
+; CHECK-LABEL: define <2 x float> @fabs_fneg_splat_v2f32(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    ret <2 x float> <float 2.000000e+00, float 2.000000e+00>
+;
+  %neg = fneg <2 x float> <float -2.0, float -2.0>
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %neg)
+  ret <2 x float> %fabs
+}
+
+
+define <2 x float> @fabs_fneg_splat_poison_v2f32(<2 x float> %x) {
+; CHECK-LABEL: define <2 x float> @fabs_fneg_splat_poison_v2f32(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> <float -2.000000e+00, float poison>)
+; CHECK-NEXT:    ret <2 x float> [[FABS]]
+;
+  %neg = fneg <2 x float> <float 2.0, float poison>
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %neg)
+  ret <2 x float> %fabs
+}
+
+define <2 x float> @fabs_fneg_non_splat_v2f32(<2 x float> %x) {
+; CHECK-LABEL: define <2 x float> @fabs_fneg_non_splat_v2f32(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    ret <2 x float> <float 2.000000e+00, float 3.000000e+00>
+;
+  %neg = fneg <2 x float> <float 2.0, float 3.0>
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %neg)
+  ret <2 x float> %fabs
+}
+
+declare void @use(float)
+
+define float @fabs_fneg_multi_use(float %x) {
+; CHECK-LABEL: define float @fabs_fneg_multi_use(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    call void @use(float [[NEG]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %neg = fneg float %x
+  call void @use(float %neg)
+  %fabs = call float @llvm.fabs.f32(float %neg)
+  ret float %fabs
+}
+
+
+
diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll
index 4d907800219d63..656b3d2c49206e 100644
--- a/llvm/test/Transforms/InstCombine/fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp.ll
@@ -1289,7 +1289,7 @@ define <1 x i1> @bitcast_1vec_eq0(i32 %x) {
 
 define i1 @fcmp_fadd_zero_ugt(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_ugt(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1299,7 +1299,7 @@ define i1 @fcmp_fadd_zero_ugt(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_uge(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_uge(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1309,7 +1309,7 @@ define i1 @fcmp_fadd_zero_uge(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_ogt(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_ogt(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1319,7 +1319,7 @@ define i1 @fcmp_fadd_zero_ogt(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_oge(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_oge(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1329,7 +1329,7 @@ define i1 @fcmp_fadd_zero_oge(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_ult(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_ult(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1339,7 +1339,7 @@ define i1 @fcmp_fadd_zero_ult(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_ule(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_ule(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1349,7 +1349,7 @@ define i1 @fcmp_fadd_zero_ule(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_olt(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_olt(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1359,7 +1359,7 @@ define i1 @fcmp_fadd_zero_olt(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_ole(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_ole(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1369,7 +1369,7 @@ define i1 @fcmp_fadd_zero_ole(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_oeq(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_oeq(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1379,7 +1379,7 @@ define i1 @fcmp_fadd_zero_oeq(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_one(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_one(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1389,7 +1389,7 @@ define i1 @fcmp_fadd_zero_one(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_ueq(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_ueq(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1399,7 +1399,7 @@ define i1 @fcmp_fadd_zero_ueq(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_une(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_une(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1409,7 +1409,7 @@ define i1 @fcmp_fadd_zero_une(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_ord(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_ord(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1419,7 +1419,7 @@ define i1 @fcmp_fadd_zero_ord(float %x, float %y) {
 
 define i1 @fcmp_fadd_zero_uno(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_fadd_zero_uno(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %add = fadd float %x, 0.000000e+00
@@ -1718,3 +1718,402 @@ define <2 x i1> @fcmp_une_sel_x_negx_with_any_fpzero_nnan_vec(<2 x i1> %cond, <2
   %icmp = fcmp nnan une <2 x float> %sel, <float 0.0, float -0.0>
   ret <2 x i1> %icmp
 }
+
+; negative test - extra use
+
+define i1 @fcmp_ueq_fsub_nnan_const_extra_use(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ueq_fsub_nnan_const_extra_use(
+; CHECK-NEXT:    [[FS:%.*]] = fsub nnan float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(float [[FS]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ueq float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub nnan float %x, %y
+  call void @use(float %fs)
+  %cmp = fcmp nnan ueq float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+; negative test - extra use
+
+define i1 @fcmp_oeq_fsub_ninf_const_extra_use(float %x, float %y) {
+; CHECK-LABEL: @fcmp_oeq_fsub_ninf_const_extra_use(
+; CHECK-NEXT:    [[FS:%.*]] = fsub ninf float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(float [[FS]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf oeq float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub ninf float %x, %y
+  call void @use(float %fs)
+  %cmp = fcmp ninf oeq float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_oeq_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_oeq_fsub_const(
+; CHECK-NEXT:    [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp oeq float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_oge_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_oge_fsub_const(
+; CHECK-NEXT:    [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp oge float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ole_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ole_fsub_const(
+; CHECK-NEXT:    [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp ole float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ueq_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ueq_fsub_const(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp ueq float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_uge_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_uge_fsub_const(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp uge float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ule_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ule_fsub_const(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp ule float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ugt_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ugt_fsub_const(
+; CHECK-NEXT:    [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp ugt float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ult_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ult_fsub_const(
+; CHECK-NEXT:    [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp ult float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_une_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_une_fsub_const(
+; CHECK-NEXT:    [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[FS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp une float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define <8 x i1> @fcmp_uge_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_uge_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf uge <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf uge <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ule_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ule_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ule <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf ule <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ueq_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ueq_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ueq <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf ueq <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_oge_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_oge_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf oge <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf oge <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ole_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ole_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ole <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf ole <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_oeq_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_oeq_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf oeq <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf oeq <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ogt_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ogt_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ogt <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf ogt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_olt_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_olt_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf olt <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf olt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_one_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_one_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf one <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf one <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ugt_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ugt_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ugt <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf ugt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ult_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ult_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ult <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf ult <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_une_fsub_const_ninf_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_une_fsub_const_ninf_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf une <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub ninf <8 x float> %x, %y
+  %cmp = fcmp ninf une <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_uge_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_uge_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan uge <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan uge <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ule_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ule_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ule <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan ule <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ueq_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ueq_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ueq <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan ueq <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_oge_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_oge_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan oge <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan oge <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ole_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ole_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ole <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan ole <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_oeq_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_oeq_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan oeq <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan oeq <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ogt_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ogt_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ogt <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan ogt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_olt_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_olt_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan olt <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan olt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_one_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_one_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan one <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan one <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ugt_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ugt_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ugt <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan ugt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ult_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_ult_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ult <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan ult <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_une_fsub_const_nnan_vec(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @fcmp_une_fsub_const_nnan_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan une <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub nnan <8 x float> %x, %y
+  %cmp = fcmp nnan une <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ugt_fsub_const_vec_denormal_positive-zero(<8 x float> %x, <8 x float> %y) "denormal-fp-math"="positive-zero,positive-zero" {
+; CHECK-LABEL: @fcmp_ugt_fsub_const_vec_denormal_positive-zero(
+; CHECK-NEXT:    [[FS:%.*]] = fsub <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <8 x float> [[FS]], zeroinitializer
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub <8 x float> %x, %y
+  %cmp = fcmp ogt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ogt_fsub_const_vec_denormal_dynamic(<8 x float> %x, <8 x float> %y) "denormal-fp-math"="dynamic,dynamic" {
+; CHECK-LABEL: @fcmp_ogt_fsub_const_vec_denormal_dynamic(
+; CHECK-NEXT:    [[FS:%.*]] = fsub <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <8 x float> [[FS]], zeroinitializer
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub <8 x float> %x, %y
+  %cmp = fcmp ogt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @fcmp_ogt_fsub_const_vec_denormal_preserve-sign(<8 x float> %x, <8 x float> %y) "denormal-fp-math"="preserve-sign,preserve-sign" {
+; CHECK-LABEL: @fcmp_ogt_fsub_const_vec_denormal_preserve-sign(
+; CHECK-NEXT:    [[FS:%.*]] = fsub <8 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <8 x float> [[FS]], zeroinitializer
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %fs = fsub <8 x float> %x, %y
+  %cmp = fcmp ogt <8 x float> %fs, zeroinitializer
+  ret <8 x i1> %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/fold-a-or-b-zero.ll b/llvm/test/Transforms/InstCombine/fold-a-or-b-zero.ll
new file mode 100644
index 00000000000000..66a7eb7314ffa4
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-a-or-b-zero.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+declare void @use(i1)
+
+define i1 @a_or_b(i32 %a, i32 %b)  {
+; CHECK-LABEL: define i1 @a_or_b(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    [[OR:%.*]] = xor i1 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %a_eq_zero = icmp eq i32 %a, 0
+  %b_ne_zero = icmp ne i32 %b, 0
+  %and.1 = and i1 %a_eq_zero, %b_ne_zero
+  %a_ne_zero = icmp ne i32 %a, 0
+  %b_eq_zero = icmp eq i32 %b, 0
+  %and.2 = and i1 %a_ne_zero, %b_eq_zero
+  %or = or i1 %and.1, %and.2
+  ret i1 %or
+}
+
+define i1 @a_or_b_not_inv(i32 %a, i32 %b){
+; CHECK-LABEL: define i1 @a_or_b_not_inv(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_EQ_ZERO:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    [[B_NE_ZERO:%.*]] = icmp ne i32 [[B]], 0
+; CHECK-NEXT:    [[AND_1:%.*]] = and i1 [[A_EQ_ZERO]], [[B_NE_ZERO]]
+; CHECK-NEXT:    [[A_NE_ZERO:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[B_EQ_1:%.*]] = icmp eq i32 [[B]], 1
+; CHECK-NEXT:    [[AND_2:%.*]] = and i1 [[A_NE_ZERO]], [[B_EQ_1]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[AND_1]], [[AND_2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %a_eq_zero = icmp eq i32 %a, 0
+  %b_ne_zero = icmp ne i32 %b, 0
+  %and.1 = and i1 %a_eq_zero, %b_ne_zero
+  %a_ne_zero = icmp ne i32 %a, 0
+  %b_eq_1 = icmp eq i32 %b, 1
+  %and.2 = and i1 %a_ne_zero, %b_eq_1
+  %or = or i1 %and.1, %and.2
+  ret i1 %or
+}
+
+define i1 @a_or_b_const(i32 %a, i32 %b, i32 %c)  {
+; CHECK-LABEL: define i1 @a_or_b_const(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[A]], [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[B]], [[C]]
+; CHECK-NEXT:    [[OR:%.*]] = xor i1 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %a_eq_c = icmp eq i32 %a, %c
+  %b_ne_c = icmp ne i32 %b, %c
+  %and.1 = and i1 %a_eq_c, %b_ne_c
+  %a_ne_c = icmp ne i32 %a, %c
+  %b_eq_c = icmp eq i32 %b, %c
+  %and.2 = and i1 %a_ne_c, %b_eq_c
+  %or = or i1 %and.1, %and.2
+  ret i1 %or
+}
+
+define i1 @a_or_b_const2(i32 %a, i32 %b, i32 %c, i32 %d)  {
+; CHECK-LABEL: define i1 @a_or_b_const2(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:    [[A_EQ_C:%.*]] = icmp eq i32 [[A]], [[C]]
+; CHECK-NEXT:    [[B_EQ_D:%.*]] = icmp eq i32 [[B]], [[D]]
+; CHECK-NEXT:    [[OR:%.*]] = xor i1 [[A_EQ_C]], [[B_EQ_D]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %a_eq_c = icmp eq i32 %a, %c
+  %b_ne_d = icmp ne i32 %b, %d
+  %and.1 = and i1 %a_eq_c, %b_ne_d
+  %a_ne_c = icmp ne i32 %a, %c
+  %b_eq_d = icmp eq i32 %b, %d
+  %and.2 = and i1 %a_ne_c, %b_eq_d
+  %or = or i1 %and.1, %and.2
+  ret i1 %or
+}
+define i1 @a_or_b_nullptr(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @a_or_b_nullptr(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[A]], null
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[B]], null
+; CHECK-NEXT:    [[OR:%.*]] = xor i1 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %a_null = icmp eq ptr %a, null
+  %b_null = icmp eq ptr %b, null
+  %a_not_null = icmp ne ptr %a, null
+  %b_not_null = icmp ne ptr %b, null
+  %and.1 = and i1 %a_null, %b_not_null
+  %and.2 = and i1 %a_not_null, %b_null
+  %or = or i1 %and.1, %and.2
+  ret i1 %or
+}
+
+define i1 @a_or_b_multiple_uses(i32 %a, i32 %b)  {
+; CHECK-LABEL: define i1 @a_or_b_multiple_uses(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    [[A_NE_ZERO:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[B_EQ_ZERO:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    [[AND_2:%.*]] = and i1 [[A_NE_ZERO]], [[B_EQ_ZERO]]
+; CHECK-NEXT:    call void @use(i1 [[AND_2]])
+; CHECK-NEXT:    [[OR:%.*]] = xor i1 [[TMP1]], [[B_EQ_ZERO]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %a_eq_zero = icmp eq i32 %a, 0
+  %b_ne_zero = icmp ne i32 %b, 0
+  %and.1 = and i1 %a_eq_zero, %b_ne_zero
+  %a_ne_zero = icmp ne i32 %a, 0
+  %b_eq_zero = icmp eq i32 %b, 0
+  %and.2 = and i1 %a_ne_zero, %b_eq_zero
+  call void @use(i1 %and.2)
+  %or = or i1 %and.1, %and.2
+  ret i1 %or
+}
+
+define i1 @a_or_b_multiple_uses_2(i32 %a, i32 %b)  {
+; CHECK-LABEL: define i1 @a_or_b_multiple_uses_2(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_EQ_ZERO:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    [[B_NE_ZERO:%.*]] = icmp ne i32 [[B]], 0
+; CHECK-NEXT:    call void @use(i1 [[B_NE_ZERO]])
+; CHECK-NEXT:    [[AND_1:%.*]] = and i1 [[A_EQ_ZERO]], [[B_NE_ZERO]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    call void @use(i1 [[AND_1]])
+; CHECK-NEXT:    [[OR:%.*]] = xor i1 [[A_EQ_ZERO]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %a_eq_zero = icmp eq i32 %a, 0
+  %b_ne_zero = icmp ne i32 %b, 0
+  call void @use(i1 %b_ne_zero)
+  %and.1 = and i1 %a_eq_zero, %b_ne_zero
+  %a_ne_zero = icmp ne i32 %a, 0
+  %b_eq_zero = icmp eq i32 %b, 0
+  %and.2 = and i1 %a_ne_zero, %b_eq_zero
+  call void @use(i1 %and.1)
+  %or = or i1 %and.1, %and.2
+  ret i1 %or
+}
+
+
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 722ced1f6abb13..f25abae60904cf 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -1014,8 +1014,8 @@ define i8 @test_gep_bitcast_as1(ptr addrspace(1) %arr, i16 %N) {
 ; The element size of the array matches the element size of the pointer
 define i64 @test_gep_bitcast_array_same_size_element(ptr %arr, i64 %N) {
 ; CHECK-LABEL: @test_gep_bitcast_array_same_size_element(
-; CHECK-NEXT:    [[V:%.*]] = shl i64 [[N:%.*]], 3
-; CHECK-NEXT:    [[T:%.*]] = getelementptr i64, ptr [[ARR:%.*]], i64 [[V]]
+; CHECK-NEXT:    [[T_IDX:%.*]] = shl i64 [[N:%.*]], 6
+; CHECK-NEXT:    [[T:%.*]] = getelementptr i8, ptr [[ARR:%.*]], i64 [[T_IDX]]
 ; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[T]], align 4
 ; CHECK-NEXT:    ret i64 [[X]]
 ;
@@ -1029,8 +1029,8 @@ define i64 @test_gep_bitcast_array_same_size_element(ptr %arr, i64 %N) {
 define i64 @test_gep_bitcast_array_same_size_element_addrspacecast(ptr %arr, i64 %N) {
 ; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_addrspacecast(
 ; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast ptr [[ARR:%.*]] to ptr addrspace(3)
-; CHECK-NEXT:    [[V:%.*]] = shl i64 [[N:%.*]], 3
-; CHECK-NEXT:    [[T:%.*]] = getelementptr i64, ptr addrspace(3) [[CAST]], i64 [[V]]
+; CHECK-NEXT:    [[T_IDX:%.*]] = shl i64 [[N:%.*]], 6
+; CHECK-NEXT:    [[T:%.*]] = getelementptr i8, ptr addrspace(3) [[CAST]], i64 [[T_IDX]]
 ; CHECK-NEXT:    [[X:%.*]] = load i64, ptr addrspace(3) [[T]], align 4
 ; CHECK-NEXT:    ret i64 [[X]]
 ;
@@ -1057,8 +1057,8 @@ define i8 @test_gep_bitcast_array_different_size_element(ptr %arr, i64 %N) {
 
 define i64 @test_gep_bitcast_array_same_size_element_as1(ptr addrspace(1) %arr, i16 %N) {
 ; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_as1(
-; CHECK-NEXT:    [[V:%.*]] = shl i16 [[N:%.*]], 3
-; CHECK-NEXT:    [[T:%.*]] = getelementptr i64, ptr addrspace(1) [[ARR:%.*]], i16 [[V]]
+; CHECK-NEXT:    [[T_IDX:%.*]] = shl i16 [[N:%.*]], 6
+; CHECK-NEXT:    [[T:%.*]] = getelementptr i8, ptr addrspace(1) [[ARR:%.*]], i16 [[T_IDX]]
 ; CHECK-NEXT:    [[X:%.*]] = load i64, ptr addrspace(1) [[T]], align 4
 ; CHECK-NEXT:    ret i64 [[X]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 7eb508ebb55378..b12982dd27e404 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -88,12 +88,12 @@ L2:             ; preds = %Loop
   br label %Loop
 }
 
-define i32 @test5(i32 %A, i1 %b) {
-; CHECK-LABEL: @test5(
+define i32 @test5_undef(i32 %A, i1 %cond) {
+; CHECK-LABEL: @test5_undef(
 ; CHECK-NEXT:  BB0:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       Loop:
-; CHECK-NEXT:    br i1 [[B:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       Exit:
 ; CHECK-NEXT:    ret i32 [[A:%.*]]
 ;
@@ -103,7 +103,28 @@ BB0:
 Loop:           ; preds = %Loop, %BB0
   ; PHI has same value always.
   %B = phi i32 [ %A, %BB0 ], [ undef, %Loop ]
-  br i1 %b, label %Loop, label %Exit
+  br i1 %cond, label %Loop, label %Exit
+
+Exit:           ; preds = %Loop
+  ret i32 %B
+}
+
+define i32 @test5_poison(i32 %A, i1 %cond) {
+; CHECK-LABEL: @test5_poison(
+; CHECK-NEXT:  BB0:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       Loop:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       Exit:
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+BB0:
+  br label %Loop
+
+Loop:           ; preds = %Loop, %BB0
+  ; PHI has same value always.
+  %B = phi i32 [ %A, %BB0 ], [ poison, %Loop ]
+  br i1 %cond, label %Loop, label %Exit
 
 Exit:           ; preds = %Loop
   ret i32 %B
diff --git a/llvm/test/Transforms/InstCombine/ptrauth-intrinsics.ll b/llvm/test/Transforms/InstCombine/ptrauth-intrinsics.ll
index da0f724abfde47..208e162ac94164 100644
--- a/llvm/test/Transforms/InstCombine/ptrauth-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/ptrauth-intrinsics.ll
@@ -12,6 +12,27 @@ define i64 @test_ptrauth_nop(ptr %p) {
   ret i64 %authed
 }
 
+declare void @foo()
+declare void @bar()
+
+define i64 @test_ptrauth_nop_constant() {
+; CHECK-LABEL: @test_ptrauth_nop_constant(
+; CHECK-NEXT:    ret i64 ptrtoint (ptr @foo to i64)
+;
+  %authed = call i64 @llvm.ptrauth.auth(i64 ptrtoint(ptr ptrauth(ptr @foo, i32 1, i64 1234) to i64), i32 1, i64 1234)
+  ret i64 %authed
+}
+
+define i64 @test_ptrauth_nop_constant_addrdisc() {
+; CHECK-LABEL: @test_ptrauth_nop_constant_addrdisc(
+; CHECK-NEXT:    ret i64 ptrtoint (ptr @foo to i64)
+;
+  %addr = ptrtoint ptr @foo to i64
+  %blended = call i64 @llvm.ptrauth.blend(i64 %addr, i64 1234)
+  %authed = call i64 @llvm.ptrauth.auth(i64 ptrtoint(ptr ptrauth(ptr @foo, i32 1, i64 1234, ptr @foo) to i64), i32 1, i64 %blended)
+  ret i64 %authed
+}
+
 define i64 @test_ptrauth_nop_mismatch(ptr %p) {
 ; CHECK-LABEL: @test_ptrauth_nop_mismatch(
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -87,6 +108,59 @@ define i64 @test_ptrauth_resign_auth_mismatch(ptr %p) {
   ret i64 %authed
 }
 
+define i64 @test_ptrauth_nop_constant_mismatch() {
+; CHECK-LABEL: @test_ptrauth_nop_constant_mismatch(
+; CHECK-NEXT:    [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 ptrtoint (ptr ptrauth (ptr @foo, i32 1, i64 1234) to i64), i32 1, i64 12)
+; CHECK-NEXT:    ret i64 [[AUTHED]]
+;
+  %authed = call i64 @llvm.ptrauth.auth(i64 ptrtoint(ptr ptrauth(ptr @foo, i32 1, i64 1234) to i64), i32 1, i64 12)
+  ret i64 %authed
+}
+
+define i64 @test_ptrauth_nop_constant_mismatch_key() {
+; CHECK-LABEL: @test_ptrauth_nop_constant_mismatch_key(
+; CHECK-NEXT:    [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 ptrtoint (ptr ptrauth (ptr @foo, i32 1, i64 1234) to i64), i32 0, i64 1234)
+; CHECK-NEXT:    ret i64 [[AUTHED]]
+;
+  %authed = call i64 @llvm.ptrauth.auth(i64 ptrtoint(ptr ptrauth(ptr @foo, i32 1, i64 1234) to i64), i32 0, i64 1234)
+  ret i64 %authed
+}
+
+define i64 @test_ptrauth_nop_constant_addrdisc_mismatch() {
+; CHECK-LABEL: @test_ptrauth_nop_constant_addrdisc_mismatch(
+; CHECK-NEXT:    [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @foo to i64), i64 12)
+; CHECK-NEXT:    [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 ptrtoint (ptr ptrauth (ptr @foo, i32 1, i64 1234, ptr @foo) to i64), i32 1, i64 [[BLENDED]])
+; CHECK-NEXT:    ret i64 [[AUTHED]]
+;
+  %addr = ptrtoint ptr @foo to i64
+  %blended = call i64 @llvm.ptrauth.blend(i64 %addr, i64 12)
+  %authed = call i64 @llvm.ptrauth.auth(i64 ptrtoint(ptr ptrauth(ptr @foo, i32 1, i64 1234, ptr @foo) to i64), i32 1, i64 %blended)
+  ret i64 %authed
+}
+
+define i64 @test_ptrauth_nop_constant_addrdisc_mismatch2() {
+; CHECK-LABEL: @test_ptrauth_nop_constant_addrdisc_mismatch2(
+; CHECK-NEXT:    [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @bar to i64), i64 1234)
+; CHECK-NEXT:    [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 ptrtoint (ptr ptrauth (ptr @foo, i32 1, i64 1234, ptr @foo) to i64), i32 1, i64 [[BLENDED]])
+; CHECK-NEXT:    ret i64 [[AUTHED]]
+;
+  %addr = ptrtoint ptr @bar to i64
+  %blended = call i64 @llvm.ptrauth.blend(i64 %addr, i64 1234)
+  %authed = call i64 @llvm.ptrauth.auth(i64 ptrtoint(ptr ptrauth(ptr @foo, i32 1, i64 1234, ptr @foo) to i64), i32 1, i64 %blended)
+  ret i64 %authed
+}
+
+define i64 @test_ptrauth_resign_ptrauth_constant(ptr %p) {
+; CHECK-LABEL: @test_ptrauth_resign_ptrauth_constant(
+; CHECK-NEXT:    ret i64 ptrtoint (ptr ptrauth (ptr @foo, i32 0, i64 42) to i64)
+;
+
+  %tmp0 = ptrtoint ptr %p to i64
+  %authed = call i64 @llvm.ptrauth.resign(i64 ptrtoint(ptr ptrauth(ptr @foo, i32 1, i64 1234) to i64), i32 1, i64 1234, i32 0, i64 42)
+  ret i64 %authed
+}
+
 declare i64 @llvm.ptrauth.auth(i64, i32, i64)
 declare i64 @llvm.ptrauth.sign(i64, i32, i64)
 declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
+declare i64 @llvm.ptrauth.blend(i64, i64)
diff --git a/llvm/test/Transforms/InstCombine/select-cmp-eq-op-fold.ll b/llvm/test/Transforms/InstCombine/select-cmp-eq-op-fold.ll
new file mode 100644
index 00000000000000..2f2c2d3ec0250f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-cmp-eq-op-fold.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare void @use.i1(i1)
+declare void @use.i8(i8)
+define i8 @replace_with_y_noundef(i8 %x, i8 noundef %y, i8 %z) {
+; CHECK-LABEL: @replace_with_y_noundef(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[Y]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %and = and i8 %x, %y
+  %sel = select i1 %cmp, i8 %and, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_x_noundef(i8 noundef %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @replace_with_x_noundef(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use.i1(i1 [[CMP]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[Z:%.*]], i8 [[X]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %cmp = icmp ne i8 %x, %y
+  call void @use.i1(i1 %cmp)
+  %and = or i8 %x, %y
+  %sel = select i1 %cmp, i8 %z, i8 %and
+  ret i8 %sel
+}
+
+define i8 @replace_with_x_maybe_undef_fail(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @replace_with_x_maybe_undef_fail(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use.i1(i1 [[CMP]])
+; CHECK-NEXT:    [[AND:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[Z:%.*]], i8 [[AND]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %cmp = icmp ne i8 %x, %y
+  call void @use.i1(i1 %cmp)
+  %and = or i8 %x, %y
+  %sel = select i1 %cmp, i8 %z, i8 %and
+  ret i8 %sel
+}
+
+define i8 @replace_with_y_for_new_oneuse(i8 noundef %xx, i8 noundef %y, i8 %z) {
+; CHECK-LABEL: @replace_with_y_for_new_oneuse(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[XX:%.*]], 13
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[ADD]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %x = mul i8 %xx, 13
+  %cmp = icmp eq i8 %x, %y
+  %add = add nuw i8 %x, %y
+  %sel = select i1 %cmp, i8 %add, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_y_for_new_oneuse2(i8 %xx, i8 noundef %y, i8 %z, i8 %q) {
+; CHECK-LABEL: @replace_with_y_for_new_oneuse2(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[XX:%.*]], 13
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[X]], [[Q:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[ADD]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %x = mul i8 %xx, 13
+  %cmp = icmp eq i8 %x, %y
+  %add = add nuw i8 %x, %q
+  %sel = select i1 %cmp, i8 %add, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_x_for_new_oneuse(i8 noundef %xx, i8 noundef %yy, i8 %z, i8 %w) {
+; CHECK-LABEL: @replace_with_x_for_new_oneuse(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[XX:%.*]], 13
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[YY:%.*]], [[W:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[MUL]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %x = mul i8 %xx, 13
+  %y = add i8 %yy, %w
+  %cmp = icmp eq i8 %x, %y
+  %mul = mul i8 %x, %y
+  %sel = select i1 %cmp, i8 %mul, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_x_for_new_oneuse2(i8 noundef %xx, i8 %yy, i8 %z, i8 %w, i8 %q) {
+; CHECK-LABEL: @replace_with_x_for_new_oneuse2(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[XX:%.*]], 13
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[YY:%.*]], [[W:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[Y]], [[Q:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[MUL]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %x = mul i8 %xx, 13
+  %y = add i8 %yy, %w
+  %cmp = icmp eq i8 %x, %y
+  %mul = mul i8 %q, %y
+  %sel = select i1 %cmp, i8 %mul, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_x_for_simple_binop(i8 noundef %xx, i8 %yy, i8 %z, i8 %w) {
+; CHECK-LABEL: @replace_with_x_for_simple_binop(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[XX:%.*]], 13
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[YY:%.*]], [[W:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use.i8(i8 [[Y]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[MUL]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %x = mul i8 %xx, 13
+  %y = add i8 %yy, %w
+  %cmp = icmp eq i8 %x, %y
+  %mul = mul i8 %x, %y
+  call void @use.i8(i8 %y)
+  %sel = select i1 %cmp, i8 %mul, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_none_for_new_oneuse_fail_maybe_undef(i8 %xx, i8 %y, i8 %z) {
+; CHECK-LABEL: @replace_with_none_for_new_oneuse_fail_maybe_undef(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[XX:%.*]], 13
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[MUL]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %x = mul i8 %xx, 13
+  %cmp = icmp eq i8 %x, %y
+  %mul = mul i8 %x, %y
+  %sel = select i1 %cmp, i8 %mul, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_y_for_simple_binop(i8 %x, i8 noundef %y, i8 %z) {
+; CHECK-LABEL: @replace_with_y_for_simple_binop(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[MUL]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %mul = mul nsw i8 %x, %y
+  %sel = select i1 %cmp, i8 %mul, i8 %z
+  ret i8 %sel
+}
+
+define i8 @replace_with_y_for_simple_binop_fail_multiuse(i8 %x, i8 noundef %y, i8 %z) {
+; CHECK-LABEL: @replace_with_y_for_simple_binop_fail_multiuse(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[MUL]], i8 [[Z:%.*]]
+; CHECK-NEXT:    call void @use.i8(i8 [[MUL]])
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %mul = mul nsw i8 %x, %y
+  %sel = select i1 %cmp, i8 %mul, i8 %z
+  call void @use.i8(i8 %mul)
+  ret i8 %sel
+}
+
+define i8 @replace_with_y_for_simple_binop_fail(i8 %x, i8 noundef %y, i8 %z, i8 %q) {
+; CHECK-LABEL: @replace_with_y_for_simple_binop_fail(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X]], [[Q:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[MUL]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %mul = mul nsw i8 %x, %q
+  %sel = select i1 %cmp, i8 %mul, i8 %z
+  ret i8 %sel
+}
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 0700d7a62ee15c..f0bfd0171b265a 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -1151,7 +1151,7 @@ define void @test61(i128 %arg, i1 %c1, i1 %c2, i1 %c3, i1 %c4) {
 ; CHECK:       bb7:
 ; CHECK-NEXT:    br i1 [[C3:%.*]], label [[BB8]], label [[BB2]]
 ; CHECK:       bb8:
-; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB12]]
+; CHECK-NEXT:    br i1 poison, label [[BB11:%.*]], label [[BB12]]
 ; CHECK:       bb11:
 ; CHECK-NEXT:    br i1 [[C4:%.*]], label [[BB1]], label [[BB12]]
 ; CHECK:       bb12:
diff --git a/llvm/test/Transforms/InstSimplify/phi.ll b/llvm/test/Transforms/InstSimplify/phi.ll
index c97326ea05137e..5cc3fecb129f4d 100644
--- a/llvm/test/Transforms/InstSimplify/phi.ll
+++ b/llvm/test/Transforms/InstSimplify/phi.ll
@@ -141,7 +141,7 @@ define i8 @only_poison(i1 %cond) {
 ; CHECK:       B:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       EXIT:
-; CHECK-NEXT:    ret i8 undef
+; CHECK-NEXT:    ret i8 poison
 ;
   br i1 %cond, label %A, label %B
 A:
diff --git a/llvm/test/Transforms/InstSimplify/uscmp.ll b/llvm/test/Transforms/InstSimplify/uscmp.ll
index db96aa39192d61..47720060acb528 100644
--- a/llvm/test/Transforms/InstSimplify/uscmp.ll
+++ b/llvm/test/Transforms/InstSimplify/uscmp.ll
@@ -229,6 +229,26 @@ define <4 x i8> @ucmp_with_addition_vec(<4 x i32> %x) {
   ret <4 x i8> %2
 }
 
+define i1 @scmp_eq_4(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @scmp_eq_4(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  %2 = icmp eq i8 %1, 4
+  ret i1 %2
+}
+
+define i1 @ucmp_ne_negative_2(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @ucmp_ne_negative_2(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  %2 = icmp ne i8 %1, -2
+  ret i1 %2
+}
+
 ; Negative case: mismatched signedness of predicates
 define i8 @scmp_known_ugt(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i8 @scmp_known_ugt(
diff --git a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
index e9bbe9ba24787f..715f1287c57757 100644
--- a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
+++ b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
@@ -717,8 +717,8 @@ failure:
   unreachable
 }
 
-define i32 @test_multiple_pred_undef_1(i1 %cond, i1 %cond2) {
-; CHECK-LABEL: @test_multiple_pred_undef_1(
+define i32 @test_multiple_pred_poison_1(i1 %cond, i1 %cond2) {
+; CHECK-LABEL: @test_multiple_pred_poison_1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -739,7 +739,7 @@ define i32 @test_multiple_pred_undef_1(i1 %cond, i1 %cond2) {
 ; CHECK:       if.false.2:
 ; CHECK-NEXT:    br label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ [[SUB]], [[IF_TRUE_1]] ], [ undef, [[IF_TRUE_2]] ]
+; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ [[SUB]], [[IF_TRUE_1]] ], [ poison, [[IF_TRUE_2]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
 ; CHECK-NEXT:    br label [[DONE:%.*]]
@@ -777,7 +777,7 @@ if.false.2:
   br label %backedge
 
 backedge:
-  %merge.phi = phi i32 [ 0, %if.false.1 ], [ 0, %if.false.2 ], [ %sub, %if.true.1 ], [ undef, %if.true.2 ]
+  %merge.phi = phi i32 [ 0, %if.false.1 ], [ 0, %if.false.2 ], [ %sub, %if.true.1 ], [ poison, %if.true.2 ]
   %sum.next = add i32 %sum, %merge.phi
   %loop.cond = icmp ne i32 %sum.next, 4
   br i1 %loop.cond, label %loop, label %done
@@ -790,8 +790,8 @@ failure:
   unreachable
 }
 
-define i32 @test_multiple_pred_undef_2(i1 %cond, i1 %cond2) {
-; CHECK-LABEL: @test_multiple_pred_undef_2(
+define i32 @test_multiple_pred_poison_2(i1 %cond, i1 %cond2) {
+; CHECK-LABEL: @test_multiple_pred_poison_2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -812,7 +812,7 @@ define i32 @test_multiple_pred_undef_2(i1 %cond, i1 %cond2) {
 ; CHECK:       if.false.2:
 ; CHECK-NEXT:    br label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ undef, [[IF_TRUE_1]] ], [ [[SUB]], [[IF_TRUE_2]] ]
+; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ poison, [[IF_TRUE_1]] ], [ [[SUB]], [[IF_TRUE_2]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
 ; CHECK-NEXT:    br label [[DONE:%.*]]
@@ -850,7 +850,7 @@ if.false.2:
   br label %backedge
 
 backedge:
-  %merge.phi = phi i32 [ 0, %if.false.1 ], [ 0, %if.false.2 ], [ undef, %if.true.1 ], [ %sub, %if.true.2 ]
+  %merge.phi = phi i32 [ 0, %if.false.1 ], [ 0, %if.false.2 ], [ poison, %if.true.1 ], [ %sub, %if.true.2 ]
   %sum.next = add i32 %sum, %merge.phi
   %loop.cond = icmp ne i32 %sum.next, 4
   br i1 %loop.cond, label %loop, label %done
@@ -863,8 +863,8 @@ failure:
   unreachable
 }
 
-define i32 @test_multiple_pred_undef_3(i1 %cond, i1 %cond2) {
-; CHECK-LABEL: @test_multiple_pred_undef_3(
+define i32 @test_multiple_pred_poison_3(i1 %cond, i1 %cond2) {
+; CHECK-LABEL: @test_multiple_pred_poison_3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -885,7 +885,7 @@ define i32 @test_multiple_pred_undef_3(i1 %cond, i1 %cond2) {
 ; CHECK:       if.false.2:
 ; CHECK-NEXT:    br label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ undef, [[IF_TRUE_1]] ], [ undef, [[IF_TRUE_2]] ]
+; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ poison, [[IF_TRUE_1]] ], [ poison, [[IF_TRUE_2]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
 ; CHECK-NEXT:    br label [[DONE:%.*]]
@@ -923,7 +923,7 @@ if.false.2:
   br label %backedge
 
 backedge:
-  %merge.phi = phi i32 [ 0, %if.false.1 ], [ 0, %if.false.2 ], [ undef, %if.true.1 ], [ undef, %if.true.2 ]
+  %merge.phi = phi i32 [ 0, %if.false.1 ], [ 0, %if.false.2 ], [ poison, %if.true.1 ], [ poison, %if.true.2 ]
   %sum.next = add i32 %sum, %merge.phi
   %loop.cond = icmp ne i32 %sum.next, 4
   br i1 %loop.cond, label %loop, label %done
diff --git a/llvm/test/Transforms/LoopDeletion/update-scev.ll b/llvm/test/Transforms/LoopDeletion/update-scev.ll
index 64c7ca9fc4e378..3a4466496a1edc 100644
--- a/llvm/test/Transforms/LoopDeletion/update-scev.ll
+++ b/llvm/test/Transforms/LoopDeletion/update-scev.ll
@@ -66,7 +66,7 @@ define void @test2(ptr %bx, i64 %by) local_unnamed_addr align 2 {
 
 ; SCEV-EXPRS-LABEL: test2
 ; SCEV-EXPRS:     %inc.lcssa.1 = phi i64 [ poison, %for.body7.preheader.1 ]
-; SCEV-EXPRS-NEXT: -->  undef
+; SCEV-EXPRS-NEXT: -->  poison
 entry:
   %cmp = icmp sgt i64 %by, 0
   br label %for.cond.preheader
diff --git a/llvm/test/Transforms/LoopDistribute/debug-loc.ll b/llvm/test/Transforms/LoopDistribute/debug-loc.ll
new file mode 100644
index 00000000000000..6456c3452b966d
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/debug-loc.ll
@@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-distribute -enable-loop-distribute \
+; RUN:   -debug-only=loop-distribute -disable-output 2>&1 %s | FileCheck %s
+
+define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, i64 %stride) {
+; CHECK-LABEL: LDist: Checking a loop in 'f' from f.c:5:2
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %ind
+  %load.a = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %ind
+  %load.b = load i32, ptr %gep.b, align 4
+  %mul.a = mul i32 %load.b, %load.a
+  %add = add nuw nsw i64 %ind, 1
+  %gep.a.plus4 = getelementptr inbounds i32, ptr %a, i64 %add
+  store i32 %mul.a, ptr %gep.a.plus4, align 4
+  %gep.d = getelementptr inbounds i32, ptr %d, i64 %ind
+  %loadD = load i32, ptr %gep.d, align 4
+  %mul = mul i64 %ind, %stride
+  %gep.strided.a = getelementptr inbounds i32, ptr %a, i64 %mul
+  %load.strided.a = load i32, ptr %gep.strided.a, align 4
+  %mul.c = mul i32 %loadD, %load.strided.a
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %ind
+  store i32 %mul.c, ptr %gep.c, align 4
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %exit, label %for.body, !dbg !2
+
+exit:                                             ; preds = %for.body
+  ret void
+}
+
+!llvm.module.flags = !{!5, !6, !7}
+
+!0 = !DIFile(filename: "f.c", directory: "/")
+!1 = distinct !DISubprogram(name: "f", scope: !0, file: !0, unit: !4)
+!2 = !DILocation(line: 5, column: 2, scope: !1)
+!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !0, producer: "clang")
+!5 = !{i32 1, !"Debug Info Version", i32 3}
+!6 = !{i32 2, !"Dwarf Version", i32 2}
+!7 = !{i32 1, !"PIC Level", i32 2}
diff --git a/llvm/test/Transforms/LoopDistribute/debug-print.ll b/llvm/test/Transforms/LoopDistribute/debug-print.ll
new file mode 100644
index 00000000000000..733c33483ecc40
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/debug-print.ll
@@ -0,0 +1,44 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-distribute -enable-loop-distribute \
+; RUN:   -debug-only=loop-distribute -disable-output 2>&1 %s | FileCheck %s
+
+define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, i64 %stride) {
+; CHECK-LABEL: 'f'
+; CHECK:        LDist: Found a candidate loop: for.body
+; CHECK:        Backward dependences:
+; CHECK-NEXT:     Backward:
+; CHECK-NEXT:         %load.a = load i32, ptr %gep.a, align 4 ->
+; CHECK-NEXT:         store i32 %mul.a, ptr %gep.a.plus4, align 4
+; CHECK:        Seeded partitions:
+; CHECK:        Partition 0
+; CHECK:        Partition 1
+; CHECK:        Partition 2
+; CHECK:        Partition 3
+; CHECK:        Distributing loop
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %ind
+  %load.a = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %ind
+  %load.b = load i32, ptr %gep.b, align 4
+  %mul.a = mul i32 %load.b, %load.a
+  %add = add nuw nsw i64 %ind, 1
+  %gep.a.plus4 = getelementptr inbounds i32, ptr %a, i64 %add
+  store i32 %mul.a, ptr %gep.a.plus4, align 4
+  %gep.d = getelementptr inbounds i32, ptr %d, i64 %ind
+  %loadD = load i32, ptr %gep.d, align 4
+  %mul = mul i64 %ind, %stride
+  %gep.strided.a = getelementptr inbounds i32, ptr %a, i64 %mul
+  %load.strided.a = load i32, ptr %gep.strided.a, align 4
+  %mul.c = mul i32 %loadD, %load.strided.a
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %ind
+  store i32 %mul.c, ptr %gep.c, align 4
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                             ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
index 1ab19081b7de59..7353acd7228cdc 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
@@ -20,7 +20,6 @@ define ptr @foo(ptr %a0, ptr %a1, i64 %a2) {
 ; CHECK-NEXT:    mv a3, a0
 ; CHECK-NEXT:  .LBB0_3: # %do.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vse8.v v8, (a3)
 ; CHECK-NEXT:    add a3, a3, a4
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/multi-blocks.ll b/llvm/test/Transforms/LoopUnroll/ARM/multi-blocks.ll
index f988c153b8bc71..d2911a14d8485e 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/multi-blocks.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/multi-blocks.ll
@@ -21,7 +21,7 @@ define void @test_three_blocks(ptr nocapture %Output, ptr nocapture readonly %Co
 ; CHECK-NEXT:    [[TEMP_09_UNR_PH:%.*]] = phi i32 [ [[TEMP_1_3]], [[FOR_INC_3]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
 ; CHECK:       for.cond.cleanup.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[TEMP_1_LCSSA_PH:%.*]] = phi i32 [ undef, [[FOR_BODY_PREHEADER]] ], [ [[TEMP_1_LCSSA_PH_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[TEMP_1_LCSSA_PH:%.*]] = phi i32 [ poison, [[FOR_BODY_PREHEADER]] ], [ [[TEMP_1_LCSSA_PH_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    [[J_010_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[J_010_UNR_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TEMP_09_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TEMP_09_UNR_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
@@ -365,7 +365,7 @@ define void @test_four_blocks(ptr nocapture %Output, ptr nocapture readonly %Con
 ; CHECK-NEXT:    [[TEMP_026_UNR_PH:%.*]] = phi i32 [ [[TEMP_1_3]], [[FOR_INC_3]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
 ; CHECK:       for.cond.cleanup.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[TEMP_1_LCSSA_PH:%.*]] = phi i32 [ undef, [[FOR_BODY_LR_PH]] ], [ [[TEMP_1_LCSSA_PH_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[TEMP_1_LCSSA_PH:%.*]] = phi i32 [ poison, [[FOR_BODY_LR_PH]] ], [ [[TEMP_1_LCSSA_PH_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_BODY_LR_PH]] ], [ [[I_UNR_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[J_027_UNR:%.*]] = phi i32 [ 1, [[FOR_BODY_LR_PH]] ], [ [[J_027_UNR_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TEMP_026_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[TEMP_026_UNR_PH]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
index 669306c8f3ab74..a97b39494b2ef8 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
@@ -68,7 +68,7 @@ define void @pr56282() {
 ; CHECK-NEXT:    [[INNER_1_IV_UNR_PH:%.*]] = phi i64 [ [[INNER_1_IV_NEXT_7]], [[INNER_1_LATCH_7]] ]
 ; CHECK-NEXT:    br label [[OUTER_MIDDLE_UNR_LCSSA]]
 ; CHECK:       outer.middle.unr-lcssa:
-; CHECK-NEXT:    [[V_LCSSA1_PH:%.*]] = phi i32 [ undef, [[OUTER_HEADER]] ], [ [[V_LCSSA1_PH_PH]], [[OUTER_MIDDLE_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[V_LCSSA1_PH:%.*]] = phi i32 [ poison, [[OUTER_HEADER]] ], [ [[V_LCSSA1_PH_PH]], [[OUTER_MIDDLE_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[INNER_1_IV_UNR:%.*]] = phi i64 [ 0, [[OUTER_HEADER]] ], [ [[INNER_1_IV_UNR_PH]], [[OUTER_MIDDLE_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[INNER_1_HEADER_EPIL_PREHEADER:%.*]], label [[OUTER_MIDDLE:%.*]]
@@ -186,7 +186,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 {
 ; CHECK-NEXT:    [[INNER_1_IV_UNR_PH:%.*]] = phi i64 [ [[INNER_1_IV_NEXT_PROL]], [[INNER_1_LATCH_PROL]] ]
 ; CHECK-NEXT:    br label [[INNER_1_HEADER_PROL_LOOPEXIT]]
 ; CHECK:       inner.1.header.prol.loopexit:
-; CHECK-NEXT:    [[L_1_LCSSA_UNR:%.*]] = phi i32 [ undef, [[OUTER_HEADER]] ], [ [[L_1_LCSSA_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[L_1_LCSSA_UNR:%.*]] = phi i32 [ poison, [[OUTER_HEADER]] ], [ [[L_1_LCSSA_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[INNER_1_IV_UNR:%.*]] = phi i64 [ [[X]], [[OUTER_HEADER]] ], [ [[INNER_1_IV_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 7
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF3]]
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-at-most-two-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-at-most-two-exits.ll
index d5864e91c13e9a..8472a8c9762b3c 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-at-most-two-exits.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-at-most-two-exits.ll
@@ -78,7 +78,7 @@ define i32 @test(ptr nocapture %a, i64 %n) {
 ; ENABLED-NEXT:    [[SUM_02_UNR_PH:%.*]] = phi i32 [ [[ADD_7]], [[FOR_BODY_7]] ]
 ; ENABLED-NEXT:    br label [[FOR_END_UNR_LCSSA]]
 ; ENABLED:       for.end.unr-lcssa:
-; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
+; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_UNR_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[SUM_02_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_02_UNR_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
index 6cad2e1c4a3246..272e47b3ee1e73 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -455,7 +455,7 @@ define i32 @test2(ptr nocapture %a, i64 %n) {
 ; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %for.body.7 ]
 ; EPILOG-NEXT:    br label %for.end.unr-lcssa
 ; EPILOG:       for.end.unr-lcssa:
-; EPILOG-NEXT:    %sum.0.lcssa.ph = phi i32 [ undef, %entry ], [ %sum.0.lcssa.ph.ph, %for.end.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %sum.0.lcssa.ph = phi i32 [ poison, %entry ], [ %sum.0.lcssa.ph.ph, %for.end.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %for.end.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %for.end.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -536,7 +536,7 @@ define i32 @test2(ptr nocapture %a, i64 %n) {
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %for.body.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %for.end.unr-lcssa
 ; EPILOG-BLOCK:       for.end.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %sum.0.lcssa.ph = phi i32 [ undef, %entry ], [ %sum.0.lcssa.ph.ph, %for.end.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %sum.0.lcssa.ph = phi i32 [ poison, %entry ], [ %sum.0.lcssa.ph.ph, %for.end.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %for.end.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %for.end.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -595,7 +595,7 @@ define i32 @test2(ptr nocapture %a, i64 %n) {
 ; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %for.body.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
 ; PROLOG:       header.prol.loopexit:
-; PROLOG-NEXT:    %sum.0.lcssa.unr = phi i32 [ undef, %entry ], [ %sum.0.lcssa.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %sum.0.lcssa.unr = phi i32 [ poison, %entry ], [ %sum.0.lcssa.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %3 = icmp ult i64 %1, 7
@@ -713,7 +713,7 @@ define i32 @test2(ptr nocapture %a, i64 %n) {
 ; PROLOG-BLOCK-NEXT:    %2 = load i32, ptr %a, align 4
 ; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
 ; PROLOG-BLOCK:       header.prol.loopexit:
-; PROLOG-BLOCK-NEXT:    %sum.0.lcssa.unr = phi i32 [ undef, %entry ], [ %2, %for.body.prol ]
+; PROLOG-BLOCK-NEXT:    %sum.0.lcssa.unr = phi i32 [ poison, %entry ], [ %2, %for.body.prol ]
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ 1, %for.body.prol ]
 ; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %2, %for.body.prol ]
 ; PROLOG-BLOCK-NEXT:    %3 = icmp ult i64 %1, 1
@@ -807,8 +807,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch, label %loop_exiting_bb1
 ; EPILOG:       loop_exiting_bb1:
 ; EPILOG-NEXT:    switch i64 %sum, label %loop_latch [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       exit3.loopexit:
 ; EPILOG-NEXT:    br label %exit3
@@ -821,56 +821,56 @@ define void @test3(i64 %trip, i64 %add) {
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.1, label %loop_exiting_bb1.1
 ; EPILOG:       loop_exiting_bb1.1:
 ; EPILOG-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.1:
 ; EPILOG-NEXT:    %sum.next.1 = add i64 %sum.next, %add
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.2, label %loop_exiting_bb1.2
 ; EPILOG:       loop_exiting_bb1.2:
 ; EPILOG-NEXT:    switch i64 %sum.next.1, label %loop_latch.2 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.2:
 ; EPILOG-NEXT:    %sum.next.2 = add i64 %sum.next.1, %add
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.3, label %loop_exiting_bb1.3
 ; EPILOG:       loop_exiting_bb1.3:
 ; EPILOG-NEXT:    switch i64 %sum.next.2, label %loop_latch.3 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.3:
 ; EPILOG-NEXT:    %sum.next.3 = add i64 %sum.next.2, %add
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.4, label %loop_exiting_bb1.4
 ; EPILOG:       loop_exiting_bb1.4:
 ; EPILOG-NEXT:    switch i64 %sum.next.3, label %loop_latch.4 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.4:
 ; EPILOG-NEXT:    %sum.next.4 = add i64 %sum.next.3, %add
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.5, label %loop_exiting_bb1.5
 ; EPILOG:       loop_exiting_bb1.5:
 ; EPILOG-NEXT:    switch i64 %sum.next.4, label %loop_latch.5 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.5:
 ; EPILOG-NEXT:    %sum.next.5 = add i64 %sum.next.4, %add
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.6, label %loop_exiting_bb1.6
 ; EPILOG:       loop_exiting_bb1.6:
 ; EPILOG-NEXT:    switch i64 %sum.next.5, label %loop_latch.6 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.6:
 ; EPILOG-NEXT:    %sum.next.6 = add i64 %sum.next.5, %add
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.7, label %loop_exiting_bb1.7
 ; EPILOG:       loop_exiting_bb1.7:
 ; EPILOG-NEXT:    switch i64 %sum.next.6, label %loop_latch.7 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.7:
 ; EPILOG-NEXT:    %iv_next.7 = add nuw nsw i64 %iv, 8
@@ -902,8 +902,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; EPILOG-NEXT:    br i1 undef, label %loop_latch.epil, label %loop_exiting_bb1.epil
 ; EPILOG:       loop_exiting_bb1.epil:
 ; EPILOG-NEXT:    switch i64 %sum.epil, label %loop_latch.epil [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit1
-; EPILOG-NEXT:    i64 42, label %exit3.loopexit2
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit1
+; EPILOG-NEXT:      i64 42, label %exit3.loopexit2
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.epil:
 ; EPILOG-NEXT:    %iv_next.epil = add nuw nsw i64 %iv.epil, 1
@@ -934,8 +934,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; EPILOG-BLOCK-NEXT:    br i1 undef, label %loop_latch, label %loop_exiting_bb1
 ; EPILOG-BLOCK:       loop_exiting_bb1:
 ; EPILOG-BLOCK-NEXT:    switch i64 %sum, label %loop_latch [
-; EPILOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-BLOCK-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-BLOCK-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-BLOCK-NEXT:    ]
 ; EPILOG-BLOCK:       exit3.loopexit:
 ; EPILOG-BLOCK-NEXT:    br label %exit3
@@ -946,8 +946,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; EPILOG-BLOCK-NEXT:    br i1 undef, label %loop_latch.1, label %loop_exiting_bb1.1
 ; EPILOG-BLOCK:       loop_exiting_bb1.1:
 ; EPILOG-BLOCK-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; EPILOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-BLOCK-NEXT:    i64 42, label %exit3.loopexit
+; EPILOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-BLOCK-NEXT:      i64 42, label %exit3.loopexit
 ; EPILOG-BLOCK-NEXT:    ]
 ; EPILOG-BLOCK:       loop_latch.1:
 ; EPILOG-BLOCK-NEXT:    %iv_next.1 = add nuw nsw i64 %iv, 2
@@ -972,8 +972,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; EPILOG-BLOCK-NEXT:    br i1 undef, label %loop_latch.epil, label %loop_exiting_bb1.epil
 ; EPILOG-BLOCK:       loop_exiting_bb1.epil:
 ; EPILOG-BLOCK-NEXT:    switch i64 %sum.unr, label %loop_latch.epil [
-; EPILOG-BLOCK-NEXT:    i64 24, label %exit1
-; EPILOG-BLOCK-NEXT:    i64 42, label %exit3
+; EPILOG-BLOCK-NEXT:      i64 24, label %exit1
+; EPILOG-BLOCK-NEXT:      i64 42, label %exit3
 ; EPILOG-BLOCK-NEXT:    ]
 ; EPILOG-BLOCK:       loop_latch.epil:
 ; EPILOG-BLOCK-NEXT:    br label %exit2.loopexit
@@ -996,8 +996,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.prol, label %loop_exiting_bb1.prol
 ; PROLOG:       loop_exiting_bb1.prol:
 ; PROLOG-NEXT:    switch i64 %sum.prol, label %loop_latch.prol [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit1
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit2
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit1
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit2
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.prol:
 ; PROLOG-NEXT:    %iv_next.prol = add nuw nsw i64 %iv.prol, 1
@@ -1023,8 +1023,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch, label %loop_exiting_bb1
 ; PROLOG:       loop_exiting_bb1:
 ; PROLOG-NEXT:    switch i64 %sum, label %loop_latch [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       exit3.loopexit:
 ; PROLOG-NEXT:    br label %exit3
@@ -1037,56 +1037,56 @@ define void @test3(i64 %trip, i64 %add) {
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.1, label %loop_exiting_bb1.1
 ; PROLOG:       loop_exiting_bb1.1:
 ; PROLOG-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.1:
 ; PROLOG-NEXT:    %sum.next.1 = add i64 %sum.next, %add
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.2, label %loop_exiting_bb1.2
 ; PROLOG:       loop_exiting_bb1.2:
 ; PROLOG-NEXT:    switch i64 %sum.next.1, label %loop_latch.2 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.2:
 ; PROLOG-NEXT:    %sum.next.2 = add i64 %sum.next.1, %add
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.3, label %loop_exiting_bb1.3
 ; PROLOG:       loop_exiting_bb1.3:
 ; PROLOG-NEXT:    switch i64 %sum.next.2, label %loop_latch.3 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.3:
 ; PROLOG-NEXT:    %sum.next.3 = add i64 %sum.next.2, %add
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.4, label %loop_exiting_bb1.4
 ; PROLOG:       loop_exiting_bb1.4:
 ; PROLOG-NEXT:    switch i64 %sum.next.3, label %loop_latch.4 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.4:
 ; PROLOG-NEXT:    %sum.next.4 = add i64 %sum.next.3, %add
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.5, label %loop_exiting_bb1.5
 ; PROLOG:       loop_exiting_bb1.5:
 ; PROLOG-NEXT:    switch i64 %sum.next.4, label %loop_latch.5 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.5:
 ; PROLOG-NEXT:    %sum.next.5 = add i64 %sum.next.4, %add
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.6, label %loop_exiting_bb1.6
 ; PROLOG:       loop_exiting_bb1.6:
 ; PROLOG-NEXT:    switch i64 %sum.next.5, label %loop_latch.6 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.6:
 ; PROLOG-NEXT:    %sum.next.6 = add i64 %sum.next.5, %add
 ; PROLOG-NEXT:    br i1 undef, label %loop_latch.7, label %loop_exiting_bb1.7
 ; PROLOG:       loop_exiting_bb1.7:
 ; PROLOG-NEXT:    switch i64 %sum.next.6, label %loop_latch.7 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.7:
 ; PROLOG-NEXT:    %iv_next.7 = add nuw nsw i64 %iv, 8
@@ -1117,8 +1117,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; PROLOG-BLOCK-NEXT:    br i1 undef, label %loop_latch.prol, label %loop_exiting_bb1.prol
 ; PROLOG-BLOCK:       loop_exiting_bb1.prol:
 ; PROLOG-BLOCK-NEXT:    switch i64 0, label %loop_latch.prol [
-; PROLOG-BLOCK-NEXT:    i64 24, label %exit1
-; PROLOG-BLOCK-NEXT:    i64 42, label %exit3
+; PROLOG-BLOCK-NEXT:      i64 24, label %exit1
+; PROLOG-BLOCK-NEXT:      i64 42, label %exit3
 ; PROLOG-BLOCK-NEXT:    ]
 ; PROLOG-BLOCK:       loop_latch.prol:
 ; PROLOG-BLOCK-NEXT:    br label %loop_header.prol.loopexit
@@ -1135,8 +1135,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; PROLOG-BLOCK-NEXT:    br i1 undef, label %loop_latch, label %loop_exiting_bb1
 ; PROLOG-BLOCK:       loop_exiting_bb1:
 ; PROLOG-BLOCK-NEXT:    switch i64 %sum, label %loop_latch [
-; PROLOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-BLOCK-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-BLOCK-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-BLOCK-NEXT:    ]
 ; PROLOG-BLOCK:       exit3.loopexit:
 ; PROLOG-BLOCK-NEXT:    br label %exit3
@@ -1147,8 +1147,8 @@ define void @test3(i64 %trip, i64 %add) {
 ; PROLOG-BLOCK-NEXT:    br i1 undef, label %loop_latch.1, label %loop_exiting_bb1.1
 ; PROLOG-BLOCK:       loop_exiting_bb1.1:
 ; PROLOG-BLOCK-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; PROLOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-BLOCK-NEXT:    i64 42, label %exit3.loopexit
+; PROLOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-BLOCK-NEXT:      i64 42, label %exit3.loopexit
 ; PROLOG-BLOCK-NEXT:    ]
 ; PROLOG-BLOCK:       loop_latch.1:
 ; PROLOG-BLOCK-NEXT:    %iv_next.1 = add nuw nsw i64 %iv, 2
@@ -1293,7 +1293,7 @@ define i32 @hdr_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
 ; EPILOG-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG:       latchExit.unr-lcssa:
-; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -1377,7 +1377,7 @@ define i32 @hdr_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG-BLOCK:       latchExit.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -1440,7 +1440,7 @@ define i32 @hdr_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
 ; PROLOG:       header.prol.loopexit:
-; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %3 = icmp ult i64 %1, 7
@@ -1561,7 +1561,7 @@ define i32 @hdr_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-BLOCK-NEXT:    %2 = load i32, ptr %a, align 4
 ; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
 ; PROLOG-BLOCK:       header.prol.loopexit:
-; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %2, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ 1, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %3 = icmp ult i64 %1, 1
@@ -1733,7 +1733,7 @@ define i32 @otherblock_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
 ; EPILOG-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG:       latchExit.unr-lcssa:
-; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -1817,7 +1817,7 @@ define i32 @otherblock_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG-BLOCK:       latchExit.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -1880,7 +1880,7 @@ define i32 @otherblock_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
 ; PROLOG:       header.prol.loopexit:
-; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %3 = icmp ult i64 %1, 7
@@ -2001,7 +2001,7 @@ define i32 @otherblock_latch_same_exit(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-BLOCK-NEXT:    %2 = load i32, ptr %a, align 4
 ; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
 ; PROLOG-BLOCK:       header.prol.loopexit:
-; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %2, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ 1, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %3 = icmp ult i64 %1, 1
@@ -2174,7 +2174,7 @@ define i32 @otherblock_latch_same_exit2(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
 ; EPILOG-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG:       latchExit.unr-lcssa:
-; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -2258,7 +2258,7 @@ define i32 @otherblock_latch_same_exit2(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG-BLOCK:       latchExit.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -2321,7 +2321,7 @@ define i32 @otherblock_latch_same_exit2(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
 ; PROLOG:       header.prol.loopexit:
-; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %3 = icmp ult i64 %1, 7
@@ -2442,7 +2442,7 @@ define i32 @otherblock_latch_same_exit2(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-BLOCK-NEXT:    %2 = load i32, ptr %a, align 4
 ; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
 ; PROLOG-BLOCK:       header.prol.loopexit:
-; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %2, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ 1, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %3 = icmp ult i64 %1, 1
@@ -2616,7 +2616,7 @@ define i32 @otherblock_latch_same_exit3(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
 ; EPILOG-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG:       latchExit.unr-lcssa:
-; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -2700,7 +2700,7 @@ define i32 @otherblock_latch_same_exit3(ptr nocapture %a, i64 %n, i1 %cond) {
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG-BLOCK:       latchExit.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ poison, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -2763,7 +2763,7 @@ define i32 @otherblock_latch_same_exit3(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
 ; PROLOG:       header.prol.loopexit:
-; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %3 = icmp ult i64 %1, 7
@@ -2884,7 +2884,7 @@ define i32 @otherblock_latch_same_exit3(ptr nocapture %a, i64 %n, i1 %cond) {
 ; PROLOG-BLOCK:       latch.prol:
 ; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
 ; PROLOG-BLOCK:       header.prol.loopexit:
-; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %2, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ poison, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ 1, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %2, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %3 = icmp ult i64 %1, 1
@@ -3060,7 +3060,7 @@ define void @unique_exit(i32 %N, i32 %M) {
 ; EPILOG-NEXT:    %i4.unr.ph = phi i32 [ %inc.7, %latch.7 ]
 ; EPILOG-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG:       latchExit.unr-lcssa:
-; EPILOG-NEXT:    %i2.ph.ph = phi i32 [ undef, %preheader ], [ %i2.ph.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %i2.ph.ph = phi i32 [ poison, %preheader ], [ %i2.ph.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %i4.unr = phi i32 [ 0, %preheader ], [ %i4.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; EPILOG-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
@@ -3121,7 +3121,7 @@ define void @unique_exit(i32 %N, i32 %M) {
 ; EPILOG-BLOCK-NEXT:    %i4.unr.ph = phi i32 [ %inc.1, %latch.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
 ; EPILOG-BLOCK:       latchExit.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %i2.ph.ph = phi i32 [ undef, %preheader ], [ %i2.ph.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %i2.ph.ph = phi i32 [ poison, %preheader ], [ %i2.ph.ph.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %i4.unr = phi i32 [ 0, %preheader ], [ %i4.unr.ph, %latchExit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; EPILOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
@@ -3171,7 +3171,7 @@ define void @unique_exit(i32 %N, i32 %M) {
 ; PROLOG-NEXT:    br label %header.prol.loopexit
 ; PROLOG:       header.prol.loopexit:
 ; PROLOG-NEXT:    %i4.unr = phi i32 [ 0, %preheader ], [ %i4.unr.ph, %header.prol.loopexit.unr-lcssa ]
-; PROLOG-NEXT:    %i2.ph.unr = phi i32 [ undef, %preheader ], [ %i2.ph.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %i2.ph.unr = phi i32 [ poison, %preheader ], [ %i2.ph.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %2 = icmp ult i32 %1, 7
 ; PROLOG-NEXT:    br i1 %2, label %latchExit, label %preheader.new
 ; PROLOG:       preheader.new:
@@ -3243,7 +3243,7 @@ define void @unique_exit(i32 %N, i32 %M) {
 ; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
 ; PROLOG-BLOCK:       header.prol.loopexit:
 ; PROLOG-BLOCK-NEXT:    %i4.unr = phi i32 [ 0, %preheader ], [ 1, %latch.prol ]
-; PROLOG-BLOCK-NEXT:    %i2.ph.unr = phi i32 [ undef, %preheader ], [ -1, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %i2.ph.unr = phi i32 [ poison, %preheader ], [ -1, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %2 = icmp ult i32 %1, 1
 ; PROLOG-BLOCK-NEXT:    br i1 %2, label %latchExit, label %preheader.new
 ; PROLOG-BLOCK:       preheader.new:
@@ -3312,8 +3312,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting:
 ; EPILOG-NEXT:    %ivy = add i64 %iv, %add
 ; EPILOG-NEXT:    switch i64 %sum, label %loop_latch [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch:
 ; EPILOG-NEXT:    %iv_next = add nuw nsw i64 %iv, 1
@@ -3322,8 +3322,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.1:
 ; EPILOG-NEXT:    %ivy.1 = add i64 %iv_next, %add
 ; EPILOG-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.1:
 ; EPILOG-NEXT:    %iv_next.1 = add nuw nsw i64 %iv, 2
@@ -3332,8 +3332,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.2:
 ; EPILOG-NEXT:    %ivy.2 = add i64 %iv_next.1, %add
 ; EPILOG-NEXT:    switch i64 %sum.next.1, label %loop_latch.2 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.2:
 ; EPILOG-NEXT:    %iv_next.2 = add nuw nsw i64 %iv, 3
@@ -3342,8 +3342,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.3:
 ; EPILOG-NEXT:    %ivy.3 = add i64 %iv_next.2, %add
 ; EPILOG-NEXT:    switch i64 %sum.next.2, label %loop_latch.3 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.3:
 ; EPILOG-NEXT:    %iv_next.3 = add nuw nsw i64 %iv, 4
@@ -3352,8 +3352,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.4:
 ; EPILOG-NEXT:    %ivy.4 = add i64 %iv_next.3, %add
 ; EPILOG-NEXT:    switch i64 %sum.next.3, label %loop_latch.4 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.4:
 ; EPILOG-NEXT:    %iv_next.4 = add nuw nsw i64 %iv, 5
@@ -3362,8 +3362,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.5:
 ; EPILOG-NEXT:    %ivy.5 = add i64 %iv_next.4, %add
 ; EPILOG-NEXT:    switch i64 %sum.next.4, label %loop_latch.5 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.5:
 ; EPILOG-NEXT:    %iv_next.5 = add nuw nsw i64 %iv, 6
@@ -3372,8 +3372,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.6:
 ; EPILOG-NEXT:    %ivy.6 = add i64 %iv_next.5, %add
 ; EPILOG-NEXT:    switch i64 %sum.next.5, label %loop_latch.6 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.6:
 ; EPILOG-NEXT:    %iv_next.6 = add nuw nsw i64 %iv, 7
@@ -3382,8 +3382,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.7:
 ; EPILOG-NEXT:    %ivy.7 = add i64 %iv_next.6, %add
 ; EPILOG-NEXT:    switch i64 %sum.next.6, label %loop_latch.7 [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.7:
 ; EPILOG-NEXT:    %iv_next.7 = add nuw nsw i64 %iv, 8
@@ -3406,7 +3406,7 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG-NEXT:    %sum.unr.ph = phi i64 [ %sum.next.7, %loop_latch.7 ]
 ; EPILOG-NEXT:    br label %latchexit.unr-lcssa
 ; EPILOG:       latchexit.unr-lcssa:
-; EPILOG-NEXT:    %sum.next.lcssa.ph = phi i64 [ undef, %entry ], [ %sum.next.lcssa.ph.ph, %latchexit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %sum.next.lcssa.ph = phi i64 [ poison, %entry ], [ %sum.next.lcssa.ph.ph, %latchexit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %iv.unr = phi i64 [ 0, %entry ], [ %iv.unr.ph, %latchexit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %sum.unr = phi i64 [ 0, %entry ], [ %sum.unr.ph, %latchexit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -3421,8 +3421,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG:       loop_exiting.epil:
 ; EPILOG-NEXT:    %ivy.epil = add i64 %iv.epil, %add
 ; EPILOG-NEXT:    switch i64 %sum.epil, label %loop_latch.epil [
-; EPILOG-NEXT:    i64 24, label %exit1.loopexit2
-; EPILOG-NEXT:    i64 42, label %exit1.loopexit2
+; EPILOG-NEXT:      i64 24, label %exit1.loopexit2
+; EPILOG-NEXT:      i64 42, label %exit1.loopexit2
 ; EPILOG-NEXT:    ]
 ; EPILOG:       loop_latch.epil:
 ; EPILOG-NEXT:    %iv_next.epil = add nuw nsw i64 %iv.epil, 1
@@ -3456,8 +3456,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG-BLOCK:       loop_exiting:
 ; EPILOG-BLOCK-NEXT:    %ivy = add i64 %iv, %add
 ; EPILOG-BLOCK-NEXT:    switch i64 %sum, label %loop_latch [
-; EPILOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-BLOCK-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-BLOCK-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-BLOCK-NEXT:    ]
 ; EPILOG-BLOCK:       loop_latch:
 ; EPILOG-BLOCK-NEXT:    %iv_next = add nuw nsw i64 %iv, 1
@@ -3466,8 +3466,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG-BLOCK:       loop_exiting.1:
 ; EPILOG-BLOCK-NEXT:    %ivy.1 = add i64 %iv_next, %add
 ; EPILOG-BLOCK-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; EPILOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-BLOCK-NEXT:    i64 42, label %exit1.loopexit
+; EPILOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; EPILOG-BLOCK-NEXT:      i64 42, label %exit1.loopexit
 ; EPILOG-BLOCK-NEXT:    ]
 ; EPILOG-BLOCK:       loop_latch.1:
 ; EPILOG-BLOCK-NEXT:    %iv_next.1 = add nuw nsw i64 %iv, 2
@@ -3487,7 +3487,7 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG-BLOCK-NEXT:    %sum.unr.ph = phi i64 [ %sum.next.1, %loop_latch.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %latchexit.unr-lcssa
 ; EPILOG-BLOCK:       latchexit.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %sum.next.lcssa.ph = phi i64 [ undef, %entry ], [ %sum.next.lcssa.ph.ph, %latchexit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %sum.next.lcssa.ph = phi i64 [ poison, %entry ], [ %sum.next.lcssa.ph.ph, %latchexit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %iv.unr = phi i64 [ 0, %entry ], [ %iv.unr.ph, %latchexit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %sum.unr = phi i64 [ 0, %entry ], [ %sum.unr.ph, %latchexit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -3499,8 +3499,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG-BLOCK:       loop_exiting.epil:
 ; EPILOG-BLOCK-NEXT:    %ivy.epil = add i64 %iv.unr, %add
 ; EPILOG-BLOCK-NEXT:    switch i64 %sum.unr, label %loop_latch.epil [
-; EPILOG-BLOCK-NEXT:    i64 24, label %exit1
-; EPILOG-BLOCK-NEXT:    i64 42, label %exit1
+; EPILOG-BLOCK-NEXT:      i64 24, label %exit1
+; EPILOG-BLOCK-NEXT:      i64 42, label %exit1
 ; EPILOG-BLOCK-NEXT:    ]
 ; EPILOG-BLOCK:       loop_latch.epil:
 ; EPILOG-BLOCK-NEXT:    %sum.next.epil = add i64 %sum.unr, %add
@@ -3526,8 +3526,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.prol:
 ; PROLOG-NEXT:    %ivy.prol = add i64 %iv.prol, %add
 ; PROLOG-NEXT:    switch i64 %sum.prol, label %loop_latch.prol [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit1
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit1
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit1
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit1
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.prol:
 ; PROLOG-NEXT:    %iv_next.prol = add nuw nsw i64 %iv.prol, 1
@@ -3544,7 +3544,7 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_header.prol.loopexit:
 ; PROLOG-NEXT:    %iv.unr = phi i64 [ 0, %entry ], [ %iv.unr.ph, %loop_header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %sum.unr = phi i64 [ 0, %entry ], [ %sum.unr.ph, %loop_header.prol.loopexit.unr-lcssa ]
-; PROLOG-NEXT:    %sum.next.lcssa.unr = phi i64 [ undef, %entry ], [ %sum.next.lcssa.unr.ph, %loop_header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %sum.next.lcssa.unr = phi i64 [ poison, %entry ], [ %sum.next.lcssa.unr.ph, %loop_header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %2 = icmp ult i64 %1, 7
 ; PROLOG-NEXT:    br i1 %2, label %latchexit, label %entry.new
 ; PROLOG:       entry.new:
@@ -3556,8 +3556,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting:
 ; PROLOG-NEXT:    %ivy = add i64 %iv, %add
 ; PROLOG-NEXT:    switch i64 %sum, label %loop_latch [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch:
 ; PROLOG-NEXT:    %iv_next = add nuw nsw i64 %iv, 1
@@ -3566,8 +3566,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.1:
 ; PROLOG-NEXT:    %ivy.1 = add i64 %iv_next, %add
 ; PROLOG-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.1:
 ; PROLOG-NEXT:    %iv_next.1 = add nuw nsw i64 %iv, 2
@@ -3576,8 +3576,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.2:
 ; PROLOG-NEXT:    %ivy.2 = add i64 %iv_next.1, %add
 ; PROLOG-NEXT:    switch i64 %sum.next.1, label %loop_latch.2 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.2:
 ; PROLOG-NEXT:    %iv_next.2 = add nuw nsw i64 %iv, 3
@@ -3586,8 +3586,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.3:
 ; PROLOG-NEXT:    %ivy.3 = add i64 %iv_next.2, %add
 ; PROLOG-NEXT:    switch i64 %sum.next.2, label %loop_latch.3 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.3:
 ; PROLOG-NEXT:    %iv_next.3 = add nuw nsw i64 %iv, 4
@@ -3596,8 +3596,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.4:
 ; PROLOG-NEXT:    %ivy.4 = add i64 %iv_next.3, %add
 ; PROLOG-NEXT:    switch i64 %sum.next.3, label %loop_latch.4 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.4:
 ; PROLOG-NEXT:    %iv_next.4 = add nuw nsw i64 %iv, 5
@@ -3606,8 +3606,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.5:
 ; PROLOG-NEXT:    %ivy.5 = add i64 %iv_next.4, %add
 ; PROLOG-NEXT:    switch i64 %sum.next.4, label %loop_latch.5 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.5:
 ; PROLOG-NEXT:    %iv_next.5 = add nuw nsw i64 %iv, 6
@@ -3616,8 +3616,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.6:
 ; PROLOG-NEXT:    %ivy.6 = add i64 %iv_next.5, %add
 ; PROLOG-NEXT:    switch i64 %sum.next.5, label %loop_latch.6 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.6:
 ; PROLOG-NEXT:    %iv_next.6 = add nuw nsw i64 %iv, 7
@@ -3626,8 +3626,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG:       loop_exiting.7:
 ; PROLOG-NEXT:    %ivy.7 = add i64 %iv_next.6, %add
 ; PROLOG-NEXT:    switch i64 %sum.next.6, label %loop_latch.7 [
-; PROLOG-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-NEXT:    ]
 ; PROLOG:       loop_latch.7:
 ; PROLOG-NEXT:    %iv_next.7 = add nuw nsw i64 %iv, 8
@@ -3663,15 +3663,15 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG-BLOCK-NEXT:    br i1 %cond, label %loop_latch.prol, label %loop_exiting.prol
 ; PROLOG-BLOCK:       loop_exiting.prol:
 ; PROLOG-BLOCK-NEXT:    switch i64 0, label %loop_latch.prol [
-; PROLOG-BLOCK-NEXT:    i64 24, label %exit1
-; PROLOG-BLOCK-NEXT:    i64 42, label %exit1
+; PROLOG-BLOCK-NEXT:      i64 24, label %exit1
+; PROLOG-BLOCK-NEXT:      i64 42, label %exit1
 ; PROLOG-BLOCK-NEXT:    ]
 ; PROLOG-BLOCK:       loop_latch.prol:
 ; PROLOG-BLOCK-NEXT:    br label %loop_header.prol.loopexit
 ; PROLOG-BLOCK:       loop_header.prol.loopexit:
 ; PROLOG-BLOCK-NEXT:    %iv.unr = phi i64 [ 0, %entry ], [ 1, %loop_latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %sum.unr = phi i64 [ 0, %entry ], [ %add, %loop_latch.prol ]
-; PROLOG-BLOCK-NEXT:    %sum.next.lcssa.unr = phi i64 [ undef, %entry ], [ %add, %loop_latch.prol ]
+; PROLOG-BLOCK-NEXT:    %sum.next.lcssa.unr = phi i64 [ poison, %entry ], [ %add, %loop_latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %2 = icmp ult i64 %1, 1
 ; PROLOG-BLOCK-NEXT:    br i1 %2, label %latchexit, label %entry.new
 ; PROLOG-BLOCK:       entry.new:
@@ -3683,8 +3683,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG-BLOCK:       loop_exiting:
 ; PROLOG-BLOCK-NEXT:    %ivy = add i64 %iv, %add
 ; PROLOG-BLOCK-NEXT:    switch i64 %sum, label %loop_latch [
-; PROLOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-BLOCK-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-BLOCK-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-BLOCK-NEXT:    ]
 ; PROLOG-BLOCK:       loop_latch:
 ; PROLOG-BLOCK-NEXT:    %iv_next = add nuw nsw i64 %iv, 1
@@ -3693,8 +3693,8 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG-BLOCK:       loop_exiting.1:
 ; PROLOG-BLOCK-NEXT:    %ivy.1 = add i64 %iv_next, %add
 ; PROLOG-BLOCK-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; PROLOG-BLOCK-NEXT:    i64 24, label %exit1.loopexit
-; PROLOG-BLOCK-NEXT:    i64 42, label %exit1.loopexit
+; PROLOG-BLOCK-NEXT:      i64 24, label %exit1.loopexit
+; PROLOG-BLOCK-NEXT:      i64 42, label %exit1.loopexit
 ; PROLOG-BLOCK-NEXT:    ]
 ; PROLOG-BLOCK:       loop_latch.1:
 ; PROLOG-BLOCK-NEXT:    %iv_next.1 = add nuw nsw i64 %iv, 2
@@ -3841,7 +3841,7 @@ define i32 @test6(ptr nocapture %a, i64 %n, i1 %cond, i32 %x) {
 ; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
 ; EPILOG-NEXT:    br label %latch_exit.unr-lcssa
 ; EPILOG:       latch_exit.unr-lcssa:
-; EPILOG-NEXT:    %sum.0.lcssa.ph = phi i32 [ undef, %entry ], [ %sum.0.lcssa.ph.ph, %latch_exit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %sum.0.lcssa.ph = phi i32 [ poison, %entry ], [ %sum.0.lcssa.ph.ph, %latch_exit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latch_exit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latch_exit.unr-lcssa.loopexit ]
 ; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -3927,7 +3927,7 @@ define i32 @test6(ptr nocapture %a, i64 %n, i1 %cond, i32 %x) {
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
 ; EPILOG-BLOCK-NEXT:    br label %latch_exit.unr-lcssa
 ; EPILOG-BLOCK:       latch_exit.unr-lcssa:
-; EPILOG-BLOCK-NEXT:    %sum.0.lcssa.ph = phi i32 [ undef, %entry ], [ %sum.0.lcssa.ph.ph, %latch_exit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %sum.0.lcssa.ph = phi i32 [ poison, %entry ], [ %sum.0.lcssa.ph.ph, %latch_exit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latch_exit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latch_exit.unr-lcssa.loopexit ]
 ; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
@@ -3991,7 +3991,7 @@ define i32 @test6(ptr nocapture %a, i64 %n, i1 %cond, i32 %x) {
 ; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
 ; PROLOG:       header.prol.loopexit:
-; PROLOG-NEXT:    %sum.0.lcssa.unr = phi i32 [ undef, %entry ], [ %sum.0.lcssa.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %sum.0.lcssa.unr = phi i32 [ poison, %entry ], [ %sum.0.lcssa.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
 ; PROLOG-NEXT:    %2 = icmp ult i64 %1, 7
@@ -4114,7 +4114,7 @@ define i32 @test6(ptr nocapture %a, i64 %n, i1 %cond, i32 %x) {
 ; PROLOG-BLOCK-NEXT:    %load.prol = load i32, ptr %a, align 4
 ; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
 ; PROLOG-BLOCK:       header.prol.loopexit:
-; PROLOG-BLOCK-NEXT:    %sum.0.lcssa.unr = phi i32 [ undef, %entry ], [ %load.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %sum.0.lcssa.unr = phi i32 [ poison, %entry ], [ %load.prol, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ 1, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %load.prol, %latch.prol ]
 ; PROLOG-BLOCK-NEXT:    %2 = icmp ult i64 %1, 1
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll
index b44206a044c291..fa9f9026d524c7 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll
@@ -102,7 +102,7 @@ define i3 @test(ptr %a, i3 %n) {
 ; UNROLL-4-NEXT:    [[SUM_02_UNR_PH:%.*]] = phi i3 [ [[ADD_3]], [[FOR_BODY]] ]
 ; UNROLL-4-NEXT:    br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
 ; UNROLL-4:       for.end.loopexit.unr-lcssa:
-; UNROLL-4-NEXT:    [[ADD_LCSSA_PH:%.*]] = phi i3 [ undef, [[FOR_BODY_PREHEADER]] ], [ [[ADD_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL-4-NEXT:    [[ADD_LCSSA_PH:%.*]] = phi i3 [ poison, [[FOR_BODY_PREHEADER]] ], [ [[ADD_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; UNROLL-4-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; UNROLL-4-NEXT:    [[SUM_02_UNR:%.*]] = phi i3 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SUM_02_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; UNROLL-4-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i3 [[XTRAITER]], 0
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll b/llvm/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll
index 75222e5ee13716..022bd6febe3270 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll
@@ -98,7 +98,7 @@ define i32 @test1(ptr nocapture %a, i64 %n) {
 ; CHECK:       latchexit.unr-lcssa.loopexit:
 ; CHECK-NEXT:    br label [[LATCHEXIT_UNR_LCSSA]]
 ; CHECK:       latchexit.unr-lcssa:
-; CHECK-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[ADD_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[SUM_02_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
@@ -255,7 +255,7 @@ define i32 @test1(ptr nocapture %a, i64 %n) {
 ; ENABLED-NEXT:    [[SUM_02_UNR_PH:%.*]] = phi i32 [ [[ADD_7]], [[LATCH_7]] ]
 ; ENABLED-NEXT:    br label [[LATCHEXIT_UNR_LCSSA]]
 ; ENABLED:       latchexit.unr-lcssa:
-; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[SUM_02_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_02_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
@@ -469,7 +469,7 @@ define i32 @test2(ptr nocapture %a, i64 %n) {
 ; ENABLED-NEXT:    [[SUM_02_UNR_PH:%.*]] = phi i32 [ [[ADD_7]], [[LATCH_7]] ]
 ; ENABLED-NEXT:    br label [[LATCHEXIT_UNR_LCSSA]]
 ; ENABLED:       latchexit.unr-lcssa:
-; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[SUM_02_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_02_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
@@ -826,7 +826,7 @@ define i32 @test5(ptr nocapture %a, i64 %n) {
 ; CHECK:       latchexit.unr-lcssa.loopexit:
 ; CHECK-NEXT:    br label [[LATCHEXIT_UNR_LCSSA]]
 ; CHECK:       latchexit.unr-lcssa:
-; CHECK-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[ADD_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[SUM_02_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_7]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
@@ -988,7 +988,7 @@ define i32 @test5(ptr nocapture %a, i64 %n) {
 ; ENABLED-NEXT:    [[SUM_02_UNR_PH:%.*]] = phi i32 [ [[ADD_7]], [[LATCH_7]] ]
 ; ENABLED-NEXT:    br label [[LATCHEXIT_UNR_LCSSA]]
 ; ENABLED:       latchexit.unr-lcssa:
-; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; ENABLED-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[SUM_02_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_02_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; ENABLED-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll
index fea9df610c3e82..4307be7a8b5a73 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll
@@ -18,7 +18,7 @@ define i32 @unroll(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N)
 ; CHECK:       for.cond.cleanup.loopexit.unr-lcssa.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
 ; CHECK:       for.cond.cleanup.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[ADD_LCSSA_PH:%.*]] = phi i32 [ undef, [[FOR_BODY_LR_PH]] ], [ [[ADD_3:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_PH:%.*]] = phi i32 [ poison, [[FOR_BODY_LR_PH]] ], [ [[ADD_3:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[C_010_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD_3]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
diff --git a/llvm/test/Transforms/LoopUnroll/tripcount-overflow.ll b/llvm/test/Transforms/LoopUnroll/tripcount-overflow.ll
index 0e4c73e5e55345..14812865fd5581 100644
--- a/llvm/test/Transforms/LoopUnroll/tripcount-overflow.ll
+++ b/llvm/test/Transforms/LoopUnroll/tripcount-overflow.ll
@@ -34,7 +34,7 @@ define i32 @foo(i32 %N) {
 ; EPILOG-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[INC_1]], [[WHILE_BODY]] ]
 ; EPILOG-NEXT:    br label [[WHILE_END_UNR_LCSSA]]
 ; EPILOG:       while.end.unr-lcssa:
-; EPILOG-NEXT:    [[I_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[I_LCSSA_PH_PH]], [[WHILE_END_UNR_LCSSA_LOOPEXIT]] ]
+; EPILOG-NEXT:    [[I_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[I_LCSSA_PH_PH]], [[WHILE_END_UNR_LCSSA_LOOPEXIT]] ]
 ; EPILOG-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[I_UNR_PH]], [[WHILE_END_UNR_LCSSA_LOOPEXIT]] ]
 ; EPILOG-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
 ; EPILOG-NEXT:    br i1 [[LCMP_MOD]], label [[WHILE_BODY_EPIL_PREHEADER:%.*]], label [[WHILE_END:%.*]]
@@ -57,7 +57,7 @@ define i32 @foo(i32 %N) {
 ; PROLOG:       while.body.prol:
 ; PROLOG-NEXT:    br label [[WHILE_BODY_PROL_LOOPEXIT]]
 ; PROLOG:       while.body.prol.loopexit:
-; PROLOG-NEXT:    [[I_LCSSA_UNR:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ 0, [[WHILE_BODY_PROL]] ]
+; PROLOG-NEXT:    [[I_LCSSA_UNR:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ 0, [[WHILE_BODY_PROL]] ]
 ; PROLOG-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ 1, [[WHILE_BODY_PROL]] ]
 ; PROLOG-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[N]], 1
 ; PROLOG-NEXT:    br i1 [[TMP1]], label [[WHILE_END:%.*]], label [[ENTRY_NEW:%.*]]
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
index f9f7da5fca6b41..89ce66767ccc92 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
@@ -639,8 +639,8 @@ define i32 @test6() #0 {
 ; CHECK-NEXT:    [[P0_UNR_PH:%.*]] = phi i32 [ 2, [[FOR_LATCH]] ]
 ; CHECK-NEXT:    br label [[FOR_END_UNR_LCSSA]]
 ; CHECK:       for.end.unr-lcssa:
-; CHECK-NEXT:    [[DOTLCSSA_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[DOTLCSSA_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[INC_LCSSA_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[INC_LCSSA_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[DOTLCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[DOTLCSSA_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[INC_LCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[INC_LCSSA_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[P0_UNR:%.*]] = phi i32 [ [[F_PROMOTED10]], [[ENTRY]] ], [ [[P0_UNR_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    br i1 true, label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.outer.epil.preheader:
@@ -1324,9 +1324,9 @@ define signext i16 @test10(i32 %k) #0 {
 ; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA_PH_PH:%.*]] = phi i32 [ 0, [[FOR_INC24]] ]
 ; CHECK-NEXT:    br label [[FOR_END26_UNR_LCSSA]]
 ; CHECK:       for.end26.unr-lcssa:
-; CHECK-NEXT:    [[DEC_LCSSA_LCSSA_PH:%.*]] = phi i64 [ undef, [[ENTRY:%.*]] ], [ [[DEC_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA_PH:%.*]] = phi i64 [ undef, [[ENTRY]] ], [ [[STOREMERGE_4_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[STOREMERGE_5_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[DEC_LCSSA_LCSSA_PH:%.*]] = phi i64 [ poison, [[ENTRY:%.*]] ], [ [[DEC_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA_PH:%.*]] = phi i64 [ poison, [[ENTRY]] ], [ [[STOREMERGE_4_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[STOREMERGE_5_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    br i1 true, label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[FOR_END26:%.*]]
 ; CHECK:       for.body.epil.preheader:
 ; CHECK-NEXT:    br label [[FOR_BODY_EPIL:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f5ad8bbb3e7e01..78452a9c884eed 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1224,7 +1224,484 @@ exit:
   ret void
 }
 
+define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias %src.2, ptr noalias %src.3, ptr noalias %src.4, ptr noalias %dst, i64 %N) #2 {
+; DEFAULT-LABEL: define void @test_conditional_interleave_group(
+; DEFAULT-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; DEFAULT-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; DEFAULT-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; DEFAULT-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; DEFAULT-NEXT:    [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; DEFAULT-NEXT:    [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
+; DEFAULT-NEXT:    [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT6]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT6]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[DST]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
+; DEFAULT-NEXT:    [[TMP15:%.*]] = load float, ptr [[SRC_1]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP16:%.*]] = load float, ptr [[SRC_2]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP16]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP17:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP17]])
+; DEFAULT-NEXT:    [[TMP19:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP20:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT11]], <8 x float> zeroinitializer, <8 x float> [[TMP18]])
+; DEFAULT-NEXT:    [[TMP21:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x float> poison, float [[TMP21]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT12]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP22:%.*]] = fcmp ogt <8 x float> [[TMP20]], [[BROADCAST_SPLAT13]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], <8 x i64> [[VEC_IND]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    [[TMP25:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP25]], align 4
+; DEFAULT-NEXT:    [[TMP26:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP27]], align 4
+; DEFAULT-NEXT:    [[TMP28:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[TMP28]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP29]], align 4
+; DEFAULT-NEXT:    [[TMP30:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP30]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <8 x i1> [[TMP22]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; DEFAULT:       pred.store.if14:
+; DEFAULT-NEXT:    [[TMP32:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP32]], align 4
+; DEFAULT-NEXT:    [[TMP33:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP33]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP34]], align 4
+; DEFAULT-NEXT:    [[TMP35:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP35]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP36]], align 4
+; DEFAULT-NEXT:    [[TMP37:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP37]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; DEFAULT:       pred.store.continue15:
+; DEFAULT-NEXT:    [[TMP38:%.*]] = extractelement <8 x i1> [[TMP22]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; DEFAULT:       pred.store.if16:
+; DEFAULT-NEXT:    [[TMP39:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP39]], align 4
+; DEFAULT-NEXT:    [[TMP40:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[TMP40]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP41]], align 4
+; DEFAULT-NEXT:    [[TMP42:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP42]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP43]], align 4
+; DEFAULT-NEXT:    [[TMP44:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP44]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; DEFAULT:       pred.store.continue17:
+; DEFAULT-NEXT:    [[TMP45:%.*]] = extractelement <8 x i1> [[TMP22]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; DEFAULT:       pred.store.if18:
+; DEFAULT-NEXT:    [[TMP46:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP46]], align 4
+; DEFAULT-NEXT:    [[TMP47:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP47]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP48]], align 4
+; DEFAULT-NEXT:    [[TMP49:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[TMP49]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP50]], align 4
+; DEFAULT-NEXT:    [[TMP51:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP51]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; DEFAULT:       pred.store.continue19:
+; DEFAULT-NEXT:    [[TMP52:%.*]] = extractelement <8 x i1> [[TMP22]], i32 4
+; DEFAULT-NEXT:    br i1 [[TMP52]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; DEFAULT:       pred.store.if20:
+; DEFAULT-NEXT:    [[TMP53:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP53]], align 4
+; DEFAULT-NEXT:    [[TMP54:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[TMP54]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP55]], align 4
+; DEFAULT-NEXT:    [[TMP56:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[TMP56]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP57]], align 4
+; DEFAULT-NEXT:    [[TMP58:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP58]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; DEFAULT:       pred.store.continue21:
+; DEFAULT-NEXT:    [[TMP59:%.*]] = extractelement <8 x i1> [[TMP22]], i32 5
+; DEFAULT-NEXT:    br i1 [[TMP59]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; DEFAULT:       pred.store.if22:
+; DEFAULT-NEXT:    [[TMP60:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP60]], align 4
+; DEFAULT-NEXT:    [[TMP61:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[TMP61]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP62]], align 4
+; DEFAULT-NEXT:    [[TMP63:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[TMP63]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP64]], align 4
+; DEFAULT-NEXT:    [[TMP65:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP65]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; DEFAULT:       pred.store.continue23:
+; DEFAULT-NEXT:    [[TMP66:%.*]] = extractelement <8 x i1> [[TMP22]], i32 6
+; DEFAULT-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; DEFAULT:       pred.store.if24:
+; DEFAULT-NEXT:    [[TMP67:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP67]], align 4
+; DEFAULT-NEXT:    [[TMP68:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr [[TMP68]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP69]], align 4
+; DEFAULT-NEXT:    [[TMP70:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr [[TMP70]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP71]], align 4
+; DEFAULT-NEXT:    [[TMP72:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP72]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; DEFAULT:       pred.store.continue25:
+; DEFAULT-NEXT:    [[TMP73:%.*]] = extractelement <8 x i1> [[TMP22]], i32 7
+; DEFAULT-NEXT:    br i1 [[TMP73]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]]
+; DEFAULT:       pred.store.if26:
+; DEFAULT-NEXT:    [[TMP74:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP74]], align 4
+; DEFAULT-NEXT:    [[TMP75:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr [[TMP75]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP76]], align 4
+; DEFAULT-NEXT:    [[TMP77:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr [[TMP77]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP78]], align 4
+; DEFAULT-NEXT:    [[TMP79:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[TMP79]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; DEFAULT:       pred.store.continue27:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; DEFAULT-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP_HEADER:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; DEFAULT-NEXT:    [[TMP81:%.*]] = load float, ptr [[SRC_1]], align 4
+; DEFAULT-NEXT:    [[TMP82:%.*]] = load float, ptr [[SRC_2]], align 4
+; DEFAULT-NEXT:    [[MUL8_I_US:%.*]] = fmul float [[TMP82]], 0.000000e+00
+; DEFAULT-NEXT:    [[TMP83:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP81]], float 0.000000e+00, float [[MUL8_I_US]])
+; DEFAULT-NEXT:    [[TMP84:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT:    [[TMP85:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP84]], float 0.000000e+00, float [[TMP83]])
+; DEFAULT-NEXT:    [[TMP86:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT:    [[C:%.*]] = fcmp ogt float [[TMP85]], [[TMP86]]
+; DEFAULT-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; DEFAULT:       if.then:
+; DEFAULT-NEXT:    [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[DST_0]], align 4
+; DEFAULT-NEXT:    [[DST_1:%.*]] = getelementptr i8, ptr [[DST_0]], i64 4
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[DST_1]], align 4
+; DEFAULT-NEXT:    [[DST_2:%.*]] = getelementptr i8, ptr [[DST_0]], i64 8
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[DST_2]], align 4
+; DEFAULT-NEXT:    [[DST_3:%.*]] = getelementptr i8, ptr [[DST_0]], i64 16
+; DEFAULT-NEXT:    store float 0.000000e+00, ptr [[DST_0]], align 4
+; DEFAULT-NEXT:    br label [[LOOP_LATCH]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @test_conditional_interleave_group(
+; PRED-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; PRED-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; PRED-NEXT:    [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
+; PRED-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
+; PRED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; PRED-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; PRED-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]]
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
+; PRED-NEXT:    [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
+; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; PRED-NEXT:    [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; PRED-NEXT:    [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
+; PRED-NEXT:    [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT6]]
+; PRED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT6]]
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[DST]]
+; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
+; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
+; PRED-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
+; PRED-NEXT:    br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP0]], 8
+; PRED-NEXT:    [[TMP16:%.*]] = icmp ugt i64 [[TMP0]], 8
+; PRED-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
+; PRED-NEXT:    [[TMP18:%.*]] = load float, ptr [[SRC_1]], align 4
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
+; PRED-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]])
+; PRED-NEXT:    [[TMP22:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP22]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP23:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT11]], <8 x float> zeroinitializer, <8 x float> [[TMP21]])
+; PRED-NEXT:    [[TMP24:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x float> poison, float [[TMP24]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT12]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP25:%.*]] = fcmp ogt <8 x float> [[TMP23]], [[BROADCAST_SPLAT13]]
+; PRED-NEXT:    [[TMP26:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i1> [[TMP25]], <8 x i1> zeroinitializer
+; PRED-NEXT:    [[TMP27:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], <8 x i64> [[VEC_IND]]
+; PRED-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP26]], i32 0
+; PRED-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP29:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP29]], align 4
+; PRED-NEXT:    [[TMP30:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[TMP30]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP31]], align 4
+; PRED-NEXT:    [[TMP32:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP33]], align 4
+; PRED-NEXT:    [[TMP34:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP34]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP26]], i32 1
+; PRED-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; PRED:       pred.store.if14:
+; PRED-NEXT:    [[TMP36:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP36]], align 4
+; PRED-NEXT:    [[TMP37:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[TMP37]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP38]], align 4
+; PRED-NEXT:    [[TMP39:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP39]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP40]], align 4
+; PRED-NEXT:    [[TMP41:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP41]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; PRED:       pred.store.continue15:
+; PRED-NEXT:    [[TMP42:%.*]] = extractelement <8 x i1> [[TMP26]], i32 2
+; PRED-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; PRED:       pred.store.if16:
+; PRED-NEXT:    [[TMP43:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP43]], align 4
+; PRED-NEXT:    [[TMP44:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[TMP44]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP45]], align 4
+; PRED-NEXT:    [[TMP46:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[TMP46]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP47]], align 4
+; PRED-NEXT:    [[TMP48:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP48]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; PRED:       pred.store.continue17:
+; PRED-NEXT:    [[TMP49:%.*]] = extractelement <8 x i1> [[TMP26]], i32 3
+; PRED-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; PRED:       pred.store.if18:
+; PRED-NEXT:    [[TMP50:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP50]], align 4
+; PRED-NEXT:    [[TMP51:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[TMP51]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP52]], align 4
+; PRED-NEXT:    [[TMP53:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[TMP53]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP54]], align 4
+; PRED-NEXT:    [[TMP55:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP55]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; PRED:       pred.store.continue19:
+; PRED-NEXT:    [[TMP56:%.*]] = extractelement <8 x i1> [[TMP26]], i32 4
+; PRED-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; PRED:       pred.store.if20:
+; PRED-NEXT:    [[TMP57:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP57]], align 4
+; PRED-NEXT:    [[TMP58:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[TMP58]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP59]], align 4
+; PRED-NEXT:    [[TMP60:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[TMP60]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP61]], align 4
+; PRED-NEXT:    [[TMP62:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP62]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; PRED:       pred.store.continue21:
+; PRED-NEXT:    [[TMP63:%.*]] = extractelement <8 x i1> [[TMP26]], i32 5
+; PRED-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; PRED:       pred.store.if22:
+; PRED-NEXT:    [[TMP64:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP64]], align 4
+; PRED-NEXT:    [[TMP65:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[TMP65]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP66]], align 4
+; PRED-NEXT:    [[TMP67:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[TMP67]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP68]], align 4
+; PRED-NEXT:    [[TMP69:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP69]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; PRED:       pred.store.continue23:
+; PRED-NEXT:    [[TMP70:%.*]] = extractelement <8 x i1> [[TMP26]], i32 6
+; PRED-NEXT:    br i1 [[TMP70]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; PRED:       pred.store.if24:
+; PRED-NEXT:    [[TMP71:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP71]], align 4
+; PRED-NEXT:    [[TMP72:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr [[TMP72]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP73]], align 4
+; PRED-NEXT:    [[TMP74:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr [[TMP74]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP75]], align 4
+; PRED-NEXT:    [[TMP76:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP76]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; PRED:       pred.store.continue25:
+; PRED-NEXT:    [[TMP77:%.*]] = extractelement <8 x i1> [[TMP26]], i32 7
+; PRED-NEXT:    br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]]
+; PRED:       pred.store.if26:
+; PRED-NEXT:    [[TMP78:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP78]], align 4
+; PRED-NEXT:    [[TMP79:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr [[TMP79]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP80]], align 4
+; PRED-NEXT:    [[TMP81:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr [[TMP81]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP82]], align 4
+; PRED-NEXT:    [[TMP83:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT:    store float 0.000000e+00, ptr [[TMP83]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; PRED:       pred.store.continue27:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]])
+; PRED-NEXT:    [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; PRED-NEXT:    [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0
+; PRED-NEXT:    br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP_HEADER:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT:    [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4
+; PRED-NEXT:    [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4
+; PRED-NEXT:    [[MUL8_I_US:%.*]] = fmul float [[TMP87]], 0.000000e+00
+; PRED-NEXT:    [[TMP88:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP86]], float 0.000000e+00, float [[MUL8_I_US]])
+; PRED-NEXT:    [[TMP89:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT:    [[TMP90:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP89]], float 0.000000e+00, float [[TMP88]])
+; PRED-NEXT:    [[TMP91:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT:    [[C:%.*]] = fcmp ogt float [[TMP90]], [[TMP91]]
+; PRED-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; PRED:       if.then:
+; PRED-NEXT:    [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store float 0.000000e+00, ptr [[DST_0]], align 4
+; PRED-NEXT:    [[DST_1:%.*]] = getelementptr i8, ptr [[DST_0]], i64 4
+; PRED-NEXT:    store float 0.000000e+00, ptr [[DST_1]], align 4
+; PRED-NEXT:    [[DST_2:%.*]] = getelementptr i8, ptr [[DST_0]], i64 8
+; PRED-NEXT:    store float 0.000000e+00, ptr [[DST_2]], align 4
+; PRED-NEXT:    [[DST_3:%.*]] = getelementptr i8, ptr [[DST_0]], i64 16
+; PRED-NEXT:    store float 0.000000e+00, ptr [[DST_0]], align 4
+; PRED-NEXT:    br label [[LOOP_LATCH]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %0 = load float, ptr %src.1, align 4
+  %1 = load float, ptr %src.2, align 4
+  %mul8.i.us = fmul float %1, 0.000000e+00
+  %2 = tail call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float %mul8.i.us)
+  %3 = load float, ptr %src.3, align 4
+  %4 = tail call float @llvm.fmuladd.f32(float %3, float 0.000000e+00, float %2)
+  %5 = load float, ptr %src.3, align 4
+  %c = fcmp ogt float %4, %5
+  br i1 %c, label %if.then, label %loop.latch
+
+if.then:
+  %dst.0 = getelementptr { [4 x float] }, ptr %dst, i64 %iv
+  store float 0.000000e+00, ptr %dst.0, align 4
+  %dst.1 = getelementptr i8, ptr %dst.0, i64 4
+  store float 0.000000e+00, ptr %dst.1, align 4
+  %dst.2 = getelementptr i8, ptr %dst.0, i64 8
+  store float 0.000000e+00, ptr %dst.2, align 4
+  %dst.3 = getelementptr i8, ptr %dst.0, i64 16
+  store float 0.000000e+00, ptr %dst.0, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
 attributes #1 = { "target-cpu"="neoverse-512tvb" }
+attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" }
 
 ;.
 ; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -1255,6 +1732,8 @@ attributes #1 = { "target-cpu"="neoverse-512tvb" }
 ; DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
 ; DEFAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
 ; DEFAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]]}
 ;.
 ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1281,4 +1760,6 @@ attributes #1 = { "target-cpu"="neoverse-512tvb" }
 ; PRED: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
 ; PRED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
 ; PRED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
+; PRED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]}
+; PRED: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/cost-no-valid-vplans-built.ll b/llvm/test/Transforms/LoopVectorize/AArch64/cost-no-valid-vplans-built.ll
new file mode 100644
index 00000000000000..4ef2ddc58c1e77
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/cost-no-valid-vplans-built.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx15.0.0"
+
+define void @test(i32 %N, ptr %dst) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FNEG137:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[FNEG1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FNEG46:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[FNEG:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FNEG]] = fneg float [[FNEG137]]
+; CHECK-NEXT:    [[FNEG1]] = fneg float [[FNEG46]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store float [[FNEG1]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %fneg137 = phi float [ 0.000000e+00, %entry ], [ %fneg1, %loop ]
+  %fneg46 = phi float [ 0.000000e+00, %entry ], [ %fneg, %loop ]
+  %fneg = fneg float %fneg137
+  %fneg1 = fneg float %fneg46
+  %gep = getelementptr float, ptr %dst, i32 %iv
+  store float %fneg1, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index b6691ed6ecfa7c..fc7f81fe12807f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -350,6 +350,38 @@ for.cond.cleanup:
   ret void
 }
 
+
+; ADD (with reduction of i1)
+
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define i1 @add_trunc_i32_i1(ptr nocapture %src, i64 %N) {
+; CHECK-LABEL: @add_trunc_i32_i1
+; CHECK: vector.body:
+; CHECK: %[[PHI1:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %20, %vector.body ]
+; CHECK: %[[PHI2:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %21, %vector.body ]
+; CHECK: %[[TRUNC1:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
+; CHECK: %[[TRUNC2:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
+; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI1]], %[[TRUNC1]]
+; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI2]], %[[TRUNC2]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %red = phi i1 [ 0, %entry ], [ %red.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv
+  %load32 = load i32, ptr %arrayidx, align 4
+  %trunc = trunc i32 %load32 to i1
+  %red.next = xor i1 %red, %trunc
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret i1 %red.next
+}
+
+
 ; Reduction cannot be vectorized
 
 ; MUL
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll
index 37a09491e0ca73..8e3b55ce2646c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll
@@ -5,9 +5,9 @@
 ; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -mattr="+sme,+sve" -passes=loop-vectorize,slp-vectorizer -S - | FileCheck %s --check-prefix=CHECK
 
 ; __arm_streaming function, force use of scalable autovec. Should vectorize.
-; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -mattr="+sme" -passes=loop-vectorize,slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -mattr="+sme" -passes=loop-vectorize,slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
 ; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -mattr="+sme,+sve" -passes=loop-vectorize,slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
-; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -mattr="+sme" -passes=loop-vectorize,slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -mattr="+sme" -passes=loop-vectorize,slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
 ; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -mattr="+sme,+sve" -passes=loop-vectorize,slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
 
 ; __arm_streaming function, force use of fixed-width autovec. Should vectorize.
@@ -25,7 +25,7 @@
 ; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -mattr="+sme,+sve" -passes=loop-vectorize,slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
 
 ; __arm_streaming_compatible function, force use of fixed-width autovec. Can only vectorize if +sve is available.
-; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -mattr="+sme" -passes=loop-vectorize,slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -mattr="+sme" -passes=loop-vectorize,slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s
 ; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -mattr="+sme,+sve" -passes=loop-vectorize,slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index ce1cfda4381701..2a80a7affa4f8c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -312,14 +312,14 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP14]]
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -328,11 +328,11 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV_STRIDE2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_IDX:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[ARRAYIDX_IDX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP17]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
index 342b37710b6537..375bbb74d3d48c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
@@ -80,33 +80,6 @@ for.end:
   ret void
 }
 
-; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
-define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
-; CHECK-LABEL: @uniform_store_i1
-; CHECK: vector.body
-; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x ptr> {{.*}}, i64 1
-; CHECK: %[[ICMP:.*]] = icmp eq <64 x ptr> %[[GEP]], %[[SPLAT:.*]]
-; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 63
-; CHECK: store i1 %[[EXTRACT1]], ptr %dst
-; CHECK-NOT: vscale
-entry:
-  br label %for.body
-
-for.body:
-  %first.sroa = phi ptr [ %incdec.ptr, %for.body ], [ %start, %entry ]
-  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
-  %iv.next = add i64 %iv, 1
-  %0 = load i64, ptr %first.sroa
-  %incdec.ptr = getelementptr inbounds i64, ptr %first.sroa, i64 1
-  %cmp.not = icmp eq ptr %incdec.ptr, %start
-  store i1 %cmp.not, ptr %dst
-  %cmp = icmp ult i64 %iv, %N
-  br i1 %cmp, label %for.body, label %end, !llvm.loop !0
-
-end:
-  ret void
-}
-
 define dso_local void @loop_fixed_width_i128(ptr nocapture %ptr, i64 %N) {
 ; CHECK-LABEL: @loop_fixed_width_i128
 ; CHECK: load <4 x i128>, ptr
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 1853e551806bcd..dba472b3471e78 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -480,8 +480,8 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -566,8 +566,8 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-invalid-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-invalid-costs.ll
new file mode 100644
index 00000000000000..487d9788a13fc7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-invalid-costs.ll
@@ -0,0 +1,28 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize -mattr=+sve -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -prefer-predicate-over-epilogue=scalar-epilogue -S -disable-output 2>&1 | FileCheck %s
+target triple = "aarch64-linux-gnu"
+
+define dso_local void @loop_sve_i1(ptr nocapture %ptr, i64 %N) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_sve_i1'
+; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction:   %0 = load i1, ptr %arrayidx, align 16
+; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction:   store i1 %add, ptr %arrayidx, align 16
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i1, ptr %ptr, i64 %iv
+  %0 = load i1, ptr %arrayidx, align 16
+  %add = add nsw i1 %0, 1
+  store i1 %add, ptr %arrayidx, align 16
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 01468474dd5f3c..3a939952bf8185 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -35,36 +35,36 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP19]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP17]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[TMP14]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP25:%.*]] = shl nuw nsw i64 [[TMP24]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP25]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP23]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP16]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[DOTIDX]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX5:%.*]] = shl nuw nsw i64 [[TMP21]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[DOTIDX5]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP19]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -76,13 +76,13 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[PTR_014:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_014]], i64 4
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[PTR_014]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[PTR_014]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[PTR_014]], i64 8
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP28]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP24]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_013]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP29]], 1
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP25]], 1
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_013]]
 ; CHECK-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
@@ -148,24 +148,24 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP15]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD8]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX9:%.*]] = shl nuw nsw i64 [[TMP11]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 [[DOTIDX9]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[NEXT_GEP6]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -178,8 +178,8 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[S_010:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[D_09:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[S_010]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[S_010]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP14]], 1
 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[D_09]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[D_09]], i64 4
 ; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[S_010]], i64 4
@@ -239,31 +239,31 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = shl <vscale x 2 x i64> [[TMP9]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x ptr> [[TMP10]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP13]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP10]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[TMP10]], i64 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP10]], ptr [[NEXT_GEP]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[TMP12]])
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -279,7 +279,7 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR2]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[VAR5]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index d8f14f30295b64..c52227e1553756 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -40,37 +40,37 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 1
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 1
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]])
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 1
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]]
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP17]], 1
-; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP15]], ptr [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[DOTIDX]]
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl i64 [[TMP13]], 4
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = shl i64 [[TMP20]], 1
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]]
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 1
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP4]])
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; INTERLEAVE-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; INTERLEAVE:       for.cond.cleanup:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -126,37 +126,37 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 1
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 1
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]])
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 1
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]]
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP17]], 1
-; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP15]], ptr [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[DOTIDX]]
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl i64 [[TMP13]], 4
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = shl i64 [[TMP20]], 1
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]]
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 1
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP4]])
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; INTERLEAVE-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; INTERLEAVE:       for.cond.cleanup:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -210,27 +210,27 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ]
 ; INTERLEAVE-NEXT:    br i1 [[ACTIVE_LANE_MASK]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; INTERLEAVE:       pred.store.if:
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call double @foo(double [[TMP3]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    store double [[TMP4]], ptr [[TMP5]], align 8
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    store double [[TMP3]], ptr [[TMP4]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; INTERLEAVE:       pred.store.continue:
 ; INTERLEAVE-NEXT:    br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
 ; INTERLEAVE:       pred.store.if3:
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDEX]], 1
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP6]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP7]], align 8
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = call double @foo(double [[TMP8]], i64 [[TMP6]]) #[[ATTR5]]
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP6]]
-; INTERLEAVE-NEXT:    store double [[TMP9]], ptr [[TMP10]], align 8
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP6]], align 8
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call double @foo(double [[TMP7]], i64 [[TMP5]]) #[[ATTR5]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP5]]
+; INTERLEAVE-NEXT:    store double [[TMP8]], ptr [[TMP9]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; INTERLEAVE:       pred.store.continue4:
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP0]]
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP11]], [[TMP0]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP10]], [[TMP0]]
 ; INTERLEAVE-NEXT:    br i1 [[ACTIVE_LANE_MASK_NEXT]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; INTERLEAVE:       for.cond.cleanup:
 ; INTERLEAVE-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
index 438321e0fb0ccb..c18592eec9f849 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
@@ -18,7 +18,8 @@ target triple = "thumbv8.1m.main-none-unknown-eabi"
 ; NOTE: The strides of the starting address values in the inner loop differ, i.e.
 ; '(i * (n + 1))' vs '(i * n)'.
 
-; DEBUG-LABEL: LAA: Found a loop in diff_checks:
+; DEBUG-LABEL: 'diff_checks'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG:      LAA: Not creating diff runtime check, since these  cannot be hoisted out of the outer loop
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll
new file mode 100644
index 00000000000000..f404b894928862
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll
@@ -0,0 +1,27 @@
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; Make sure we do not vectorize a loop with a widened int induction.
+define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
+; CHECK-NOT: LV: Vector loop of width {{.+}} costs:
+;
+; CHECK: define void @test_wide_integer_induction(
+; CHECK-NOT: vector.body
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  store i64 %iv, ptr %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b5aa96eb23f5e5..76aaff6c9c83c0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -50,6 +50,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
 ; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
@@ -112,6 +114,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop cost is 32
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
@@ -121,6 +124,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ; CHECK-EMPTY:
 ;
@@ -191,6 +195,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
 ; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
@@ -253,6 +259,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop cost is 32
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
@@ -262,6 +269,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
new file mode 100644
index 00000000000000..99a7d1d34f26d8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt  -mattr=+v -passes=loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown-elf"
+
+define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
+; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> <i1 true, i1 true>), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> <i1 true, i1 true>), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    store i32 [[DOTPRE]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[TMP3]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[TMP3]], 100
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %0 = phi i64 [ 0, %entry ], [ %1, %loop ]
+  %.pre = load i32, ptr %src, align 4
+  store i32 %.pre, ptr %dst, align 4
+  %1 = add nuw i64 %0, 1
+  %ec = icmp eq i64 %1, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
index 3d7d17dfbfb2ae..fad167e916c5df 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -14,22 +14,22 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 4
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    store i32 4, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    store i32 4, ptr [[TMP3]], align 4
-; CHECK-NEXT:    store i32 4, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 poison
 ;
@@ -73,8 +73,8 @@ define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/induction-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/induction-branch-cost.ll
new file mode 100644
index 00000000000000..785af1551dd286
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/induction-branch-cost.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-f128:64-n32:64-S128-ni:1:10:20"
+target triple = "wasm32-unknown-emscripten"
+
+define void @induction_phi_and_branch_cost(ptr %end, ptr %start.1, ptr %start.2) #0 {
+; CHECK-LABEL: define void @induction_phi_and_branch_cost(
+; CHECK-SAME: ptr [[END:%.*]], ptr [[START_1:%.*]], ptr [[START_2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[END2:%.*]] = ptrtoint ptr [[END]] to i32
+; CHECK-NEXT:    [[START_11:%.*]] = ptrtoint ptr [[START_1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[START_11]], [[END2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[N_VEC]], -4
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -4
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, ptr [[START_2]], i32 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], -4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_2]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -3
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START_1]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], %[[MIDDLE_BLOCK]] ], [ [[START_2]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i32 -4
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i32 -4
+; CHECK-NEXT:    store i32 0, ptr [[PTR_IV_2]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start.1, %entry ], [ %ptr.iv.next, %loop ]
+  %ptr.iv.2 = phi ptr [ %start.2, %entry ], [ %ptr.iv.2.next, %loop ]
+  %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i32 -4
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i32 -4
+  store i32 0, ptr %ptr.iv.2, align 4
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+simd128" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/WebAssembly/lit.local.cfg
new file mode 100644
index 00000000000000..d5f39ab4dbc8ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "WebAssembly" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index b067766b543572..eedabde5fc1a13 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -479,4 +479,388 @@ exit:
   ret i1 %any.of.next
 }
 
+define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
+; CHECK-LABEL: @avx512_cond_load_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = sub i32 0, [[TMP0]]
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
+; CHECK-NEXT:    [[EXT:%.*]] = sext i32 [[OR]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[RES_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %c.1 = icmp slt i32 %iv, 0
+  br i1 %c.1, label %if.then, label %loop.latch
+
+if.then:
+  %1 = urem i32 %a, %c
+  %mul = sub i32 0, %1
+  %div = udiv i32 %c, %d
+  %or = or i32 %div, %mul
+  %ext = sext i32 %or to i64
+  %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
+  %l = load i64, ptr %gep, align 8
+  %or.2 = or i64 %l, %b
+  br label %loop.latch
+
+loop.latch:
+  %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
+  %iv.next = add i32 %iv, 1
+  %ec = icmp ult i32 %iv, %c
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret i64 %res
+}
+
+define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
+; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP25]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
+; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP29]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
+; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
+; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK:       pred.store.if13:
+; CHECK-NEXT:    [[TMP39:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP40]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
+; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK:       pred.store.if15:
+; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP44]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.continue16:
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
+; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP48]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
+; CHECK-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP52]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
+; CHECK-NEXT:    br i1 [[TMP53]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP54:%.*]] = shl nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP55]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
+; CHECK-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK:       pred.store.if23:
+; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP59]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; CHECK:       pred.store.continue24:
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
+; CHECK-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK:       pred.store.if25:
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP63]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; CHECK:       pred.store.continue26:
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
+; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK:       pred.store.if27:
+; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NEXT:    [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP67]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; CHECK:       pred.store.continue28:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
+; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK:       pred.store.if29:
+; CHECK-NEXT:    [[TMP69:%.*]] = shl nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP70]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; CHECK:       pred.store.continue30:
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
+; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; CHECK:       pred.store.if31:
+; CHECK-NEXT:    [[TMP72:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NEXT:    [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP74]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
+; CHECK:       pred.store.continue32:
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
+; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
+; CHECK:       pred.store.if33:
+; CHECK-NEXT:    [[TMP76:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NEXT:    [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP78]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE34]]
+; CHECK:       pred.store.continue34:
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
+; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]]
+; CHECK:       pred.store.if35:
+; CHECK-NEXT:    [[TMP80:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NEXT:    [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP82]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE36]]
+; CHECK:       pred.store.continue36:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.shl = shl nsw i64 %iv, 2
+  %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
+  %l = load double, ptr %gep.0, align 8
+  %c = fcmp oeq double %l, 0.000000e+00
+  br i1 %c, label %if.then, label %loop.latch
+
+if.then:
+  %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
+  store double 0.000000e+00, ptr %gep.1, align 8
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define i64 @cost_assume(ptr %end, i64 %N) {
+; CHECK-LABEL: @cost_assume(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -9
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], 9
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3]] = add <2 x i64> [[VEC_PHI]], <i64 1, i64 1>
+; CHECK-NEXT:    [[TMP4]] = add <2 x i64> [[VEC_PHI2]], <i64 1, i64 1>
+; CHECK-NEXT:    [[TMP5]] = add <2 x i64> [[VEC_PHI3]], <i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6]] = add <2 x i64> [[VEC_PHI4]], <i64 1, i64 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP17]])
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP18]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <2 x i64> [[TMP5]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <2 x i64> [[TMP6]], [[BIN_RDX5]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX6]])
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP22]] = add i64 [[TMP21]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i64 [[N]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C]])
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nusw [9 x i8], ptr null, i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[GEP]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[TMP22]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[DOTLCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %0 = phi i64 [ 0, %entry ], [ %1, %loop ]
+  %1 = add i64 %0, 1
+  %iv.next = add nsw i64 %iv, 1
+  %c = icmp ne i64 %N, 0
+  tail call void @llvm.assume(i1 %c)
+  %gep = getelementptr nusw [ 9 x i8 ], ptr null, i64 %iv.next
+  %ec = icmp eq ptr %gep, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %1
+}
+
+declare void @llvm.assume(i1 noundef) #0
+
 attributes #0 = { "target-cpu"="penryn" }
+attributes #1 = { "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" }
+attributes #2 = { "target-cpu"="znver3" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index 59b8ce42380d9d..cb4f5f6d9eabaf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -309,7 +309,7 @@ define double @external_use_without_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
 ; AUTO_VEC-NEXT:    br i1 [[NITER_NCMP_7]], label [[FOR_END_UNR_LCSSA]], label [[FOR_BODY]]
 ; AUTO_VEC:       for.end.unr-lcssa:
-; AUTO_VEC-NEXT:    [[J_LCSSA_PH:%.*]] = phi double [ undef, [[ENTRY:%.*]] ], [ [[J_NEXT_6]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[J_LCSSA_PH:%.*]] = phi double [ poison, [[ENTRY:%.*]] ], [ [[J_NEXT_6]], [[FOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[I_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[I_NEXT_7]], [[FOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[J_UNR:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[J_NEXT_7]], [[FOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index e64b02f00dfc18..6ac1e446d13add 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -13,26 +13,26 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; SSE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SSE:       vector.body:
 ; SSE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SSE-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = shl i64 [[INDEX]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[TMP1]], 8
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
-; SSE-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
-; SSE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; SSE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
+; SSE-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 8
+; SSE-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
+; SSE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; SSE-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; SSE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; SSE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; SSE-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 16
+; SSE-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; SSE-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16
+; SSE-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
-; SSE-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 ; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SSE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SSE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SSE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE:       middle.block:
 ; SSE-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; SSE:       scalar.ph:
@@ -49,21 +49,21 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP1:%.*]] = shl i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[TMP1]], 8
-; AVX1-NEXT:    [[TMP3:%.*]] = shl i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP3]], 16
-; AVX1-NEXT:    [[TMP5:%.*]] = shl i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP5]], 24
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
-; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
-; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4
-; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
-; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
+; AVX1-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 8
+; AVX1-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
+; AVX1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 16
+; AVX1-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
+; AVX1-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP4]], 24
+; AVX1-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]]
+; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
+; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
+; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4
+; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
 ; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -72,21 +72,21 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX1-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX1-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 16
-; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 32
-; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 48
+; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX1-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 16
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 32
+; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 48
+; AVX1-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP14]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP15]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP16]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP17]], align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; AVX1:       scalar.ph:
@@ -103,21 +103,21 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; AVX2-NEXT:    [[TMP1:%.*]] = shl i64 [[INDEX]], 1
-; AVX2-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[TMP1]], 16
-; AVX2-NEXT:    [[TMP3:%.*]] = shl i64 [[INDEX]], 1
-; AVX2-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP3]], 32
-; AVX2-NEXT:    [[TMP5:%.*]] = shl i64 [[INDEX]], 1
-; AVX2-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP5]], 48
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
-; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
-; AVX2-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
-; AVX2-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
-; AVX2-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; AVX2-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 16
+; AVX2-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 32
+; AVX2-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP4]], 48
+; AVX2-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
+; AVX2-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
+; AVX2-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
+; AVX2-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
 ; AVX2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; AVX2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; AVX2-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -126,21 +126,21 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX2-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 32
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 64
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 96
+; AVX2-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX2-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 32
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 64
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 96
+; AVX2-NEXT:    store <8 x i32> [[TMP10]], ptr [[TMP14]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP11]], ptr [[TMP15]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP16]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP13]], ptr [[TMP17]], align 4
-; AVX2-NEXT:    store <8 x i32> [[TMP14]], ptr [[TMP18]], align 4
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; AVX2:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index ce6dd52d54557e..3226f72d51d2e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -45,7 +45,7 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -84,7 +84,7 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
 ; AUTOVF-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AUTOVF:       for.end:
 ; AUTOVF-NEXT:    ret i32 0
 ;
@@ -353,3 +353,112 @@ for.cond:
 while.cond.loopexit:
   ret i32 0
 }
+
+define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
+; CHECK-LABEL: @tail_folded_store_avx512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END2:%.*]] = ptrtoint ptr [[END:%.*]] to i32
+; CHECK-NEXT:    [[START1:%.*]] = ptrtoint ptr [[START:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[START1]], -72
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[END2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 72
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 63
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 64
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT3]], <64 x i32> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <64 x i32> <i32 0, i32 -72, i32 -144, i32 -216, i32 -288, i32 -360, i32 -432, i32 -504, i32 -576, i32 -648, i32 -720, i32 -792, i32 -864, i32 -936, i32 -1008, i32 -1080, i32 -1152, i32 -1224, i32 -1296, i32 -1368, i32 -1440, i32 -1512, i32 -1584, i32 -1656, i32 -1728, i32 -1800, i32 -1872, i32 -1944, i32 -2016, i32 -2088, i32 -2160, i32 -2232, i32 -2304, i32 -2376, i32 -2448, i32 -2520, i32 -2592, i32 -2664, i32 -2736, i32 -2808, i32 -2880, i32 -2952, i32 -3024, i32 -3096, i32 -3168, i32 -3240, i32 -3312, i32 -3384, i32 -3456, i32 -3528, i32 -3600, i32 -3672, i32 -3744, i32 -3816, i32 -3888, i32 -3960, i32 -4032, i32 -4104, i32 -4176, i32 -4248, i32 -4320, i32 -4392, i32 -4464, i32 -4536>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <64 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v64p0.v64p0(<64 x ptr> zeroinitializer, <64 x ptr> [[TMP5]], i32 8, <64 x i1> [[TMP6]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -4608
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 -72
+; CHECK-NEXT:    store ptr null, ptr [[PTR_IV]], align 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+; AUTOVF-LABEL: @tail_folded_store_avx512(
+; AUTOVF-NEXT:  entry:
+; AUTOVF-NEXT:    [[END2:%.*]] = ptrtoint ptr [[END:%.*]] to i32
+; AUTOVF-NEXT:    [[START1:%.*]] = ptrtoint ptr [[START:%.*]] to i32
+; AUTOVF-NEXT:    [[TMP0:%.*]] = add i32 [[START1]], -72
+; AUTOVF-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[END2]]
+; AUTOVF-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 72
+; AUTOVF-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
+; AUTOVF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AUTOVF:       vector.ph:
+; AUTOVF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 7
+; AUTOVF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
+; AUTOVF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; AUTOVF-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
+; AUTOVF-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
+; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
+; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; AUTOVF-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT3]], <8 x i32> poison, <8 x i32> zeroinitializer
+; AUTOVF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AUTOVF:       vector.body:
+; AUTOVF-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; AUTOVF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AUTOVF-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <8 x i32> <i32 0, i32 -72, i32 -144, i32 -216, i32 -288, i32 -360, i32 -432, i32 -504>
+; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INDEX]], i64 0
+; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
+; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp ule <8 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; AUTOVF-NEXT:    call void @llvm.masked.scatter.v8p0.v8p0(<8 x ptr> zeroinitializer, <8 x ptr> [[TMP5]], i32 8, <8 x i1> [[TMP6]])
+; AUTOVF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; AUTOVF-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -576
+; AUTOVF-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; AUTOVF-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AUTOVF:       middle.block:
+; AUTOVF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; AUTOVF:       scalar.ph:
+; AUTOVF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
+; AUTOVF-NEXT:    br label [[LOOP:%.*]]
+; AUTOVF:       loop:
+; AUTOVF-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; AUTOVF-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 -72
+; AUTOVF-NEXT:    store ptr null, ptr [[PTR_IV]], align 8
+; AUTOVF-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; AUTOVF-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; AUTOVF:       exit:
+; AUTOVF-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+  %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 -72
+  store ptr null, ptr %ptr.iv, align 8
+  %ec = icmp eq ptr %ptr.iv.next, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+attributes #3 = { optsize "target-cpu"="skylake-avx512" }
diff --git a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
index d92fbe3a77d6d1..aea383226c71e6 100644
--- a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
+++ b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
+; RUN: opt -S -passes=loop-vectorize -force-vector-width=4 < %s | FileCheck %s
 
 ; This is a regression test. Without the fix it crashes on SSAUpdater due to
 ; LoopVectroizer created a phi node placeholder without incoming values but
 ; SSAUpdater expects that phi node is completely filled.
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
-define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c) {
+define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  L1.preheader:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 -1, [[ARG:%.*]]
@@ -36,8 +36,8 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c) {
 ; CHECK-NEXT:    br label [[L2_HEADER_BACKEDGE:%.*]]
 ; CHECK:       L2.header:
 ; CHECK-NEXT:    switch i32 [[L2_SWITCH:%.*]], label [[L2_HEADER_BACKEDGE]] [
-; CHECK-NEXT:    i32 8, label [[L2_EXIT:%.*]]
-; CHECK-NEXT:    i32 20, label [[L2_INNER_HEADER_PREHEADER:%.*]]
+; CHECK-NEXT:      i32 8, label [[L2_EXIT:%.*]]
+; CHECK-NEXT:      i32 20, label [[L2_INNER_HEADER_PREHEADER:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       L2.header.backedge:
 ; CHECK-NEXT:    br label [[L2_HEADER]]
@@ -46,12 +46,30 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 12, [[INDUCTION_IV_LCSSA1]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 1, [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV_LCSSA1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[INDUCTION_IV_LCSSA1]], 4
+; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[L1_EXIT_VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i32> [[TMP7]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[L2_HEADER_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -63,6 +81,8 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c) {
 ; CHECK-NEXT:    [[L2_IV:%.*]] = phi i64 [ [[L2_IV_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L2_ACCUM_NEXT]] = sub i32 [[L2_ACCUM]], [[L1_EXIT_VAL]]
 ; CHECK-NEXT:    [[L2_DUMMY_BUT_NEED_IT:%.*]] = sext i32 [[L2_ACCUM_NEXT]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[L2_IV]]
+; CHECK-NEXT:    store i64 [[L2_DUMMY_BUT_NEED_IT]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[L2_IV_NEXT]] = add nuw nsw i64 [[L2_IV]], 1
 ; CHECK-NEXT:    [[L2_EXIT_COND:%.*]] = icmp ugt i64 [[L2_IV]], 11
 ; CHECK-NEXT:    br i1 [[L2_EXIT_COND]], label [[L2_HEADER_LOOPEXIT]], label [[L2_INNER_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -101,6 +121,8 @@ L2.Inner.header:                                  ; preds = %L2.Inner.header, %L
   %L2.iv = phi i64 [ %L2.iv.next, %L2.Inner.header ], [ 1, %L2.header ]
   %L2.accum.next = sub i32 %L2.accum, %L1.exit.val
   %L2.dummy.but.need.it = sext i32 %L2.accum.next to i64
+  %gep = getelementptr inbounds i64 , ptr %dst, i64 %L2.iv
+  store i64 %L2.dummy.but.need.it, ptr %gep
   %L2.iv.next = add nuw nsw i64 %L2.iv, 1
   %L2.exit_cond = icmp ugt i64 %L2.iv, 11
   br i1 %L2.exit_cond, label %L2.header, label %L2.Inner.header
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index af7b3679578438..00b00bf160174d 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1669,14 +1669,14 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; IND-NEXT:    br label [[FOR_BODY:%.*]]
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[TMP20:%.*]] = shl nsw i64 [[I]], 2
-; IND-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; IND-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 1
-; IND-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
-; IND-NEXT:    store i32 [[TMP22]], ptr [[TMP23]], align 1
+; IND-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[I]], 4
+; IND-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX]]
+; IND-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 1
+; IND-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; IND-NEXT:    store i32 [[TMP21]], ptr [[TMP22]], align 1
 ; IND-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; IND-NEXT:    [[TMP24:%.*]] = trunc i64 [[I_NEXT]] to i32
-; IND-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP24]], [[N]]
+; IND-NEXT:    [[TMP23:%.*]] = trunc i64 [[I_NEXT]] to i32
+; IND-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP23]], [[N]]
 ; IND-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; IND:       for.end:
 ; IND-NEXT:    ret void
@@ -1746,14 +1746,14 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[TMP31:%.*]] = shl nsw i64 [[I]], 2
-; UNROLL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
-; UNROLL-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 1
-; UNROLL-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
-; UNROLL-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 1
+; UNROLL-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[I]], 4
+; UNROLL-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX]]
+; UNROLL-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 1
+; UNROLL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; UNROLL-NEXT:    store i32 [[TMP32]], ptr [[TMP33]], align 1
 ; UNROLL-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; UNROLL-NEXT:    [[TMP35:%.*]] = trunc i64 [[I_NEXT]] to i32
-; UNROLL-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; UNROLL-NEXT:    [[TMP34:%.*]] = trunc i64 [[I_NEXT]] to i32
+; UNROLL-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP34]], [[N]]
 ; UNROLL-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; UNROLL:       for.end:
 ; UNROLL-NEXT:    ret void
@@ -1873,39 +1873,39 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 5
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[INDEX]], 6
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = or disjoint i64 [[INDEX]], 7
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[INDEX]], 2
-; INTERLEAVE-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP14]], 2
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP20]], align 1
-; INTERLEAVE-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP21]], align 1
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1
-; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP30]], ptr [[TMP22]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META20:![0-9]+]]
-; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
+; INTERLEAVE-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 4
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX]]
+; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl nsw i64 [[TMP14]], 4
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX5]]
+; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP18]], align 1
+; INTERLEAVE-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP19]], align 1
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
+; INTERLEAVE-NEXT:    store i32 [[TMP28]], ptr [[TMP20]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META20:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
+; INTERLEAVE-NEXT:    store i32 [[TMP29]], ptr [[TMP21]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
+; INTERLEAVE-NEXT:    store i32 [[TMP30]], ptr [[TMP22]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
 ; INTERLEAVE-NEXT:    store i32 [[TMP31]], ptr [[TMP23]], align 1, !alias.scope [[META17]], !noalias [[META20]]
-; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
+; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
 ; INTERLEAVE-NEXT:    store i32 [[TMP32]], ptr [[TMP24]], align 1, !alias.scope [[META17]], !noalias [[META20]]
-; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
+; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
 ; INTERLEAVE-NEXT:    store i32 [[TMP33]], ptr [[TMP25]], align 1, !alias.scope [[META17]], !noalias [[META20]]
-; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
+; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP34]], ptr [[TMP26]], align 1, !alias.scope [[META17]], !noalias [[META20]]
-; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
+; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
 ; INTERLEAVE-NEXT:    store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope [[META17]], !noalias [[META20]]
-; INTERLEAVE-NEXT:    [[TMP36:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
-; INTERLEAVE-NEXT:    store i32 [[TMP36]], ptr [[TMP28]], align 1, !alias.scope [[META17]], !noalias [[META20]]
-; INTERLEAVE-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
-; INTERLEAVE-NEXT:    store i32 [[TMP37]], ptr [[TMP29]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    br label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
@@ -1913,14 +1913,14 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP39:%.*]] = shl nsw i64 [[I]], 2
-; INTERLEAVE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP39]]
-; INTERLEAVE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 1
-; INTERLEAVE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
-; INTERLEAVE-NEXT:    store i32 [[TMP41]], ptr [[TMP42]], align 1
+; INTERLEAVE-NEXT:    [[DOTIDX6:%.*]] = shl nsw i64 [[I]], 4
+; INTERLEAVE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX6]]
+; INTERLEAVE-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 1
+; INTERLEAVE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; INTERLEAVE-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 1
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; INTERLEAVE-NEXT:    [[TMP43:%.*]] = trunc i64 [[I_NEXT]] to i32
-; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP43]], [[N]]
+; INTERLEAVE-NEXT:    [[TMP40:%.*]] = trunc i64 [[I_NEXT]] to i32
+; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP40]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; INTERLEAVE:       for.end:
 ; INTERLEAVE-NEXT:    ret void
@@ -4424,14 +4424,14 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[K]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -4463,13 +4463,13 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[K]], 4294967294
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
-; IND-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
+; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
 ; IND-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
 ; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
-; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
+; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; IND-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -4503,16 +4503,16 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[K]], 4294967292
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
+; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
 ; UNROLL-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
-; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
+; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
 ; UNROLL-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -4551,10 +4551,10 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[K]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i32 [[TMP5]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]]
@@ -4563,7 +4563,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP11]], align 4
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -4595,16 +4595,16 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[K]], 4294967288
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
-; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
+; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
-; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -5951,10 +5951,10 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SRC:%.*]], align 4
@@ -5965,7 +5965,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
@@ -6000,7 +6000,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; IND:       vector.ph:
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
-; IND-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
@@ -6008,12 +6008,12 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
 ; IND-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[TMP2:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP0]]
-; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
+; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
 ; IND-NEXT:    [[TMP3:%.*]] = ashr exact i64 [[SEXT]], 32
 ; IND-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP3]]
 ; IND-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP2]]
 ; IND-NEXT:    store <2 x i32> [[TMP5]], ptr [[TMP4]], align 4
-; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
+; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; IND-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; IND-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
@@ -6032,7 +6032,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
@@ -6043,7 +6043,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[TMP3:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP0]]
 ; UNROLL-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP1]]
-; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
+; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
 ; UNROLL-NEXT:    [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP3]]
@@ -6051,7 +6051,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP6]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
+; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
 ; UNROLL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
@@ -6070,11 +6070,11 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
@@ -6092,7 +6092,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 2
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP11]], ptr [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
@@ -6127,7 +6127,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
-; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
@@ -6138,7 +6138,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP0]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP1]]
-; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
+; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[VEC_IND]], [[TMP3]]
@@ -6146,7 +6146,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; INTERLEAVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 4c3377255b21a7..10a8dc7688fbc6 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -108,23 +108,23 @@ define void @test_struct_array_load3_store3() {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 12
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr @A, i64 [[TMP0]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 12
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr @A, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -266,21 +266,21 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -419,8 +419,8 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
@@ -495,8 +495,8 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
@@ -736,23 +736,23 @@ define void @mixed_load3_store3(ptr nocapture %A) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 12
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 12
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -837,14 +837,14 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr @SA, align 4
 ; CHECK-NEXT:    store float [[ADD3_LCSSA]], ptr @SB, align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
index e3e98c3a7b6270..07e7b462e7bf98 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
@@ -17,7 +17,8 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; NOTE: The strides of the starting address values in the inner loop differ, i.e.
 ; '(i * (n + 1))' vs '(i * n)'.
 
-; DEBUG-LABEL: LAA: Found a loop in diff_checks:
+; DEBUG-LABEL: 'diff_checks'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG:      LAA: Not creating diff runtime check, since these  cannot be hoisted out of the outer loop
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
@@ -149,7 +150,8 @@ outer.exit:
 ; We decide to do full runtime checks here (as opposed to diff checks) due to
 ; the additional load of 'dst[(i * n) + j]' in the loop.
 
-; DEBUG-LABEL: LAA: Found a loop in full_checks:
+; DEBUG-LABEL: 'full_checks'
+; DEBUG: LAA: Found an analyzable loop: inner.loop
 ; DEBUG-NOT: LAA: Creating diff runtime check for:
 ; DEBUG: LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
@@ -272,7 +274,8 @@ outer.exit:
 ; is accessed with a higher stride compared src, and therefore the inner loop
 ; runtime checks will vary for each outer loop iteration.
 
-; DEBUG-LABEL: LAA: Found a loop in full_checks_diff_strides:
+; DEBUG-LABEL: 'full_checks_diff_strides'
+; DEBUG: LAA: Found an analyzable loop: inner.loop
 ; DEBUG-NOT: LAA: Creating diff runtime check for:
 ; DEBUG: LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
@@ -402,7 +405,8 @@ outer.exit:
 ;   }
 ; }
 
-; DEBUG-LABEL: LAA: Found a loop in diff_checks_src_start_invariant:
+; DEBUG-LABEL: 'diff_checks_src_start_invariant'
+; DEBUG: LAA: Found an analyzable loop: inner.loop
 ; DEBUG-NOT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
 
 define void @diff_checks_src_start_invariant(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) {
@@ -508,7 +512,8 @@ outer.loop.exit:
 ;   }
 ; }
 
-; DEBUG-LABEL: LAA: Found a loop in full_checks_src_start_invariant:
+; DEBUG-LABEL: 'full_checks_src_start_invariant'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
 ; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %dst)
@@ -629,7 +634,8 @@ outer.loop.exit:
 ; The 'src' access varies with the outermost loop, rather than the parent of the
 ; innermost loop. Hence we don't expand `src`, although in theory we could do.
 
-; DEBUG-LABEL: LAA: Found a loop in triple_nested_loop_mixed_access:
+; DEBUG-LABEL: 'triple_nested_loop_mixed_access'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG-NOT:  LAA: Creating diff runtime check for:
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
@@ -795,7 +801,8 @@ exit:
 ; }
 ; Outer loop trip count is uncomputable so we shouldn't expand the ranges.
 
-; DEBUG-LABEL: LAA: Found a loop in uncomputable_outer_tc:
+; DEBUG-LABEL: 'uncomputable_outer_tc'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: Start: {%dst,+,(4 * (zext i32 (1 + %n) to i64))<nuw><nsw>}<%outer.loop> End: {((4 * (zext i32 %n to i64))<nuw><nsw> + %dst),+,(4 * (zext i32 (1 + %n) to i64))<nuw><nsw>}<%outer.loop>
 ; DEBUG-NEXT: LAA: Adding RT check for range:
@@ -945,7 +952,8 @@ while.end:
 ; Inner IV is decreasing, but this isn't a problem and we can still expand the
 ; runtime checks correctly to cover the whole loop.
 
-; DEBUG-LABEL: LAA: Found a loop in decreasing_inner_iv:
+; DEBUG-LABEL: 'decreasing_inner_iv'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
 ; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * (sext i32 %stride1 to i64))<nsw>
@@ -1111,7 +1119,8 @@ exit:
 ; Outer IV is decreasing, but the direction of memory accesses also depends
 ; upon the signedness of stride1.
 
-; DEBUG-LABEL: LAA: Found a loop in decreasing_outer_iv:
+; DEBUG-LABEL: 'decreasing_outer_iv'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
 ; DEBUG-NEXT: LAA: ... but need to check stride is positive: (-4 * (sext i32 %stride1 to i64))<nsw>
@@ -1271,7 +1280,8 @@ exit:
 ; }
 
 
-; DEBUG-LABEL: LAA: Found a loop in unknown_inner_stride:
+; DEBUG-LABEL: 'unknown_inner_stride'
+; DEBUG:      LAA: Found an analyzable loop: inner.loop
 ; DEBUG:      LAA: Adding RT check for range:
 ; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
 ; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (zext i32 (1 + %n) to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %dst)
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 1b9f15a419ea3d..030eb9e76b51a5 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -33,22 +33,22 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[DOTIDX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD2]], [[STEP_ADD]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP16]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP15]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP14]], ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD2]], [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX3:%.*]] = shl i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[DOTIDX3]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -58,8 +58,8 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP20]], [[I_08]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP18]], [[I_08]]
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_08]]
 ; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX1]], align 8
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
index b50a7cd999ece7..965428098342f4 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -48,16 +48,16 @@
 ; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, ptr %b, i64 %index
 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 2
-; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, ptr %[[IDXB]], i64 %[[VSCALE2]]
+; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 5
+; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds i8, ptr %[[IDXB]], i64 %[[VSCALE2]]
 ; CHECKUF2: %wide.load = load <vscale x 4 x double>, ptr %[[IDXB]], align 8
 ; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, ptr %[[IDXB_NEXT]], align 8
 ; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i64 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i64 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index
 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 2
-; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, ptr %[[IDXA]], i64 %[[VSCALE2]]
+; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 5
+; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds i8, ptr %[[IDXA]], i64 %[[VSCALE2]]
 ; CHECKUF2: store <vscale x 4 x double> %[[FADD]], ptr %[[IDXA]], align 8
 ; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], ptr %[[IDXA_NEXT]], align 8
 ; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index f33ec1419b1147..c9fc8beb006d9b 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -1,20 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
 
 @dst = external global [32 x i16], align 1
 
 define void @blend_uniform_iv_trunc(i1 %c) {
-; CHECK-LABEL: @blend_uniform_iv_trunc(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 %c, i16 [[TMP2]], i16 undef
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[PREDPHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP6]], align 2
+; CHECK-LABEL: define void @blend_uniform_iv_trunc(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], i16 [[TMP1]], i16 undef
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[PREDPHI]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP8]], label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ undef, %[[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], %[[LOOP_NEXT]] ]
+; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[BLEND]]
+; CHECK-NEXT:    store i16 0, ptr [[DST_PTR]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
+; CHECK-NEXT:    br i1 [[CMP439]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -40,20 +66,43 @@ exit:                                             ; preds = %loop.latch
 }
 
 define void @blend_uniform_iv(i1 %c) {
-; CHECK-LABEL: @blend_uniform_iv(
-; CHECK:       vector.ph:
-
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-LABEL: define void @blend_uniform_iv(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 %c, i64 [[TMP0]], i64 undef
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[PREDPHI]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP4]], align 2
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], i64 [[TMP0]], i64 undef
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[PREDPHI]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP6]], label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[BLEND:%.*]] = phi i64 [ undef, %[[LOOP_HEADER]] ], [ [[IV]], %[[LOOP_NEXT]] ]
+; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[BLEND]]
+; CHECK-NEXT:    store i16 0, ptr [[DST_PTR]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
+; CHECK-NEXT:    br i1 [[CMP439]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
+
 entry:
   br label %loop.header
 
@@ -77,34 +126,62 @@ exit:                                             ; preds = %loop.latch
 }
 
 define void @blend_chain_iv(i1 %c) {
-; CHECK-LABEL: @blend_chain_iv(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i64 0
-; CHECK-NEXT:    [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer
-
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[MASK1]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> undef
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[MASK1]], <4 x i64> [[PREDPHI]], <4 x i64> undef
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP15]]
-; CHECK-NEXT:    store i16 0, ptr [[TMP10]], align 2
-; CHECK-NEXT:    store i16 0, ptr [[TMP12]], align 2
-; CHECK-NEXT:    store i16 0, ptr [[TMP14]], align 2
-; CHECK-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; CHECK-LABEL: define void @blend_chain_iv(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; CHECK-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; CHECK-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; CHECK-NEXT:    store i16 0, ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP17]], label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_NEXT]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT_2:.*]], label %[[LOOP_NEXT_3:.*]]
+; CHECK:       [[LOOP_NEXT_2]]:
+; CHECK-NEXT:    br label %[[LOOP_NEXT_3]]
+; CHECK:       [[LOOP_NEXT_3]]:
+; CHECK-NEXT:    [[BLEND_1:%.*]] = phi i64 [ undef, %[[LOOP_NEXT]] ], [ [[IV]], %[[LOOP_NEXT_2]] ]
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[BLEND:%.*]] = phi i64 [ undef, %[[LOOP_HEADER]] ], [ [[BLEND_1]], %[[LOOP_NEXT_3]] ]
+; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[BLEND]]
+; CHECK-NEXT:    store i16 0, ptr [[DST_PTR]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
+; CHECK-NEXT:    br i1 [[CMP439]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
+
 entry:
   br label %loop.header
 
@@ -133,3 +210,13 @@ loop.latch:                                       ; preds = %loop.next, %loop.he
 exit:                                             ; preds = %loop.latch
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index 2bb3c898c7cdab..0fbaafe0ccb63b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 ; VF/IC combination.
 define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK:      LV: Scalarizing:  %cmp =
-; CHECK-NEXT: VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for VF={8},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
diff --git a/llvm/test/Transforms/LoopVersioning/pr96656.ll b/llvm/test/Transforms/LoopVersioning/pr96656.ll
new file mode 100644
index 00000000000000..0264fe40a94302
--- /dev/null
+++ b/llvm/test/Transforms/LoopVersioning/pr96656.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-versioning -S %s | FileCheck %s
+
+define void @lver.check.unnecessary(ptr %arg, ptr %arg1, i1 %arg2) {
+; CHECK-LABEL: define void @lver.check.unnecessary(
+; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]], i1 [[ARG2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ARG]], align 4
+; CHECK-NEXT:    br i1 [[ARG2]], label %[[NOLOOP_EXIT:.*]], label %[[LOOP_BODY_LVER_CHECK:.*]]
+; CHECK:       [[LOOP_BODY_LVER_CHECK]]:
+; CHECK-NEXT:    [[SEXT7:%.*]] = sext i32 [[LOAD]] to i64
+; CHECK-NEXT:    [[GEP8:%.*]] = getelementptr i8, ptr [[ARG1]], i64 8
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[LOAD]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[LOOP_BODY_PH_LVER_ORIG:.*]], label %[[LOOP_BODY_PH:.*]]
+; CHECK:       [[LOOP_BODY_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[LOOP_BODY_LVER_ORIG:.*]]
+; CHECK:       [[LOOP_BODY_LVER_ORIG]]:
+; CHECK-NEXT:    [[PHI_LVER_ORIG:%.*]] = phi i64 [ 0, %[[LOOP_BODY_PH_LVER_ORIG]] ], [ [[ADD_LVER_ORIG:%.*]], %[[LOOP_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[MUL_LVER_ORIG:%.*]] = mul i64 [[PHI_LVER_ORIG]], [[SEXT7]]
+; CHECK-NEXT:    [[GEP10_LVER_ORIG:%.*]] = getelementptr double, ptr [[GEP8]], i64 [[MUL_LVER_ORIG]]
+; CHECK-NEXT:    [[LOAD11_LVER_ORIG:%.*]] = load double, ptr [[GEP10_LVER_ORIG]], align 8
+; CHECK-NEXT:    store double [[LOAD11_LVER_ORIG]], ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[ADD_LVER_ORIG]] = add i64 [[PHI_LVER_ORIG]], 1
+; CHECK-NEXT:    [[ICMP_LVER_ORIG:%.*]] = icmp eq i64 [[PHI_LVER_ORIG]], 0
+; CHECK-NEXT:    br i1 [[ICMP_LVER_ORIG]], label %[[LOOP_EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY_LVER_ORIG]]
+; CHECK:       [[LOOP_BODY_PH]]:
+; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, %[[LOOP_BODY_PH]] ], [ [[ADD:%.*]], %[[LOOP_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[PHI]], [[SEXT7]]
+; CHECK-NEXT:    [[GEP10:%.*]] = getelementptr double, ptr [[GEP8]], i64 [[MUL]]
+; CHECK-NEXT:    [[LOAD11:%.*]] = load double, ptr [[GEP10]], align 8
+; CHECK-NEXT:    store double [[LOAD11]], ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[ADD]] = add i64 [[PHI]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i64 [[PHI]], 0
+; CHECK-NEXT:    br i1 [[ICMP]], label %[[LOOP_EXIT_LOOPEXIT1:.*]], label %[[LOOP_BODY]]
+; CHECK:       [[NOLOOP_EXIT]]:
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[ARG1]], i64 [[SEXT]]
+; CHECK-NEXT:    [[LOAD5:%.*]] = load double, ptr [[GEP]], align 8
+; CHECK-NEXT:    store double [[LOAD5]], ptr [[ARG]], align 8
+; CHECK-NEXT:    ret void
+; CHECK:       [[LOOP_EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[LOOP_EXIT:.*]]
+; CHECK:       [[LOOP_EXIT_LOOPEXIT1]]:
+; CHECK-NEXT:    br label %[[LOOP_EXIT]]
+; CHECK:       [[LOOP_EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load = load i32, ptr %arg, align 4
+  br i1 %arg2, label %noloop.exit, label %loop.ph
+
+loop.ph:                                          ; preds = %entry
+  %sext7 = sext i32 %load to i64
+  %gep8 = getelementptr i8, ptr %arg1, i64 8
+  br label %loop.body
+
+loop.body:                                        ; preds = %loop.body, %loop.ph
+  %phi = phi i64 [ 0, %loop.ph ], [ %add, %loop.body ]
+  %mul = mul i64 %phi, %sext7
+  %gep10 = getelementptr double, ptr %gep8, i64 %mul
+  %load11 = load double, ptr %gep10, align 8
+  store double %load11, ptr %arg1, align 8
+  %add = add i64 %phi, 1
+  %icmp = icmp eq i64 %phi, 0
+  br i1 %icmp, label %loop.exit, label %loop.body
+
+noloop.exit:                                      ; preds = %entry
+  %sext = sext i32 %load to i64
+  %gep = getelementptr double, ptr %arg1, i64 %sext
+  %load5 = load double, ptr %gep, align 8
+  store double %load5, ptr %arg, align 8
+  ret void
+
+loop.exit:                                        ; preds = %loop.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
index 1cc886e68d8e61..249608f413fb2f 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -22,30 +22,30 @@ define void @multiply_noalias_4x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
 ; CHECK:       inner.header:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
-; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP7:%.*]], [[INNER_LATCH]] ]
-; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP9:%.*]], [[INNER_LATCH]] ]
+; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP5:%.*]], [[INNER_LATCH]] ]
+; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP7:%.*]], [[INNER_LATCH]] ]
 ; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
 ; CHECK:       inner.body:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INNER_IV]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i64 [[ROWS_IV]]
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 32
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INNER_IV]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[TMP0]], i64 [[ROWS_IV]]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[COLS_IV]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP4]], i64 [[INNER_IV]]
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP5]], i64 32
+; CHECK-NEXT:    [[DOTIDX17:%.*]] = shl i64 [[COLS_IV]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[DOTIDX17]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[TMP2]], i64 [[INNER_IV]]
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[RESULT_VEC_0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[RESULT_VEC_0]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT8]], <2 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP5]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT8]], <2 x double> [[TMP4]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[RESULT_VEC_1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[RESULT_VEC_1]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP9]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP8]])
+; CHECK-NEXT:    [[TMP7]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP6]])
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
@@ -54,12 +54,12 @@ define void @multiply_noalias_4x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_STEP]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[COLS_IV]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[C:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[ROWS_IV]]
-; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP12]], i64 32
-; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[VEC_GEP16]], align 8
+; CHECK-NEXT:    [[DOTIDX18:%.*]] = shl i64 [[COLS_IV]], 5
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[DOTIDX18]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP8]], i64 [[ROWS_IV]]
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP9]], i64 32
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[VEC_GEP16]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
 ; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
@@ -98,34 +98,34 @@ define void @multiply_noalias_2x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
 ; CHECK:       inner.header:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
-; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP9:%.*]], [[INNER_LATCH]] ]
-; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP13:%.*]], [[INNER_LATCH]] ]
+; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP7:%.*]], [[INNER_LATCH]] ]
+; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ]
 ; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
 ; CHECK:       inner.body:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INNER_IV]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[ROWS_IV]]
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 16
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INNER_IV]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[ROWS_IV]]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[COLS_IV]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i64 [[INNER_IV]]
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP5]], i64 32
+; CHECK-NEXT:    [[DOTIDX17:%.*]] = shl i64 [[COLS_IV]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[DOTIDX17]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i64 [[INNER_IV]]
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i64>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i64> [[COL_LOAD2]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[RESULT_VEC_0]], [[TMP6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[RESULT_VEC_0]], [[TMP4]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <2 x i64> [[COL_LOAD2]], <2 x i64> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT8]]
-; CHECK-NEXT:    [[TMP9]] = add <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT8]]
+; CHECK-NEXT:    [[TMP7]] = add <2 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x i64> [[COL_LOAD4]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT12]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[RESULT_VEC_1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[RESULT_VEC_1]], [[TMP8]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x i64> [[COL_LOAD4]], <2 x i64> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT15]]
-; CHECK-NEXT:    [[TMP13]] = add <2 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[TMP11]] = add <2 x i64> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
@@ -134,12 +134,12 @@ define void @multiply_noalias_2x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_IV]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 [[COLS_IV]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[C:%.*]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[ROWS_IV]]
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP16]], i64 16
-; CHECK-NEXT:    store <2 x i64> [[TMP13]], ptr [[VEC_GEP16]], align 8
+; CHECK-NEXT:    [[DOTIDX18:%.*]] = shl i64 [[COLS_IV]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[DOTIDX18]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i64 [[ROWS_IV]]
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP13]], i64 16
+; CHECK-NEXT:    store <2 x i64> [[TMP11]], ptr [[VEC_GEP16]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
 ; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
@@ -184,34 +184,34 @@ define void @multiply_noalias_4x2_2x8(ptr noalias %A, ptr noalias %B, ptr noalia
 ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
 ; CHECK:       inner.header:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
-; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP9:%.*]], [[INNER_LATCH]] ]
-; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP13:%.*]], [[INNER_LATCH]] ]
+; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP7:%.*]], [[INNER_LATCH]] ]
+; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ]
 ; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
 ; CHECK:       inner.body:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INNER_IV]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[ROWS_IV]]
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 32
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INNER_IV]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[ROWS_IV]]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[COLS_IV]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i64 [[INNER_IV]]
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP5]], i64 16
+; CHECK-NEXT:    [[DOTIDX17:%.*]] = shl i64 [[COLS_IV]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[DOTIDX17]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i64 [[INNER_IV]]
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i64>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i64> [[COL_LOAD2]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[RESULT_VEC_0]], [[TMP6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[RESULT_VEC_0]], [[TMP4]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <2 x i64> [[COL_LOAD2]], <2 x i64> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT8]]
-; CHECK-NEXT:    [[TMP9]] = add <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT8]]
+; CHECK-NEXT:    [[TMP7]] = add <2 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x i64> [[COL_LOAD4]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT12]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[RESULT_VEC_1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[RESULT_VEC_1]], [[TMP8]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x i64> [[COL_LOAD4]], <2 x i64> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT15]]
-; CHECK-NEXT:    [[TMP13]] = add <2 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[COL_LOAD1]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[TMP11]] = add <2 x i64> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
@@ -220,12 +220,12 @@ define void @multiply_noalias_4x2_2x8(ptr noalias %A, ptr noalias %B, ptr noalia
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_STEP]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 [[COLS_IV]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[C:%.*]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[ROWS_IV]]
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP16]], i64 32
-; CHECK-NEXT:    store <2 x i64> [[TMP13]], ptr [[VEC_GEP16]], align 8
+; CHECK-NEXT:    [[DOTIDX18:%.*]] = shl i64 [[COLS_IV]], 5
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[DOTIDX18]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i64 [[ROWS_IV]]
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP13]], i64 32
+; CHECK-NEXT:    store <2 x i64> [[TMP11]], ptr [[VEC_GEP16]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
 ; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
@@ -333,11 +333,11 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_IV]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = shl i64 [[COLS_IV]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[TMP19]], i64 [[ROWS_IV]]
-; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[VEC_GEP23:%.*]] = getelementptr i8, ptr [[TMP20]], i64 8
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[COLS_IV]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[C]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[TMP18]], i64 [[ROWS_IV]]
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[VEC_GEP23:%.*]] = getelementptr i8, ptr [[TMP19]], i64 8
 ; CHECK-NEXT:    store <2 x float> [[TMP17]], ptr [[VEC_GEP23]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
diff --git a/llvm/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll b/llvm/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
index 73a68f26f4aede..edc07864795e83 100644
--- a/llvm/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
+++ b/llvm/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
@@ -140,11 +140,46 @@ fin:
   ret ptr null
 }
 
-define ptr @no_store_single_load() {
-; CHECK-LABEL: @no_store_single_load(
+define ptr @no_store_single_load_noundef() {
+; CHECK-LABEL: @no_store_single_load_noundef(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne ptr undef, null
-; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP0]])
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
+; CHECK-NEXT:    ret ptr undef
+;
+entry:
+  %buf = alloca ptr
+  %buf.load = load ptr, ptr %buf, !noundef !0
+  ret ptr %buf.load
+}
+
+define ptr @no_store_multiple_loads_noundef(i1 %c) {
+; CHECK-LABEL: @no_store_multiple_loads_noundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
+; CHECK-NEXT:    ret ptr undef
+; CHECK:       else:
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
+; CHECK-NEXT:    ret ptr undef
+;
+entry:
+  %buf = alloca ptr
+  br i1 %c, label %if, label %else
+
+if:
+  %buf.load = load ptr, ptr %buf, !noundef !0
+  ret ptr %buf.load
+
+  else:
+  %buf.load2 = load ptr, ptr %buf, !noundef !0
+  ret ptr %buf.load2
+}
+
+define ptr @no_store_single_load_nonnull_noundef() {
+; CHECK-LABEL: @no_store_single_load_nonnull_noundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    ret ptr undef
 ;
 entry:
@@ -153,17 +188,15 @@ entry:
   ret ptr %buf.load
 }
 
-define ptr @no_store_multiple_loads(i1 %c) {
-; CHECK-LABEL: @no_store_multiple_loads(
+define ptr @no_store_multiple_loads_nonnull_noundef(i1 %c) {
+; CHECK-LABEL: @no_store_multiple_loads_nonnull_noundef(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne ptr undef, null
-; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP0]])
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    ret ptr undef
 ; CHECK:       else:
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne ptr undef, null
-; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    ret ptr undef
 ;
 entry:
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof.exe b/llvm/test/Transforms/PGOProfile/Inputs/memprof.exe
index b10c2f9c72147e..710e49ce7ec895 100755
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof.exe and b/llvm/test/Transforms/PGOProfile/Inputs/memprof.exe differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof.memprofraw b/llvm/test/Transforms/PGOProfile/Inputs/memprof.memprofraw
index 790249a44b0d13..255f7012e333d8 100644
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof.memprofraw and b/llvm/test/Transforms/PGOProfile/Inputs/memprof.memprofraw differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.exe b/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.exe
index 3000e2b8515a25..c24a0fdbb0e958 100755
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.exe and b/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.exe differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.memprofraw b/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.memprofraw
index c0db6d2e0f56ca..8886204350470a 100644
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.memprofraw and b/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.memprofraw differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.exe b/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.exe
index 6e0eaafdaa4dee..4d7c54b14ed42f 100755
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.exe and b/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.exe differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.memprofraw b/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.memprofraw
index 43bd116cec4ddb..a4c5ad8b53d646 100644
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.memprofraw and b/llvm/test/Transforms/PGOProfile/Inputs/memprof_internal_linkage.memprofraw differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.exe b/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.exe
index 6bb4279baa7d3b..4de044a97a21cf 100755
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.exe and b/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.exe differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.memprofraw b/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.memprofraw
index e74f9663618c22..20ad99db1f0816 100644
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.memprofraw and b/llvm/test/Transforms/PGOProfile/Inputs/memprof_loop_unroll.memprofraw differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.exe b/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.exe
index 212f8f8ecce766..39bc43023559e9 100755
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.exe and b/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.exe differ
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.memprofraw b/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.memprofraw
index 3a06639d3a2bec..fcb661c0ba81d7 100644
Binary files a/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.memprofraw and b/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.memprofraw differ
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
new file mode 100644
index 00000000000000..c77be3b1ed2444
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -icp-max-num-vtable-last-candidate=2 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,VTABLE-CMP
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -icp-max-num-vtable-last-candidate=1 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,FUNC-CMP
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@Base1 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0
+@Base2 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo] }, !type !2
+@Base3 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo] }, !type !6
+
+@Derived1 = constant { [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived1_bar] }, !type !1, !type !2, !type !3
+@Derived2 = constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo], [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived2_bar] }, !type !4, !type !5, !type !6, !type !7
+@Derived3 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0, !type !8
+
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived1_bar with count 600 out of 1600, sink 2 instruction(s) and compare 1 vtable(s): {Derived1}
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived2_bar with count 500 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {Derived2}
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Base1_bar with count 400 out of 500, sink 2 instruction(s) and compare 2 vtable(s): {Derived3, Base1}
+
+define void @test(ptr %d) {
+; VTABLE-CMP-LABEL: define void @test(
+; VTABLE-CMP-SAME: ptr [[D:%.*]]) {
+; VTABLE-CMP-NEXT:  [[ENTRY:.*:]]
+; VTABLE-CMP-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF9:![0-9]+]]
+; VTABLE-CMP-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"Base1")
+; VTABLE-CMP-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; VTABLE-CMP-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived1, i32 40)
+; VTABLE-CMP-NEXT:    br i1 [[TMP1]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF10:![0-9]+]]
+; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG]]:
+; VTABLE-CMP-NEXT:    call void @Derived1_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP:.*]]
+; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT]]:
+; VTABLE-CMP-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived2, i32 64)
+; VTABLE-CMP-NEXT:    br i1 [[TMP2]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF11:![0-9]+]]
+; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG1]]:
+; VTABLE-CMP-NEXT:    call void @Derived2_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP3:.*]]
+; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT2]]:
+; VTABLE-CMP-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Base1, i32 16)
+; VTABLE-CMP-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived3, i32 16)
+; VTABLE-CMP-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; VTABLE-CMP-NEXT:    br i1 [[TMP5]], label %[[IF_TRUE_DIRECT_TARG4:.*]], label %[[IF_FALSE_ORIG_INDIRECT5:.*]], !prof [[PROF12:![0-9]+]]
+; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG4]]:
+; VTABLE-CMP-NEXT:    call void @Base1_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP6:.*]]
+; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT5]]:
+; VTABLE-CMP-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
+; VTABLE-CMP-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[VFN]], align 8
+; VTABLE-CMP-NEXT:    call void [[TMP6]](ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP6]]
+; VTABLE-CMP:       [[IF_END_ICP6]]:
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP3]]
+; VTABLE-CMP:       [[IF_END_ICP3]]:
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP]]
+; VTABLE-CMP:       [[IF_END_ICP]]:
+; VTABLE-CMP-NEXT:    ret void
+;
+; FUNC-CMP-LABEL: define void @test(
+; FUNC-CMP-SAME: ptr [[D:%.*]]) {
+; FUNC-CMP-NEXT:  [[ENTRY:.*:]]
+; FUNC-CMP-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF9:![0-9]+]]
+; FUNC-CMP-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"Base1")
+; FUNC-CMP-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; FUNC-CMP-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
+; FUNC-CMP-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
+; FUNC-CMP-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @Derived1_bar
+; FUNC-CMP-NEXT:    br i1 [[TMP2]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF10:![0-9]+]]
+; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG]]:
+; FUNC-CMP-NEXT:    call void @Derived1_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP:.*]]
+; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT]]:
+; FUNC-CMP-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[TMP1]], @Derived2_bar
+; FUNC-CMP-NEXT:    br i1 [[TMP3]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF11:![0-9]+]]
+; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG1]]:
+; FUNC-CMP-NEXT:    call void @Derived2_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP3:.*]]
+; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT2]]:
+; FUNC-CMP-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[TMP1]], @Base1_bar
+; FUNC-CMP-NEXT:    br i1 [[TMP4]], label %[[IF_TRUE_DIRECT_TARG4:.*]], label %[[IF_FALSE_ORIG_INDIRECT5:.*]], !prof [[PROF12:![0-9]+]]
+; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG4]]:
+; FUNC-CMP-NEXT:    call void @Base1_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP6:.*]]
+; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT5]]:
+; FUNC-CMP-NEXT:    call void [[TMP1]](ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP6]]
+; FUNC-CMP:       [[IF_END_ICP6]]:
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP3]]
+; FUNC-CMP:       [[IF_END_ICP3]]:
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP]]
+; FUNC-CMP:       [[IF_END_ICP]]:
+; FUNC-CMP-NEXT:    ret void
+;
+entry:
+  %vtable = load ptr, ptr %d, !prof !9
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"Base1")
+  tail call void @llvm.assume(i1 %0)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  %1 = load ptr, ptr %vfn
+  call void %1(ptr %d), !prof !10
+  ret void
+}
+
+define void @Base1_bar(ptr %this) {
+  ret void
+}
+
+define void @Derived1_bar(ptr %this) {
+  ret void
+}
+
+define void @Derived2_bar(ptr %this) {
+  ret void
+}
+
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+declare i32 @Base2_foo(ptr)
+declare i32 @Base1_foo(ptr)
+declare void @Base3_foo(ptr)
+
+!0 = !{i64 16, !"Base1"}
+!1 = !{i64 40, !"Base1"}
+!2 = !{i64 16, !"Base2"}
+!3 = !{i64 16, !"Derived1"}
+!4 = !{i64 64, !"Base1"}
+!5 = !{i64 40, !"Base2"}
+!6 = !{i64 16, !"Base3"}
+!7 = !{i64 16, !"Derived2"}
+!8 = !{i64 16, !"Derived3"}
+!9 = !{!"VP", i32 2, i64 1600, i64 -4123858694673519054, i64 600, i64 -7211198353767973908, i64 500, i64 -3574436251470806727, i64 200, i64 6288809125658696740, i64 200, i64 12345678, i64 100}
+!10 = !{!"VP", i32 0, i64 1600, i64 3827408714133779784, i64 600, i64 5837445539218476403, i64 500, i64 -9064955852395570538, i64 400,  i64 56781234, i64 100}
+;.
+; VTABLE-COMMON: [[PROF9]] = !{!"VP", i32 2, i64 100, i64 12345678, i64 100}
+; VTABLE-COMMON: [[PROF10]] = !{!"branch_weights", i32 600, i32 1000}
+; VTABLE-COMMON: [[PROF11]] = !{!"branch_weights", i32 500, i32 500}
+; VTABLE-COMMON: [[PROF12]] = !{!"branch_weights", i32 400, i32 100}
+
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
new file mode 100644
index 00000000000000..6d3a6972f6885c
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
@@ -0,0 +1,125 @@
+; RUN: opt < %s -passes='pgo-icall-prom' -enable-vtable-profile-use -S | FileCheck %s --check-prefix=VTABLE
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@_ZTV4Base = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base10get_ticketEv] }, !type !0, !type !1
+@_ZTV7Derived = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived10get_ticketEv] }, !type !0, !type !1, !type !2, !type !3
+
+@.str = private constant [15 x i8] c"out of tickets\00"
+
+define i32 @test(ptr %b) personality ptr @__gxx_personality_v0 {
+; VTABLE-LABEL: define i32 @test(
+; VTABLE-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 {
+; VTABLE-NEXT:  [[ENTRY:.*:]]
+; VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[B]], align 8
+; VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; VTABLE-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV7Derived, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP3]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF4:![0-9]+]]
+; VTABLE:       [[IF_TRUE_DIRECT_TARG]]:
+; VTABLE-NEXT:    [[TMP2:%.*]] = invoke i32 @_ZN7Derived10get_ticketEv(ptr [[B]])
+; VTABLE-NEXT:            to label %[[IF_END_ICP:.*]] unwind label %[[LPAD:.*]]
+; VTABLE:       [[IF_FALSE_ORIG_INDIRECT]]:
+; VTABLE-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV4Base, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP4]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF5:![0-9]+]]
+; VTABLE:       [[IF_TRUE_DIRECT_TARG1]]:
+; VTABLE-NEXT:    [[TMP5:%.*]] = invoke i32 @_ZN4Base10get_ticketEv(ptr [[B]])
+; VTABLE-NEXT:            to label %[[IF_END_ICP3:.*]] unwind label %[[LPAD]]
+; VTABLE:       [[IF_FALSE_ORIG_INDIRECT2]]:
+; VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; VTABLE-NEXT:    [[CALL:%.*]] = invoke i32 [[TMP1]](ptr [[B]])
+; VTABLE-NEXT:            to label %[[IF_END_ICP3]] unwind label %[[LPAD]]
+; VTABLE:       [[IF_END_ICP3]]:
+; VTABLE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], %[[IF_FALSE_ORIG_INDIRECT2]] ], [ [[TMP5]], %[[IF_TRUE_DIRECT_TARG1]] ]
+; VTABLE-NEXT:    br label %[[IF_END_ICP]]
+; VTABLE:       [[IF_END_ICP]]:
+; VTABLE-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP6]], %[[IF_END_ICP3]] ], [ [[TMP2]], %[[IF_TRUE_DIRECT_TARG]] ]
+; VTABLE-NEXT:    br label %[[NEXT:.*]]
+; VTABLE:       [[NEXT]]:
+; VTABLE-NEXT:    ret i32 [[TMP7]]
+; VTABLE:       [[LPAD]]:
+; VTABLE-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
+; VTABLE-NEXT:            cleanup
+; VTABLE-NEXT:    unreachable
+;
+entry:
+  %vtable = load ptr, ptr %b, !prof !4
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  tail call void @llvm.assume(i1 %0)
+  %1 = load ptr, ptr %vtable
+  %call = invoke i32 %1(ptr %b) to label %next unwind label %lpad, !prof !5
+
+next:
+  ret i32 %call
+
+lpad:
+  %exn = landingpad {ptr, i32}
+  cleanup
+  unreachable
+}
+
+declare void @make_error(ptr, ptr, i32)
+declare i32 @get_ticket_id()
+declare ptr @__cxa_allocate_exception(i64)
+
+define i32 @_ZN4Base10get_ticketEv(ptr %this) personality ptr @__gxx_personality_v0 {
+entry:
+  %call = tail call i32 @get_ticket_id()
+  %cmp.not = icmp eq i32 %call, -1
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  ret i32 %call
+
+if.end:
+  %exception = tail call ptr @__cxa_allocate_exception(i64 1)
+  invoke void @make_error(ptr %exception, ptr @.str, i32 1)
+  to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  unreachable
+
+lpad:
+  %0 = landingpad { ptr, i32 }
+  cleanup
+  resume { ptr, i32 } %0
+}
+
+define i32 @_ZN7Derived10get_ticketEv(ptr %this) personality ptr @__gxx_personality_v0 {
+entry:
+  %call = tail call i32 @get_ticket_id()
+  %cmp.not = icmp eq i32 %call, -1
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  ret i32 %call
+
+if.end:
+  %exception = tail call ptr @__cxa_allocate_exception(i64 1)
+  invoke void @make_error(ptr %exception, ptr @.str, i32 2)
+  to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  unreachable
+
+lpad:
+  %0 = landingpad { ptr, i32 }
+  cleanup
+  resume { ptr, i32 } %0
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+declare i32 @__gxx_personality_v0(...)
+
+!0 = !{i64 16, !"_ZTS4Base"}
+!1 = !{i64 16, !"_ZTSM4BaseFivE.virtual"}
+!2 = !{i64 16, !"_ZTS7Derived"}
+!3 = !{i64 16, !"_ZTSM7DerivedFivE.virtual"}
+!4 = !{!"VP", i32 2, i64 1600, i64 13870436605473471591, i64 900, i64 1960855528937986108, i64 700}
+!5 = !{!"VP", i32 0, i64 1600, i64 14811317294552474744, i64 900, i64 9261744921105590125, i64 700}
+
+; VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
+; VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
new file mode 100644
index 00000000000000..fb9ec0d0c85ff3
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -S 2>&1 | FileCheck %s --check-prefixes=VTABLE,REMARK
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN7Derived5func1Eii with count 900 out of 1600, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV7Derived}
+; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN4Base5func1Eii with count 700 out of 700, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV4Base}
+
+@_ZTV7Derived = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived5func1Eii] }, !type !0, !type !1, !type !2, !type !3
+@_ZTV4Base = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Eii] }, !type !0, !type !1
+
+define i32 @test_tail_call(ptr %ptr, i32 %a, i32 %b) {
+; VTABLE-LABEL: define i32 @test_tail_call(
+; VTABLE-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; VTABLE-NEXT:  entry:
+; VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[PTR]], align 8
+; VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV7Derived, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[TMP4:%.*]], !prof [[PROF4:![0-9]+]]
+; VTABLE:       if.true.direct_targ:
+; VTABLE-NEXT:    [[TMP3:%.*]] = musttail call i32 @_ZN7Derived5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
+; VTABLE-NEXT:    ret i32 [[TMP3]]
+; VTABLE:       3:
+; VTABLE-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV4Base, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP4]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[TMP7:%.*]], !prof [[PROF5:![0-9]+]]
+; VTABLE:       if.true.direct_targ1:
+; VTABLE-NEXT:    [[TMP6:%.*]] = musttail call i32 @_ZN4Base5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
+; VTABLE-NEXT:    ret i32 [[TMP6]]
+; VTABLE:       6:
+; VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; VTABLE-NEXT:    [[CALL:%.*]] = musttail call i32 [[TMP1]](ptr [[PTR]], i32 [[A]], i32 [[B]])
+; VTABLE-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %vtable = load ptr, ptr %ptr, !prof !4
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  tail call void @llvm.assume(i1 %0)
+  %1 = load ptr, ptr %vtable
+  %call = musttail call i32 %1(ptr %ptr, i32 %a, i32 %b), !prof !5
+  ret i32 %call
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define i32 @_ZN7Derived5func1Eii(ptr %this, i32 %a, i32 %b) {
+entry:
+  %sub = sub nsw i32 %a, %b
+  ret i32 %sub
+}
+
+define i32 @_ZN4Base5func1Eii(ptr %this, i32 %a, i32 %b) {
+entry:
+  %add = add nsw i32 %b, %a
+  ret i32 %add
+}
+
+
+!0 = !{i64 16, !"_ZTS4Base"}
+!1 = !{i64 16, !"_ZTSM4BaseFiiiE.virtual"}
+!2 = !{i64 16, !"_ZTS7Derived"}
+!3 = !{i64 16, !"_ZTSM7DerivedFiiiE.virtual"}
+!4 = !{!"VP", i32 2, i64 1600, i64 13870436605473471591, i64 900, i64 1960855528937986108, i64 700}
+!5 = !{!"VP", i32 0, i64 1600, i64 7889036118036845314, i64 900, i64 10495086226207060333, i64 700}
+
+; VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
+; VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}
diff --git a/llvm/test/Transforms/PGOProfile/memprof_internal_linkage.ll b/llvm/test/Transforms/PGOProfile/memprof_internal_linkage.ll
index 3d4b93c85ddf7e..65717d12bd4811 100644
--- a/llvm/test/Transforms/PGOProfile/memprof_internal_linkage.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof_internal_linkage.ll
@@ -28,12 +28,12 @@ entry:
   store i32 0, ptr %retval, align 4
   store i32 %argc, ptr %argc.addr, align 4
   store ptr %argv, ptr %argv.addr, align 8
-  call void @_ZL3foov.__uniq.231888424933890731874095357293037629092() #4, !dbg !14
+  call void @_ZL3foov.__uniq.246575255519150625886541854978321354160() #4, !dbg !14
   ret i32 0, !dbg !15
 }
 
 ; Function Attrs: mustprogress noinline optnone uwtable
-define internal void @_ZL3foov.__uniq.231888424933890731874095357293037629092() #1 !dbg !16 {
+define internal void @_ZL3foov.__uniq.246575255519150625886541854978321354160() #1 !dbg !16 {
 entry:
   %a = alloca ptr, align 8
   %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 20) #5, !dbg !17
@@ -76,7 +76,7 @@ attributes #5 = { builtin allocsize(0) }
 !13 = !{}
 !14 = !DILocation(line: 8, column: 3, scope: !10)
 !15 = !DILocation(line: 9, column: 3, scope: !10)
-!16 = distinct !DISubprogram(name: "foo", linkageName: "_ZL3foov.__uniq.231888424933890731874095357293037629092", scope: !11, file: !11, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0)
+!16 = distinct !DISubprogram(name: "foo", linkageName: "_ZL3foov.__uniq.246575255519150625886541854978321354160", scope: !11, file: !11, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0)
 !17 = !DILocation(line: 4, column: 12, scope: !16)
 !18 = !DILocation(line: 4, column: 8, scope: !16)
 !19 = !DILocation(line: 5, column: 10, scope: !16)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
index c085e10c049a93..3ee8ba5d09ed1e 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
@@ -22,9 +22,9 @@ define void @add4(ptr noalias noundef %x, ptr noalias noundef %y, i32 noundef %n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP2]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -403,12 +403,12 @@ define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP4]], [[WIDE_VEC36]]
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
new file mode 100644
index 00000000000000..bccd189d12a82a
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
@@ -0,0 +1,574 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
+
+;
+; TODO: PR58895 - replace shuffled _mm_blendv_epi8+icmp with select+icmp
+;
+
+;
+; 256-bits types split to use 128-bit PBLENDVB selects
+;
+
+define <4 x double> @x86_pblendvb_v4f64_v2f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: @x86_pblendvb_v4f64_v2f64(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x double> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x double> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_HI]], <16 x i8> [[B_HI]], <16 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <16 x i8> [[SEL_LO]], <16 x i8> [[SEL_HI]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x double>
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %a.bc = bitcast <4 x double> %a to <32 x i8>
+  %b.bc = bitcast <4 x double> %b to <32 x i8>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %cmp = fcmp olt <4 x double> %c, %d
+  %sext = sext <4 x i1> %cmp to <4 x i64>
+  %sext.bc = bitcast <4 x i64> %sext to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sext.hi = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %sel.hi = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.hi, <16 x i8> %b.hi, <16 x i8> %sext.hi)
+  %concat = shufflevector <16 x i8> %sel.lo, <16 x i8> %sel.hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = bitcast <32 x i8> %concat to <4 x double>
+  ret <4 x double> %res
+}
+
+define <8 x float> @x86_pblendvb_v8f32_v4f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
+; CHECK-LABEL: @x86_pblendvb_v8f32_v4f32(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x float> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x float> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_HI]], <16 x i8> [[B_HI]], <16 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <16 x i8> [[SEL_LO]], <16 x i8> [[SEL_HI]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %a.bc = bitcast <8 x float> %a to <32 x i8>
+  %b.bc = bitcast <8 x float> %b to <32 x i8>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %cmp = fcmp olt <8 x float> %c, %d
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  %sext.bc = bitcast <8 x i32> %sext to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sext.hi = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %sel.hi = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.hi, <16 x i8> %b.hi, <16 x i8> %sext.hi)
+  %concat = shufflevector <16 x i8> %sel.lo, <16 x i8> %sel.hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = bitcast <32 x i8> %concat to <8 x float>
+  ret <8 x float> %res
+}
+
+define <4 x i64> @x86_pblendvb_v4i64_v2i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v4i64_v2i64(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_HI]], <16 x i8> [[B_HI]], <16 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <16 x i8> [[SEL_LO]], <16 x i8> [[SEL_HI]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %a.bc = bitcast <4 x i64> %a to <32 x i8>
+  %b.bc = bitcast <4 x i64> %b to <32 x i8>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %cmp = icmp slt <4 x i64> %c, %d
+  %sext = sext <4 x i1> %cmp to <4 x i64>
+  %sext.bc = bitcast <4 x i64> %sext to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sext.hi = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %sel.hi = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.hi, <16 x i8> %b.hi, <16 x i8> %sext.hi)
+  %concat = shufflevector <16 x i8> %sel.lo, <16 x i8> %sel.hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = bitcast <32 x i8> %concat to <4 x i64>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @x86_pblendvb_v8i32_v4i32(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v8i32_v4i32(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i32> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_HI]], <16 x i8> [[B_HI]], <16 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <16 x i8> [[SEL_LO]], <16 x i8> [[SEL_HI]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %a.bc = bitcast <4 x i64> %a to <32 x i8>
+  %b.bc = bitcast <4 x i64> %b to <32 x i8>
+  %c.bc = bitcast <4 x i64> %c to <8 x i32>
+  %d.bc = bitcast <4 x i64> %d to <8 x i32>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %cmp = icmp slt <8 x i32> %c.bc, %d.bc
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  %sext.bc = bitcast <8 x i32> %sext to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sext.hi = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %sel.hi = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.hi, <16 x i8> %b.hi, <16 x i8> %sext.hi)
+  %concat = shufflevector <16 x i8> %sel.lo, <16 x i8> %sel.hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = bitcast <32 x i8> %concat to <4 x i64>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @x86_pblendvb_v16i16_v8i16(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v16i16_v8i16(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <16 x i16>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <16 x i16>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <16 x i16> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <16 x i1> [[CMP]] to <16 x i16>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <16 x i16> [[SEXT]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_HI]], <16 x i8> [[B_HI]], <16 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <16 x i8> [[SEL_LO]], <16 x i8> [[SEL_HI]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %a.bc = bitcast <4 x i64> %a to <32 x i8>
+  %b.bc = bitcast <4 x i64> %b to <32 x i8>
+  %c.bc = bitcast <4 x i64> %c to <16 x i16>
+  %d.bc = bitcast <4 x i64> %d to <16 x i16>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %cmp = icmp slt <16 x i16> %c.bc, %d.bc
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  %sext.bc = bitcast <16 x i16> %sext to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sext.hi = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %sel.hi = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.hi, <16 x i8> %b.hi, <16 x i8> %sext.hi)
+  %concat = shufflevector <16 x i8> %sel.lo, <16 x i8> %sel.hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = bitcast <32 x i8> %concat to <4 x i64>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @x86_pblendvb_v32i8_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v32i8_v16i8(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <32 x i1> [[CMP]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <32 x i8> [[SEXT]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_HI]], <16 x i8> [[B_HI]], <16 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <16 x i8> [[SEL_LO]], <16 x i8> [[SEL_HI]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %a.bc = bitcast <4 x i64> %a to <32 x i8>
+  %b.bc = bitcast <4 x i64> %b to <32 x i8>
+  %c.bc = bitcast <4 x i64> %c to <32 x i8>
+  %d.bc = bitcast <4 x i64> %d to <32 x i8>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %cmp = icmp slt <32 x i8> %c.bc, %d.bc
+  %sext = sext <32 x i1> %cmp to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sext.hi = shufflevector <32 x i8> %sext, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %sel.hi = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.hi, <16 x i8> %b.hi, <16 x i8> %sext.hi)
+  %concat = shufflevector <16 x i8> %sel.lo, <16 x i8> %sel.hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = bitcast <32 x i8> %concat to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bits types split to use 256-bit PBLENDVB selects
+;
+
+define <8 x double> @x86_pblendvb_v8f64_v4f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d) {
+; CHECK-LABEL: @x86_pblendvb_v8f64_v4f64(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x double> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x double> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i64>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <8 x i64> [[SEXT]] to <64 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_LO]], <32 x i8> [[B_LO]], <32 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_HI]], <32 x i8> [[B_HI]], <32 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <32 x i8> [[SEL_LO]], <32 x i8> [[SEL_HI]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x double>
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %a.bc = bitcast <8 x double> %a to <64 x i8>
+  %b.bc = bitcast <8 x double> %b to <64 x i8>
+  %a.lo = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.lo = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %a.hi = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %b.hi = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %cmp = fcmp olt <8 x double> %c, %d
+  %sext = sext <8 x i1> %cmp to <8 x i64>
+  %sext.bc = bitcast <8 x i64> %sext to <64 x i8>
+  %sext.lo = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sext.hi = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %sel.lo = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.lo, <32 x i8> %b.lo, <32 x i8> %sext.lo)
+  %sel.hi = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.hi, <32 x i8> %b.hi, <32 x i8> %sext.hi)
+  %concat = shufflevector <32 x i8> %sel.lo, <32 x i8> %sel.hi, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %res = bitcast <64 x i8> %concat to <8 x double>
+  ret <8 x double> %res
+}
+
+define <16 x float> @x86_pblendvb_v16f32_v8f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d) {
+; CHECK-LABEL: @x86_pblendvb_v16f32_v8f32(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <16 x float> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <16 x float> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <16 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <16 x i1> [[CMP]] to <16 x i32>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <16 x i32> [[SEXT]] to <64 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_LO]], <32 x i8> [[B_LO]], <32 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_HI]], <32 x i8> [[B_HI]], <32 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <32 x i8> [[SEL_LO]], <32 x i8> [[SEL_HI]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <16 x float>
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %a.bc = bitcast <16 x float> %a to <64 x i8>
+  %b.bc = bitcast <16 x float> %b to <64 x i8>
+  %a.lo = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.lo = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %a.hi = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %b.hi = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %cmp = fcmp olt <16 x float> %c, %d
+  %sext = sext <16 x i1> %cmp to <16 x i32>
+  %sext.bc = bitcast <16 x i32> %sext to <64 x i8>
+  %sext.lo = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sext.hi = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %sel.lo = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.lo, <32 x i8> %b.lo, <32 x i8> %sext.lo)
+  %sel.hi = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.hi, <32 x i8> %b.hi, <32 x i8> %sext.hi)
+  %concat = shufflevector <32 x i8> %sel.lo, <32 x i8> %sel.hi, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %res = bitcast <64 x i8> %concat to <16 x float>
+  ret <16 x float> %res
+}
+
+define <8 x i64> @x86_pblendvb_v8i64_v4i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v8i64_v4i64(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i64> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i64>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <8 x i64> [[SEXT]] to <64 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_LO]], <32 x i8> [[B_LO]], <32 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_HI]], <32 x i8> [[B_HI]], <32 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <32 x i8> [[SEL_LO]], <32 x i8> [[SEL_HI]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %a.bc = bitcast <8 x i64> %a to <64 x i8>
+  %b.bc = bitcast <8 x i64> %b to <64 x i8>
+  %a.lo = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.lo = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %a.hi = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %b.hi = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %cmp = icmp slt <8 x i64> %c, %d
+  %sext = sext <8 x i1> %cmp to <8 x i64>
+  %sext.bc = bitcast <8 x i64> %sext to <64 x i8>
+  %sext.lo = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sext.hi = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %sel.lo = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.lo, <32 x i8> %b.lo, <32 x i8> %sext.lo)
+  %sel.hi = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.hi, <32 x i8> %b.hi, <32 x i8> %sext.hi)
+  %concat = shufflevector <32 x i8> %sel.lo, <32 x i8> %sel.hi, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %res = bitcast <64 x i8> %concat to <8 x i64>
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @x86_pblendvb_v16i32_v8i32(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v16i32_v8i32(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <16 x i32> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <16 x i1> [[CMP]] to <16 x i32>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <16 x i32> [[SEXT]] to <64 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_LO]], <32 x i8> [[B_LO]], <32 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_HI]], <32 x i8> [[B_HI]], <32 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <32 x i8> [[SEL_LO]], <32 x i8> [[SEL_HI]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %a.bc = bitcast <8 x i64> %a to <64 x i8>
+  %b.bc = bitcast <8 x i64> %b to <64 x i8>
+  %c.bc = bitcast <8 x i64> %a to <16 x i32>
+  %d.bc = bitcast <8 x i64> %b to <16 x i32>
+  %a.lo = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.lo = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %a.hi = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %b.hi = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %cmp = icmp slt <16 x i32> %c.bc, %d.bc
+  %sext = sext <16 x i1> %cmp to <16 x i32>
+  %sext.bc = bitcast <16 x i32> %sext to <64 x i8>
+  %sext.lo = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sext.hi = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %sel.lo = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.lo, <32 x i8> %b.lo, <32 x i8> %sext.lo)
+  %sel.hi = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.hi, <32 x i8> %b.hi, <32 x i8> %sext.hi)
+  %concat = shufflevector <32 x i8> %sel.lo, <32 x i8> %sel.hi, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %res = bitcast <64 x i8> %concat to <8 x i64>
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @x86_pblendvb_v32i16_v16i16(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v32i16_v16i16(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <32 x i16>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <32 x i16>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i16> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <32 x i1> [[CMP]] to <32 x i16>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <32 x i16> [[SEXT]] to <64 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <64 x i8> [[SEXT_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_LO]], <32 x i8> [[B_LO]], <32 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_HI]], <32 x i8> [[B_HI]], <32 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <32 x i8> [[SEL_LO]], <32 x i8> [[SEL_HI]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %a.bc = bitcast <8 x i64> %a to <64 x i8>
+  %b.bc = bitcast <8 x i64> %b to <64 x i8>
+  %c.bc = bitcast <8 x i64> %c to <32 x i16>
+  %d.bc = bitcast <8 x i64> %d to <32 x i16>
+  %a.lo = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.lo = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %a.hi = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %b.hi = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %cmp = icmp slt <32 x i16> %c.bc, %d.bc
+  %sext = sext <32 x i1> %cmp to <32 x i16>
+  %sext.bc = bitcast <32 x i16> %sext to <64 x i8>
+  %sext.lo = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sext.hi = shufflevector <64 x i8> %sext.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %sel.lo = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.lo, <32 x i8> %b.lo, <32 x i8> %sext.lo)
+  %sel.hi = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.hi, <32 x i8> %b.hi, <32 x i8> %sext.hi)
+  %concat = shufflevector <32 x i8> %sel.lo, <32 x i8> %sel.hi, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %res = bitcast <64 x i8> %concat to <8 x i64>
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @x86_pblendvb_v64i8_v32i8(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v64i8_v32i8(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[A_HI:%.*]] = shufflevector <64 x i8> [[A_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[B_HI:%.*]] = shufflevector <64 x i8> [[B_BC]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <64 x i1> [[CMP]] to <64 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <64 x i8> [[SEXT]], <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SEXT_HI:%.*]] = shufflevector <64 x i8> [[SEXT]], <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_LO]], <32 x i8> [[B_LO]], <32 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[SEL_HI:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A_HI]], <32 x i8> [[B_HI]], <32 x i8> [[SEXT_HI]])
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <32 x i8> [[SEL_LO]], <32 x i8> [[SEL_HI]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %a.bc = bitcast <8 x i64> %a to <64 x i8>
+  %b.bc = bitcast <8 x i64> %b to <64 x i8>
+  %c.bc = bitcast <8 x i64> %c to <64 x i8>
+  %d.bc = bitcast <8 x i64> %d to <64 x i8>
+  %a.lo = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.lo = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %a.hi = shufflevector <64 x i8> %a.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %b.hi = shufflevector <64 x i8> %b.bc, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %cmp = icmp slt <64 x i8> %c.bc, %d.bc
+  %sext = sext <64 x i1> %cmp to <64 x i8>
+  %sext.lo = shufflevector <64 x i8> %sext, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %sext.hi = shufflevector <64 x i8> %sext, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %sel.lo = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.lo, <32 x i8> %b.lo, <32 x i8> %sext.lo)
+  %sel.hi = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a.hi, <32 x i8> %b.hi, <32 x i8> %sext.hi)
+  %concat = shufflevector <32 x i8> %sel.lo, <32 x i8> %sel.hi, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %res = bitcast <64 x i8> %concat to <8 x i64>
+  ret <8 x i64> %res
+}
+
+;
+; Negative Tests
+;
+
+; non-pow2 source
+define <2 x double> @x86_pblendvb_v3f64_v2f64(<3 x double> %a, <3 x double> %b, <3 x double> %c, <3 x double> %d) {
+; CHECK-LABEL: @x86_pblendvb_v3f64_v2f64(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <3 x double> [[A:%.*]] to <24 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <3 x double> [[B:%.*]] to <24 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <24 x i8> [[A_BC]], <24 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <24 x i8> [[B_BC]], <24 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <3 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <3 x i1> [[CMP]] to <3 x i64>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <3 x i64> [[SEXT]] to <24 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <24 x i8> [[SEXT_BC]], <24 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[SEL_LO]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %a.bc = bitcast <3 x double> %a to <24 x i8>
+  %b.bc = bitcast <3 x double> %b to <24 x i8>
+  %a.lo = shufflevector <24 x i8> %a.bc, <24 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <24 x i8> %b.bc, <24 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %cmp = fcmp olt <3 x double> %c, %d
+  %sext = sext <3 x i1> %cmp to <3 x i64>
+  %sext.bc = bitcast <3 x i64> %sext to <24 x i8>
+  %sext.lo = shufflevector <24 x i8> %sext.bc, <24 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %res = bitcast <16 x i8> %sel.lo to <2 x double>
+  ret <2 x double> %res
+}
+
+; shuffle masks contain undef
+define <2 x i64> @x86_pblendvb_v32i8_v16i8_undefs(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v32i8_v16i8_undefs(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <32 x i1> [[CMP]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[SEL_LO]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %a.bc = bitcast <4 x i64> %a to <32 x i8>
+  %b.bc = bitcast <4 x i64> %b to <32 x i8>
+  %c.bc = bitcast <4 x i64> %c to <32 x i8>
+  %d.bc = bitcast <4 x i64> %d to <32 x i8>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %cmp = icmp slt <32 x i8> %c.bc, %d.bc
+  %sext = sext <32 x i1> %cmp to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %res = bitcast <16 x i8> %sel.lo to <2 x i64>
+  ret <2 x i64> %res
+}
+
+; masks elements too small
+define <2 x i64> @x86_pblendvb_v32i8_v32i4(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; CHECK-LABEL: @x86_pblendvb_v32i8_v32i4(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <64 x i4>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <64 x i4>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <64 x i4> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <64 x i1> [[CMP]] to <64 x i4>
+; CHECK-NEXT:    [[SEXT_BC:%.*]] = bitcast <64 x i4> [[SEXT]] to <32 x i8>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[SEL_LO]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %a.bc = bitcast <4 x i64> %a to <32 x i8>
+  %b.bc = bitcast <4 x i64> %b to <32 x i8>
+  %c.bc = bitcast <4 x i64> %c to <64 x i4>
+  %d.bc = bitcast <4 x i64> %d to <64 x i4>
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %cmp = icmp slt <64 x i4> %c.bc, %d.bc
+  %sext = sext <64 x i1> %cmp to <64 x i4>
+  %sext.bc = bitcast <64 x i4> %sext to <32 x i8>
+  %sext.lo = shufflevector <32 x i8> %sext.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %sel.lo = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo, <16 x i8> %b.lo, <16 x i8> %sext.lo)
+  %res = bitcast <16 x i8> %sel.lo to <2 x i64>
+  ret <2 x i64> %res
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
index 36535264bd0f29..c7818e8254b2ea 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
@@ -10,22 +10,22 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP9]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP11:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP6]], <16 x i8> [[TMP8]], <16 x i8> [[TMP10]])
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i8> [[TMP11]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP19:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP14]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]])
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP19]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[SHUFFLE_I23:%.*]] = bitcast <32 x i8> [[TMP20]] to <4 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP12]], <16 x i8> [[TMP14]], <16 x i8> [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP17]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SHUFFLE_I23:%.*]] = bitcast <32 x i8> [[TMP18]] to <4 x i64>
 ; CHECK-NEXT:    ret <4 x i64> [[SHUFFLE_I23]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll b/llvm/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll
index 6ca5faabdd0f23..fddd0003087736 100644
--- a/llvm/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll
+++ b/llvm/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll
@@ -1,22 +1,35 @@
-; RUN: opt < %s -passes=ipsccp -S | grep "ret i32 42"
-; RUN: opt < %s -passes=ipsccp -S | grep "ret i32 undef"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
 ; PR3325
 
 define i32 @main() personality ptr @__gxx_personality_v0 {
-	%tmp1 = invoke i32 @f()
-			to label %UnifiedReturnBlock unwind label %lpad
+; CHECK-LABEL: define i32 @main() personality ptr @__gxx_personality_v0 {
+; CHECK-NEXT:    [[TMP1:%.*]] = invoke i32 @f()
+; CHECK-NEXT:            to label %[[UNIFIEDRETURNBLOCK:.*]] unwind label %[[LPAD:.*]]
+; CHECK:       [[LPAD]]:
+; CHECK-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    unreachable
+; CHECK:       [[UNIFIEDRETURNBLOCK]]:
+; CHECK-NEXT:    ret i32 42
+;
+  %tmp1 = invoke i32 @f()
+  to label %UnifiedReturnBlock unwind label %lpad
 
 lpad:
-        %val = landingpad { ptr, i32 }
-                 cleanup
-	unreachable
+  %val = landingpad { ptr, i32 }
+  cleanup
+  unreachable
 
 UnifiedReturnBlock:
-	ret i32 %tmp1
+  ret i32 %tmp1
 }
 
 define internal i32 @f() {
-       ret i32 42
+; CHECK-LABEL: define internal i32 @f() {
+; CHECK-NEXT:    ret i32 poison
+;
+  ret i32 42
 }
 
 declare ptr @__cxa_begin_catch(ptr) nounwind
diff --git a/llvm/test/Transforms/SCCP/apint-ipsccp2.ll b/llvm/test/Transforms/SCCP/apint-ipsccp2.ll
index 7f41d863f66fc3..a80fb4db8fe732 100644
--- a/llvm/test/Transforms/SCCP/apint-ipsccp2.ll
+++ b/llvm/test/Transforms/SCCP/apint-ipsccp2.ll
@@ -1,19 +1,29 @@
-; RUN: opt < %s -passes=ipsccp -S | grep -v "ret i101 0" | \
-; RUN:    grep -v "ret i101 undef" | not grep ret
-
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
 
 define internal i101 @bar(i101 %A) {
-	%x = icmp eq i101 %A, 0
-	br i1 %x, label %T, label %F
+; CHECK-LABEL: define internal i101 @bar(
+; CHECK-SAME: i101 [[A:%.*]]) {
+; CHECK-NEXT:    br label %[[T:.*]]
+; CHECK:       [[T]]:
+; CHECK-NEXT:    [[B:%.*]] = call i101 @bar(i101 0)
+; CHECK-NEXT:    ret i101 poison
+;
+  %x = icmp eq i101 %A, 0
+  br i1 %x, label %T, label %F
 T:
-	%B = call i101 @bar(i101 0)
-	ret i101 0
+  %B = call i101 @bar(i101 0)
+  ret i101 0
 F:      ; unreachable
-	%C = call i101 @bar(i101 1)
-	ret i101 %C
+  %C = call i101 @bar(i101 1)
+  ret i101 %C
 }
 
 define i101 @foo() {
-	%X = call i101 @bar(i101 0)
-	ret i101 %X
+; CHECK-LABEL: define i101 @foo() {
+; CHECK-NEXT:    [[X:%.*]] = call i101 @bar(i101 0)
+; CHECK-NEXT:    ret i101 0
+;
+  %X = call i101 @bar(i101 0)
+  ret i101 %X
 }
diff --git a/llvm/test/Transforms/SCCP/arg-count-mismatch.ll b/llvm/test/Transforms/SCCP/arg-count-mismatch.ll
index 54bf8d349b831c..c2c5248be41f36 100644
--- a/llvm/test/Transforms/SCCP/arg-count-mismatch.ll
+++ b/llvm/test/Transforms/SCCP/arg-count-mismatch.ll
@@ -56,7 +56,7 @@ define internal i16 @bar(i16 %p1, i16 %p2) {
 define internal i16 @vararg_prop(i16 %p1, ...) {
 ; CHECK-LABEL: define {{[^@]+}}@vararg_prop
 ; CHECK-SAME: (i16 [[P1:%.*]], ...) {
-; CHECK-NEXT:    ret i16 undef
+; CHECK-NEXT:    ret i16 poison
 ;
   ret i16 %p1
 }
diff --git a/llvm/test/Transforms/SCCP/indirectbr.ll b/llvm/test/Transforms/SCCP/indirectbr.ll
index 280bc53ff2623e..d9235e0f84cb2e 100644
--- a/llvm/test/Transforms/SCCP/indirectbr.ll
+++ b/llvm/test/Transforms/SCCP/indirectbr.ll
@@ -104,7 +104,7 @@ define internal i32 @indbrtest5(i1 %c) {
 ; CHECK:       target1:
 ; CHECK-NEXT:    br label [[TARGET2]]
 ; CHECK:       target2:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
 entry:
   br i1 %c, label %bb1, label %bb2
diff --git a/llvm/test/Transforms/SCCP/ip-constant-ranges.ll b/llvm/test/Transforms/SCCP/ip-constant-ranges.ll
index 07068441ae4f4e..c0cdfafe71cb2d 100644
--- a/llvm/test/Transforms/SCCP/ip-constant-ranges.ll
+++ b/llvm/test/Transforms/SCCP/ip-constant-ranges.ll
@@ -6,7 +6,7 @@ define internal i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: define {{[^@]+}}@f1
 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
 entry:
   %cmp.a = icmp sgt i32 %a, 300
@@ -81,7 +81,7 @@ define internal i32 @f3(i32 %x) {
 ; CHECK-LABEL: define {{[^@]+}}@f3
 ; CHECK-SAME: (i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
 entry:
   %cmp = icmp sgt i32 %x, 300
@@ -118,7 +118,7 @@ define internal i32 @f4(i32 %x) {
 ; CHECK-LABEL: define {{[^@]+}}@f4
 ; CHECK-SAME: (i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
 entry:
   %cmp = icmp sgt i32 %x, 300
@@ -172,7 +172,7 @@ define internal i1 @test_unreachable_callee(i32 %a) {
 ; CHECK-LABEL: define {{[^@]+}}@test_unreachable_callee
 ; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
 entry:
   ret i1 true
@@ -284,7 +284,7 @@ define internal i32 @callee6.1(i32 %i) {
 ; CHECK-LABEL: define {{[^@]+}}@callee6.1
 ; CHECK-SAME: (i32 [[I:%.*]]) {
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee6.2(i32 [[I]])
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   %res = call i32 @callee6.2(i32 %i)
   ret i32 %res
@@ -295,7 +295,7 @@ define internal i32 @callee6.2(i32 %i) {
 ; CHECK-SAME: (i32 [[I:%.*]]) {
 ; CHECK-NEXT:    br label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
 
   %cmp = icmp ne i32 %i, 0
diff --git a/llvm/test/Transforms/SCCP/ip-ranges-phis.ll b/llvm/test/Transforms/SCCP/ip-ranges-phis.ll
index e5a48ceabc2a21..4fb73c532f2e26 100644
--- a/llvm/test/Transforms/SCCP/ip-ranges-phis.ll
+++ b/llvm/test/Transforms/SCCP/ip-ranges-phis.ll
@@ -4,7 +4,7 @@
 define internal i32 @f1(i32 %x) {
 ; CHECK-LABEL: define {{[^@]+}}@f1
 ; CHECK-SAME: (i32 [[X:%.*]]) {
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   %cmp = icmp sgt i32 %x, 300
   %res = select i1 %cmp, i32 1, i32 2
diff --git a/llvm/test/Transforms/SCCP/ip-ranges-select.ll b/llvm/test/Transforms/SCCP/ip-ranges-select.ll
index 026b737dd6abf6..7a507eaa74b1ab 100644
--- a/llvm/test/Transforms/SCCP/ip-ranges-select.ll
+++ b/llvm/test/Transforms/SCCP/ip-ranges-select.ll
@@ -22,7 +22,7 @@ define internal i32 @callee.1(i32 %arg) {
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 false, i32 16, i32 [[ARG]]
 ; CHECK-NEXT:    br label [[BB10:%.*]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   %c.1 = icmp slt i32 %arg, 0
   %sel = select i1 %c.1, i32 16, i32 %arg
diff --git a/llvm/test/Transforms/SCCP/ipsccp-basic.ll b/llvm/test/Transforms/SCCP/ipsccp-basic.ll
index 6a7ab8ac2864cd..9ab686aa701f44 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-basic.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-basic.ll
@@ -8,7 +8,7 @@
 define internal i32 @test1a(i32 %A) {
 ; CHECK-LABEL: define internal i32 @test1a
 ; CHECK-SAME: (i32 [[A:%.*]]) {
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   %X = add i32 1, 2
   ret i32 %A
@@ -34,7 +34,7 @@ define internal i32 @test2a(i32 %A) {
 ; CHECK-NEXT:    br label [[T:%.*]]
 ; CHECK:       T:
 ; CHECK-NEXT:    [[B:%.*]] = call i32 @test2a(i32 0)
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   %C = icmp eq i32 %A, 0
   br i1 %C, label %T, label %F
@@ -97,7 +97,7 @@ F:
 
 define internal {i64,i64} @test4a() {
 ; CHECK-LABEL: define internal { i64, i64 } @test4a() {
-; CHECK-NEXT:    ret { i64, i64 } undef
+; CHECK-NEXT:    ret { i64, i64 } poison
 ;
   %a = insertvalue {i64,i64} undef, i64 4, 1
   %b = insertvalue {i64,i64} %a, i64 5, 0
@@ -107,13 +107,13 @@ define internal {i64,i64} @test4a() {
 define i64 @test4b() personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: define range(i64 0, 6) i64 @test4b() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[A:%.*]] = invoke { i64, i64 } @test4a()
-; CHECK-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
+; CHECK-NEXT:            to label [[A:%.*]] unwind label [[B:%.*]]
 ; CHECK:       A:
 ; CHECK-NEXT:    [[C:%.*]] = call i64 @test4c(i64 5)
 ; CHECK-NEXT:    ret i64 5
 ; CHECK:       B:
 ; CHECK-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    ret i64 0
 ;
   %a = invoke {i64,i64} @test4a()
@@ -131,7 +131,7 @@ B:
 define internal i64 @test4c(i64 %a) {
 ; CHECK-LABEL: define internal i64 @test4c
 ; CHECK-SAME: (i64 [[A:%.*]]) {
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
   ret i64 %a
 }
@@ -141,7 +141,7 @@ define internal i64 @test4c(i64 %a) {
 ; PR4313
 define internal {i64,i64} @test5a() {
 ; CHECK-LABEL: define internal { i64, i64 } @test5a() {
-; CHECK-NEXT:    ret { i64, i64 } undef
+; CHECK-NEXT:    ret { i64, i64 } poison
 ;
   %a = insertvalue {i64,i64} undef, i64 4, 1
   %b = insertvalue {i64,i64} %a, i64 5, 0
@@ -151,13 +151,13 @@ define internal {i64,i64} @test5a() {
 define i64 @test5b() personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: define range(i64 0, 6) i64 @test5b() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[A:%.*]] = invoke { i64, i64 } @test5a()
-; CHECK-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
+; CHECK-NEXT:            to label [[A:%.*]] unwind label [[B:%.*]]
 ; CHECK:       A:
 ; CHECK-NEXT:    [[C:%.*]] = call i64 @test5c({ i64, i64 } { i64 5, i64 4 })
 ; CHECK-NEXT:    ret i64 5
 ; CHECK:       B:
 ; CHECK-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    ret i64 0
 ;
   %a = invoke {i64,i64} @test5a()
@@ -174,7 +174,7 @@ B:
 define internal i64 @test5c({i64,i64} %a) {
 ; CHECK-LABEL: define internal i64 @test5c
 ; CHECK-SAME: ({ i64, i64 } [[A:%.*]]) {
-; CHECK-NEXT:    ret i64 undef
+; CHECK-NEXT:    ret i64 poison
 ;
   %b = extractvalue {i64,i64} %a, 0
   ret i64 %b
@@ -206,7 +206,7 @@ define i64 @test6b() {
 define internal %T @test7a(i32 %A) {
 ; CHECK-LABEL: define internal %T @test7a
 ; CHECK-SAME: (i32 [[A:%.*]]) {
-; CHECK-NEXT:    ret [[T:%.*]] undef
+; CHECK-NEXT:    ret [[T:%.*]] poison
 ;
   %X = add i32 1, %A
   %mrv0 = insertvalue %T undef, i32 %X, 0
@@ -216,7 +216,7 @@ define internal %T @test7a(i32 %A) {
 
 define i32 @test7b() {
 ; CHECK-LABEL: define i32 @test7b() {
-; CHECK-NEXT:    [[X:%.*]] = call [[T:%.*]] @test7a(i32 17)
+; CHECK-NEXT:    [[X:%.*]] = call [[T:%.*]] @[[TEST7A:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 17)
 ; CHECK-NEXT:    ret i32 36
 ;
   %X = call %T @test7a(i32 17)
@@ -232,7 +232,7 @@ define internal {} @test8a(i32 %A, ptr %P) {
 ; CHECK-LABEL: define internal {} @test8a
 ; CHECK-SAME: (i32 [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    store i32 5, ptr [[P]], align 4
-; CHECK-NEXT:    ret {} undef
+; CHECK-NEXT:    ret {} poison
 ;
   store i32 %A, ptr %P
   ret {} {}
diff --git a/llvm/test/Transforms/SCCP/ipsccp-clear-returned.ll b/llvm/test/Transforms/SCCP/ipsccp-clear-returned.ll
index e5bd5d5973ed38..0b1d89f99f7886 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-clear-returned.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-clear-returned.ll
@@ -1,36 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
+
 ; if IPSCCP determines a function returns undef,
 ; then the "returned" attribute of input arguments
 ; should be cleared.
 
-; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
 define i32 @main() {
-; CHECK-LABEL: @main
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @func_return_undef(i32 1)
+; CHECK-NEXT:    ret i32 1
+;
 entry:
-; CHECK-NEXT: entry:
   %call = call i32 @func_return_undef(i32 returned 1)
-; CHECK: call i32 @func_return_undef(i32 1)
-; CHECK-NOT: returned
   ret i32 %call
-; CHECK: ret i32 1
 }
 
 define internal i32 @func_return_undef(i32 returned %arg) {
-; CHECK: {{define.*@func_return_undef}}
-; CHECK-NOT: returned
+; CHECK-LABEL: define internal i32 @func_return_undef(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i32 poison
+;
 entry:
-; CHECK-NEXT: entry:
-; CHECK-NEXT: {{ret.*undef}}
   ret i32 %arg
 }
 
-
 ; The only case that users of zapped functions are non-call site
 ; users is that they are blockaddr users. Skip them because we
 ; want to remove the returned attribute for call sites
-
-; CHECK: {{define.*@blockaddr_user}}
-; CHECK-NOT: returned
 define internal i32 @blockaddr_user(i1 %c, i32 returned %d) {
+; CHECK-LABEL: define internal i32 @blockaddr_user(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br label %[[BRANCH_BLOCK:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br label %[[BRANCH_BLOCK]]
+; CHECK:       [[BRANCH_BLOCK]]:
+; CHECK-NEXT:    [[ADDR:%.*]] = phi ptr [ blockaddress(@blockaddr_user, %[[TARGET1:.*]]), %[[BB1]] ], [ blockaddress(@blockaddr_user, %[[TARGET2:.*]]), %[[BB2]] ]
+; CHECK-NEXT:    indirectbr ptr [[ADDR]], [label %[[TARGET1]], label %target2]
+; CHECK:       [[TARGET1]]:
+; CHECK-NEXT:    br label %[[TARGET2]]
+; CHECK:       [[TARGET2]]:
+; CHECK-NEXT:    ret i32 poison
+;
 entry:
   br i1 %c, label %bb1, label %bb2
 
@@ -47,16 +62,16 @@ branch.block:
 target1:
   br label %target2
 
-; CHECK: ret i32 undef
 target2:
   ret i32 %d
 }
 
 define i32 @call_blockaddr_user(i1 %c) {
 ; CHECK-LABEL: define i32 @call_blockaddr_user(
-; CHECK-NEXT: %r = call i32 @blockaddr_user(i1 %c
-; CHECK-NOT: returned
-; CHECK-NEXT: ret i32 10
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i32 @blockaddr_user(i1 [[C]], i32 10)
+; CHECK-NEXT:    ret i32 10
+;
   %r = call i32 @blockaddr_user(i1 %c, i32 returned 10)
   ret i32 %r
 }
diff --git a/llvm/test/Transforms/SCCP/ipsccp-noundef.ll b/llvm/test/Transforms/SCCP/ipsccp-noundef.ll
index a79ef83162d190..261e99e1bf91fa 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-noundef.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-noundef.ll
@@ -1,23 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=ipsccp < %s | FileCheck %s
 @g = external global i8
 
 define internal noundef i32 @ret_noundef() {
 ; CHECK-LABEL: define internal i32 @ret_noundef() {
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   ret i32 0
 }
 
 define internal dereferenceable(1) ptr @ret_dereferenceable() {
 ; CHECK-LABEL: define internal ptr @ret_dereferenceable() {
-; CHECK-NEXT:    ret ptr undef
+; CHECK-NEXT:    ret ptr poison
 ;
   ret ptr @g
 }
 
 define internal dereferenceable_or_null(1) ptr @ret_dereferenceable_or_null() {
 ; CHECK-LABEL: define internal ptr @ret_dereferenceable_or_null() {
-; CHECK-NEXT:    ret ptr undef
+; CHECK-NEXT:    ret ptr poison
 ;
   ret ptr @g
 }
@@ -25,14 +26,14 @@ define internal dereferenceable_or_null(1) ptr @ret_dereferenceable_or_null() {
 ; Non-null is fine, because it does not cause immediate UB.
 define internal nonnull ptr @ret_nonnull() {
 ; CHECK-LABEL: define internal nonnull ptr @ret_nonnull() {
-; CHECK-NEXT:    ret ptr undef
+; CHECK-NEXT:    ret ptr poison
 ;
   ret ptr @g
 }
 
 define internal nonnull ptr @ret_nonnull_noundef() {
 ; CHECK-LABEL: define internal nonnull ptr @ret_nonnull_noundef() {
-; CHECK-NEXT:    ret ptr undef
+; CHECK-NEXT:    ret ptr poison
 ;
   ret ptr @g
 }
diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-analysis.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-analysis.ll
index 9ceb8c5606cb1b..678ef916e466f8 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-preserve-analysis.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-analysis.ll
@@ -1,12 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Basic test to check that DominatorTreeAnalysis is preserved by IPSCCP and
 ; the following analysis can re-use it. The test contains two trivial functions
 ; IPSCCP can simplify, so we can test the case where IPSCCP makes changes.
 
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN: -passes='function(require<domtree>,require<postdomtree>),ipsccp,function(require<domtree>,require<postdomtree>)' -S  %s 2>&1 \
-; RUN:     | FileCheck -check-prefixes='IR,NEW-PM' %s
+; RUN:     | FileCheck -check-prefixes=IR,NEW-PM %s
 
-; RUN: opt -passes='function(require<postdomtree>),ipsccp,function(verify<domtree>)' -S  %s | FileCheck -check-prefixes='IR' %s
+; RUN: opt -passes='function(require<postdomtree>),ipsccp,function(verify<domtree>)' -S  %s | FileCheck -check-prefixes=IR %s
 
 ; NEW-PM: Running analysis: DominatorTreeAnalysis on f1
 ; NEW-PM: Running analysis: PostDominatorTreeAnalysis on f1
@@ -17,18 +18,15 @@
 ; NEW-PM-DAG: Running analysis: AssumptionAnalysis on f2
 ; NEW-PM-NOT: Running analysis: AssumptionAnalysis
 
-; IR-LABEL: @f1
-; IR-LABEL: entry:
-; IR-NEXT: br label %bb2
-; IR-LABEL: bb2:
-; IR-NEXT: undef
-
-; IR-LABEL: @f2
-; IR-NOT: icmp
-; IR:    br label %bbtrue
-; IR-LABEL: bbtrue:
-; IR-NEXT:   ret i32 0
+
 define internal i32 @f1() readnone {
+; IR-LABEL: define internal i32 @f1(
+; IR-SAME: ) #[[ATTR0:[0-9]+]] {
+; IR-NEXT:  [[ENTRY:.*:]]
+; IR-NEXT:    br label %[[BB2:.*]]
+; IR:       [[BB2]]:
+; IR-NEXT:    ret i32 poison
+;
 entry:
   br i1 false, label %bb1, label %bb2
 bb1:
@@ -38,6 +36,13 @@ bb2:
 }
 
 define i32 @f2(i32 %n) {
+; IR-LABEL: define i32 @f2(
+; IR-SAME: i32 [[N:%.*]]) {
+; IR-NEXT:    [[I:%.*]] = call i32 @f1()
+; IR-NEXT:    br label %[[BBTRUE:.*]]
+; IR:       [[BBTRUE]]:
+; IR-NEXT:    ret i32 0
+;
   %i = call i32 @f1()
   %cmp = icmp eq i32 %i, 10
   br i1 %cmp, label %bbtrue, label %bbfalse
@@ -49,3 +54,5 @@ bbfalse:
   %res = add i32 %n, %i
   ret i32 %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NEW-PM: {{.*}}
diff --git a/llvm/test/Transforms/SCCP/issue59602-assume-like-call-users.ll b/llvm/test/Transforms/SCCP/issue59602-assume-like-call-users.ll
index 352eb21095c232..260966b8b505bd 100644
--- a/llvm/test/Transforms/SCCP/issue59602-assume-like-call-users.ll
+++ b/llvm/test/Transforms/SCCP/issue59602-assume-like-call-users.ll
@@ -20,7 +20,7 @@ define internal i32 @assume_self_user() {
 ; CHECK-LABEL: define {{[^@]+}}@assume_self_user() {
 ; CHECK-NEXT:    [[OBJSIZE:%.*]] = call i32 @llvm.objectsize.i32.p0(ptr @assume_self_user, i1 false, i1 false, i1 false)
 ; CHECK-NEXT:    store i32 [[OBJSIZE]], ptr @extern, align 4
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   %objsize = call i32 @llvm.objectsize.i32.p0(ptr @assume_self_user, i1 false, i1 false, i1 false)
   store i32 %objsize, ptr @extern
@@ -50,7 +50,7 @@ define internal i32 @constexpr_self_user(i32 %arg0) addrspace(1) {
 ; CHECK-SAME: (i32 [[ARG0:%.*]]) addrspace(1) {
 ; CHECK-NEXT:    [[OBJSIZE:%.*]] = call i32 @llvm.objectsize.i32.p0(ptr addrspacecast (ptr addrspace(1) @constexpr_self_user to ptr), i1 false, i1 false, i1 false)
 ; CHECK-NEXT:    store i32 [[OBJSIZE]], ptr @extern, align 4
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   %objsize = call i32 @llvm.objectsize.i32.p0(ptr addrspacecast (ptr addrspace(1) @constexpr_self_user to ptr), i1 false, i1 false, i1 false)
   store i32 %objsize, ptr @extern
diff --git a/llvm/test/Transforms/SCCP/metadata.ll b/llvm/test/Transforms/SCCP/metadata.ll
index 8b1299c856611a..b89b7fa7dbbb33 100644
--- a/llvm/test/Transforms/SCCP/metadata.ll
+++ b/llvm/test/Transforms/SCCP/metadata.ll
@@ -6,7 +6,7 @@ declare i32 @get_i32()
 
 define void @load_range(ptr %p) {
 ; CHECK-LABEL: @load_range(
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range !0
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0:![0-9]+]]
 ; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C2:%.*]] = icmp ult i32 [[V]], 9
 ; CHECK-NEXT:    call void @use(i1 [[C2]])
@@ -37,7 +37,7 @@ define i32 @load_range_single(ptr %p) {
 
 define i32 @load_range_single_volatile(ptr %p) {
 ; CHECK-LABEL: @load_range_single_volatile(
-; CHECK-NEXT:    [[V:%.*]] = load volatile i32, ptr [[P:%.*]], align 4, !range !1
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, ptr [[P:%.*]], align 4, !range [[RNG1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
   %v = load volatile i32, ptr %p, !range !{i32 0, i32 1}
@@ -46,8 +46,8 @@ define i32 @load_range_single_volatile(ptr %p) {
 
 define void @load_nonnull(ptr %p, ptr %p2) {
 ; CHECK-LABEL: @load_nonnull(
-; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !2
-; CHECK-NEXT:    [[V2:%.*]] = load ptr, ptr [[P2:%.*]], align 8, !nonnull !2
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull [[META2:![0-9]+]]
+; CHECK-NEXT:    [[V2:%.*]] = load ptr, ptr [[P2:%.*]], align 8, !nonnull [[META2]]
 ; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    call void @use(i1 false)
 ; CHECK-NEXT:    call void @use(i1 true)
@@ -78,7 +78,7 @@ define void @load_nonnull(ptr %p, ptr %p2) {
 
 define void @call_range(ptr %p) {
 ; CHECK-LABEL: @call_range(
-; CHECK-NEXT:    [[V:%.*]] = call i32 @get_i32(), !range !0
+; CHECK-NEXT:    [[V:%.*]] = call i32 @get_i32(), !range [[RNG0]]
 ; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C2:%.*]] = icmp ult i32 [[V]], 9
 ; CHECK-NEXT:    call void @use(i1 [[C2]])
@@ -101,7 +101,7 @@ define void @call_range(ptr %p) {
 
 define internal i1 @ip_cmp_range(i32 %v) {
 ; CHECK-LABEL: @ip_cmp_range(
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
   %c = icmp ult i32 %v, 10
   ret i1 %c
@@ -109,7 +109,7 @@ define internal i1 @ip_cmp_range(i32 %v) {
 
 define i1 @ip_load_range(ptr %p) {
 ; CHECK-LABEL: @ip_load_range(
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range !0
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @ip_cmp_range(i32 [[V]])
 ; CHECK-NEXT:    ret i1 true
 ;
diff --git a/llvm/test/Transforms/SCCP/musttail-call.ll b/llvm/test/Transforms/SCCP/musttail-call.ll
index 964683e5886fdc..085662aad8b850 100644
--- a/llvm/test/Transforms/SCCP/musttail-call.ll
+++ b/llvm/test/Transforms/SCCP/musttail-call.ll
@@ -62,7 +62,7 @@ define internal ptr @side_effects(i8 %v) {
 define internal ptr @no_side_effects(i8 %v) readonly nounwind willreturn {
 ; CHECK-LABEL: define {{[^@]+}}@no_side_effects
 ; CHECK-SAME: (i8 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    ret ptr undef
+; CHECK-NEXT:    ret ptr poison
 ;
   ret ptr null
 }
diff --git a/llvm/test/Transforms/SCCP/range-and-ip.ll b/llvm/test/Transforms/SCCP/range-and-ip.ll
index 7caea5d2cc330d..d5d4b8022e1a7e 100644
--- a/llvm/test/Transforms/SCCP/range-and-ip.ll
+++ b/llvm/test/Transforms/SCCP/range-and-ip.ll
@@ -1,18 +1,19 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=ipsccp %s | FileCheck %s
 
 ; Make sure IPSCCP does not assume %r < 256 for @f1. Undef is passed at a call
 ; site, which won't be eliminated.
 
 define i1 @constant_and_undef(i64 %a) {
-; CHECK-LABEL: @constant_and_undef(
+; CHECK-LABEL: define i1 @constant_and_undef(
+; CHECK-SAME: i64 [[A:%.*]]) {
 ; CHECK-NEXT:    [[C_1:%.*]] = call i1 @f1(i64 undef)
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[C_2:%.*]] = call i1 @f1(i64 10)
-; CHECK-NEXT:    br label [[BB2:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[RANGE:%.*]] = and i64 [[A:%.*]], 255
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[RANGE:%.*]] = and i64 [[A]], 255
 ; CHECK-NEXT:    [[C_3:%.*]] = call i1 @f1(i64 [[RANGE]])
 ; CHECK-NEXT:    ret i1 true
 ;
@@ -34,9 +35,10 @@ bb2:
 declare void @sideeffect(i1, i64 %a)
 
 define internal i1 @f1(i64 %r) {
-; CHECK-LABEL: define {{.*}} @f1(
-; CHECK-NEXT:    call void @sideeffect(i1 true, i64 [[R:%.*]])
-; CHECK-NEXT:    ret i1 undef
+; CHECK-LABEL: define internal i1 @f1(
+; CHECK-SAME: i64 [[R:%.*]]) {
+; CHECK-NEXT:    call void @sideeffect(i1 true, i64 [[R]])
+; CHECK-NEXT:    ret i1 poison
 ;
   %c = icmp ult i64 %r, 256
   call void @sideeffect(i1 %c, i64 %r)
diff --git a/llvm/test/Transforms/SCCP/range-attribute.ll b/llvm/test/Transforms/SCCP/range-attribute.ll
index ae66af91bb8e35..209c5464ccf221 100644
--- a/llvm/test/Transforms/SCCP/range-attribute.ll
+++ b/llvm/test/Transforms/SCCP/range-attribute.ll
@@ -83,7 +83,7 @@ define void @call_range_result() {
 
 define internal i1 @ip_cmp_range_attribute(i32 %v) {
 ; CHECK-LABEL: @ip_cmp_range_attribute(
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
   %c = icmp ult i32 %v, 10
   ret i1 %c
@@ -100,7 +100,7 @@ define i1 @ip_range_attribute(i32 range(i32 0, 10) %v) {
 
 define internal i1 @ip_cmp_range_call(i32 %v) {
 ; CHECK-LABEL: @ip_cmp_range_call(
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
   %c = icmp ult i32 %v, 10
   ret i1 %c
@@ -119,7 +119,7 @@ define i1 @ip_range_call() {
 
 define internal i1 @ip_cmp_range_result(i32 %v) {
 ; CHECK-LABEL: @ip_cmp_range_result(
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
   %c = icmp ult i32 %v, 10
   ret i1 %c
@@ -138,7 +138,7 @@ define i1 @ip_range_result() {
 
 define internal i1 @ip_cmp_with_range_attribute(i32 range(i32 0, 10) %v) {
 ; CHECK-LABEL: @ip_cmp_with_range_attribute(
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
   %c = icmp eq i32 %v, 5
   ret i1 %c
diff --git a/llvm/test/Transforms/SCCP/remove-call-inst.ll b/llvm/test/Transforms/SCCP/remove-call-inst.ll
index 6881b82a938fb9..055a3235996ad0 100644
--- a/llvm/test/Transforms/SCCP/remove-call-inst.ll
+++ b/llvm/test/Transforms/SCCP/remove-call-inst.ll
@@ -20,7 +20,7 @@ define internal i32 @wwrite(i64 %i) nounwind readnone willreturn {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[SW_DEFAULT:%.*]]
 ; CHECK:       sw.default:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
 entry:
   switch i64 %i, label %sw.default [
diff --git a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
index e5d77c6356e190..88e9e6568c6475 100644
--- a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
+++ b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
@@ -66,7 +66,7 @@ define internal i1 @test1_g(ptr %h, i32 %i) #0 {
 ; CHECK-NEXT:    br label [[LAND_END]]
 ; CHECK:       land.end:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TOBOOL1]], [[LAND_RHS]] ]
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
 entry:
   %tobool = icmp ne i32 %i, 0
@@ -142,7 +142,7 @@ define internal i1 @test2_g(ptr %h, i32 %i) {
 ; CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp ne i32 [[CALL]], 0
 ; CHECK-NEXT:    br label [[LAND_END:%.*]]
 ; CHECK:       land.end:
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
 entry:
   %tobool = icmp ne i32 %i, 0
@@ -231,7 +231,7 @@ define internal i1 @test3_g(ptr %h, i32 %i) {
 ; CHECK-NEXT:    br label [[LAND_END]]
 ; CHECK:       land.end:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TOBOOL1]], [[LAND_RHS]] ]
-; CHECK-NEXT:    ret i1 undef
+; CHECK-NEXT:    ret i1 poison
 ;
 entry:
   %tobool = icmp ne i32 %i, 0
@@ -294,7 +294,7 @@ define internal ptr @test4_c(ptr %arg) {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb3:
-; CHECK-NEXT:    ret ptr undef
+; CHECK-NEXT:    ret ptr poison
 ;
 bb1:                                              ; preds = %bb
   %tmp = and i1 undef, undef
@@ -345,7 +345,7 @@ define internal ptr @test5_c(ptr %arg) {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb3:
-; CHECK-NEXT:    ret ptr undef
+; CHECK-NEXT:    ret ptr poison
 ;
 bb1:                                              ; preds = %bb
   %tmp = and i1 undef, undef
diff --git a/llvm/test/Transforms/SCCP/return-constant.ll b/llvm/test/Transforms/SCCP/return-constant.ll
index a6971af07f5df0..8b361ea851c1ea 100644
--- a/llvm/test/Transforms/SCCP/return-constant.ll
+++ b/llvm/test/Transforms/SCCP/return-constant.ll
@@ -5,9 +5,9 @@ define internal i32 @foo(i1 %C) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ; CHECK:       F:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 poison
 ;
   br i1 %C, label %T, label %F
 
@@ -31,12 +31,12 @@ define i1 @caller(i1 %C) {
 define i1 @invokecaller(i1 %C) personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: @invokecaller(
 ; CHECK-NEXT:    [[X:%.*]] = invoke i32 @foo(i1 [[C:%.*]])
-; CHECK-NEXT:    to label [[OK:%.*]] unwind label [[FAIL:%.*]]
+; CHECK-NEXT:            to label [[OK:%.*]] unwind label [[FAIL:%.*]]
 ; CHECK:       OK:
 ; CHECK-NEXT:    ret i1 true
 ; CHECK:       FAIL:
 ; CHECK-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    cleanup
+; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    ret i1 false
 ;
   %X = invoke i32 @foo( i1 %C ) to label %OK unwind label %FAIL             ; <i32> [#uses=1]
diff --git a/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll b/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll
index 130a96994a32fd..05f3358045c3c4 100644
--- a/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll
+++ b/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll
@@ -1,9 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
 
-; CHECK-LABEL: @testf(
-; CHECK:         ret i32 undef
 ;
 define internal i32 @testf(i1 %c) {
+; CHECK-LABEL: define internal i32 @testf(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C]], label %[[IF_COND:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_COND]]:
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    ret i32 poison
+;
 entry:
   br i1 %c, label %if.cond, label %if.end
 
@@ -17,10 +25,18 @@ if.end:                                          ; preds = %if.then1, %entry
   ret i32 10
 }
 
-; CHECK-LABEL: @test1(
-; CHECK:         ret i32 undef
 ;
 define internal i32 @test1(i1 %c) {
+; CHECK-LABEL: define internal i32 @test1(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @testf(i1 [[C]])
+; CHECK-NEXT:    br label %[[RET1:.*]]
+; CHECK:       [[RET1]]:
+; CHECK-NEXT:    ret i32 poison
+;
 entry:
   br label %if.then
 
@@ -36,11 +52,13 @@ ret2:                                           ; preds = %if.then, %entry
   ret i32 0
 }
 
-; CHECK-LABEL: @main(
-; CHECK-NEXT:    %res = call i32 @test1(
-; CHECK-NEXT:    ret i32 99
 ;
 define i32 @main(i1 %c) {
+; CHECK-LABEL: define i32 @main(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @test1(i1 [[C]])
+; CHECK-NEXT:    ret i32 99
+;
   %res = call i32 @test1(i1 %c)
   ret i32 %res
 }
diff --git a/llvm/test/Transforms/SCCP/struct-arg-resolve-undefs.ll b/llvm/test/Transforms/SCCP/struct-arg-resolve-undefs.ll
index 7ac29b8a9a8dfd..b99ebe2e761e71 100644
--- a/llvm/test/Transforms/SCCP/struct-arg-resolve-undefs.ll
+++ b/llvm/test/Transforms/SCCP/struct-arg-resolve-undefs.ll
@@ -1,12 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=ipsccp -S %s | FileCheck %s
 
 %struct.S = type { i32 }
 
 
 define void @main() {
-; CHECK-LABEL: void @main() {
-; CHECK-NEXT:    %r = call i32 @f(%struct.S { i32 100 })
+; CHECK-LABEL: define void @main() {
+; CHECK-NEXT:    [[R:%.*]] = call i32 @f([[STRUCT_S:%.*]] { i32 100 })
 ; CHECK-NEXT:    call void @do_report(i32 123)
+; CHECK-NEXT:    ret void
+;
   %r = call i32 @f(%struct.S { i32 100 })
   call void @do_report(i32 %r)
   ret void
@@ -15,11 +18,19 @@ define void @main() {
 declare void @do_report(i32)
 
 define internal i32 @f(%struct.S %s.coerce) {
-; CHECK-LABEL: define internal i32 @f(%struct.S %s.coerce)
-; CHECK-LABEL: entry:
-; CHECK-NEXT:     %call = call i8 @lsh(i8 1, i32 100)
-; CHECK-LABEL: if.end:
-; CHECK-NEXT:     ret i32 undef
+; CHECK-LABEL: define internal i32 @f(
+; CHECK-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CALL:%.*]] = call i8 @lsh(i8 1, i32 100)
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_COND]], label %[[IF_END:.*]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    ret i32 poison
+;
 entry:
   %ev = extractvalue %struct.S %s.coerce, 0
   %call = call i8 @lsh(i8 1, i32 %ev)
@@ -37,6 +48,11 @@ if.end:                                           ; preds = %for.cond, %entry
 }
 
 define internal i8 @lsh(i8 %l, i32 %r) {
+; CHECK-LABEL: define internal i8 @lsh(
+; CHECK-SAME: i8 [[L:%.*]], i32 [[R:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i8 poison
+;
 entry:
   %conv = sext i8 %l to i32
   %cmp = icmp slt i32 %conv, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
index 6db27e597a63f3..eae38295ba08cf 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index 24e16deacb3af4..5e2dd305f05576 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 9608608a180982..059e4c38b519bd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,11 +155,13 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -171,11 +173,13 @@ define <4 x float> @exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -208,11 +212,13 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -224,11 +230,13 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -261,11 +269,13 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -277,11 +287,13 @@ define <4 x float> @log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -314,11 +326,13 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -330,11 +344,13 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -367,11 +383,13 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -383,11 +401,13 @@ define <4 x float> @sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -420,11 +440,13 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -436,11 +458,13 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
index c83d9363c1809c..7b65467d7f9ef1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
@@ -53,3 +53,57 @@ entry:
   ret i32 %max.3
 }
 
+define i32 @test1(ptr %p) {
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D_0:%.*]] = load i16, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> <i16 0, i16 poison, i16 0, i16 0>, i16 [[D_0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP2]], <i32 -16383, i32 -1, i32 65535, i32 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> <i32 -16383, i32 -1, i32 65535, i32 -1>, <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], <i32 -16383, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+entry:
+  %d.0 = load i16, ptr %p, align 4
+  %zext.d.0 = zext i16 %d.0 to i32
+  %zero.0 = zext i16 0 to i32
+  %zero.1 = zext i16 0 to i32
+  %zero.2 = zext i16 0 to i32
+
+  %or.d.0 = or i32 %zext.d.0, 0
+  %or.zero.0 = or i32 %zero.0, 0
+  %or.zero.1 = or i32 %zero.1, 0
+  %or.zero.2 = or i32 %zero.2, 0
+
+  %szero.00 = sext i16 65535 to i32
+  %szero.0 = sext i16 -16383 to i32
+  %uzero.1 = zext i16 65535 to i32
+  %szero.2 = sext i16 65535 to i32
+
+  %zero.d.0 = and i32 %or.d.0, %szero.00
+  %and.zero.0 = and i32 %or.zero.0, %szero.0
+  %and.zero.1 = and i32 %or.zero.1, %uzero.1
+  %and.zero.2 = and i32 %or.zero.2, %szero.2
+
+  %d.0.gt.0 = icmp eq i32 %zext.d.0, 65535
+  %false.0 = icmp eq i32 %szero.0, -16383
+  %false.1 = icmp eq i32 %uzero.1, 65535
+  %false.2 = icmp eq i32 %szero.2, 65535
+
+  %select.0.2 = select i1 %d.0.gt.0, i32 %zero.d.0, i32 4
+  %select.1.0 = select i1 %false.0, i32 %and.zero.0, i32 3
+  %select.2.0 = select i1 %false.1, i32 %and.zero.1, i32 2
+  %select.3.0 = select i1 %false.2, i32 %and.zero.2, i32 1
+
+  %max.0 = add i32 %select.0.2, %select.1.0
+  %max.1 = add i32 %max.0, %select.2.0
+  %max.2 = add i32 %max.1, %select.3.0
+
+  ret i32 %max.2
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index 45ce1eec2cbfcd..6c21cc1cfc5be8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -51,23 +51,25 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index b8b284b9595a41..bc5bcee361168a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -51,23 +51,25 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/call.ll b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
index 4181148a4d829d..8835e3b144be65 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/call.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
@@ -6,6 +6,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 declare double @sin(double) nounwind willreturn
 declare double @cos(double) nounwind willreturn
+declare double @tan(double) nounwind willreturn
 declare double @pow(double, double) nounwind willreturn
 declare double @exp2(double) nounwind willreturn
 declare double @sqrt(double) nounwind willreturn
@@ -48,6 +49,24 @@ define void @cos_libm(ptr %a, ptr %b) {
   ret void
 }
 
+define void @tan_libm(ptr %a, ptr %b) {
+; CHECK-LABEL: @tan_libm(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.tan.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, ptr %a, align 8
+  %idx1 = getelementptr inbounds double, ptr %a, i64 1
+  %a1 = load double, ptr %idx1, align 8
+  %tan1 = tail call double @tan(double %a0) nounwind readnone
+  %tan2 = tail call double @tan(double %a1) nounwind readnone
+  store double %tan1, ptr %b, align 8
+  %idx2 = getelementptr inbounds double, ptr %b, i64 1
+  store double %tan2, ptr %idx2, align 8
+  ret void
+}
+
 define void @pow_libm(ptr %a, ptr %b) {
 ; CHECK-LABEL: @pow_libm(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll
new file mode 100644
index 00000000000000..799d0a055d5c41
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-10 < %s -mtriple=x86_64-pc-windows-msvc19.39.33523 | FileCheck %s
+
+define void @test(ptr %0) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <2 x ptr> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    switch i32 0, label %[[NEWFUNCROOT994:.*]] [
+; CHECK-NEXT:      i32 1, label %[[NEWFUNCROOT994]]
+; CHECK-NEXT:      i32 0, label %[[NEWFUNCROOT584:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[NEWFUNCROOT584]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    ret void
+; CHECK:       [[NEWFUNCROOT994]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP5]], i64 [[TMP6]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = ptrtoint ptr %0 to i64
+  %2 = trunc i64 %1 to i32
+  %3 = ptrtoint ptr null to i64
+  %4 = trunc i64 %3 to i32
+  switch i32 0, label %newFuncRoot994 [
+  i32 1, label %newFuncRoot994
+  i32 0, label %newFuncRoot584
+  ]
+
+newFuncRoot584:
+  %5 = insertelement <4 x i32> poison, i32 %4, i64 0
+  %6 = insertelement <4 x i32> %5, i32 %2, i64 1
+  ret void
+
+newFuncRoot994:
+  %7 = insertelement <4 x i32> zeroinitializer, i32 %2, i64 %3
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll b/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
index 5c266f48673981..8ad455eb9e7f22 100644
--- a/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
+++ b/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
@@ -59,6 +59,76 @@ return:                                           ; preds = %entry, %sw.bb4, %sw
   ret i32 %retval.0
 }
 
+; We should not forward `%m` to 1, as this does not simplify the CFG.
+define i32 @forward_one(i32 %m) {
+; NO_FWD-LABEL: @forward_one(
+; NO_FWD-NEXT:  entry:
+; NO_FWD-NEXT:    switch i32 [[M:%.*]], label [[SW_BB4:%.*]] [
+; NO_FWD-NEXT:      i32 0, label [[RETURN:%.*]]
+; NO_FWD-NEXT:      i32 1, label [[SW_BB1:%.*]]
+; NO_FWD-NEXT:      i32 2, label [[SW_BB2:%.*]]
+; NO_FWD-NEXT:      i32 3, label [[SW_BB3:%.*]]
+; NO_FWD-NEXT:    ]
+; NO_FWD:       sw.bb1:
+; NO_FWD-NEXT:    br label [[RETURN]]
+; NO_FWD:       sw.bb2:
+; NO_FWD-NEXT:    br label [[RETURN]]
+; NO_FWD:       sw.bb3:
+; NO_FWD-NEXT:    br label [[RETURN]]
+; NO_FWD:       sw.bb4:
+; NO_FWD-NEXT:    br label [[RETURN]]
+; NO_FWD:       return:
+; NO_FWD-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 4, [[SW_BB4]] ], [ 5, [[SW_BB3]] ], [ 6, [[SW_BB2]] ], [ 1, [[SW_BB1]] ], [ 8, [[ENTRY:%.*]] ]
+; NO_FWD-NEXT:    ret i32 [[RETVAL_0]]
+;
+; FWD-LABEL: @forward_one(
+; FWD-NEXT:  entry:
+; FWD-NEXT:    switch i32 [[M:%.*]], label [[SW_BB4:%.*]] [
+; FWD-NEXT:      i32 0, label [[RETURN:%.*]]
+; FWD-NEXT:      i32 1, label [[SW_BB1:%.*]]
+; FWD-NEXT:      i32 2, label [[SW_BB2:%.*]]
+; FWD-NEXT:      i32 3, label [[SW_BB3:%.*]]
+; FWD-NEXT:    ]
+; FWD:       sw.bb1:
+; FWD-NEXT:    br label [[RETURN]]
+; FWD:       sw.bb2:
+; FWD-NEXT:    br label [[RETURN]]
+; FWD:       sw.bb3:
+; FWD-NEXT:    br label [[RETURN]]
+; FWD:       sw.bb4:
+; FWD-NEXT:    br label [[RETURN]]
+; FWD:       return:
+; FWD-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 4, [[SW_BB4]] ], [ 5, [[SW_BB3]] ], [ 6, [[SW_BB2]] ], [ 1, [[SW_BB1]] ], [ 8, [[ENTRY:%.*]] ]
+; FWD-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  switch i32 %m, label %sw.bb4 [
+  i32 0, label %sw.bb0
+  i32 1, label %sw.bb1
+  i32 2, label %sw.bb2
+  i32 3, label %sw.bb3
+  ]
+
+sw.bb0:                                           ; preds = %entry
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  br label %return
+
+sw.bb4:                                           ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb4, %sw.bb3, %sw.bb2, %sw.bb1
+  %retval.0 = phi i32 [ 4, %sw.bb4 ], [ 5, %sw.bb3 ], [ 6, %sw.bb2 ], [ 1, %sw.bb1 ], [ 8, %sw.bb0 ]
+  ret i32 %retval.0
+}
+
 ; If 1 incoming phi value is a case constant of a switch, convert it to the switch condition:
 ; https://bugs.llvm.org/show_bug.cgi?id=34471
 ; This then subsequently should allow squashing of the other trivial case blocks.
@@ -115,3 +185,60 @@ return:
   ret i32 %r
 }
 
+; We can replace `[ 1, %bb2 ]` with `[ %arg1, %bb2 ]`.
+define { i64, i64 } @PR95919(i64 noundef %arg, i64 noundef %arg1) {
+; NO_FWD-LABEL: @PR95919(
+; NO_FWD-NEXT:  bb:
+; NO_FWD-NEXT:    switch i64 [[ARG1:%.*]], label [[BB3:%.*]] [
+; NO_FWD-NEXT:      i64 0, label [[BB5:%.*]]
+; NO_FWD-NEXT:      i64 1, label [[BB2:%.*]]
+; NO_FWD-NEXT:    ]
+; NO_FWD:       bb2:
+; NO_FWD-NEXT:    br label [[BB5]]
+; NO_FWD:       bb3:
+; NO_FWD-NEXT:    [[I:%.*]] = udiv i64 [[ARG:%.*]], [[ARG1]]
+; NO_FWD-NEXT:    [[I4:%.*]] = shl nuw i64 [[I]], 1
+; NO_FWD-NEXT:    br label [[BB5]]
+; NO_FWD:       bb5:
+; NO_FWD-NEXT:    [[I6:%.*]] = phi i64 [ [[I4]], [[BB3]] ], [ [[ARG]], [[BB2]] ], [ undef, [[BB:%.*]] ]
+; NO_FWD-NEXT:    [[I7:%.*]] = phi i64 [ 1, [[BB3]] ], [ 1, [[BB2]] ], [ [[ARG1]], [[BB]] ]
+; NO_FWD-NEXT:    [[I8:%.*]] = insertvalue { i64, i64 } poison, i64 [[I7]], 0
+; NO_FWD-NEXT:    [[I9:%.*]] = insertvalue { i64, i64 } [[I8]], i64 [[I6]], 1
+; NO_FWD-NEXT:    ret { i64, i64 } [[I9]]
+;
+; FWD-LABEL: @PR95919(
+; FWD-NEXT:  bb:
+; FWD-NEXT:    [[SWITCH:%.*]] = icmp ult i64 [[ARG1:%.*]], 2
+; FWD-NEXT:    br i1 [[SWITCH]], label [[BB5:%.*]], label [[BB3:%.*]]
+; FWD:       bb3:
+; FWD-NEXT:    [[I:%.*]] = udiv i64 [[ARG:%.*]], [[ARG1]]
+; FWD-NEXT:    [[I4:%.*]] = shl nuw i64 [[I]], 1
+; FWD-NEXT:    br label [[BB5]]
+; FWD:       bb5:
+; FWD-NEXT:    [[I6:%.*]] = phi i64 [ [[I4]], [[BB3]] ], [ [[ARG]], [[BB:%.*]] ]
+; FWD-NEXT:    [[I7:%.*]] = phi i64 [ 1, [[BB3]] ], [ [[ARG1]], [[BB]] ]
+; FWD-NEXT:    [[I8:%.*]] = insertvalue { i64, i64 } poison, i64 [[I7]], 0
+; FWD-NEXT:    [[I9:%.*]] = insertvalue { i64, i64 } [[I8]], i64 [[I6]], 1
+; FWD-NEXT:    ret { i64, i64 } [[I9]]
+;
+bb:
+  switch i64 %arg1, label %bb3 [
+  i64 0, label %bb5
+  i64 1, label %bb2
+  ]
+
+bb2: ; preds = %bb
+  br label %bb5
+
+bb3: ; preds = %bb
+  %i = udiv i64 %arg, %arg1
+  %i4 = shl nuw i64 %i, 1
+  br label %bb5
+
+bb5: ; preds = %bb3, %bb2, %bb
+  %i6 = phi i64 [ %i4, %bb3 ], [ %arg, %bb2 ], [ undef, %bb ]
+  %i7 = phi i64 [ 1, %bb3 ], [ 1, %bb2 ], [ %arg1, %bb ]
+  %i8 = insertvalue { i64, i64 } poison, i64 %i7, 0
+  %i9 = insertvalue { i64, i64 } %i8, i64 %i6, 1
+  ret { i64, i64 } %i9
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
index 47dd24e194c392..b3b6abe314e6fc 100644
--- a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
@@ -43,7 +43,7 @@ define i32 @foo(i32 %x) #0 section ".tcm_text" {
 ; DISABLE:       sw.default:
 ; DISABLE-NEXT:    br label [[RETURN]]
 ; DISABLE:       return:
-; DISABLE-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 19, [[SW_DEFAULT]] ], [ 5, [[SW_BB5]] ], [ 12, [[SW_BB4]] ], [ 22, [[SW_BB3]] ], [ 14, [[SW_BB2]] ], [ 20, [[SW_BB1]] ], [ 9, [[ENTRY:%.*]] ]
+; DISABLE-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 19, [[SW_DEFAULT]] ], [ 33, [[SW_BB5]] ], [ 12, [[SW_BB4]] ], [ 22, [[SW_BB3]] ], [ 14, [[SW_BB2]] ], [ 20, [[SW_BB1]] ], [ 9, [[ENTRY:%.*]] ]
 ; DISABLE-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
@@ -81,7 +81,7 @@ sw.bb4:                                           ; preds = %entry
   br label %return
 
 sw.bb5:                                           ; preds = %entry
-  store i32 5, ptr %retval, align 4
+  store i32 33, ptr %retval, align 4
   br label %return
 
 sw.default:                                       ; preds = %entry
diff --git a/llvm/test/Transforms/SimplifyCFG/branch-nested.ll b/llvm/test/Transforms/SimplifyCFG/branch-nested.ll
new file mode 100644
index 00000000000000..13218e6d2049c6
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/branch-nested.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+
+define void @fold_nested_branch(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: define void @fold_nested_branch(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND1]], [[COND2]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BB4:.*]], label %[[BB3:.*]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb4
+
+bb2:
+  br i1 %cond2, label %bb4, label %bb3
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+}
+
+define void @fold_nested_branch_extra_predecessors(i1 %cond0, i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: define void @fold_nested_branch_extra_predecessors(
+; CHECK-SAME: i1 [[COND0:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND0]], label %[[BB0:.*]], label %[[BB1_PRED:.*]]
+; CHECK:       [[BB1_PRED]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND1]], [[COND2]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BB4]], label %[[BB3]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond0, label %bb0, label %bb1_pred
+
+bb1_pred:
+  call void @sideeffect1()
+  br label %bb1
+
+bb0:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb4
+
+bb2:
+  br i1 %cond2, label %bb4, label %bb3
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+}
+
+; Negative tests
+
+define void @fold_nested_branch_cfg_mismatch(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: define void @fold_nested_branch_cfg_mismatch(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB3:.*]], label %[[COMMON_RET:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB4:.*]], label %[[BB3]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb5
+
+bb2:
+  br i1 %cond2, label %bb4, label %bb3
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+
+bb5:
+  ret void
+}
+
+define void @fold_nested_branch_cond_mismatch(i1 %cond1, i1 %cond2, i1 %cond3) {
+; CHECK-LABEL: define void @fold_nested_branch_cond_mismatch(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i1 [[COND3:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br i1 [[COND3]], label %[[BB4]], label %[[BB3]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb4
+
+bb2:
+  br i1 %cond3, label %bb4, label %bb3
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+}
+
+define void @fold_nested_branch_non_trivial_succ(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: define void @fold_nested_branch_non_trivial_succ(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB4]], label %[[BB3]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  call void @sideeffect1()
+  br i1 %cond2, label %bb3, label %bb4
+
+bb2:
+  br i1 %cond2, label %bb4, label %bb3
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+}
+
+define i32 @fold_nested_branch_with_phi(i1 %cond1, i1 %cond2, i32 %x) {
+; CHECK-LABEL: define i32 @fold_nested_branch_with_phi(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[COMMON_RET:.*]], label %[[BB4:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB4]], label %[[COMMON_RET]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, %[[BB4]] ], [ 0, %[[BB1]] ], [ [[X]], %[[BB2]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb4
+
+bb2:
+  br i1 %cond2, label %bb4, label %bb3
+
+bb3:
+  %ret = phi i32 [ 0, %bb1 ], [ %x, %bb2 ]
+  ret i32 %ret
+
+bb4:
+  call void @sideeffect2()
+  ret i32 0
+}
+
+define void @fold_nested_branch_loop1(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: define void @fold_nested_branch_loop1(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND1_NOT:%.*]] = xor i1 [[COND1]], true
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[COND1_NOT]], i1 true, i1 [[COND2]]
+; CHECK-NEXT:    br i1 [[BRMERGE]], label %[[BB3:.*]], label %[[BB4:.*]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb4
+
+bb2:
+  br i1 %cond2, label %bb1, label %bb3
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+}
+
+define void @fold_nested_branch_loop2(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: define void @fold_nested_branch_loop2(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[BB0:.*]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    br i1 [[COND1]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BB0]], label %[[BB3]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br label %bb0
+
+bb0:
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb4
+
+bb2:
+  br i1 %cond2, label %bb0, label %bb3
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+}
+
+; Make sure that branch weights are correctly preserved
+; freq(bb4) = 1 * 4 + 2 * 5 = 14
+; freq(bb3) = 1 * 3 + 2 * 6 = 15
+define void @fold_nested_branch_prof(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: define void @fold_nested_branch_prof(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND1]], [[COND2]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BB4:.*]], label %[[BB3:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @sideeffect1()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @sideeffect2()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %bb1, label %bb2, !prof !0 ; 1:2
+
+bb1:
+  br i1 %cond2, label %bb3, label %bb4, !prof !1 ; 3:4
+
+bb2:
+  br i1 %cond2, label %bb4, label %bb3, !prof !2 ; 5:6
+
+bb3:
+  call void @sideeffect1()
+  ret void
+
+bb4:
+  call void @sideeffect2()
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 2}
+!1 = !{!"branch_weights", i32 3, i32 4}
+!2 = !{!"branch_weights", i32 5, i32 6}
+
+
+declare void @sideeffect1()
+declare void @sideeffect2()
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 14, i32 15}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/hoisting-metadata.ll b/llvm/test/Transforms/SimplifyCFG/hoisting-metadata.ll
new file mode 100644
index 00000000000000..026002a4942af6
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/hoisting-metadata.ll
@@ -0,0 +1,137 @@
+; RUN: opt -p simplifycfg -S %s | FileCheck %s
+
+declare void @init(ptr)
+
+define i64 @hoist_load_with_matching_pointers_and_tbaa(i1 %c) {
+; CHECK-LABEL: define i64 @hoist_load_with_matching_pointers_and_tbaa(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @init(ptr [[TMP]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[TMP]], align 8
+; CHECK-NOT:       !tbaa
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[TMP]], align 8
+; CHECK-NOT:       !tbaa
+; CHECK-NEXT:    [[P:%.*]] = select i1 [[C]], i64 [[TMP0]], i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  %tmp = alloca i64, align 8
+  call void @init(ptr %tmp)
+  br i1 %c, label %then, label %else
+
+then:
+  %0 = load i64, ptr %tmp, align 8, !tbaa !0
+  br label %exit
+
+else:
+  %1 = load i64, ptr %tmp, align 8, !tbaa !0
+  br label %exit
+
+exit:
+  %p = phi i64 [ %0, %then ], [ %1, %else ]
+  ret i64 %p
+}
+
+define i64 @hoist_load_with_matching_tbaa_different_pointers(i1 %c) {
+; CHECK-LABEL: define i64 @hoist_load_with_matching_tbaa_different_pointers(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[TMP_1:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @init(ptr [[TMP]])
+; CHECK-NEXT:    call void @init(ptr [[TMP_1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[TMP]], align 8
+; CHECK-NOT:       !tbaa
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[TMP_1]], align 8
+; CHECK-NOT:       !tbaa
+; CHECK-NEXT:    [[P:%.*]] = select i1 [[C]], i64 [[TMP0]], i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  %tmp = alloca i64, align 8
+  %tmp.1 = alloca i64, align 8
+  call void @init(ptr %tmp)
+  call void @init(ptr %tmp.1)
+  br i1 %c, label %then, label %else
+
+then:
+  %0 = load i64, ptr %tmp, align 8, !tbaa !0
+  br label %exit
+
+else:
+  %1 = load i64, ptr %tmp.1, align 8, !tbaa !0
+  br label %exit
+
+exit:
+  %p = phi i64 [ %0, %then ], [ %1, %else ]
+  ret i64 %p
+}
+
+define i64 @hoist_load_with_different_tbaa(i1 %c) {
+; CHECK-LABEL: define i64 @hoist_load_with_different_tbaa(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @init(ptr [[TMP]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[TMP]], align 8
+; CHECK-NOT:       !tbaa
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[TMP]], align 8
+; CHECK-NOT:       !tbaa
+; CHECK-NEXT:    [[P:%.*]] = select i1 [[C]], i64 [[TMP0]], i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  %tmp = alloca i64, align 8
+  call void @init(ptr %tmp)
+  br i1 %c, label %then, label %else
+
+then:
+  %0 = load i64, ptr %tmp, align 8, !tbaa !0
+  br label %exit
+
+else:
+  %1 = load i64, ptr %tmp, align 8, !tbaa !5
+  br label %exit
+
+exit:
+  %p = phi i64 [ %0, %then ], [ %1, %else ]
+  ret i64 %p
+}
+
+define i64 @hoist_different_ops(i1 %c, i64 %a) {
+; CHECK-LABEL: define i64 @hoist_different_ops(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @init(ptr [[TMP]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[TMP]], align 8
+; CHECK-NOT:       !tbaa
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[A]], 123
+; CHECK-NEXT:    [[P:%.*]] = select i1 [[C]], i64 [[TMP0]], i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  %tmp = alloca i64, align 8
+  call void @init(ptr %tmp)
+  br i1 %c, label %then, label %else
+
+then:
+  %0 = load i64, ptr %tmp, align 8, !tbaa !0
+  br label %exit
+
+else:
+  %1 = add i64 %a, 123
+  br label %exit
+
+exit:
+  %p = phi i64 [ %0, %then ], [ %1, %else ]
+  ret i64 %p
+}
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"p2 long long", !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
+!5 = !{!3, !3, i64 0}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
index d72532963d12ac..7aba1bbb1c9a01 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
@@ -82,15 +82,9 @@ define <8 x i8> @concata_addmul_small(<4 x i8> %a1, <4 x i8> %a2, <8 x i8> %b, <
 
 define <8 x i32> @concata_addmul_big(<4 x i32> %a1, <4 x i32> %a2, <8 x i32> %b, <8 x i32> %c) {
 ; CHECK-LABEL: @concata_addmul_big(
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[CB:%.*]] = shufflevector <8 x i32> [[C:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[CT:%.*]] = shufflevector <8 x i32> [[C]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[XB:%.*]] = mul <4 x i32> [[A1:%.*]], [[BB]]
-; CHECK-NEXT:    [[XT:%.*]] = mul <4 x i32> [[A2:%.*]], [[BT]]
-; CHECK-NEXT:    [[YB:%.*]] = add <4 x i32> [[XB]], [[CB]]
-; CHECK-NEXT:    [[YT:%.*]] = add <4 x i32> [[XT]], [[CT]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[YB]], <4 x i32> [[YT]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i32> [[TMP2]], [[C:%.*]]
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %bb = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -107,29 +101,11 @@ define <8 x i32> @concata_addmul_big(<4 x i32> %a1, <4 x i32> %a2, <8 x i32> %b,
 
 define <16 x i32> @concata_addmul_bigger(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a, <4 x i32> %a4a, <16 x i32> %b, <16 x i32> %c) {
 ; CHECK-LABEL: @concata_addmul_bigger(
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i32> [[A1A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A3:%.*]] = shufflevector <4 x i32> [[A3A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A4:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[B2:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[B3:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[B4:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[C1:%.*]] = shufflevector <16 x i32> [[C:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[C2:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[C3:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[C4:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[X1:%.*]] = mul <4 x i32> [[A1]], [[B1]]
-; CHECK-NEXT:    [[X2:%.*]] = mul <4 x i32> [[A2]], [[B2]]
-; CHECK-NEXT:    [[X3:%.*]] = mul <4 x i32> [[A3]], [[B3]]
-; CHECK-NEXT:    [[X4:%.*]] = mul <4 x i32> [[A4]], [[B4]]
-; CHECK-NEXT:    [[Y1:%.*]] = add <4 x i32> [[X1]], [[C1]]
-; CHECK-NEXT:    [[Y2:%.*]] = add <4 x i32> [[X2]], [[C2]]
-; CHECK-NEXT:    [[Y3:%.*]] = add <4 x i32> [[X3]], [[C3]]
-; CHECK-NEXT:    [[Y4:%.*]] = add <4 x i32> [[X4]], [[C4]]
-; CHECK-NEXT:    [[CC1:%.*]] = shufflevector <4 x i32> [[Y1]], <4 x i32> [[Y2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[CC2:%.*]] = shufflevector <4 x i32> [[Y3]], <4 x i32> [[Y4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[CC1]], <8 x i32> [[CC2]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> [[A3A:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> [[A1A:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[B:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add <16 x i32> [[TMP4]], [[C:%.*]]
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %a1 = shufflevector <4 x i32> %a1a, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -213,34 +189,13 @@ define <16 x i32> @concata_addmul_bigger_undef(<4 x i32> %a1a, <4 x i32> %a2a, <
 
 define <16 x i32> @splat_concat(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a, <4 x i32> %a4a, <16 x i32> %b, <16 x i32> %c) {
 ; CHECK-LABEL: @splat_concat(
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i32> [[A1A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A3:%.*]] = shufflevector <4 x i32> [[A3A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A4:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[B2:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[B3:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[B4:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[C1:%.*]] = shufflevector <16 x i32> [[C:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[C2:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[C3:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[C4:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[SPLATA:%.*]] = shufflevector <4 x i32> [[A4A]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[X1:%.*]] = mul <4 x i32> [[A1]], [[B1]]
-; CHECK-NEXT:    [[X2:%.*]] = mul <4 x i32> [[A2]], [[B2]]
-; CHECK-NEXT:    [[X3:%.*]] = mul <4 x i32> [[A3]], [[B3]]
-; CHECK-NEXT:    [[X4:%.*]] = mul <4 x i32> [[A4]], [[B4]]
-; CHECK-NEXT:    [[Y1:%.*]] = add <4 x i32> [[X1]], [[C1]]
-; CHECK-NEXT:    [[Y2:%.*]] = add <4 x i32> [[X2]], [[C2]]
-; CHECK-NEXT:    [[Y3:%.*]] = add <4 x i32> [[X3]], [[C3]]
-; CHECK-NEXT:    [[Y4:%.*]] = add <4 x i32> [[X4]], [[C4]]
-; CHECK-NEXT:    [[Z1:%.*]] = xor <4 x i32> [[Y1]], [[SPLATA]]
-; CHECK-NEXT:    [[Z2:%.*]] = xor <4 x i32> [[Y2]], [[SPLATA]]
-; CHECK-NEXT:    [[Z3:%.*]] = xor <4 x i32> [[Y3]], [[SPLATA]]
-; CHECK-NEXT:    [[Z4:%.*]] = xor <4 x i32> [[Y4]], [[SPLATA]]
-; CHECK-NEXT:    [[CC1:%.*]] = shufflevector <4 x i32> [[Z1]], <4 x i32> [[Z2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[CC2:%.*]] = shufflevector <4 x i32> [[Z3]], <4 x i32> [[Z4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[CC1]], <8 x i32> [[CC2]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> [[A3A:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> [[A1A:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[TMP4]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[A4A]], <4 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = xor <16 x i32> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %a1 = shufflevector <4 x i32> %a1a, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -276,33 +231,15 @@ define <16 x i32> @splat_concat(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a,
 
 define <16 x i32> @two_concats(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a, <4 x i32> %a4a, <16 x i32> %b, <16 x i32> %c) {
 ; CHECK-LABEL: @two_concats(
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i32> [[A1A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A3:%.*]] = shufflevector <4 x i32> [[A3A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A4:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[B2:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[B3:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[B4:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[C1:%.*]] = shufflevector <16 x i32> [[C:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[C2:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[C3:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[C4:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[X1:%.*]] = mul <4 x i32> [[A1]], [[B1]]
-; CHECK-NEXT:    [[X2:%.*]] = mul <4 x i32> [[A2]], [[B2]]
-; CHECK-NEXT:    [[X3:%.*]] = mul <4 x i32> [[A3]], [[B3]]
-; CHECK-NEXT:    [[X4:%.*]] = mul <4 x i32> [[A4]], [[B4]]
-; CHECK-NEXT:    [[Y1:%.*]] = add <4 x i32> [[X1]], [[C1]]
-; CHECK-NEXT:    [[Y2:%.*]] = add <4 x i32> [[X2]], [[C2]]
-; CHECK-NEXT:    [[Y3:%.*]] = add <4 x i32> [[X3]], [[C3]]
-; CHECK-NEXT:    [[Y4:%.*]] = add <4 x i32> [[X4]], [[C4]]
-; CHECK-NEXT:    [[Z1:%.*]] = xor <4 x i32> [[Y1]], [[A1]]
-; CHECK-NEXT:    [[Z2:%.*]] = xor <4 x i32> [[Y2]], [[A1]]
-; CHECK-NEXT:    [[Z3:%.*]] = xor <4 x i32> [[Y3]], [[A1]]
-; CHECK-NEXT:    [[Z4:%.*]] = xor <4 x i32> [[Y4]], [[A1]]
-; CHECK-NEXT:    [[CC1:%.*]] = shufflevector <4 x i32> [[Z1]], <4 x i32> [[Z2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[CC2:%.*]] = shufflevector <4 x i32> [[Z3]], <4 x i32> [[Z4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[CC1]], <8 x i32> [[CC2]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> [[A3A:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> [[A1A:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[TMP4]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[A1A]], <4 x i32> [[A1A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[A1A]], <4 x i32> [[A1A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = xor <16 x i32> [[TMP5]], [[TMP8]]
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %a1 = shufflevector <4 x i32> %a1a, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -338,57 +275,15 @@ define <16 x i32> @two_concats(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a, <
 
 define <16 x double> @konkat(<16 x double> %wide.vec, <16 x double> %wide.vec115, <2 x double> %l27, <2 x double> %l28, <2 x double> %l29, <2 x double> %l30) {
 ; CHECK-LABEL: @konkat(
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[L27:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[L32:%.*]] = shufflevector <2 x double> [[L27]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[BROADCAST_SPLAT102:%.*]] = shufflevector <2 x double> [[L32]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLAT104:%.*]] = shufflevector <2 x double> [[L28:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[L33:%.*]] = shufflevector <2 x double> [[L28]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[BROADCAST_SPLAT106:%.*]] = shufflevector <2 x double> [[L33]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLAT108:%.*]] = shufflevector <2 x double> [[L29:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[L34:%.*]] = shufflevector <2 x double> [[L29]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[BROADCAST_SPLAT110:%.*]] = shufflevector <2 x double> [[L34]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLAT112:%.*]] = shufflevector <2 x double> [[L30:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[L35:%.*]] = shufflevector <2 x double> [[L30]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[BROADCAST_SPLAT114:%.*]] = shufflevector <2 x double> [[L35]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC:%.*]], <16 x double> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC94:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC95:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 2, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC96:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 3, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC97:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 4, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC98:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC99:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC100:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC94]], [[BROADCAST_SPLAT102]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC95]], [[BROADCAST_SPLAT104]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC96]], [[BROADCAST_SPLAT106]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC97]], [[BROADCAST_SPLAT108]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC98]], [[BROADCAST_SPLAT110]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC99]], [[BROADCAST_SPLAT112]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul reassoc nsz contract <2 x double> [[STRIDED_VEC100]], [[BROADCAST_SPLAT114]]
-; CHECK-NEXT:    [[STRIDED_VEC116:%.*]] = shufflevector <16 x double> [[WIDE_VEC115:%.*]], <16 x double> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC117:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC118:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <2 x i32> <i32 2, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC119:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <2 x i32> <i32 3, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC120:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <2 x i32> <i32 4, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC121:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC122:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC123:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC116]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC117]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC118]], [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC119]], [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC120]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC121]], [[TMP6]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC122]], [[TMP7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fadd reassoc nsz contract <2 x double> [[STRIDED_VEC123]], [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x double> [[TMP17]], <4 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x double> [[TMP19]], <4 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[L27:%.*]], <2 x double> [[L28:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[L29:%.*]], <2 x double> [[L30:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[L27]], <2 x double> [[L28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[L29]], <2 x double> [[L30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul reassoc nsz contract <16 x double> [[WIDE_VEC:%.*]], [[TMP7]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd reassoc nsz contract <16 x double> [[WIDE_VEC115:%.*]], [[TMP8]]
 ; CHECK-NEXT:    ret <16 x double> [[INTERLEAVED_VEC]]
 ;
   %broadcast.splat = shufflevector <2 x double> %l27, <2 x double> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 9ad042c112b4c0..7ca5baf3b538a9 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -262,9 +262,9 @@ define <8 x half> @splatandidentity(<8 x half> %a, <8 x half> %b) {
 
 define <8 x half> @splattwice(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @splattwice(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %as = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> zeroinitializer
@@ -352,9 +352,9 @@ define <8 x half> @constantsplatf(<8 x half> %a) {
 
 define <8 x i8> @inner_shuffle(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: @inner_shuffle(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i8> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -839,16 +839,16 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
 ; CHECK-LABEL: @v8f64interleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Z:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <16 x double> [[WIDE_VEC]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
-; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x double> [[WIDE_VEC34]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP0]], 7
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -56
-; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
+; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -56
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <16 x double> [[WIDE_VEC]], [[TMP6]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x double> [[WIDE_VEC34]], [[TMP7]]
+; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -905,10 +905,10 @@ entry:
 
 define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-LABEL: @singleop(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[A:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[A:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[R:%.*]] = trunc <4 x i16> [[TMP4]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
@@ -953,9 +953,9 @@ define <4 x float> @fadd_mismatched_types(<4 x float> %x, <4 x float> %y) {
 define void @maximal_legal_fpmath(ptr %addr1, ptr %addr2, ptr %result, float %val) {
 ; CHECK-LABEL: @maximal_legal_fpmath(
 ; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[VAL:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[SPLATINSERT]], <4 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC1:%.*]] = load <16 x float>, ptr [[ADDR1:%.*]], align 4
 ; CHECK-NEXT:    [[VEC2:%.*]] = load <16 x float>, ptr [[ADDR2:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[SPLATINSERT]], <4 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <16 x float> [[TMP1]], [[VEC2]]
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd reassoc contract <16 x float> [[VEC1]], [[TMP2]]
 ; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[RESULT:%.*]], align 4
@@ -993,4 +993,23 @@ define void @maximal_legal_fpmath(ptr %addr1, ptr %addr2, ptr %result, float %va
   ret void
 }
 
+define <2 x float> @first_scalar_select(<2 x float> %0, <2 x float> %1, float %x) {
+; CHECK-LABEL: @first_scalar_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_I903:%.*]] = fcmp ogt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[SEL1639:%.*]] = select i1 [[CMP_I903]], <2 x float> [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x float> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[SEL48_I913:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP0]], <2 x float> [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[SEL1639]], <2 x float> [[SEL48_I913]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+;
+entry:
+  %cmp.i903 = fcmp ogt float %x, 0.000000e+00
+  %sel1639 = select i1 %cmp.i903, <2 x float> %0, <2 x float> %1
+  %3 = fcmp ogt <2 x float> %0, zeroinitializer
+  %sel48.i913 = select <2 x i1> %3, <2 x float> %0, <2 x float> %1
+  %4 = shufflevector <2 x float> %sel1639, <2 x float> %sel48.i913, <2 x i32> <i32 0, i32 3>
+  ret <2 x float> %4
+}
+
 declare void @use(<4 x i8>)
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index 9ed62d3545f076..13d13bca56de6c 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -499,31 +499,31 @@ define i32 @test_udot4(i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3) {
   ret i32 %val
 }
 
-declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1)
 define i32 @test_permlane16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg3
-  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
-  %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg4
-  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
-  %v2 = call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+  %v2 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
   ret i32 %v2
 }
 
-declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1)
 define i32 @test_permlanex16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg3
-  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
-  %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg4
-  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
-  %v2 = call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+  %v2 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
   ret i32 %v2
 }
 
diff --git a/llvm/test/tools/llc/new-pm/machine-branch-prob.ll b/llvm/test/tools/llc/new-pm/machine-branch-prob.ll
new file mode 100644
index 00000000000000..8854c85b3a3a40
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/machine-branch-prob.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -stop-after=finalize-isel -o - | \
+; RUN: llc -mtriple=x86_64-linux-gnu -passes='print<machine-branch-prob>' -x mir -filetype=null 2>&1 | \
+; RUN: FileCheck %s
+
+declare void @foo()
+
+define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 4, label %sw.bb
+    i64 5, label %sw.bb1
+    i64 15, label %sw.bb
+  ], !prof !0
+
+sw.bb:
+; this call will prevent simplifyCFG from optimizing the block away in ARM/AArch64.
+  tail call void @foo()
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!0 = !{!"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64, i21 1000}
+
+; CHECK: Printing analysis 'Machine Branch Probability Analysis' for machine function 'test2':
+; CHECK: edge %bb.4 -> %bb.6 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK: edge %bb.5 -> %bb.6 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
diff --git a/llvm/test/tools/llc/new-pm/pipeline.mir b/llvm/test/tools/llc/new-pm/pipeline.mir
index 6baa710060f055..ee058f763b7793 100644
--- a/llvm/test/tools/llc/new-pm/pipeline.mir
+++ b/llvm/test/tools/llc/new-pm/pipeline.mir
@@ -1,7 +1,10 @@
 # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes=no-op-machine-function --print-pipeline-passes -filetype=null < %s | FileCheck %s --match-full-lines
+# RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes='require<machine-dom-tree>,print<machine-dom-tree>' -print-pipeline-passes < %s | FileCheck --check-prefix=ANALYSIS %s
 
 # CHECK: function(machine-function(no-op-machine-function)),PrintMIRPreparePass,function(machine-function(print))
 
+# ANALYSIS: require<machine-dom-tree>,print<machine-dom-tree>
+
 ---
 name: f
 body: |
diff --git a/llvm/test/tools/llvm-dwp/X86/empty_warning.s b/llvm/test/tools/llvm-dwp/X86/empty_warning.s
new file mode 100644
index 00000000000000..56a5ed2bf15ab9
--- /dev/null
+++ b/llvm/test/tools/llvm-dwp/X86/empty_warning.s
@@ -0,0 +1,145 @@
+# RUN: llvm-mc --triple=x86_64-unknown-linux --filetype=obj -dwarf-version=4 %s -o %t.o
+# RUN: llvm-dwp -e %t.o -o %t.dwp 2>&1 | FileCheck %s --check-prefix=CHECK-WARNING
+
+# CHECK-WARNING: warning: executable file does not contain any references to dwo files
+
+.ifdef GEN
+;--- main.c
+int main() {
+    return 0;
+}
+;--- gen
+clang -g -S -gdwarf-4 main.c -o -
+.endif
+	.text
+	.file	"main.c"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	1 "/proc/self/cwd" "main.c"
+	.loc	1 8 0                           # main.c:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	xorl	%eax, %eax
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	1 9 5 prologue_end              # main.c:9:5
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x40 DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	12                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	2                               # Abbrev [2] 0x2a:0x19 DW_TAG_subprogram
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.long	.Linfo_string3                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	67                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x43:0x7 DW_TAG_base_type
+	.long	.Linfo_string4                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.byte	0                               # string offset=0
+.Linfo_string1:
+	.asciz	"main.c"                        # string offset=1
+.Linfo_string2:
+	.asciz	"/proc/self/cwd"                # string offset=8
+.Linfo_string3:
+	.asciz	"main"                          # string offset=23
+.Linfo_string4:
+	.asciz	"int"                           # string offset=28
+	.section	.GCC.command.line,"MS",@progbits,1
+	.zero	1
+	.ascii	"/usr/local/bin/clang-11 -Qn -g -S -gdwarf-4 main.c -o -"
+	.zero	1
+	.section	.debug_str,"MS",@progbits,1
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-dwp/X86/merge.test b/llvm/test/tools/llvm-dwp/X86/merge.test
index 0cf56bd858a04b..25a1315ee63f33 100644
--- a/llvm/test/tools/llvm-dwp/X86/merge.test
+++ b/llvm/test/tools/llvm-dwp/X86/merge.test
@@ -44,3 +44,8 @@ CHECK:    Index  Signature INFO                                        ABBREV
 CHECK-DAG:       [[DWOC]]  [0x00000000[[#COFF]], 0x00000000[[#AOFF]]) [0x0000[[CAOFF]], 0x0000[[AAOFF]]) [0x00000000, 0x00000011) [0x00000000, 0x00000018)
 CHECK-DAG:       [[DWOA]]  [0x00000000[[#AOFF]], 0x00000000[[#BOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000011, 0x00000022) [0x00000018, 0x00000028)
 CHECK-DAG:       [[DWOB]]  [0x00000000[[#BOFF]], 0x00000000[[#XOFF]]) [0x0000[[BAOFF]], 0x000000c3)      [0x00000022, 0x00000033) [0x00000028, 0x0000003c)
+
+CHECK-LABEL: .debug_str_offsets.dwo contents:
+CHECK: Contribution size = 24, Format = DWARF32, Version = 4
+CHECK: Contribution size = 16, Format = DWARF32, Version = 4
+CHECK: Contribution size = 20, Format = DWARF32, Version = 4
diff --git a/llvm/test/tools/llvm-dwp/X86/merge_v5.test b/llvm/test/tools/llvm-dwp/X86/merge_v5.test
new file mode 100644
index 00000000000000..1c8b14deaea6a0
--- /dev/null
+++ b/llvm/test/tools/llvm-dwp/X86/merge_v5.test
@@ -0,0 +1,152 @@
+RUN: rm -rf %t && split-file %s %t && cd %t
+RUN: llc a.ll -o a.o -filetype=obj --split-dwarf-output=a.dwo --split-dwarf-file=a.dwo
+RUN: llc b.ll -o b.o -filetype=obj --split-dwarf-output=b.dwo --split-dwarf-file=b.dwo
+RUN: llc c.ll -o c.o -filetype=obj --split-dwarf-output=c.dwo --split-dwarf-file=c.dwo
+RUN: llvm-dwp a.dwo b.dwo -o ab.dwp
+RUN: llvm-dwp c.dwo ab.dwp -o merged.dwp
+RUN: llvm-dwarfdump -v merged.dwp | FileCheck --check-prefix=CHECK %s
+
+
+CHECK-LABEL: .debug_str_offsets.dwo contents:
+CHECK: Contribution size = 32, Format = DWARF32, Version = 5
+CHECK: 0x00000008: 00000000 "_Z1cv"
+CHECK: 0x0000000c: 00000006 "c"
+CHECK: 0x00000010: 00000008 "int"
+CHECK: 0x00000014: 0000000c "baz"
+CHECK: 0x00000018: 00000010 ""
+CHECK: 0x0000001c: 00000011 "c.cpp"
+CHECK: 0x00000020: 00000017 "c.dwo"
+CHECK: Contribution size = 24, Format = DWARF32, Version = 5
+CHECK: 0x0000002c: 0000001d "a"
+CHECK: 0x00000030: 0000001f "foo"
+CHECK: 0x00000034: 00000010 ""
+CHECK: 0x00000038: 00000023 "a.cpp"
+CHECK: 0x0000003c: 00000029 "a.dwo"
+CHECK: Contribution size = 28, Format = DWARF32, Version = 5
+CHECK: 0x00000048: 0000002f "_Z1b3bar"
+CHECK: 0x0000004c: 00000038 "b"
+CHECK: 0x00000050: 0000003a "bar"
+CHECK: 0x00000054: 00000010 ""
+CHECK: 0x00000058: 0000003e "b.cpp"
+CHECK: 0x0000005c: 00000044 "b.dwo"
+;--- a.cpp
+  struct foo { };
+  foo a;
+;--- b.cpp
+  struct bar { };
+  void b(bar) {
+  }
+;--- c.cpp
+  typedef int baz;
+  baz c() {
+  }
+;--- gen
+clang  --target=x86_64-linux  -g3 -S -emit-llvm -gsplit-dwarf  a.cpp -o -
+echo '#--- b.ll'
+clang  --target=x86_64-linux  -g3 -S -emit-llvm -gsplit-dwarf  b.cpp -o -
+echo '#--- c.ll'
+clang  --target=x86_64-linux  -g3 -S -emit-llvm -gsplit-dwarf  c.cpp -o -
+;--- a.ll
+; ModuleID = 'a.cpp'
+source_filename = "a.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+%struct.foo = type { i8 }
+
+@a = dso_local global %struct.foo zeroinitializer, align 1, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 2, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, isOptimized: false, runtimeVersion: 0, splitDebugFilename: "a.dwo", emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: GNU)
+!3 = !DIFile(filename: "a.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "394299a94a96cb48c0c9c95d7baf01f5")
+!4 = !{!0}
+!5 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !3, line: 1, size: 8, flags: DIFlagTypePassByValue, elements: !6, identifier: "_ZTS3foo")
+!6 = !{}
+!7 = !{i32 7, !"Dwarf Version", i32 5}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 8, !"PIC Level", i32 2}
+!11 = !{i32 7, !"PIE Level", i32 2}
+!12 = !{i32 7, !"uwtable", i32 2}
+!13 = !{i32 7, !"frame-pointer", i32 2}
+#--- b.ll
+; ModuleID = 'b.cpp'
+source_filename = "b.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+%struct.bar = type { i8 }
+
+; Function Attrs: mustprogress noinline nounwind optnone uwtable
+define dso_local void @_Z1b3bar() #0 !dbg !9 {
+  %1 = alloca %struct.bar, align 1
+  call void @llvm.dbg.declare(metadata ptr %1, metadata !14, metadata !DIExpression()), !dbg !15
+  ret void, !dbg !16
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { mustprogress noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: false, runtimeVersion: 0, splitDebugFilename: "b.dwo", emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: GNU)
+!1 = !DIFile(filename: "b.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "8195382ac12baa5edfe47e2e4725f4a7")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = distinct !DISubprogram(name: "b", linkageName: "_Z1b3bar", scope: !1, file: !1, line: 2, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !13)
+!10 = !DISubroutineType(types: !11)
+!11 = !{null, !12}
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "bar", file: !1, line: 1, size: 8, flags: DIFlagTypePassByValue, elements: !13, identifier: "_ZTS3bar")
+!13 = !{}
+!14 = !DILocalVariable(arg: 1, scope: !9, file: !1, line: 2, type: !12)
+!15 = !DILocation(line: 2, column: 13, scope: !9)
+!16 = !DILocation(line: 3, column: 3, scope: !9)
+#--- c.ll
+; ModuleID = 'c.cpp'
+source_filename = "c.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+; Function Attrs: mustprogress noinline nounwind optnone uwtable
+define dso_local noundef i32 @_Z1cv() #0 !dbg !9 {
+  call void @llvm.trap(), !dbg !14
+  unreachable, !dbg !14
+}
+
+; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write)
+declare void @llvm.trap() #1
+
+attributes #0 = { mustprogress noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: false, runtimeVersion: 0, splitDebugFilename: "c.dwo", emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: GNU)
+!1 = !DIFile(filename: "c.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "e508eeb01e2e608fe8713f9132696ef5")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = distinct !DISubprogram(name: "c", linkageName: "_Z1cv", scope: !1, file: !1, line: 2, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12}
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "baz", file: !1, line: 1, baseType: !13)
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !DILocation(line: 2, column: 11, scope: !9)
diff --git a/llvm/test/tools/llvm-ifs/write-stub.test b/llvm/test/tools/llvm-ifs/write-stub.test
index 44c194b77abc72..e9f2d4f147b2c0 100644
--- a/llvm/test/tools/llvm-ifs/write-stub.test
+++ b/llvm/test/tools/llvm-ifs/write-stub.test
@@ -151,7 +151,7 @@ Symbols:
 # CHECK-NEXT:   0x[[DYNTABZ]]0000006 SYMTAB
 # CHECK-NEXT:   0x[[DYNTABZ]]0000005 STRTAB
 # CHECK-NEXT:   0x[[DYNTABZ]]000000A STRSZ
-# CHECK-NEXT:   0x[[DYNTABZ]]0000001 NEEDED Shared library: [libc.so.6]
+# CHECK-NEXT:   0x[[DYNTABZ]]0000001 NEEDED Shared library: libc.so.6
 # CHECK-NEXT:   0x[[DYNTABZ]]0000000 NULL
 # CHECK-NEXT: ]
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
index 20a38a55c1be1d..9a3bcac69582ac 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
@@ -1294,6 +1294,8 @@ ldnp      q23, q29, [x1, #-1024]
 
 mov      w3, #983055
 mov      x10, #-6148914691236517206
+ands     w4, w4, #983055
+ands     x11, x11, #-6148914691236517206
 
 #------------------------------------------------------------------------------
 # Logical (shifted register)
@@ -1404,16 +1406,16 @@ drps
 # CHECK-NEXT:  1      1     0.17                        sub	w4, w20, #546, lsl #12
 # CHECK-NEXT:  1      1     0.17                        sub	sp, sp, #288
 # CHECK-NEXT:  1      1     0.17                        sub	wsp, w19, #16
-# CHECK-NEXT:  1      1     0.17                        adds	w13, w23, #291, lsl #12
-# CHECK-NEXT:  1      1     0.17                        cmn	w2, #4095
-# CHECK-NEXT:  1      1     0.17                        adds	w20, wsp, #0
-# CHECK-NEXT:  1      1     0.17                        cmn	x3, #1, lsl #12
-# CHECK-NEXT:  1      1     0.17                        cmp	sp, #20, lsl #12
-# CHECK-NEXT:  1      1     0.17                        cmp	x30, #4095
-# CHECK-NEXT:  1      1     0.17                        subs	x4, sp, #3822
-# CHECK-NEXT:  1      1     0.17                        cmn	w3, #291, lsl #12
-# CHECK-NEXT:  1      1     0.17                        cmn	wsp, #1365
-# CHECK-NEXT:  1      1     0.17                        cmn	sp, #1092, lsl #12
+# CHECK-NEXT:  1      1     0.25                        adds	w13, w23, #291, lsl #12
+# CHECK-NEXT:  1      1     0.25                        cmn	w2, #4095
+# CHECK-NEXT:  1      1     0.25                        adds	w20, wsp, #0
+# CHECK-NEXT:  1      1     0.25                        cmn	x3, #1, lsl #12
+# CHECK-NEXT:  1      1     0.25                        cmp	sp, #20, lsl #12
+# CHECK-NEXT:  1      1     0.25                        cmp	x30, #4095
+# CHECK-NEXT:  1      1     0.25                        subs	x4, sp, #3822
+# CHECK-NEXT:  1      1     0.25                        cmn	w3, #291, lsl #12
+# CHECK-NEXT:  1      1     0.25                        cmn	wsp, #1365
+# CHECK-NEXT:  1      1     0.25                        cmn	sp, #1092, lsl #12
 # CHECK-NEXT:  1      1     0.17                        mov	sp, x30
 # CHECK-NEXT:  1      1     0.17                        mov	wsp, w20
 # CHECK-NEXT:  1      1     0.17                        mov	x11, sp
@@ -1699,30 +1701,30 @@ drps
 # CHECK-NEXT:  1      1     0.50                        b.ne	#4
 # CHECK-NEXT:  1      1     0.50                        b.ge	#1048572
 # CHECK-NEXT:  1      1     0.50                        b.ge	#-4
-# CHECK-NEXT:  1      1     0.17                        ccmp	w1, #31, #0, eq
-# CHECK-NEXT:  1      1     0.17                        ccmp	w3, #0, #15, hs
-# CHECK-NEXT:  1      1     0.17                        ccmp	wzr, #15, #13, hs
-# CHECK-NEXT:  1      1     0.17                        ccmp	x9, #31, #0, le
-# CHECK-NEXT:  1      1     0.17                        ccmp	x3, #0, #15, gt
-# CHECK-NEXT:  1      1     0.17                        ccmp	xzr, #5, #7, ne
-# CHECK-NEXT:  1      1     0.17                        ccmn	w1, #31, #0, eq
-# CHECK-NEXT:  1      1     0.17                        ccmn	w3, #0, #15, hs
-# CHECK-NEXT:  1      1     0.17                        ccmn	wzr, #15, #13, hs
-# CHECK-NEXT:  1      1     0.17                        ccmn	x9, #31, #0, le
-# CHECK-NEXT:  1      1     0.17                        ccmn	x3, #0, #15, gt
-# CHECK-NEXT:  1      1     0.17                        ccmn	xzr, #5, #7, ne
-# CHECK-NEXT:  1      1     0.17                        ccmp	w1, wzr, #0, eq
-# CHECK-NEXT:  1      1     0.17                        ccmp	w3, w0, #15, hs
-# CHECK-NEXT:  1      1     0.17                        ccmp	wzr, w15, #13, hs
-# CHECK-NEXT:  1      1     0.17                        ccmp	x9, xzr, #0, le
-# CHECK-NEXT:  1      1     0.17                        ccmp	x3, x0, #15, gt
-# CHECK-NEXT:  1      1     0.17                        ccmp	xzr, x5, #7, ne
-# CHECK-NEXT:  1      1     0.17                        ccmn	w1, wzr, #0, eq
-# CHECK-NEXT:  1      1     0.17                        ccmn	w3, w0, #15, hs
-# CHECK-NEXT:  1      1     0.17                        ccmn	wzr, w15, #13, hs
-# CHECK-NEXT:  1      1     0.17                        ccmn	x9, xzr, #0, le
-# CHECK-NEXT:  1      1     0.17                        ccmn	x3, x0, #15, gt
-# CHECK-NEXT:  1      1     0.17                        ccmn	xzr, x5, #7, ne
+# CHECK-NEXT:  1      1     0.25                        ccmp	w1, #31, #0, eq
+# CHECK-NEXT:  1      1     0.25                        ccmp	w3, #0, #15, hs
+# CHECK-NEXT:  1      1     0.25                        ccmp	wzr, #15, #13, hs
+# CHECK-NEXT:  1      1     0.25                        ccmp	x9, #31, #0, le
+# CHECK-NEXT:  1      1     0.25                        ccmp	x3, #0, #15, gt
+# CHECK-NEXT:  1      1     0.25                        ccmp	xzr, #5, #7, ne
+# CHECK-NEXT:  1      1     0.25                        ccmn	w1, #31, #0, eq
+# CHECK-NEXT:  1      1     0.25                        ccmn	w3, #0, #15, hs
+# CHECK-NEXT:  1      1     0.25                        ccmn	wzr, #15, #13, hs
+# CHECK-NEXT:  1      1     0.25                        ccmn	x9, #31, #0, le
+# CHECK-NEXT:  1      1     0.25                        ccmn	x3, #0, #15, gt
+# CHECK-NEXT:  1      1     0.25                        ccmn	xzr, #5, #7, ne
+# CHECK-NEXT:  1      1     0.25                        ccmp	w1, wzr, #0, eq
+# CHECK-NEXT:  1      1     0.25                        ccmp	w3, w0, #15, hs
+# CHECK-NEXT:  1      1     0.25                        ccmp	wzr, w15, #13, hs
+# CHECK-NEXT:  1      1     0.25                        ccmp	x9, xzr, #0, le
+# CHECK-NEXT:  1      1     0.25                        ccmp	x3, x0, #15, gt
+# CHECK-NEXT:  1      1     0.25                        ccmp	xzr, x5, #7, ne
+# CHECK-NEXT:  1      1     0.25                        ccmn	w1, wzr, #0, eq
+# CHECK-NEXT:  1      1     0.25                        ccmn	w3, w0, #15, hs
+# CHECK-NEXT:  1      1     0.25                        ccmn	wzr, w15, #13, hs
+# CHECK-NEXT:  1      1     0.25                        ccmn	x9, xzr, #0, le
+# CHECK-NEXT:  1      1     0.25                        ccmn	x3, x0, #15, gt
+# CHECK-NEXT:  1      1     0.25                        ccmn	xzr, x5, #7, ne
 # CHECK-NEXT:  1      1     0.17                        csel	w1, w0, w19, ne
 # CHECK-NEXT:  1      1     0.17                        csel	wzr, w5, w9, eq
 # CHECK-NEXT:  1      1     0.17                        csel	w9, wzr, w30, gt
@@ -2508,6 +2510,8 @@ drps
 # CHECK-NEXT:  3      6     0.67    *                   ldnp	q23, q29, [x1, #-1024]
 # CHECK-NEXT:  1      1     0.17                        mov	w3, #983055
 # CHECK-NEXT:  1      1     0.17                        mov	x10, #-6148914691236517206
+# CHECK-NEXT:  1      1     0.25                        ands	w4, w4, #0xf000f
+# CHECK-NEXT:  1      1     0.25                        ands	x11, x11, #0xaaaaaaaaaaaaaaaa
 # CHECK-NEXT:  1      1     0.17                        and	w12, w23, w21
 # CHECK-NEXT:  1      1     0.17                        and	w16, w15, w1, lsl #1
 # CHECK-NEXT:  1      1     0.17                        and	w9, w4, w10, lsl #31
@@ -2585,7 +2589,7 @@ drps
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT: 11.00  11.00  33.00  33.00  99.00  165.00 165.00 326.58 181.58 109.58 109.58 91.83  91.83  190.00 146.00 30.00  10.00
+# CHECK-NEXT: 11.00  11.00  33.00  33.00  99.00  165.00 165.00 329.92 184.92 112.92 112.92 86.17  86.17  190.00 146.00 30.00  10.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
@@ -2604,16 +2608,16 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     sub	w4, w20, #546, lsl #12
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     sub	sp, sp, #288
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     sub	wsp, w19, #16
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     adds	w13, w23, #291, lsl #12
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     cmn	w2, #4095
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     adds	w20, wsp, #0
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     cmn	x3, #1, lsl #12
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     cmp	sp, #20, lsl #12
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     cmp	x30, #4095
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     subs	x4, sp, #3822
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     cmn	w3, #291, lsl #12
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     cmn	wsp, #1365
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     cmn	sp, #1092, lsl #12
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     adds	w13, w23, #291, lsl #12
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     cmn	w2, #4095
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     adds	w20, wsp, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     cmn	x3, #1, lsl #12
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     cmp	sp, #20, lsl #12
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     cmp	x30, #4095
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     subs	x4, sp, #3822
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     cmn	w3, #291, lsl #12
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     cmn	wsp, #1365
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     cmn	sp, #1092, lsl #12
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     mov	sp, x30
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     mov	wsp, w20
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     mov	x11, sp
@@ -2899,30 +2903,30 @@ drps
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     b.ne	#4
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     b.ge	#1048572
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     b.ge	#-4
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	w1, #31, #0, eq
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	w3, #0, #15, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	wzr, #15, #13, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	x9, #31, #0, le
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	x3, #0, #15, gt
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	xzr, #5, #7, ne
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	w1, #31, #0, eq
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	w3, #0, #15, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	wzr, #15, #13, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	x9, #31, #0, le
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	x3, #0, #15, gt
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	xzr, #5, #7, ne
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	w1, wzr, #0, eq
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	w3, w0, #15, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	wzr, w15, #13, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	x9, xzr, #0, le
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	x3, x0, #15, gt
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmp	xzr, x5, #7, ne
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	w1, wzr, #0, eq
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	w3, w0, #15, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	wzr, w15, #13, hs
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	x9, xzr, #0, le
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	x3, x0, #15, gt
-# CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     ccmn	xzr, x5, #7, ne
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	w1, #31, #0, eq
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	w3, #0, #15, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	wzr, #15, #13, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	x9, #31, #0, le
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	x3, #0, #15, gt
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	xzr, #5, #7, ne
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	w1, #31, #0, eq
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	w3, #0, #15, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	wzr, #15, #13, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	x9, #31, #0, le
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	x3, #0, #15, gt
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	xzr, #5, #7, ne
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	w1, wzr, #0, eq
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	w3, w0, #15, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	wzr, w15, #13, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	x9, xzr, #0, le
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	x3, x0, #15, gt
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmp	xzr, x5, #7, ne
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	w1, wzr, #0, eq
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	w3, w0, #15, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	wzr, w15, #13, hs
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	x9, xzr, #0, le
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	x3, x0, #15, gt
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ccmn	xzr, x5, #7, ne
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     csel	w1, w0, w19, ne
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     csel	wzr, w5, w9, eq
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     csel	w9, wzr, w30, gt
@@ -3708,6 +3712,8 @@ drps
 # CHECK-NEXT:  -      -      -      -     0.67   0.67   0.67    -      -      -      -      -      -      -      -      -      -     ldnp	q23, q29, [x1, #-1024]
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     mov	w3, #983055
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     mov	x10, #-6148914691236517206
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ands	w4, w4, #0xf000f
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     ands	x11, x11, #0xaaaaaaaaaaaaaaaa
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     and	w12, w23, w21
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     and	w16, w15, w1, lsl #1
 # CHECK-NEXT:  -      -      -      -      -      -      -     0.17   0.17   0.17   0.17   0.17   0.17    -      -      -      -     and	w9, w4, w10, lsl #31
diff --git a/llvm/test/tools/llvm-mca/RISCV/SyntacoreSCR/SCR3-ALU.s b/llvm/test/tools/llvm-mca/RISCV/SyntacoreSCR/SCR3-ALU.s
new file mode 100644
index 00000000000000..83866106b47a17
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SyntacoreSCR/SCR3-ALU.s
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64-unknown-unknown -mcpu=syntacore-scr3-rv64 --iterations=1 < %s | FileCheck %s --check-prefixes=CHECK,RV64
+# RUN: llvm-mca -mtriple=riscv32-unknown-unknown -mcpu=syntacore-scr3-rv32 --iterations=1 < %s | FileCheck %s --check-prefixes=CHECK,RV32
+
+div a0, a0, a0
+mul t0, a0, t0
+add t1, a0, t0
+add t2, t2, t2
+div a1, a1, a1
+mul s0, a1, s0
+add s1, s0, s1
+add s2, s2, s2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      8
+
+# RV32-NEXT:  Total Cycles:      25
+# RV64-NEXT:  Total Cycles:      31
+
+# CHECK-NEXT: Total uOps:        8
+
+# CHECK:      Dispatch Width:    1
+
+# RV32-NEXT:  uOps Per Cycle:    0.32
+# RV32-NEXT:  IPC:               0.32
+# RV32-NEXT:  Block RThroughput: 16.0
+
+# RV64-NEXT:  uOps Per Cycle:    0.26
+# RV64-NEXT:  IPC:               0.26
+# RV64-NEXT:  Block RThroughput: 22.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+
+# RV32-NEXT:   1      8     8.00                        div	a0, a0, a0
+# RV64-NEXT:   1      11    11.00                       div	a0, a0, a0
+
+# CHECK-NEXT:  1      2     1.00                        mul	t0, a0, t0
+# CHECK-NEXT:  1      1     1.00                        add	t1, a0, t0
+# CHECK-NEXT:  1      1     1.00                        add	t2, t2, t2
+
+# RV32-NEXT:   1      8     8.00                        div	a1, a1, a1
+# RV64-NEXT:   1      11    11.00                       div	a1, a1, a1
+
+# CHECK-NEXT:  1      2     1.00                        mul	s0, a1, s0
+# CHECK-NEXT:  1      1     1.00                        add	s1, s1, s0
+# CHECK-NEXT:  1      1     1.00                        add	s2, s2, s2
+
+# CHECK:      Resources:
+
+# RV32-NEXT:  [0]   - SCR3RV32_ALU
+# RV32-NEXT:  [1]   - SCR3RV32_CFU
+# RV32-NEXT:  [2]   - SCR3RV32_DIV
+# RV32-NEXT:  [3]   - SCR3RV32_LSU
+# RV32-NEXT:  [4]   - SCR3RV32_MUL
+
+# RV64-NEXT:  [0]   - SCR3RV64_ALU
+# RV64-NEXT:  [1]   - SCR3RV64_CFU
+# RV64-NEXT:  [2]   - SCR3RV64_DIV
+# RV64-NEXT:  [3]   - SCR3RV64_LSU
+# RV64-NEXT:  [4]   - SCR3RV64_MUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]
+
+# RV32-NEXT:  4.00    -     16.00   -     2.00
+# RV64-NEXT:  4.00    -     22.00   -     2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    Instructions:
+
+# RV32-NEXT:   -      -     8.00    -      -     div	a0, a0, a0
+# RV64-NEXT:   -      -     11.00   -      -     div	a0, a0, a0
+
+# CHECK-NEXT:  -      -      -      -     1.00   mul	t0, a0, t0
+# CHECK-NEXT: 1.00    -      -      -      -     add	t1, a0, t0
+# CHECK-NEXT: 1.00    -      -      -      -     add	t2, t2, t2
+
+# RV32-NEXT:   -      -     8.00    -      -     div	a1, a1, a1
+# RV64-NEXT:   -      -     11.00   -      -     div	a1, a1, a1
+
+# CHECK-NEXT:  -      -      -      -     1.00   mul	s0, a1, s0
+# CHECK-NEXT: 1.00    -      -      -      -     add	s1, s1, s0
+# CHECK-NEXT: 1.00    -      -      -      -     add	s2, s2, s2
diff --git a/llvm/test/tools/llvm-mca/RISCV/SyntacoreSCR/SCR3-LSU.s b/llvm/test/tools/llvm-mca/RISCV/SyntacoreSCR/SCR3-LSU.s
new file mode 100644
index 00000000000000..dc8ff80198ea5e
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SyntacoreSCR/SCR3-LSU.s
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64-unknown-unknown -mcpu=syntacore-scr3-rv64 --iterations=1 < %s | FileCheck %s --check-prefixes=CHECK,RV64
+# RUN: llvm-mca -mtriple=riscv32-unknown-unknown -mcpu=syntacore-scr3-rv32 --iterations=1 < %s | FileCheck %s --check-prefixes=CHECK,RV32
+
+lw a0, 0(s0)
+lw a1, 0(s0)
+lw a2, 0(s0)
+lw a3, 0(s0)
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      6
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    1
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.67
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     1.00    *                   lw	a0, 0(s0)
+# CHECK-NEXT:  1      2     1.00    *                   lw	a1, 0(s0)
+# CHECK-NEXT:  1      2     1.00    *                   lw	a2, 0(s0)
+# CHECK-NEXT:  1      2     1.00    *                   lw	a3, 0(s0)
+
+# CHECK:      Resources:
+
+# RV32-NEXT:  [0]   - SCR3RV32_ALU
+# RV32-NEXT:  [1]   - SCR3RV32_CFU
+# RV32-NEXT:  [2]   - SCR3RV32_DIV
+# RV32-NEXT:  [3]   - SCR3RV32_LSU
+# RV32-NEXT:  [4]   - SCR3RV32_MUL
+
+# RV64-NEXT:  [0]   - SCR3RV64_ALU
+# RV64-NEXT:  [1]   - SCR3RV64_CFU
+# RV64-NEXT:  [2]   - SCR3RV64_DIV
+# RV64-NEXT:  [3]   - SCR3RV64_LSU
+# RV64-NEXT:  [4]   - SCR3RV64_MUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]
+# CHECK-NEXT:  -      -      -     4.00    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -     lw	a0, 0(s0)
+# CHECK-NEXT:  -      -      -     1.00    -     lw	a1, 0(s0)
+# CHECK-NEXT:  -      -      -     1.00    -     lw	a2, 0(s0)
+# CHECK-NEXT:  -      -      -     1.00    -     lw	a3, 0(s0)
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic-histogram.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/basic-histogram.memprofexe
new file mode 100755
index 00000000000000..f69c0b12a89eb7
Binary files /dev/null and b/llvm/test/tools/llvm-profdata/Inputs/basic-histogram.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic-histogram.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/basic-histogram.memprofraw
new file mode 100644
index 00000000000000..ed679dc49c53bf
Binary files /dev/null and b/llvm/test/tools/llvm-profdata/Inputs/basic-histogram.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/basic.memprofexe
index d03894b6d8e40c..14cbfeb88eaf8f 100755
Binary files a/llvm/test/tools/llvm-profdata/Inputs/basic.memprofexe and b/llvm/test/tools/llvm-profdata/Inputs/basic.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/basic.memprofraw
index 62b7d299d3aa14..c3ac49e8079e9d 100644
Binary files a/llvm/test/tools/llvm-profdata/Inputs/basic.memprofraw and b/llvm/test/tools/llvm-profdata/Inputs/basic.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic_v3.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/basic_v3.memprofexe
new file mode 100755
index 00000000000000..d03894b6d8e40c
Binary files /dev/null and b/llvm/test/tools/llvm-profdata/Inputs/basic_v3.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic_v3.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/basic_v3.memprofraw
new file mode 100644
index 00000000000000..62b7d299d3aa14
Binary files /dev/null and b/llvm/test/tools/llvm-profdata/Inputs/basic_v3.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofexe
index 5687d065d035d9..1b4db88d8186dd 100755
Binary files a/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofexe and b/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofraw
index 43f9ca99c28bbb..e959e7679f56c7 100644
Binary files a/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofraw and b/llvm/test/tools/llvm-profdata/Inputs/buildid.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/inline.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/inline.memprofexe
index 766d2c4713136b..2822f2fa204345 100755
Binary files a/llvm/test/tools/llvm-profdata/Inputs/inline.memprofexe and b/llvm/test/tools/llvm-profdata/Inputs/inline.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/inline.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/inline.memprofraw
index 26254ae67c7c2e..05deb2e963a277 100644
Binary files a/llvm/test/tools/llvm-profdata/Inputs/inline.memprofraw and b/llvm/test/tools/llvm-profdata/Inputs/inline.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/multi.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/multi.memprofexe
index d5c3ec65b3ac78..22c6136f3dda80 100755
Binary files a/llvm/test/tools/llvm-profdata/Inputs/multi.memprofexe and b/llvm/test/tools/llvm-profdata/Inputs/multi.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/multi.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/multi.memprofraw
index a1018aae2fba46..364aa1cefdd73f 100644
Binary files a/llvm/test/tools/llvm-profdata/Inputs/multi.memprofraw and b/llvm/test/tools/llvm-profdata/Inputs/multi.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/padding-histogram.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/padding-histogram.memprofexe
new file mode 100755
index 00000000000000..34db7e784208cc
Binary files /dev/null and b/llvm/test/tools/llvm-profdata/Inputs/padding-histogram.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/padding-histogram.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/padding-histogram.memprofraw
new file mode 100644
index 00000000000000..7a7d3a6460aed2
Binary files /dev/null and b/llvm/test/tools/llvm-profdata/Inputs/padding-histogram.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/pic.memprofexe b/llvm/test/tools/llvm-profdata/Inputs/pic.memprofexe
index d854186380e734..f7d172314de6d5 100755
Binary files a/llvm/test/tools/llvm-profdata/Inputs/pic.memprofexe and b/llvm/test/tools/llvm-profdata/Inputs/pic.memprofexe differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/pic.memprofraw b/llvm/test/tools/llvm-profdata/Inputs/pic.memprofraw
index 992022fa9129c9..0920028b558401 100644
Binary files a/llvm/test/tools/llvm-profdata/Inputs/pic.memprofraw and b/llvm/test/tools/llvm-profdata/Inputs/pic.memprofraw differ
diff --git a/llvm/test/tools/llvm-profdata/Inputs/update_memprof_inputs.sh b/llvm/test/tools/llvm-profdata/Inputs/update_memprof_inputs.sh
index 5365a0bb6cc3a3..10c36a834c06a7 100755
--- a/llvm/test/tools/llvm-profdata/Inputs/update_memprof_inputs.sh
+++ b/llvm/test/tools/llvm-profdata/Inputs/update_memprof_inputs.sh
@@ -31,7 +31,7 @@ read -r -d '' INLINE << EOF
 
 __attribute__((always_inline))
 void qux(int x) {
-  char *ptr = malloc(x);
+  char *ptr = (char*) malloc(x);
   memset(ptr, 0, x);
   free(ptr);
 }
@@ -64,6 +64,190 @@ int main(int argc, char **argv) {
 }
 EOF
 
+read -r -d '' BASIC_HISTOGRAM << EOF
+struct A {
+  long int a;
+  long int b;
+  long int c;
+  long int d;
+  long int e;
+  long int f;
+  long int g;
+  long int h;
+  A() {};
+};
+
+void foo() {
+  long int acc = 0;
+  A *a = new A();
+  acc += a->a;
+  acc += a->b;
+  acc += a->c;
+  acc += a->d;
+  acc += a->e;
+  acc += a->f;
+  acc += a->g;
+  acc += a->h;
+  delete a;
+}
+void bar() {
+  long int acc = 0;
+  A *a = new A();
+  acc += a->a;
+  acc += a->a;
+  acc += a->a;
+  acc += a->a;
+  acc += a->a;
+  acc += a->a;
+  acc += a->a;
+  acc += a->a;
+  acc += a->b;
+  acc += a->b;
+  acc += a->b;
+  acc += a->b;
+  acc += a->b;
+  acc += a->b;
+  acc += a->b;
+  acc += a->c;
+  acc += a->c;
+  acc += a->c;
+  acc += a->c;
+  acc += a->c;
+  acc += a->c;
+  acc += a->d;
+  acc += a->d;
+  acc += a->d;
+  acc += a->d;
+  acc += a->d;
+  acc += a->e;
+  acc += a->e;
+  acc += a->e;
+  acc += a->e;
+  acc += a->f;
+  acc += a->f;
+  acc += a->f;
+  acc += a->g;
+  acc += a->g;
+  acc += a->h;
+
+  delete a;
+}
+
+int main(int argc, char **argv) {
+  long int acc = 0;
+  A *a = new A();
+  acc += a->a;
+  acc += a->b;
+  acc += a->c;
+  acc += a->d;
+  acc += a->e;
+  acc += a->f;
+  acc += a->g;
+  acc += a->h;
+
+  delete a;
+
+  A *b = new A();
+  acc += b->a;
+  acc += b->a;
+  acc += b->a;
+  acc += b->a;
+  acc += b->a;
+  acc += b->a;
+  acc += b->a;
+  acc += b->a;
+  acc += b->b;
+  acc += b->b;
+  acc += b->b;
+  acc += b->b;
+  acc += b->b;
+  acc += b->b;
+  acc += b->b;
+  acc += b->c;
+  acc += b->c;
+  acc += b->c;
+  acc += b->c;
+  acc += b->c;
+  acc += b->c;
+  acc += b->d;
+  acc += b->d;
+  acc += b->d;
+  acc += b->d;
+  acc += b->d;
+  acc += b->e;
+  acc += b->e;
+  acc += b->e;
+  acc += b->e;
+  acc += b->f;
+  acc += b->f;
+  acc += b->f;
+  acc += b->g;
+  acc += b->g;
+  acc += b->h;
+
+  delete b;
+
+  A *c = new A();
+  acc += c->a;
+
+  for (int i = 0; i < 21; ++i) {
+
+    foo();
+  }
+
+  for (int i = 0; i < 21; ++i) {
+
+    bar();
+  }
+
+  return 0;
+}
+EOF
+
+read -r -d '' PADDING_HISTOGRAM << EOF
+struct A {
+  char a;
+  char b;
+  long int c;
+  char d;
+  int e;
+  A() {};
+};
+
+struct B {
+  double x;
+  double y;
+  B() {};
+};
+
+struct C {
+  A a;
+  char z;
+  B b;
+  C() {};
+};
+
+int main(int argc, char **argv) {
+  long int acc = 0;
+
+  A *a = new A();
+  acc += a->a;
+  acc += a->b;
+  acc += a->c;
+  acc += a->d;
+  acc += a->e;
+
+  C *c = new C();
+  acc += c->a.a;
+  acc += c->a.a;
+  acc += c->b.x;
+  acc += c->b.y;
+
+  return 0;
+}
+EOF
+
+
 DEFAULT_MEMPROF_FLAGS="-fuse-ld=lld -Wl,--no-rosegment -gmlt -fdebug-info-for-profiling -fmemory-profile -mno-omit-leaf-frame-pointer -fno-omit-frame-pointer -fno-optimize-sibling-calls -m64 -Wl,-build-id -no-pie"
 
 # Map each test to their source and any additional flags separated by ; 
@@ -81,3 +265,20 @@ for name in "${!INPUTS[@]}"; do
   env MEMPROF_OPTIONS=log_path=stdout ${OUTDIR}/${name}.memprofexe > ${OUTDIR}/${name}.memprofraw
   rm ${OUTDIR}/${name}.c
 done
+
+
+DEFAULT_HIST_FLAGS="${DEFAULT_MEMPROF_FLAGS} -mllvm -memprof-use-callbacks=true -mllvm -memprof-histogram"
+
+
+# Map each test to their source and any additional flags separated by ; 
+declare -A HISTOGRAM_INPUTS
+HISTOGRAM_INPUTS["basic-histogram"]="BASIC_HISTOGRAM"
+HISTOGRAM_INPUTS["padding-histogram"]="PADDING_HISTOGRAM"
+
+for name in "${!HISTOGRAM_INPUTS[@]}"; do
+  IFS=";" read -r src flags <<< "${HISTOGRAM_INPUTS[$name]}"
+  echo "${!src}" > ${OUTDIR}/${name}.c
+  ${CLANG} ${DEFAULT_HIST_FLAGS} ${flags} ${OUTDIR}/${name}.c -o ${OUTDIR}/${name}.memprofexe
+  env MEMPROF_OPTIONS=log_path=stdout ${OUTDIR}/${name}.memprofexe > ${OUTDIR}/${name}.memprofraw
+  rm ${OUTDIR}/${name}.c
+done
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-profdata/memprof-basic-histogram.test b/llvm/test/tools/llvm-profdata/memprof-basic-histogram.test
new file mode 100644
index 00000000000000..3d30a627bdd796
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/memprof-basic-histogram.test
@@ -0,0 +1,244 @@
+REQUIRES: x86_64-linux
+
+To update the inputs used below run Inputs/update_memprof_inputs.sh /path/to/updated/clang
+RUN: llvm-profdata show --memory %p/Inputs/basic-histogram.memprofraw --profiled-binary %p/Inputs/basic-histogram.memprofexe -o - | FileCheck %s
+
+We expect 5 MIBs, each with different AccessHistogramValues.
+
+CHECK: MemprofProfile:
+CHECK-NEXT:   Summary:
+CHECK-NEXT:     Version: 4
+CHECK-NEXT:     NumSegments: {{[0-9]+}}
+CHECK-NEXT:     NumMibInfo: 5
+CHECK-NEXT:     NumAllocFunctions: 3
+CHECK-NEXT:     NumStackOffsets: 5
+CHECK-NEXT:   Segments:
+CHECK-NEXT:   -
+CHECK-NEXT:     BuildId: {{[[:xdigit:]]+}}
+CHECK-NEXT:     Start: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:     End: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:     Offset: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:   -
+
+CHECK:   Records:
+CHECK-NEXT:   -
+CHECK-NEXT:     FunctionGUID: {{[0-9]+}}
+CHECK-NEXT:     AllocSites:
+CHECK-NEXT:     -
+CHECK-NEXT:       Callstack:
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 2
+CHECK-NEXT:         Column: 10
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       MemInfoBlock:
+CHECK-NEXT:         AllocCount: 1
+CHECK-NEXT:         TotalAccessCount: 8
+CHECK-NEXT:         MinAccessCount: 8
+CHECK-NEXT:         MaxAccessCount: 8
+CHECK-NEXT:         TotalSize: 64
+CHECK-NEXT:         MinSize: 64
+CHECK-NEXT:         MaxSize: 64
+CHECK-NEXT:         AllocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         DeallocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         TotalLifetime: 0
+CHECK-NEXT:         MinLifetime: 0
+CHECK-NEXT:         MaxLifetime: 0
+CHECK-NEXT:         AllocCpuId: {{[0-9]+}}
+CHECK-NEXT:         DeallocCpuId: {{[0-9]+}}
+CHECK-NEXT:         NumMigratedCpu: 0
+CHECK-NEXT:         NumLifetimeOverlaps: 0
+CHECK-NEXT:         NumSameAllocCpu: 0
+CHECK-NEXT:         NumSameDeallocCpu: 0
+CHECK-NEXT:         DataTypeId: {{[0-9]+}}
+CHECK-NEXT:         TotalAccessDensity: 12
+CHECK-NEXT:         MinAccessDensity: 12
+CHECK-NEXT:         MaxAccessDensity: 12
+CHECK-NEXT:         TotalLifetimeAccessDensity: 12000
+CHECK-NEXT:         MinLifetimeAccessDensity: 12000
+CHECK-NEXT:         MaxLifetimeAccessDensity: 12000
+CHECK-NEXT:         AccessHistogramSize: 8
+CHECK-NEXT:         AccessHistogram: {{[0-9]+}}
+CHECK-NEXT:         AccessHistogramValues: 1 1 1 1 1 1 1 1
+CHECK-NEXT:     -
+CHECK-NEXT:       Callstack:
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 14
+CHECK-NEXT:         Column: 10
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       MemInfoBlock:
+CHECK-NEXT:         AllocCount: 1
+CHECK-NEXT:         TotalAccessCount: 36
+CHECK-NEXT:         MinAccessCount: 36
+CHECK-NEXT:         MaxAccessCount: 36
+CHECK-NEXT:         TotalSize: 64
+CHECK-NEXT:         MinSize: 64
+CHECK-NEXT:         MaxSize: 64
+CHECK-NEXT:         AllocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         DeallocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         TotalLifetime: 0
+CHECK-NEXT:         MinLifetime: 0
+CHECK-NEXT:         MaxLifetime: 0
+CHECK-NEXT:         AllocCpuId: {{[0-9]+}}
+CHECK-NEXT:         DeallocCpuId: {{[0-9]+}}
+CHECK-NEXT:         NumMigratedCpu: 0
+CHECK-NEXT:         NumLifetimeOverlaps: 0
+CHECK-NEXT:         NumSameAllocCpu: 0
+CHECK-NEXT:         NumSameDeallocCpu: 0
+CHECK-NEXT:         DataTypeId: {{[0-9]+}}
+CHECK-NEXT:         TotalAccessDensity: 56
+CHECK-NEXT:         MinAccessDensity: 56
+CHECK-NEXT:         MaxAccessDensity: 56
+CHECK-NEXT:         TotalLifetimeAccessDensity: 56000
+CHECK-NEXT:         MinLifetimeAccessDensity: 56000
+CHECK-NEXT:         MaxLifetimeAccessDensity: 56000
+CHECK-NEXT:         AccessHistogramSize: 8
+CHECK-NEXT:         AccessHistogram: {{[0-9]+}}
+CHECK-NEXT:         AccessHistogramValues: 8 7 6 5 4 3 2 1
+CHECK-NEXT:     -
+CHECK-NEXT:       Callstack:
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 54
+CHECK-NEXT:         Column: 10
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       MemInfoBlock:
+CHECK-NEXT:         AllocCount: 1
+CHECK-NEXT:         TotalAccessCount: 1
+CHECK-NEXT:         MinAccessCount: 1
+CHECK-NEXT:         MaxAccessCount: 1
+CHECK-NEXT:         TotalSize: 64
+CHECK-NEXT:         MinSize: 64
+CHECK-NEXT:         MaxSize: 64
+CHECK-NEXT:         AllocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         DeallocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         TotalLifetime: 0
+CHECK-NEXT:         MinLifetime: 0
+CHECK-NEXT:         MaxLifetime: 0
+CHECK-NEXT:         AllocCpuId: {{[0-9]+}}
+CHECK-NEXT:         DeallocCpuId: {{[0-9]+}}
+CHECK-NEXT:         NumMigratedCpu: 0
+CHECK-NEXT:         NumLifetimeOverlaps: 0
+CHECK-NEXT:         NumSameAllocCpu: 0
+CHECK-NEXT:         NumSameDeallocCpu: 0
+CHECK-NEXT:         DataTypeId: {{[0-9]+}}
+CHECK-NEXT:         TotalAccessDensity: 1
+CHECK-NEXT:         MinAccessDensity: 1
+CHECK-NEXT:         MaxAccessDensity: 1
+CHECK-NEXT:         TotalLifetimeAccessDensity: 1000
+CHECK-NEXT:         MinLifetimeAccessDensity: 1000
+CHECK-NEXT:         MaxLifetimeAccessDensity: 1000
+CHECK-NEXT:         AccessHistogramSize: 8
+CHECK-NEXT:         AccessHistogram: {{[0-9]+}}
+CHECK-NEXT:         AccessHistogramValues: 1 0 0 0 0 0 0 0
+CHECK-NEXT:     CallSites:
+CHECK-NEXT:     -
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 59
+CHECK-NEXT:         Column: 5
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:     -
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 64
+CHECK-NEXT:         Column: 5
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:   -
+CHECK-NEXT:     FunctionGUID: {{[0-9]+}}
+CHECK-NEXT:     AllocSites:
+CHECK-NEXT:     -
+CHECK-NEXT:       Callstack:
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: _Z3foov
+CHECK-NEXT:         LineOffset: 2
+CHECK-NEXT:         Column: 10
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 59
+CHECK-NEXT:         Column: 5
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       MemInfoBlock:
+CHECK-NEXT:         AllocCount: 21
+CHECK-NEXT:         TotalAccessCount: 168
+CHECK-NEXT:         MinAccessCount: 8
+CHECK-NEXT:         MaxAccessCount: 8
+CHECK-NEXT:         TotalSize: 1344
+CHECK-NEXT:         MinSize: 64
+CHECK-NEXT:         MaxSize: 64
+CHECK-NEXT:         AllocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         DeallocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         TotalLifetime: 0
+CHECK-NEXT:         MinLifetime: 0
+CHECK-NEXT:         MaxLifetime: 0
+CHECK-NEXT:         AllocCpuId: {{[0-9]+}}
+CHECK-NEXT:         DeallocCpuId: {{[0-9]+}}
+CHECK-NEXT:         NumMigratedCpu: 0
+CHECK-NEXT:         NumLifetimeOverlaps: 0
+CHECK-NEXT:         NumSameAllocCpu: 20
+CHECK-NEXT:         NumSameDeallocCpu: 20
+CHECK-NEXT:         DataTypeId: {{[0-9]+}}
+CHECK-NEXT:         TotalAccessDensity: 252
+CHECK-NEXT:         MinAccessDensity: 12
+CHECK-NEXT:         MaxAccessDensity: 12
+CHECK-NEXT:         TotalLifetimeAccessDensity: 252000
+CHECK-NEXT:         MinLifetimeAccessDensity: 12000
+CHECK-NEXT:         MaxLifetimeAccessDensity: 12000
+CHECK-NEXT:         AccessHistogramSize: 8
+CHECK-NEXT:         AccessHistogram: {{[0-9]+}}
+CHECK-NEXT:         AccessHistogramValues: 21 21 21 21 21 21 21 21
+CHECK-NEXT:   -
+CHECK-NEXT:     FunctionGUID: {{[0-9]+}}
+CHECK-NEXT:     AllocSites:
+CHECK-NEXT:     -
+CHECK-NEXT:       Callstack:
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: _Z3barv
+CHECK-NEXT:         LineOffset: 2
+CHECK-NEXT:         Column: 10
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 64
+CHECK-NEXT:         Column: 5
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       MemInfoBlock:
+CHECK-NEXT:         AllocCount: 21
+CHECK-NEXT:         TotalAccessCount: 756
+CHECK-NEXT:         MinAccessCount: 36
+CHECK-NEXT:         MaxAccessCount: 36
+CHECK-NEXT:         TotalSize: 1344
+CHECK-NEXT:         MinSize: 64
+CHECK-NEXT:         MaxSize: 64
+CHECK-NEXT:         AllocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         DeallocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         TotalLifetime: 0
+CHECK-NEXT:         MinLifetime: 0
+CHECK-NEXT:         MaxLifetime: 0
+CHECK-NEXT:         AllocCpuId: {{[0-9]+}}
+CHECK-NEXT:         DeallocCpuId: {{[0-9]+}}
+CHECK-NEXT:         NumMigratedCpu: 0
+CHECK-NEXT:         NumLifetimeOverlaps: 0
+CHECK-NEXT:         NumSameAllocCpu: 20
+CHECK-NEXT:         NumSameDeallocCpu: 20
+CHECK-NEXT:         DataTypeId: {{[0-9]+}}
+CHECK-NEXT:         TotalAccessDensity: 1176
+CHECK-NEXT:         MinAccessDensity: 56
+CHECK-NEXT:         MaxAccessDensity: 56
+CHECK-NEXT:         TotalLifetimeAccessDensity: 1176000
+CHECK-NEXT:         MinLifetimeAccessDensity: 56000
+CHECK-NEXT:         MaxLifetimeAccessDensity: 56000
+CHECK-NEXT:         AccessHistogramSize: 8
+CHECK-NEXT:         AccessHistogram: {{[0-9]+}}
+CHECK-NEXT:         AccessHistogramValues: 168 147 126 105 84 63 42 21
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-profdata/memprof-basic.test b/llvm/test/tools/llvm-profdata/memprof-basic.test
index 8eaa2fa1013f44..e15df50bc16577 100644
--- a/llvm/test/tools/llvm-profdata/memprof-basic.test
+++ b/llvm/test/tools/llvm-profdata/memprof-basic.test
@@ -8,7 +8,7 @@ additional allocations which do not originate from the main binary are pruned.
 
 CHECK:  MemprofProfile:
 CHECK-NEXT:   Summary:
-CHECK-NEXT:     Version: 3
+CHECK-NEXT:     Version: 4
 CHECK-NEXT:     NumSegments: {{[0-9]+}}
 CHECK-NEXT:     NumMibInfo: 2
 CHECK-NEXT:     NumAllocFunctions: 1
@@ -59,6 +59,8 @@ CHECK-NEXT:         MaxAccessDensity: 20
 CHECK-NEXT:         TotalLifetimeAccessDensity: 20000
 CHECK-NEXT:         MinLifetimeAccessDensity: 20000
 CHECK-NEXT:         MaxLifetimeAccessDensity: 20000
+CHECK-NEXT:         AccessHistogramSize: 0
+CHECK-NEXT:         AccessHistogram: 0
 CHECK-NEXT:     -
 CHECK-NEXT:       Callstack:
 CHECK-NEXT:       -
@@ -93,3 +95,5 @@ CHECK-NEXT:         MaxAccessDensity: 20
 CHECK-NEXT:         TotalLifetimeAccessDensity: 20000
 CHECK-NEXT:         MinLifetimeAccessDensity: 20000
 CHECK-NEXT:         MaxLifetimeAccessDensity: 20000
+CHECK-NEXT:         AccessHistogramSize: 0
+CHECK-NEXT:         AccessHistogram: 0
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-profdata/memprof-basic_v3.test b/llvm/test/tools/llvm-profdata/memprof-basic_v3.test
new file mode 100644
index 00000000000000..3c10c574c40326
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/memprof-basic_v3.test
@@ -0,0 +1,102 @@
+REQUIRES: x86_64-linux
+
+This is a copy of memprof-basic.test with slight changes to check that we can still read v3 of memprofraw.
+
+Inputs cannot and should not be updated.
+
+RUN: llvm-profdata show --memory %p/Inputs/basic_v3.memprofraw --profiled-binary %p/Inputs/basic_v3.memprofexe -o - | FileCheck %s
+
+We expect 2 MIB entries, 1 each for the malloc calls in the program. Any
+additional allocations which do not originate from the main binary are pruned.
+
+CHECK:  MemprofProfile:
+CHECK-NEXT:   Summary:
+CHECK-NEXT:     Version: 3
+CHECK-NEXT:     NumSegments: {{[0-9]+}}
+CHECK-NEXT:     NumMibInfo: 2
+CHECK-NEXT:     NumAllocFunctions: 1
+CHECK-NEXT:     NumStackOffsets: 2
+CHECK-NEXT:   Segments:
+CHECK-NEXT:   -
+CHECK-NEXT:     BuildId: {{[[:xdigit:]]+}}
+CHECK-NEXT:     Start: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:     End: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:     Offset: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:   -
+
+CHECK:   Records:
+CHECK-NEXT:   -
+CHECK-NEXT:     FunctionGUID: {{[0-9]+}}
+CHECK-NEXT:     AllocSites:
+CHECK-NEXT:     -
+CHECK-NEXT:       Callstack:
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 1
+CHECK-NEXT:         Column: 21
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       MemInfoBlock:
+CHECK-NEXT:         AllocCount: 1
+CHECK-NEXT:         TotalAccessCount: 2
+CHECK-NEXT:         MinAccessCount: 2
+CHECK-NEXT:         MaxAccessCount: 2
+CHECK-NEXT:         TotalSize: 10
+CHECK-NEXT:         MinSize: 10
+CHECK-NEXT:         MaxSize: 10
+CHECK-NEXT:         AllocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         DeallocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         TotalLifetime: 0
+CHECK-NEXT:         MinLifetime: 0
+CHECK-NEXT:         MaxLifetime: 0
+CHECK-NEXT:         AllocCpuId: {{[0-9]+}}
+CHECK-NEXT:         DeallocCpuId: {{[0-9]+}}
+CHECK-NEXT:         NumMigratedCpu: 0
+CHECK-NEXT:         NumLifetimeOverlaps: 0
+CHECK-NEXT:         NumSameAllocCpu: 0
+CHECK-NEXT:         NumSameDeallocCpu: 0
+CHECK-NEXT:         DataTypeId: {{[0-9]+}}
+CHECK-NEXT:         TotalAccessDensity: 20
+CHECK-NEXT:         MinAccessDensity: 20
+CHECK-NEXT:         MaxAccessDensity: 20
+CHECK-NEXT:         TotalLifetimeAccessDensity: 20000
+CHECK-NEXT:         MinLifetimeAccessDensity: 20000
+CHECK-NEXT:         MaxLifetimeAccessDensity: 20000
+CHECK-NEXT:         AccessHistogramSize: 0
+CHECK-NEXT:         AccessHistogram: 0
+CHECK-NEXT:     -
+CHECK-NEXT:       Callstack:
+CHECK-NEXT:       -
+CHECK-NEXT:         Function: {{[0-9]+}}
+CHECK-NEXT:         SymbolName: main
+CHECK-NEXT:         LineOffset: 4
+CHECK-NEXT:         Column: 15
+CHECK-NEXT:         Inline: 0
+CHECK-NEXT:       MemInfoBlock:
+CHECK-NEXT:         AllocCount: 1
+CHECK-NEXT:         TotalAccessCount: 2
+CHECK-NEXT:         MinAccessCount: 2
+CHECK-NEXT:         MaxAccessCount: 2
+CHECK-NEXT:         TotalSize: 10
+CHECK-NEXT:         MinSize: 10
+CHECK-NEXT:         MaxSize: 10
+CHECK-NEXT:         AllocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         DeallocTimestamp: {{[0-9]+}}
+CHECK-NEXT:         TotalLifetime: 0
+CHECK-NEXT:         MinLifetime: 0
+CHECK-NEXT:         MaxLifetime: 0
+CHECK-NEXT:         AllocCpuId: {{[0-9]+}}
+CHECK-NEXT:         DeallocCpuId: {{[0-9]+}}
+CHECK-NEXT:         NumMigratedCpu: 0
+CHECK-NEXT:         NumLifetimeOverlaps: 0
+CHECK-NEXT:         NumSameAllocCpu: 0
+CHECK-NEXT:         NumSameDeallocCpu: 0
+CHECK-NEXT:         DataTypeId: {{[0-9]+}}
+CHECK-NEXT:         TotalAccessDensity: 20
+CHECK-NEXT:         MinAccessDensity: 20
+CHECK-NEXT:         MaxAccessDensity: 20
+CHECK-NEXT:         TotalLifetimeAccessDensity: 20000
+CHECK-NEXT:         MinLifetimeAccessDensity: 20000
+CHECK-NEXT:         MaxLifetimeAccessDensity: 20000
+CHECK-NEXT:         AccessHistogramSize: 0
+CHECK-NEXT:         AccessHistogram: 0
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-profdata/memprof-inline.test b/llvm/test/tools/llvm-profdata/memprof-inline.test
index dd842c05420831..79ce2ad8384821 100644
--- a/llvm/test/tools/llvm-profdata/memprof-inline.test
+++ b/llvm/test/tools/llvm-profdata/memprof-inline.test
@@ -5,7 +5,7 @@ RUN: llvm-profdata show --memory %p/Inputs/inline.memprofraw --profiled-binary %
 
 CHECK:  MemprofProfile:
 CHECK-NEXT:  Summary:
-CHECK-NEXT:    Version: 3
+CHECK-NEXT:    Version: 4
 CHECK-NEXT:    NumSegments: {{[0-9]+}}
 CHECK-NEXT:    NumMibInfo: 2
 CHECK-NEXT:    NumAllocFunctions: 2
@@ -20,25 +20,25 @@ CHECK-NEXT:  -
 
 CHECK:  Records:
 CHECK-NEXT:  -
-CHECK-NEXT:    FunctionGUID: 15505678318020221912
+CHECK-NEXT:    FunctionGUID: 3873612792189045660
 CHECK-NEXT:    AllocSites:
 CHECK-NEXT:    -
 CHECK-NEXT:      Callstack:
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 15505678318020221912
-CHECK-NEXT:        SymbolName: qux
+CHECK-NEXT:        Function: 3873612792189045660
+CHECK-NEXT:        SymbolName: _Z3quxi
 CHECK-NEXT:        LineOffset: 1
-CHECK-NEXT:        Column: 15
+CHECK-NEXT:        Column: 23
 CHECK-NEXT:        Inline: 1
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 6699318081062747564
-CHECK-NEXT:        SymbolName: foo
+CHECK-NEXT:        Function: 1228452328526475178
+CHECK-NEXT:        SymbolName: _Z3fooi
 CHECK-NEXT:        LineOffset: 0
 CHECK-NEXT:        Column: 18
 CHECK-NEXT:        Inline: 0
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 16434608426314478903
-CHECK-NEXT:        SymbolName: bar
+CHECK-NEXT:        Function: 3727899762981752933
+CHECK-NEXT:        SymbolName: _Z3bari
 CHECK-NEXT:        LineOffset: 0
 CHECK-NEXT:        Column: 19
 CHECK-NEXT:        Inline: 0
@@ -74,26 +74,30 @@ CHECK-NEXT:        MaxAccessDensity: 100
 CHECK-NEXT:        TotalLifetimeAccessDensity: 100000
 CHECK-NEXT:        MinLifetimeAccessDensity: 100000
 CHECK-NEXT:        MaxLifetimeAccessDensity: 100000
+CHECK-NEXT:        AccessHistogramSize: 0
+CHECK-NEXT:        AccessHistogram: 0
+
+
 CHECK-NEXT:  -
-CHECK-NEXT:    FunctionGUID: 6699318081062747564
+CHECK-NEXT:    FunctionGUID: 1228452328526475178
 CHECK-NEXT:    AllocSites:
 CHECK-NEXT:    -
 CHECK-NEXT:      Callstack:
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 15505678318020221912
-CHECK-NEXT:        SymbolName: qux
+CHECK-NEXT:        Function: 3873612792189045660
+CHECK-NEXT:        SymbolName: _Z3quxi
 CHECK-NEXT:        LineOffset: 1
-CHECK-NEXT:        Column: 15
+CHECK-NEXT:        Column: 23
 CHECK-NEXT:        Inline: 1
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 6699318081062747564
-CHECK-NEXT:        SymbolName: foo
+CHECK-NEXT:        Function: 1228452328526475178
+CHECK-NEXT:        SymbolName: _Z3fooi
 CHECK-NEXT:        LineOffset: 0
 CHECK-NEXT:        Column: 18
 CHECK-NEXT:        Inline: 0
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 16434608426314478903
-CHECK-NEXT:        SymbolName: bar
+CHECK-NEXT:        Function: 3727899762981752933
+CHECK-NEXT:        SymbolName: _Z3bari
 CHECK-NEXT:        LineOffset: 0
 CHECK-NEXT:        Column: 19
 CHECK-NEXT:        Inline: 0
@@ -129,28 +133,31 @@ CHECK-NEXT:        MaxAccessDensity: 100
 CHECK-NEXT:        TotalLifetimeAccessDensity: 100000
 CHECK-NEXT:        MinLifetimeAccessDensity: 100000
 CHECK-NEXT:        MaxLifetimeAccessDensity: 100000
+CHECK-NEXT:        AccessHistogramSize: 0
+CHECK-NEXT:        AccessHistogram: 0
+
 CHECK-NEXT:    CallSites:
 CHECK-NEXT:    -
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 15505678318020221912
-CHECK-NEXT:        SymbolName: qux
+CHECK-NEXT:        Function: 3873612792189045660
+CHECK-NEXT:        SymbolName: _Z3quxi
 CHECK-NEXT:        LineOffset: 1
-CHECK-NEXT:        Column: 15
+CHECK-NEXT:        Column: 23
 CHECK-NEXT:        Inline: 1
 CHECK-NEXT:    -
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 6699318081062747564
-CHECK-NEXT:        SymbolName: foo
+CHECK-NEXT:        Function: 1228452328526475178
+CHECK-NEXT:        SymbolName: _Z3fooi
 CHECK-NEXT:        LineOffset: 0
 CHECK-NEXT:        Column: 18
 CHECK-NEXT:        Inline: 0
 CHECK-NEXT:  -
-CHECK-NEXT:    FunctionGUID: 16434608426314478903
+CHECK-NEXT:    FunctionGUID: 3727899762981752933
 CHECK-NEXT:    CallSites:
 CHECK-NEXT:    -
 CHECK-NEXT:      -
-CHECK-NEXT:        Function: 16434608426314478903
-CHECK-NEXT:        SymbolName: bar
+CHECK-NEXT:        Function: 3727899762981752933
+CHECK-NEXT:        SymbolName: _Z3bari
 CHECK-NEXT:        LineOffset: 0
 CHECK-NEXT:        Column: 19
 CHECK-NEXT:        Inline: 0
diff --git a/llvm/test/tools/llvm-profdata/memprof-multi.test b/llvm/test/tools/llvm-profdata/memprof-multi.test
index f3cdbd1f5266c1..62439823defd06 100644
--- a/llvm/test/tools/llvm-profdata/memprof-multi.test
+++ b/llvm/test/tools/llvm-profdata/memprof-multi.test
@@ -7,7 +7,7 @@ We expect 2 MIB entries, 1 each for the malloc calls in the program.
 
 CHECK:  MemprofProfile:
 CHECK-NEXT:  Summary:
-CHECK-NEXT:    Version: 3
+CHECK-NEXT:    Version: 4
 CHECK-NEXT:    NumSegments: {{[0-9]+}}
 CHECK-NEXT:    NumMibInfo: 2
 CHECK-NEXT:    NumAllocFunctions: 1
diff --git a/llvm/test/tools/llvm-profdata/memprof-padding-histogram.test b/llvm/test/tools/llvm-profdata/memprof-padding-histogram.test
new file mode 100644
index 00000000000000..4ba58e3c870d5f
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/memprof-padding-histogram.test
@@ -0,0 +1,99 @@
+REQUIRES: x86_64-linux
+
+To update the inputs used below run Inputs/update_memprof_inputs.sh /path/to/updated/clang
+RUN: llvm-profdata show --memory %p/Inputs/padding-histogram.memprofraw --profiled-binary %p/Inputs/padding-histogram.memprofexe -o - | FileCheck %s
+
+We expect 2 different MIBs with histogram values. This test is to make sure we properly deal with padding.
+
+CHECK: MemprofProfile:
+CHECK-NEXT:   Summary:
+CHECK-NEXT:     Version: 4
+CHECK-NEXT:     NumSegments: {{[0-9]+}}
+CHECK-NEXT:     NumMibInfo: 2
+CHECK-NEXT:     NumAllocFunctions: 1
+CHECK-NEXT:     NumStackOffsets: 2
+CHECK-NEXT:   Segments:
+CHECK-NEXT:   -
+CHECK-NEXT:     BuildId: {{[[:xdigit:]]+}}
+CHECK-NEXT:     Start: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:     End: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:     Offset: 0x{{[[:xdigit:]]+}}
+CHECK-NEXT:   -
+
+CHECK:   Records:
+CHEC-NEXT    FunctionGUID: {{[0-9]+}}
+CHEC-NEXT    AllocSites:
+CHEC-NEXT    -
+CHEC-NEXT      Callstack:
+CHEC-NEXT      -
+CHEC-NEXT        Function: {{[0-9]+}}
+CHEC-NEXT        SymbolName: main
+CHEC-NEXT        LineOffset: 3
+CHEC-NEXT        Column: 10
+CHEC-NEXT        Inline: 0
+CHEC-NEXT      MemInfoBlock:
+CHEC-NEXT        AllocCount: 1
+CHEC-NEXT        TotalAccessCount: 5
+CHEC-NEXT        MinAccessCount: 5
+CHEC-NEXT        MaxAccessCount: 5
+CHEC-NEXT        TotalSize: 24
+CHEC-NEXT        MinSize: 24
+CHEC-NEXT        MaxSize: 24
+CHEC-NEXT        AllocTimestamp: {{[0-9]+}}
+CHEC-NEXT        DeallocTimestamp: {{[0-9]+}}
+CHEC-NEXT        TotalLifetime: 0
+CHEC-NEXT        MinLifetime: 0
+CHEC-NEXT        MaxLifetime: 0
+CHEC-NEXT        AllocCpuId: 11
+CHEC-NEXT        DeallocCpuId: 11
+CHEC-NEXT        NumMigratedCpu: 0
+CHEC-NEXT        NumLifetimeOverlaps: 0
+CHEC-NEXT        NumSameAllocCpu: 0
+CHEC-NEXT        NumSameDeallocCpu: 0
+CHEC-NEXT        DataTypeId: 0
+CHEC-NEXT        TotalAccessDensity: 20
+CHEC-NEXT        MinAccessDensity: 20
+CHEC-NEXT        MaxAccessDensity: 20
+CHEC-NEXT        TotalLifetimeAccessDensity: 20000
+CHEC-NEXT        MinLifetimeAccessDensity: 20000
+CHEC-NEXT        MaxLifetimeAccessDensity: 20000
+CHEC-NEXT        AccessHistogramSize: 3
+CHEC-NEXT        AccessHistogram: {{[0-9]+}}
+CHEC-NEXT        AccessHistogramValues: -2 -1 -2
+CHEC-NEXT    -
+CHEC-NEXT      Callstack:
+CHEC-NEXT      -
+CHEC-NEXT        Function: {{[0-9]+}}
+CHEC-NEXT        SymbolName: main
+CHEC-NEXT        LineOffset: 10
+CHEC-NEXT        Column: 10
+CHEC-NEXT        Inline: 0
+CHEC-NEXT      MemInfoBlock:
+CHEC-NEXT        AllocCount: 1
+CHEC-NEXT        TotalAccessCount: 4
+CHEC-NEXT        MinAccessCount: 4
+CHEC-NEXT        MaxAccessCount: 4
+CHEC-NEXT        TotalSize: 48
+CHEC-NEXT        MinSize: 48
+CHEC-NEXT        MaxSize: 48
+CHEC-NEXT        AllocTimestamp: {{[0-9]+}}
+CHEC-NEXT        DeallocTimestamp: {{[0-9]+}}
+CHEC-NEXT        TotalLifetime: 0
+CHEC-NEXT        MinLifetime: 0
+CHEC-NEXT        MaxLifetime: 0
+CHEC-NEXT        AllocCpuId: 11
+CHEC-NEXT        DeallocCpuId: 11
+CHEC-NEXT        NumMigratedCpu: 0
+CHEC-NEXT        NumLifetimeOverlaps: 0
+CHEC-NEXT        NumSameAllocCpu: 0
+CHEC-NEXT        NumSameDeallocCpu: 0
+CHEC-NEXT        DataTypeId: 0
+CHEC-NEXT        TotalAccessDensity: 8
+CHEC-NEXT        MinAccessDensity: 8
+CHEC-NEXT        MaxAccessDensity: 8
+CHEC-NEXT        TotalLifetimeAccessDensity: 8000
+CHEC-NEXT        MinLifetimeAccessDensity: 8000
+CHEC-NEXT        MaxLifetimeAccessDensity: 8000
+CHEC-NEXT        AccessHistogramSize: 6
+CHEC-NEXT        AccessHistogram: {{[0-9]+}}
+CHEC-NEXT        AccessHistogramValues: -2 -0 -0 -0 -1 -1
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-profdata/memprof-pic.test b/llvm/test/tools/llvm-profdata/memprof-pic.test
index 03c98ed9200d91..78d2c5c54feb11 100644
--- a/llvm/test/tools/llvm-profdata/memprof-pic.test
+++ b/llvm/test/tools/llvm-profdata/memprof-pic.test
@@ -11,7 +11,7 @@ RUN: llvm-profdata show --memory %p/Inputs/pic.memprofraw --profiled-binary %p/I
 
 CHECK:  MemprofProfile:
 CHECK-NEXT:   Summary:
-CHECK-NEXT:     Version: 3
+CHECK-NEXT:     Version: 4
 CHECK-NEXT:     NumSegments: {{[0-9]+}}
 CHECK-NEXT:     NumMibInfo: 2
 CHECK-NEXT:     NumAllocFunctions: 1
@@ -62,6 +62,9 @@ CHECK-NEXT:         MaxAccessDensity: 20
 CHECK-NEXT:         TotalLifetimeAccessDensity: 20000
 CHECK-NEXT:         MinLifetimeAccessDensity: 20000
 CHECK-NEXT:         MaxLifetimeAccessDensity: 20000
+CHECK-NEXT:         AccessHistogramSize: 0
+CHECK-NEXT:         AccessHistogram: 0
+
 CHECK-NEXT:     -
 CHECK-NEXT:       Callstack:
 CHECK-NEXT:       -
@@ -96,3 +99,5 @@ CHECK-NEXT:         MaxAccessDensity: 20
 CHECK-NEXT:         TotalLifetimeAccessDensity: 20000
 CHECK-NEXT:         MinLifetimeAccessDensity: 20000
 CHECK-NEXT:         MaxLifetimeAccessDensity: 20000
+CHECK-NEXT:         AccessHistogramSize: 0
+CHECK-NEXT:         AccessHistogram: 0
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-profdata/vtable-value-prof.test b/llvm/test/tools/llvm-profdata/vtable-value-prof.test
index 378c2e11b236ba..8dc8f6f0d480ee 100644
--- a/llvm/test/tools/llvm-profdata/vtable-value-prof.test
+++ b/llvm/test/tools/llvm-profdata/vtable-value-prof.test
@@ -1,7 +1,7 @@
 ; RUN: rm -rf %t && mkdir %t && cd %t
 
 ; Generate indexed profiles from text profiles
-RUN: llvm-profdata merge %S/Inputs/vtable-value-prof.proftext -o indexed.profdata
+RUN: llvm-profdata merge --keep-vtable-symbols %S/Inputs/vtable-value-prof.proftext -o indexed.profdata
 
 ; Show indexed profiles
 RUN: llvm-profdata show --function=main --ic-targets --show-vtables indexed.profdata | FileCheck %s --check-prefix=INDEXED
@@ -10,7 +10,7 @@ RUN: llvm-profdata show --function=main --ic-targets --show-vtables indexed.prof
 RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text %S/Inputs/vtable-value-prof.proftext | FileCheck %s --check-prefix=ICTEXT
 
 ; Convert indexed profiles to its textual output and show it.
-RUN: llvm-profdata merge --text -o text-from-indexed.proftext indexed.profdata
+RUN: llvm-profdata merge --keep-vtable-symbols --text -o text-from-indexed.proftext indexed.profdata
 RUN: llvm-profdata show --function=main --ic-targets --show-vtables text-from-indexed.proftext | FileCheck %s --check-prefix=INDEXED
 RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text text-from-indexed.proftext | FileCheck %s --check-prefix=ICTEXT
 
diff --git a/llvm/test/tools/llvm-readobj/ELF/dynamic-malformed.test b/llvm/test/tools/llvm-readobj/ELF/dynamic-malformed.test
index d160ea87208c3b..05be15775e9e71 100644
--- a/llvm/test/tools/llvm-readobj/ELF/dynamic-malformed.test
+++ b/llvm/test/tools/llvm-readobj/ELF/dynamic-malformed.test
@@ -100,15 +100,15 @@ ProgramHeaders:
 # BAD-STRING-LLVM-NEXT:   0x0000000000000005 STRTAB    0x1000
 # BAD-STRING-LLVM-NEXT:   0x000000000000000A STRSZ     1 (bytes)
 # BAD-STRING-LLVM-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb2: it goes past the end of the table (0xb1)
-# BAD-STRING-LLVM-NEXT:   0x0000000000000001 NEEDED    Shared library: [<?>]
+# BAD-STRING-LLVM-NEXT:   0x0000000000000001 NEEDED    Shared library: <?>
 # BAD-STRING-LLVM-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb3: it goes past the end of the table (0xb1)
-# BAD-STRING-LLVM-NEXT:   0x000000007FFFFFFF FILTER    Filter library: [<?>]
+# BAD-STRING-LLVM-NEXT:   0x000000007FFFFFFF FILTER    Filter library: <?>
 # BAD-STRING-LLVM-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb4: it goes past the end of the table (0xb1)
-# BAD-STRING-LLVM-NEXT:   0x000000007FFFFFFD AUXILIARY Auxiliary library: [<?>]
+# BAD-STRING-LLVM-NEXT:   0x000000007FFFFFFD AUXILIARY Auxiliary library: <?>
 # BAD-STRING-LLVM-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb5: it goes past the end of the table (0xb1)
-# BAD-STRING-LLVM-NEXT:   0x000000007FFFFFFE USED      Not needed object: [<?>]
+# BAD-STRING-LLVM-NEXT:   0x000000007FFFFFFE USED      Not needed object: <?>
 ## Note: there is no "string table at offset 0xb0..." warning here, because it was printed earlier.
-# BAD-STRING-LLVM-NEXT:   0x000000000000000E SONAME    Library soname: [<?>]
+# BAD-STRING-LLVM-NEXT:   0x000000000000000E SONAME    Library soname: <?>
 # BAD-STRING-LLVM-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb7: it goes past the end of the table (0xb1)
 # BAD-STRING-LLVM-NEXT:   0x000000000000000F RPATH     Library rpath: [<?>]
 # BAD-STRING-LLVM-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb8: it goes past the end of the table (0xb1)
@@ -121,15 +121,15 @@ ProgramHeaders:
 # BAD-STRING-GNU-NEXT:   0x0000000000000005 (STRTAB)    0x1000
 # BAD-STRING-GNU-NEXT:   0x000000000000000a (STRSZ)     1 (bytes)
 # BAD-STRING-GNU-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb2: it goes past the end of the table (0xb1)
-# BAD-STRING-GNU-NEXT:   0x0000000000000001 (NEEDED)    Shared library: [<?>]
+# BAD-STRING-GNU-NEXT:   0x0000000000000001 (NEEDED)    Shared library: <?>
 # BAD-STRING-GNU-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb3: it goes past the end of the table (0xb1)
-# BAD-STRING-GNU-NEXT:   0x000000007fffffff (FILTER)    Filter library: [<?>]
+# BAD-STRING-GNU-NEXT:   0x000000007fffffff (FILTER)    Filter library: <?>
 # BAD-STRING-GNU-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb4: it goes past the end of the table (0xb1)
-# BAD-STRING-GNU-NEXT:   0x000000007ffffffd (AUXILIARY) Auxiliary library: [<?>]
+# BAD-STRING-GNU-NEXT:   0x000000007ffffffd (AUXILIARY) Auxiliary library: <?>
 # BAD-STRING-GNU-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb5: it goes past the end of the table (0xb1)
-# BAD-STRING-GNU-NEXT:   0x000000007ffffffe (USED)      Not needed object: [<?>]
+# BAD-STRING-GNU-NEXT:   0x000000007ffffffe (USED)      Not needed object: <?>
 # BAD-STRING-GNU-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb6: it goes past the end of the table (0xb1)
-# BAD-STRING-GNU-NEXT:   0x000000000000000e (SONAME)    Library soname: [<?>]
+# BAD-STRING-GNU-NEXT:   0x000000000000000e (SONAME)    Library soname: <?>
 # BAD-STRING-GNU-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb7: it goes past the end of the table (0xb1)
 # BAD-STRING-GNU-NEXT:   0x000000000000000f (RPATH)     Library rpath: [<?>]
 # BAD-STRING-GNU-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb8: it goes past the end of the table (0xb1)
@@ -199,8 +199,8 @@ ProgramHeaders:
 #      BAD-STRTAB-ERR2: warning: '[[FILE]]': unable to parse DT_STRTAB: virtual address is not in any segment: 0x2000000
 #      BAD-STRTAB-LLVM: LoadName: <Not found>
 #           BAD-STRTAB: warning: '[[FILE]]': string table was not found
-# BAD-STRTAB-LLVM-NEXT: 0x0000000000000001  NEEDED   Shared library: [<?>]
-#  BAD-STRTAB-GNU-NEXT: 0x0000000000000001 (NEEDED)  Shared library: [<?>]
+# BAD-STRTAB-LLVM-NEXT: 0x0000000000000001  NEEDED   Shared library: <?>
+#  BAD-STRTAB-GNU-NEXT: 0x0000000000000001 (NEEDED)  Shared library: <?>
 #           BAD-STRTAB: NeededLibraries [
 #           BAD-STRTAB:   <?>
 #           BAD-STRTAB: ]
@@ -313,15 +313,15 @@ ProgramHeaders:
 
 # NOT-TERMINATED-LLVM: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb4: the string table is not null-terminated
 # NOT-TERMINATED:      warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb0: the string table is not null-terminated
-# NOT-TERMINATED-NEXT: {{[(]?}}NEEDED{{[)]?}}    Shared library: [<?>]
+# NOT-TERMINATED-NEXT: {{[(]?}}NEEDED{{[)]?}}    Shared library: <?>
 # NOT-TERMINATED-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb1: the string table is not null-terminated
-# NOT-TERMINATED-NEXT: {{[(]?}}FILTER{{[)]?}}    Filter library: [<?>]
+# NOT-TERMINATED-NEXT: {{[(]?}}FILTER{{[)]?}}    Filter library: <?>
 # NOT-TERMINATED-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb2: the string table is not null-terminated
-# NOT-TERMINATED-NEXT: {{[(]?}}AUXILIARY{{[)]?}} Auxiliary library: [<?>]
+# NOT-TERMINATED-NEXT: {{[(]?}}AUXILIARY{{[)]?}} Auxiliary library: <?>
 # NOT-TERMINATED-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb3: the string table is not null-terminated
-# NOT-TERMINATED-NEXT: {{[(]?}}USED{{[)]?}}      Not needed object: [<?>]
+# NOT-TERMINATED-NEXT: {{[(]?}}USED{{[)]?}}      Not needed object: <?>
 # NOT-TERMINATED-GNU-NEXT:  warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb4: the string table is not null-terminated
-# NOT-TERMINATED-NEXT: {{[(]?}}SONAME{{[)]?}}    Library soname: [<?>]
+# NOT-TERMINATED-NEXT: {{[(]?}}SONAME{{[)]?}}    Library soname: <?>
 # NOT-TERMINATED-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb5: the string table is not null-terminated
 # NOT-TERMINATED-NEXT: {{[(]?}}RPATH{{[)]?}}     Library rpath: [<?>]
 # NOT-TERMINATED-GREQ-NEXT: warning: '[[FILE]]': string table at offset 0xb0: unable to read the string at 0xb6: the string table is not null-terminated
@@ -390,11 +390,11 @@ ProgramHeaders:
 ## as normal. Since the file ends with a zero byte, strings are dumped, but if it didn't,
 ## we'd get <?> printed instead. The important bit is that we don't get the past the end warning.
 
-# BEFORE-THE-EOF:      {{[(]?}}NEEDED{{[)]?}}    Shared library: [test.soabc]
-# BEFORE-THE-EOF-NEXT: {{[(]?}}FILTER{{[)]?}}    Filter library: [est.soabc]
-# BEFORE-THE-EOF-NEXT: {{[(]?}}AUXILIARY{{[)]?}} Auxiliary library: [st.soabc]
-# BEFORE-THE-EOF-NEXT: {{[(]?}}USED{{[)]?}}      Not needed object: [t.soabc]
-# BEFORE-THE-EOF-NEXT: {{[(]?}}SONAME{{[)]?}}    Library soname: [.soabc]
+# BEFORE-THE-EOF:      {{[(]?}}NEEDED{{[)]?}}    Shared library: test.soabc
+# BEFORE-THE-EOF-NEXT: {{[(]?}}FILTER{{[)]?}}    Filter library: est.soabc
+# BEFORE-THE-EOF-NEXT: {{[(]?}}AUXILIARY{{[)]?}} Auxiliary library: st.soabc
+# BEFORE-THE-EOF-NEXT: {{[(]?}}USED{{[)]?}}      Not needed object: t.soabc
+# BEFORE-THE-EOF-NEXT: {{[(]?}}SONAME{{[)]?}}    Library soname: .soabc
 # BEFORE-THE-EOF-NEXT: {{[(]?}}RPATH{{[)]?}}     Library rpath: [soabc]
 # BEFORE-THE-EOF-NEXT: {{[(]?}}RUNPATH{{[)]?}}   Library runpath: [oabc]
 # BEFORE-THE-EOF-NEXT: {{[(]?}}NULL{{[)]?}}      0x0
@@ -406,11 +406,11 @@ ProgramHeaders:
 
 # PAST-THE-EOF:      warning: '[[FILE]]': the dynamic string table at 0xb0 goes past the end of the file (0x2c0) with DT_STRSZ = 0x211
 # PAST-THE-EOF:      warning: '[[FILE]]': string table was not found
-# PAST-THE-EOF:      {{[(]?}}NEEDED{{[)]?}}    Shared library: [<?>]
-# PAST-THE-EOF-NEXT: {{[(]?}}FILTER{{[)]?}}    Filter library: [<?>]
-# PAST-THE-EOF-NEXT: {{[(]?}}AUXILIARY{{[)]?}} Auxiliary library: [<?>]
-# PAST-THE-EOF-NEXT: {{[(]?}}USED{{[)]?}}      Not needed object: [<?>]
-# PAST-THE-EOF-NEXT: {{[(]?}}SONAME{{[)]?}}    Library soname: [<?>]
+# PAST-THE-EOF:      {{[(]?}}NEEDED{{[)]?}}    Shared library: <?>
+# PAST-THE-EOF-NEXT: {{[(]?}}FILTER{{[)]?}}    Filter library: <?>
+# PAST-THE-EOF-NEXT: {{[(]?}}AUXILIARY{{[)]?}} Auxiliary library: <?>
+# PAST-THE-EOF-NEXT: {{[(]?}}USED{{[)]?}}      Not needed object: <?>
+# PAST-THE-EOF-NEXT: {{[(]?}}SONAME{{[)]?}}    Library soname: <?>
 # PAST-THE-EOF-NEXT: {{[(]?}}RPATH{{[)]?}}     Library rpath: [<?>]
 # PAST-THE-EOF-NEXT: {{[(]?}}RUNPATH{{[)]?}}   Library runpath: [<?>]
 # PAST-THE-EOF-NEXT: {{[(]?}}NULL{{[)]?}}      0x0
diff --git a/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test b/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test
index 5610ed872df5cc..e25f64ca220275 100644
--- a/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test
+++ b/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test
@@ -8,10 +8,12 @@
 # RUN: llvm-readelf --dynamic-table %t1 \
 # RUN:   | FileCheck %s --strict-whitespace --match-full-lines --check-prefix=GNU64
 # RUN: llvm-readelf -d %t1 | FileCheck %s --check-prefix=GNU64
+# RUN: llvm-readelf --dynamic-table --pretty-print --elf-output-style=JSON %t1 \
+# RUN:   | FileCheck %s --strict-whitespace --check-prefix=JSON64
 
 #      LLVM64:DynamicSection [ (61 entries)
 # LLVM64-NEXT:  Tag                Type                 Name/Value
-# LLVM64-NEXT:  0x0000000000000001 NEEDED               Shared library: [D]
+# LLVM64-NEXT:  0x0000000000000001 NEEDED               Shared library: D
 # LLVM64-NEXT:  0x0000000000000002 PLTRELSZ             16 (bytes)
 # LLVM64-NEXT:  0x0000000000000003 PLTGOT               0x1000
 # LLVM64-NEXT:  0x0000000000000004 HASH                 0x1000
@@ -24,8 +26,8 @@
 # LLVM64-NEXT:  0x000000000000000B SYMENT               2439 (bytes)
 # LLVM64-NEXT:  0x000000000000000C INIT                 0x1000
 # LLVM64-NEXT:  0x000000000000000D FINI                 0x1000
-# LLVM64-NEXT:  0x000000000000000E SONAME               Library soname: [U]
-# LLVM64-NEXT:  0x000000000000000F RPATH                Library rpath: [f]
+# LLVM64-NEXT:  0x000000000000000E SONAME               Library soname: U
+# LLVM64-NEXT:  0x000000000000000F RPATH                Library rpath: [x:w:U]
 # LLVM64-NEXT:  0x0000000000000010 SYMBOLIC             0x1234567890ABCDEF
 # LLVM64-NEXT:  0x0000000000000011 REL                  0x1000
 # LLVM64-NEXT:  0x0000000000000012 RELSZ                16 (bytes)
@@ -65,9 +67,9 @@
 # LLVM64-NEXT:  0x000000006FFFFFFD VERDEFNUM            0
 # LLVM64-NEXT:  0x000000006FFFFFFE VERNEED              0x1000
 # LLVM64-NEXT:  0x000000006FFFFFFF VERNEEDNUM           0
-# LLVM64-NEXT:  0x000000007FFFFFFD AUXILIARY            Auxiliary library: [D]
-# LLVM64-NEXT:  0x000000007FFFFFFE USED                 Not needed object: [U]
-# LLVM64-NEXT:  0x000000007FFFFFFF FILTER               Filter library: [U]
+# LLVM64-NEXT:  0x000000007FFFFFFD AUXILIARY            Auxiliary library: D
+# LLVM64-NEXT:  0x000000007FFFFFFE USED                 Not needed object: U
+# LLVM64-NEXT:  0x000000007FFFFFFF FILTER               Filter library: U
 # LLVM64-NEXT:  0x0000000012345678 <unknown:>0x12345678 0x8765432187654321
 # LLVM64-NEXT:  0x000000006ABCDEF0 <unknown:>0x6abcdef0 0x9988776655443322
 # LLVM64-NEXT:  0x0000000076543210 <unknown:>0x76543210 0x5555666677778888
@@ -76,7 +78,7 @@
 
 #      GNU64:Dynamic section at offset {{.*}} contains 61 entries:
 # GNU64-NEXT:  Tag                Type                   Name/Value
-# GNU64-NEXT:  0x0000000000000001 (NEEDED)               Shared library: [D]
+# GNU64-NEXT:  0x0000000000000001 (NEEDED)               Shared library: D
 # GNU64-NEXT:  0x0000000000000002 (PLTRELSZ)             16 (bytes)
 # GNU64-NEXT:  0x0000000000000003 (PLTGOT)               0x1000
 # GNU64-NEXT:  0x0000000000000004 (HASH)                 0x1000
@@ -89,8 +91,8 @@
 # GNU64-NEXT:  0x000000000000000b (SYMENT)               2439 (bytes)
 # GNU64-NEXT:  0x000000000000000c (INIT)                 0x1000
 # GNU64-NEXT:  0x000000000000000d (FINI)                 0x1000
-# GNU64-NEXT:  0x000000000000000e (SONAME)               Library soname: [U]
-# GNU64-NEXT:  0x000000000000000f (RPATH)                Library rpath: [f]
+# GNU64-NEXT:  0x000000000000000e (SONAME)               Library soname: U
+# GNU64-NEXT:  0x000000000000000f (RPATH)                Library rpath: [x:w:U]
 # GNU64-NEXT:  0x0000000000000010 (SYMBOLIC)             0x1234567890abcdef
 # GNU64-NEXT:  0x0000000000000011 (REL)                  0x1000
 # GNU64-NEXT:  0x0000000000000012 (RELSZ)                16 (bytes)
@@ -130,14 +132,371 @@
 # GNU64-NEXT:  0x000000006ffffffd (VERDEFNUM)            0
 # GNU64-NEXT:  0x000000006ffffffe (VERNEED)              0x1000
 # GNU64-NEXT:  0x000000006fffffff (VERNEEDNUM)           0
-# GNU64-NEXT:  0x000000007ffffffd (AUXILIARY)            Auxiliary library: [D]
-# GNU64-NEXT:  0x000000007ffffffe (USED)                 Not needed object: [U]
-# GNU64-NEXT:  0x000000007fffffff (FILTER)               Filter library: [U]
+# GNU64-NEXT:  0x000000007ffffffd (AUXILIARY)            Auxiliary library: D
+# GNU64-NEXT:  0x000000007ffffffe (USED)                 Not needed object: U
+# GNU64-NEXT:  0x000000007fffffff (FILTER)               Filter library: U
 # GNU64-NEXT:  0x0000000012345678 (<unknown:>0x12345678) 0x8765432187654321
 # GNU64-NEXT:  0x000000006abcdef0 (<unknown:>0x6abcdef0) 0x9988776655443322
 # GNU64-NEXT:  0x0000000076543210 (<unknown:>0x76543210) 0x5555666677778888
 # GNU64-NEXT:  0x0000000000000000 (NULL)                 0x0
 
+#      JSON64:"DynamicSection": [
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1,
+# JSON64-NEXT:      "Type": "NEEDED",
+# JSON64-NEXT:      "Value": 1,
+# JSON64-NEXT:      "Library": "D"
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 2,
+# JSON64-NEXT:      "Type": "PLTRELSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 3,
+# JSON64-NEXT:      "Type": "PLTGOT",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 4,
+# JSON64-NEXT:      "Type": "HASH",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 5,
+# JSON64-NEXT:      "Type": "STRTAB",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 6,
+# JSON64-NEXT:      "Type": "SYMTAB",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 7,
+# JSON64-NEXT:      "Type": "RELA",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 8,
+# JSON64-NEXT:      "Type": "RELASZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 9,
+# JSON64-NEXT:      "Type": "RELAENT",
+# JSON64-NEXT:      "Value": 1929
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 10,
+# JSON64-NEXT:      "Type": "STRSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 11,
+# JSON64-NEXT:      "Type": "SYMENT",
+# JSON64-NEXT:      "Value": 2439
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 12,
+# JSON64-NEXT:      "Type": "INIT",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 13,
+# JSON64-NEXT:      "Type": "FINI",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 14,
+# JSON64-NEXT:      "Type": "SONAME",
+# JSON64-NEXT:      "Value": 3,
+# JSON64-NEXT:      "Name": "U"
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 15,
+# JSON64-NEXT:      "Type": "RPATH",
+# JSON64-NEXT:      "Value": 9,
+# JSON64-NEXT:      "Path": [
+# JSON64-NEXT:          "x",
+# JSON64-NEXT:          "w",
+# JSON64-NEXT:          "U"
+# JSON64-NEXT:      ]
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 16,
+# JSON64-NEXT:      "Type": "SYMBOLIC",
+# JSON64-NEXT:      "Value": 1311768467294899695
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 17,
+# JSON64-NEXT:      "Type": "REL",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 18,
+# JSON64-NEXT:      "Type": "RELSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 19,
+# JSON64-NEXT:      "Type": "RELENT",
+# JSON64-NEXT:      "Value": 291
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 20,
+# JSON64-NEXT:      "Type": "PLTREL",
+# JSON64-NEXT:      "Value": 7
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 21,
+# JSON64-NEXT:      "Type": "DEBUG",
+# JSON64-NEXT:      "Value": 18364757930599072545
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 22,
+# JSON64-NEXT:      "Type": "TEXTREL",
+# JSON64-NEXT:      "Value": 1234605616436508552
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 23,
+# JSON64-NEXT:      "Type": "JMPREL",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 24,
+# JSON64-NEXT:      "Type": "BIND_NOW",
+# JSON64-NEXT:      "Value": 9833440827789222417
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 25,
+# JSON64-NEXT:      "Type": "INIT_ARRAY",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 26,
+# JSON64-NEXT:      "Type": "FINI_ARRAY",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 27,
+# JSON64-NEXT:      "Type": "INIT_ARRAYSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 28,
+# JSON64-NEXT:      "Type": "FINI_ARRAYSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 29,
+# JSON64-NEXT:      "Type": "RUNPATH",
+# JSON64-NEXT:      "Value": 7,
+# JSON64-NEXT:      "Path": [
+# JSON64-NEXT:          "w"
+# JSON64-NEXT:      ]
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 30,
+# JSON64-NEXT:      "Type": "FLAGS",
+# JSON64-NEXT:      "Value": 18446744073709551615,
+# JSON64-NEXT:      "Flags": [
+# JSON64-NEXT:          "ORIGIN",
+# JSON64-NEXT:          "SYMBOLIC",
+# JSON64-NEXT:          "TEXTREL",
+# JSON64-NEXT:          "BIND_NOW",
+# JSON64-NEXT:          "STATIC_TLS"
+# JSON64-NEXT:      ]
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 32,
+# JSON64-NEXT:      "Type": "PREINIT_ARRAY",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 33,
+# JSON64-NEXT:      "Type": "PREINIT_ARRAYSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 34,
+# JSON64-NEXT:      "Type": "SYMTAB_SHNDX",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 35,
+# JSON64-NEXT:      "Type": "RELRSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 36,
+# JSON64-NEXT:      "Type": "RELR",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 37,
+# JSON64-NEXT:      "Type": "RELRENT",
+# JSON64-NEXT:      "Value": 17185
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1610612751,
+# JSON64-NEXT:      "Type": "ANDROID_REL",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1610612752,
+# JSON64-NEXT:      "Type": "ANDROID_RELSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1610612753,
+# JSON64-NEXT:      "Type": "ANDROID_RELA",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1610612754,
+# JSON64-NEXT:      "Type": "ANDROID_RELASZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879040000,
+# JSON64-NEXT:      "Type": "ANDROID_RELR",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879040001,
+# JSON64-NEXT:      "Type": "ANDROID_RELRSZ",
+# JSON64-NEXT:      "Value": 16
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879040003,
+# JSON64-NEXT:      "Type": "ANDROID_RELRENT",
+# JSON64-NEXT:      "Value": 4660
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879047925,
+# JSON64-NEXT:      "Type": "GNU_HASH",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879047926,
+# JSON64-NEXT:      "Type": "TLSDESC_PLT",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879047927,
+# JSON64-NEXT:      "Type": "TLSDESC_GOT",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048185,
+# JSON64-NEXT:      "Type": "RELACOUNT",
+# JSON64-NEXT:      "Value": 0
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048186,
+# JSON64-NEXT:      "Type": "RELCOUNT",
+# JSON64-NEXT:      "Value": 0
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048187,
+# JSON64-NEXT:      "Type": "FLAGS_1"
+# JSON64-NEXT:      "Value": 18446744073709551615,
+# JSON64-NEXT:      "Flags": [
+# JSON64-NEXT:          "NOW",
+# JSON64-NEXT:          "GLOBAL",
+# JSON64-NEXT:          "GROUP",
+# JSON64-NEXT:          "NODELETE",
+# JSON64-NEXT:          "LOADFLTR",
+# JSON64-NEXT:          "INITFIRST",
+# JSON64-NEXT:          "NOOPEN",
+# JSON64-NEXT:          "ORIGIN",
+# JSON64-NEXT:          "DIRECT",
+# JSON64-NEXT:          "TRANS",
+# JSON64-NEXT:          "INTERPOSE",
+# JSON64-NEXT:          "NODEFLIB",
+# JSON64-NEXT:          "NODUMP",
+# JSON64-NEXT:          "CONFALT",
+# JSON64-NEXT:          "ENDFILTEE",
+# JSON64-NEXT:          "DISPRELDNE",
+# JSON64-NEXT:          "DISPRELPND",
+# JSON64-NEXT:          "NODIRECT",
+# JSON64-NEXT:          "IGNMULDEF",
+# JSON64-NEXT:          "NOKSYMS",
+# JSON64-NEXT:          "NOHDR",
+# JSON64-NEXT:          "EDITED",
+# JSON64-NEXT:          "NORELOC",
+# JSON64-NEXT:          "SYMINTPOSE",
+# JSON64-NEXT:          "GLOBAUDIT",
+# JSON64-NEXT:          "SINGLETON",
+# JSON64-NEXT:          "PIE"
+# JSON64-NEXT:      ]
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048176,
+# JSON64-NEXT:      "Type": "VERSYM",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048188,
+# JSON64-NEXT:      "Type": "VERDEF",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048189,
+# JSON64-NEXT:      "Type": "VERDEFNUM",
+# JSON64-NEXT:      "Value": 0
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048190,
+# JSON64-NEXT:      "Type": "VERNEED",
+# JSON64-NEXT:      "Value": 4096
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1879048191,
+# JSON64-NEXT:      "Type": "VERNEEDNUM",
+# JSON64-NEXT:      "Value": 0
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 2147483645,
+# JSON64-NEXT:      "Type": "AUXILIARY"
+# JSON64-NEXT:      "Value": 1,
+# JSON64-NEXT:      "Library": "D"
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 2147483646,
+# JSON64-NEXT:      "Type": "USED",
+# JSON64-NEXT:      "Value": 3,
+# JSON64-NEXT:      "Object": "U"
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 2147483647,
+# JSON64-NEXT:      "Type": "FILTER",
+# JSON64-NEXT:      "Value": 3,
+# JSON64-NEXT:      "Library": "U"
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 305419896,
+# JSON64-NEXT:      "Type": "<unknown:>0x12345678",
+# JSON64-NEXT:      "Value": 9756277979052589857
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1790762736,
+# JSON64-NEXT:      "Type": "<unknown:>0x6abcdef0",
+# JSON64-NEXT:      "Value": 11063223766036525858
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 1985229328,
+# JSON64-NEXT:      "Type": "<unknown:>0x76543210",
+# JSON64-NEXT:      "Value": 6148933456521300104
+# JSON64-NEXT:  },
+# JSON64-NEXT:  {
+# JSON64-NEXT:      "Tag": 0,
+# JSON64-NEXT:      "Type": "NULL",
+# JSON64-NEXT:      "Value": 0
+# JSON64-NEXT:  }
+# JSON64-NEXT:]
+
 --- !ELF
 FileHeader:
   Class:      ELFCLASS[[BITS=64]]
@@ -149,7 +508,7 @@ Sections:
     Type:    SHT_STRTAB
     Address: 0x1000
     Size:    0x10
-    Content: "004400550066007700"
+    Content: "004400550066007700783A773A5500"
   - Name:    .dynamic
     Type:    SHT_DYNAMIC
     Address: 0x1010
@@ -183,7 +542,7 @@ Sections:
       - Tag:   DT_SONAME
         Value: 0x3
       - Tag:   DT_RPATH
-        Value: 0x5
+        Value: 0x9
       - Tag:   DT_SYMBOLIC
         Value: [[SYMBOLIC=0x1234567890abcdef]]
       - Tag:   DT_REL
@@ -302,7 +661,7 @@ ProgramHeaders:
 
 #      LLVM32:DynamicSection [ (61 entries)
 # LLVM32-NEXT:  Tag        Type                 Name/Value
-# LLVM32-NEXT:  0x00000001 NEEDED               Shared library: [D]
+# LLVM32-NEXT:  0x00000001 NEEDED               Shared library: D
 # LLVM32-NEXT:  0x00000002 PLTRELSZ             16 (bytes)
 # LLVM32-NEXT:  0x00000003 PLTGOT               0x1000
 # LLVM32-NEXT:  0x00000004 HASH                 0x1000
@@ -315,8 +674,8 @@ ProgramHeaders:
 # LLVM32-NEXT:  0x0000000B SYMENT               2439 (bytes)
 # LLVM32-NEXT:  0x0000000C INIT                 0x1000
 # LLVM32-NEXT:  0x0000000D FINI                 0x1000
-# LLVM32-NEXT:  0x0000000E SONAME               Library soname: [U]
-# LLVM32-NEXT:  0x0000000F RPATH                Library rpath: [f]
+# LLVM32-NEXT:  0x0000000E SONAME               Library soname: U
+# LLVM32-NEXT:  0x0000000F RPATH                Library rpath: [x:w:U]
 # LLVM32-NEXT:  0x00000010 SYMBOLIC             0x12345678
 # LLVM32-NEXT:  0x00000011 REL                  0x1000
 # LLVM32-NEXT:  0x00000012 RELSZ                16 (bytes)
@@ -356,9 +715,9 @@ ProgramHeaders:
 # LLVM32-NEXT:  0x6FFFFFFD VERDEFNUM            0
 # LLVM32-NEXT:  0x6FFFFFFE VERNEED              0x1000
 # LLVM32-NEXT:  0x6FFFFFFF VERNEEDNUM           0
-# LLVM32-NEXT:  0x7FFFFFFD AUXILIARY            Auxiliary library: [D]
-# LLVM32-NEXT:  0x7FFFFFFE USED                 Not needed object: [U]
-# LLVM32-NEXT:  0x7FFFFFFF FILTER               Filter library: [U]
+# LLVM32-NEXT:  0x7FFFFFFD AUXILIARY            Auxiliary library: D
+# LLVM32-NEXT:  0x7FFFFFFE USED                 Not needed object: U
+# LLVM32-NEXT:  0x7FFFFFFF FILTER               Filter library: U
 # LLVM32-NEXT:  0x12345678 <unknown:>0x12345678 0x87654321
 # LLVM32-NEXT:  0x6ABCDEF0 <unknown:>0x6abcdef0 0x99887766
 # LLVM32-NEXT:  0x76543210 <unknown:>0x76543210 0x55556666
@@ -367,7 +726,7 @@ ProgramHeaders:
 
 #      GNU32:Dynamic section at offset 0x84 contains 61 entries:
 # GNU32-NEXT:  Tag        Type                   Name/Value
-# GNU32-NEXT:  0x00000001 (NEEDED)               Shared library: [D]
+# GNU32-NEXT:  0x00000001 (NEEDED)               Shared library: D
 # GNU32-NEXT:  0x00000002 (PLTRELSZ)             16 (bytes)
 # GNU32-NEXT:  0x00000003 (PLTGOT)               0x1000
 # GNU32-NEXT:  0x00000004 (HASH)                 0x1000
@@ -380,8 +739,8 @@ ProgramHeaders:
 # GNU32-NEXT:  0x0000000b (SYMENT)               2439 (bytes)
 # GNU32-NEXT:  0x0000000c (INIT)                 0x1000
 # GNU32-NEXT:  0x0000000d (FINI)                 0x1000
-# GNU32-NEXT:  0x0000000e (SONAME)               Library soname: [U]
-# GNU32-NEXT:  0x0000000f (RPATH)                Library rpath: [f]
+# GNU32-NEXT:  0x0000000e (SONAME)               Library soname: U
+# GNU32-NEXT:  0x0000000f (RPATH)                Library rpath: [x:w:U]
 # GNU32-NEXT:  0x00000010 (SYMBOLIC)             0x12345678
 # GNU32-NEXT:  0x00000011 (REL)                  0x1000
 # GNU32-NEXT:  0x00000012 (RELSZ)                16 (bytes)
@@ -421,9 +780,9 @@ ProgramHeaders:
 # GNU32-NEXT:  0x6ffffffd (VERDEFNUM)            0
 # GNU32-NEXT:  0x6ffffffe (VERNEED)              0x1000
 # GNU32-NEXT:  0x6fffffff (VERNEEDNUM)           0
-# GNU32-NEXT:  0x7ffffffd (AUXILIARY)            Auxiliary library: [D]
-# GNU32-NEXT:  0x7ffffffe (USED)                 Not needed object: [U]
-# GNU32-NEXT:  0x7fffffff (FILTER)               Filter library: [U]
+# GNU32-NEXT:  0x7ffffffd (AUXILIARY)            Auxiliary library: D
+# GNU32-NEXT:  0x7ffffffe (USED)                 Not needed object: U
+# GNU32-NEXT:  0x7fffffff (FILTER)               Filter library: U
 # GNU32-NEXT:  0x12345678 (<unknown:>0x12345678) 0x87654321
 # GNU32-NEXT:  0x6abcdef0 (<unknown:>0x6abcdef0) 0x99887766
 # GNU32-NEXT:  0x76543210 (<unknown:>0x76543210) 0x55556666
@@ -497,7 +856,7 @@ Sections:
 
 # PHENTSIZE-LLVM:      DynamicSection [ (61 entries)
 # PHENTSIZE-LLVM-NEXT:   Tag                Type                 Name/Value
-# PHENTSIZE-LLVM-NEXT:   0x0000000000000001 NEEDED               Shared library: [<?>]
+# PHENTSIZE-LLVM-NEXT:   0x0000000000000001 NEEDED               Shared library: <?>
 # PHENTSIZE-LLVM-NEXT:   0x0000000000000002 PLTRELSZ             16 (bytes)
 # PHENTSIZE-LLVM-NEXT:   0x0000000000000003 PLTGOT               0x1000
 # PHENTSIZE-LLVM-NEXT:   0x0000000000000004 HASH                 0x1000
@@ -510,7 +869,7 @@ Sections:
 # PHENTSIZE-LLVM-NEXT:   0x000000000000000B SYMENT               2439 (bytes)
 # PHENTSIZE-LLVM-NEXT:   0x000000000000000C INIT                 0x1000
 # PHENTSIZE-LLVM-NEXT:   0x000000000000000D FINI                 0x1000
-# PHENTSIZE-LLVM-NEXT:   0x000000000000000E SONAME               Library soname: [<?>]
+# PHENTSIZE-LLVM-NEXT:   0x000000000000000E SONAME               Library soname: <?>
 # PHENTSIZE-LLVM-NEXT:   0x000000000000000F RPATH                Library rpath: [<?>]
 # PHENTSIZE-LLVM-NEXT:   0x0000000000000010 SYMBOLIC             0x1234567890ABCDEF
 # PHENTSIZE-LLVM-NEXT:   0x0000000000000011 REL                  0x1000
@@ -551,9 +910,9 @@ Sections:
 # PHENTSIZE-LLVM-NEXT:   0x000000006FFFFFFD VERDEFNUM            0
 # PHENTSIZE-LLVM-NEXT:   0x000000006FFFFFFE VERNEED              0x1000
 # PHENTSIZE-LLVM-NEXT:   0x000000006FFFFFFF VERNEEDNUM           0
-# PHENTSIZE-LLVM-NEXT:   0x000000007FFFFFFD AUXILIARY            Auxiliary library: [<?>]
-# PHENTSIZE-LLVM-NEXT:   0x000000007FFFFFFE USED                 Not needed object: [<?>]
-# PHENTSIZE-LLVM-NEXT:   0x000000007FFFFFFF FILTER               Filter library: [<?>]
+# PHENTSIZE-LLVM-NEXT:   0x000000007FFFFFFD AUXILIARY            Auxiliary library: <?>
+# PHENTSIZE-LLVM-NEXT:   0x000000007FFFFFFE USED                 Not needed object: <?>
+# PHENTSIZE-LLVM-NEXT:   0x000000007FFFFFFF FILTER               Filter library: <?>
 # PHENTSIZE-LLVM-NEXT:   0x0000000012345678 <unknown:>0x12345678 0x8765432187654321
 # PHENTSIZE-LLVM-NEXT:   0x000000006ABCDEF0 <unknown:>0x6abcdef0 0x9988776655443322
 # PHENTSIZE-LLVM-NEXT:   0x0000000076543210 <unknown:>0x76543210 0x5555666677778888
@@ -563,7 +922,7 @@ Sections:
 # PHENTSIZE-GNU:      Dynamic section at offset 0xc0 contains 61 entries:
 # PHENTSIZE-GNU-NEXT:   Tag                Type                   Name/Value
 # PHENTSIZE-GNU-NEXT: warning: '[[FILE]]': string table was not found
-# PHENTSIZE-GNU-NEXT:   0x0000000000000001 (NEEDED)               Shared library: [<?>]
+# PHENTSIZE-GNU-NEXT:   0x0000000000000001 (NEEDED)               Shared library: <?>
 # PHENTSIZE-GNU-NEXT:   0x0000000000000002 (PLTRELSZ)             16 (bytes)
 # PHENTSIZE-GNU-NEXT:   0x0000000000000003 (PLTGOT)               0x1000
 # PHENTSIZE-GNU-NEXT:   0x0000000000000004 (HASH)                 0x1000
@@ -576,7 +935,7 @@ Sections:
 # PHENTSIZE-GNU-NEXT:   0x000000000000000b (SYMENT)               2439 (bytes)
 # PHENTSIZE-GNU-NEXT:   0x000000000000000c (INIT)                 0x1000
 # PHENTSIZE-GNU-NEXT:   0x000000000000000d (FINI)                 0x1000
-# PHENTSIZE-GNU-NEXT:   0x000000000000000e (SONAME)               Library soname: [<?>]
+# PHENTSIZE-GNU-NEXT:   0x000000000000000e (SONAME)               Library soname: <?>
 # PHENTSIZE-GNU-NEXT:   0x000000000000000f (RPATH)                Library rpath: [<?>]
 # PHENTSIZE-GNU-NEXT:   0x0000000000000010 (SYMBOLIC)             0x1234567890abcdef
 # PHENTSIZE-GNU-NEXT:   0x0000000000000011 (REL)                  0x1000
@@ -617,9 +976,9 @@ Sections:
 # PHENTSIZE-GNU-NEXT:   0x000000006ffffffd (VERDEFNUM)            0
 # PHENTSIZE-GNU-NEXT:   0x000000006ffffffe (VERNEED)              0x1000
 # PHENTSIZE-GNU-NEXT:   0x000000006fffffff (VERNEEDNUM)           0
-# PHENTSIZE-GNU-NEXT:   0x000000007ffffffd (AUXILIARY)            Auxiliary library: [<?>]
-# PHENTSIZE-GNU-NEXT:   0x000000007ffffffe (USED)                 Not needed object: [<?>]
-# PHENTSIZE-GNU-NEXT:   0x000000007fffffff (FILTER)               Filter library: [<?>]
+# PHENTSIZE-GNU-NEXT:   0x000000007ffffffd (AUXILIARY)            Auxiliary library: <?>
+# PHENTSIZE-GNU-NEXT:   0x000000007ffffffe (USED)                 Not needed object: <?>
+# PHENTSIZE-GNU-NEXT:   0x000000007fffffff (FILTER)               Filter library: <?>
 # PHENTSIZE-GNU-NEXT:   0x0000000012345678 (<unknown:>0x12345678) 0x8765432187654321
 # PHENTSIZE-GNU-NEXT:   0x000000006abcdef0 (<unknown:>0x6abcdef0) 0x9988776655443322
 # PHENTSIZE-GNU-NEXT:   0x0000000076543210 (<unknown:>0x76543210) 0x5555666677778888
diff --git a/llvm/test/tools/llvm-readobj/ELF/loadname.test b/llvm/test/tools/llvm-readobj/ELF/loadname.test
index 18db00845e00b5..359b9ca762334b 100644
--- a/llvm/test/tools/llvm-readobj/ELF/loadname.test
+++ b/llvm/test/tools/llvm-readobj/ELF/loadname.test
@@ -15,7 +15,7 @@
 # GNU-NEXT:   Tag                Type     Name/Value
 # GNU-NEXT:   0x0000000000000005 (STRTAB) 0x0
 # GNU-NEXT:   0x000000000000000a (STRSZ)  8 (bytes)
-# GNU-NEXT:   0x000000000000000e (SONAME) Library soname: [test.so]
+# GNU-NEXT:   0x000000000000000e (SONAME) Library soname: test.so
 # GNU-NEXT:   0x0000000000000000 (NULL)   0x0
 
 !ELF
@@ -62,4 +62,4 @@ ProgramHeaders:
 # BROKEN-OFFSET:      warning: '[[FILE]]': unable to parse DT_STRTAB: can't map virtual address 0xfffe to the segment with index 1: the segment ends at 0x10077, which is greater than the file size (0x228)
 # BROKEN-OFFSET:      warning: '[[FILE]]': string table was not found
 # BROKEN-OFFSET-LLVM: LoadName: <?>
-# BROKEN-OFFSET-GNU:  0x000000000000000e (SONAME) Library soname: [<?>]
+# BROKEN-OFFSET-GNU:  0x000000000000000e (SONAME) Library soname: <?>
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
index 8b76237efa3253..d269f92763853c 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
index 46d7d9783aeae0..731cf4b374c95b 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
new file mode 100644
index 00000000000000..836b5c05d0653d
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -debug 2>&1 | FileCheck %s --implicit-check-not="[root]"
+; REQUIRES: asserts
+
+; func_3 is never directly called, it needs to be considered
+; as a root to handle this module correctly.
+
+; CHECK:      [root] kernel_1
+; CHECK-NEXT:   [dependency] func_1
+; CHECK-NEXT:   [dependency] func_2
+; CHECK-NEXT: [root] func_3
+; CHECK-NEXT:   [dependency] func_2
+
+define amdgpu_kernel void @kernel_1() {
+entry:
+  call void @func_1()
+  ret void
+}
+
+define linkonce_odr hidden void @func_1() {
+entry:
+  %call = call i32 @func_2()
+  ret void
+}
+
+define linkonce_odr hidden i32 @func_2() #0 {
+entry:
+  ret i32 0
+}
+
+define void @func_3() {
+entry:
+  %call = call i32 @func_2()
+  ret void
+}
+
+attributes #0 = { noinline optnone }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
index 4fdbac7d17897d..459c5a7f1a2db3 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
@@ -1,9 +1,9 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=1.2 -amdgpu-module-splitting-large-function-merge-overlap=0.5
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
-; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
+; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s
 ; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s
 ; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s
diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependencies.ll
new file mode 100644
index 00000000000000..f944b7ef65d2dc
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependencies.ll
@@ -0,0 +1,44 @@
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s
+
+; 3 functions with each their own dependencies should go into 3
+; distinct partitions.
+
+; CHECK0: define void @C
+; CHECK0: define internal void @HelperC
+
+; CHECK1: define void @B
+; CHECK1: define internal void @HelperB
+
+; CHECK2: define void @A
+; CHECK2: define internal void @HelperA
+
+
+define void @A() {
+  call void @HelperA()
+  ret void
+}
+
+define internal void @HelperA() {
+  ret void
+}
+
+define void @B() {
+  call void @HelperB()
+  ret void
+}
+
+define internal void @HelperB() {
+  ret void
+}
+
+define void @C() {
+  call void @HelperC()
+  ret void
+}
+
+define internal void @HelperC() {
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
new file mode 100644
index 00000000000000..167930ce0e8063
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
@@ -0,0 +1,72 @@
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s
+
+; We have 4 function:
+;   - Each function has an internal helper
+;   - @A and @B's helpers does an indirect call.
+;
+; For non-kernels, indirect calls shouldn't matter, so
+; @CallCandidate doesn't have to be in A/B's partition, unlike
+; in the corresponding tests for kernels where it has to.
+
+; CHECK0: define hidden void @HelperA
+; CHECK0: define hidden void @HelperB
+; CHECK0: define internal void @HelperC
+; CHECK0: define internal void @HelperD
+; CHECK0: define void @A
+; CHECK0: define void @B
+
+; CHECK1: define internal void @HelperD
+; CHECK1: define void @D
+
+; CHECK2: define hidden void @CallCandidate
+; CHECK2: define internal void @HelperC
+; CHECK2: define void @C
+
+@addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
+
+define internal void @HelperA(ptr %call) {
+  call void %call()
+  ret void
+}
+
+define internal void @HelperB(ptr %call) {
+  call void @HelperC()
+  call void %call()
+  call void @HelperD()
+  ret void
+}
+
+define internal void @CallCandidate() {
+  ret void
+}
+
+define internal void @HelperC() {
+  ret void
+}
+
+define internal void @HelperD() {
+  ret void
+}
+
+define void @A(ptr %call) {
+  call void @HelperA(ptr %call)
+  ret void
+}
+
+define void @B(ptr %call) {
+  call void @HelperB(ptr %call)
+  ret void
+}
+
+define void @C() {
+  call void @HelperC()
+  ret void
+}
+
+define void @D() {
+  call void @HelperD()
+  ret void
+}
diff --git a/llvm/tools/dsymutil/MachOUtils.cpp b/llvm/tools/dsymutil/MachOUtils.cpp
index b08ae4cf64ab9f..e2ec8cae22856f 100644
--- a/llvm/tools/dsymutil/MachOUtils.cpp
+++ b/llvm/tools/dsymutil/MachOUtils.cpp
@@ -629,9 +629,6 @@ bool generateDsymCompanion(
 
   // Emit the Dwarf sections contents.
   for (const MCSection &Sec : MCAsm) {
-    if (Sec.empty())
-      continue;
-
     uint64_t Pos = OutFile.tell();
     OutFile.write_zeros(alignTo(Pos, Sec.getAlign()) - Pos);
     MCAsm.writeSectionData(OutFile, &Sec, Layout);
diff --git a/llvm/tools/lli/ForwardingMemoryManager.h b/llvm/tools/lli/ForwardingMemoryManager.h
index 2cc669953ee694..e5c10d672ccfb2 100644
--- a/llvm/tools/lli/ForwardingMemoryManager.h
+++ b/llvm/tools/lli/ForwardingMemoryManager.h
@@ -98,8 +98,8 @@ class RemoteResolver : public LegacyJITSymbolResolver {
     auto H = DylibMgr->open("", 0);
     if (!H)
       return H.takeError();
-    return std::unique_ptr<RemoteResolver>(
-        new RemoteResolver(std::move(*DylibMgr), std::move(*H)));
+    return std::make_unique<RemoteResolver>(std::move(*DylibMgr),
+                                            std::move(*H));
   }
 
   JITSymbol findSymbol(const std::string &Name) override {
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index 518716168c4235..6fa36421810f0f 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -157,8 +157,26 @@ struct TypeCloner {
         return LLVMX86MMXTypeInContext(Ctx);
       case LLVMTokenTypeKind:
         return LLVMTokenTypeInContext(Ctx);
-      case LLVMTargetExtTypeKind:
-        assert(false && "Implement me");
+      case LLVMTargetExtTypeKind: {
+        const char *Name = LLVMGetTargetExtTypeName(Src);
+        unsigned NumTypeParams = LLVMGetTargetExtTypeNumTypeParams(Src);
+        unsigned NumIntParams = LLVMGetTargetExtTypeNumIntParams(Src);
+
+        SmallVector<LLVMTypeRef, 4> TypeParams((size_t)NumTypeParams);
+        SmallVector<unsigned, 4> IntParams((size_t)NumIntParams);
+
+        for (unsigned i = 0; i < TypeParams.size(); i++)
+          TypeParams[i] = Clone(LLVMGetTargetExtTypeTypeParam(Src, i));
+
+        for (unsigned i = 0; i < IntParams.size(); i++)
+          IntParams[i] = LLVMGetTargetExtTypeIntParam(Src, i);
+
+        LLVMTypeRef TargetExtTy = LLVMTargetExtTypeInContext(
+            Ctx, Name, TypeParams.data(), TypeParams.size(), IntParams.data(),
+            IntParams.size());
+
+        return TargetExtTy;
+      }
     }
 
     fprintf(stderr, "%d is not a supported typekind\n", Kind);
diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp
index 4c4f2a9fdf78b5..18f4f1a0eb9fbf 100644
--- a/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -188,8 +188,11 @@ int llvm_dwp_main(int argc, char **argv, const llvm::ToolContext &) {
                         std::make_move_iterator(DWOs->end()));
   }
 
-  if (DWOFilenames.empty())
+  if (DWOFilenames.empty()) {
+    WithColor::defaultWarningHandler(make_error<DWPError>(
+        "executable file does not contain any references to dwo files"));
     return 0;
+  }
 
   std::string ErrorStr;
   StringRef Context = "dwarf streamer init";
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 6c8ab14e7c2456..2ce0668601bc6a 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -291,6 +291,11 @@ cl::opt<bool> DropProfileSymbolList(
     cl::desc("Drop the profile symbol list when merging AutoFDO profiles "
              "(only meaningful for -sample)"));
 
+cl::opt<bool> KeepVTableSymbols(
+    "keep-vtable-symbols", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("If true, keep the vtable symbols in indexed profiles"));
+
 // Temporary support for writing the previous version of the format, to enable
 // some forward compatibility.
 // TODO: Consider enabling this with future version changes as well, to ease
@@ -767,11 +772,12 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
     });
   }
 
-  const InstrProfSymtab &symtab = Reader->getSymtab();
-  const auto &VTableNames = symtab.getVTableNames();
+  if (KeepVTableSymbols) {
+    const InstrProfSymtab &symtab = Reader->getSymtab();
+    const auto &VTableNames = symtab.getVTableNames();
 
-  for (const auto &kv : VTableNames) {
-    WC->Writer.addVTableName(kv.getKey());
+    for (const auto &kv : VTableNames)
+      WC->Writer.addVTableName(kv.getKey());
   }
 
   if (Reader->hasTemporalProfile()) {
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index c696934a959b2a..a986ba88fdcfd1 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -793,7 +793,11 @@ template <typename ELFT> class JSONELFDumper : public LLVMELFDumper<ELFT> {
 
   void printEmptyGroupMessage() const override;
 
+  void printDynamicTable() override;
+
 private:
+  void printAuxillaryDynamicTableEntryInfo(const Elf_Dyn &Entry);
+
   std::unique_ptr<DictScope> FileScope;
 };
 
@@ -2313,6 +2317,12 @@ std::string ELFDumper<ELFT>::getDynamicEntry(uint64_t Type,
     return OS.str();
   };
 
+  const std::map<uint64_t, const char *> TagNames = {
+      {DT_NEEDED, "Shared library"},       {DT_SONAME, "Library soname"},
+      {DT_AUXILIARY, "Auxiliary library"}, {DT_USED, "Not needed object"},
+      {DT_FILTER, "Filter library"},       {DT_RPATH, "Library rpath"},
+      {DT_RUNPATH, "Library runpath"},
+  };
   // Handle custom printing of architecture specific tags
   switch (Obj.getHeader().e_machine) {
   case EM_AARCH64:
@@ -2470,18 +2480,11 @@ std::string ELFDumper<ELFT>::getDynamicEntry(uint64_t Type,
   case DT_AUXILIARY:
   case DT_USED:
   case DT_FILTER:
+    return (Twine(TagNames.at(Type)) + ": " + getDynamicString(Value)).str();
   case DT_RPATH:
-  case DT_RUNPATH: {
-    const std::map<uint64_t, const char *> TagNames = {
-        {DT_NEEDED, "Shared library"},       {DT_SONAME, "Library soname"},
-        {DT_AUXILIARY, "Auxiliary library"}, {DT_USED, "Not needed object"},
-        {DT_FILTER, "Filter library"},       {DT_RPATH, "Library rpath"},
-        {DT_RUNPATH, "Library runpath"},
-    };
-
+  case DT_RUNPATH:
     return (Twine(TagNames.at(Type)) + ": [" + getDynamicString(Value) + "]")
         .str();
-  }
   case DT_FLAGS:
     return FormatFlags(Value, ArrayRef(ElfDynamicDTFlags));
   case DT_FLAGS_1:
@@ -7365,6 +7368,63 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printDynamicTable() {
   W.startLine() << "]\n";
 }
 
+template <class ELFT>
+void JSONELFDumper<ELFT>::printAuxillaryDynamicTableEntryInfo(
+    const Elf_Dyn &Entry) {
+  auto FormatFlags = [this, Value = Entry.getVal()](auto Flags) {
+    ListScope L(this->W, "Flags");
+    for (const auto &Flag : Flags) {
+      if (Flag.Value != 0 && (Value & Flag.Value) == Flag.Value)
+        this->W.printString(Flag.Name);
+    }
+  };
+  switch (Entry.getTag()) {
+  case DT_SONAME:
+    this->W.printString("Name", this->getDynamicString(Entry.getVal()));
+    break;
+  case DT_AUXILIARY:
+  case DT_FILTER:
+  case DT_NEEDED:
+    this->W.printString("Library", this->getDynamicString(Entry.getVal()));
+    break;
+  case DT_USED:
+    this->W.printString("Object", this->getDynamicString(Entry.getVal()));
+    break;
+  case DT_RPATH:
+  case DT_RUNPATH: {
+    StringRef Value = this->getDynamicString(Entry.getVal());
+    ListScope L(this->W, "Path");
+    while (!Value.empty()) {
+      auto [Front, Back] = Value.split(':');
+      this->W.printString(Front);
+      Value = Back;
+    }
+    break;
+  }
+  case DT_FLAGS:
+    FormatFlags(ArrayRef(ElfDynamicDTFlags));
+    break;
+  case DT_FLAGS_1:
+    FormatFlags(ArrayRef(ElfDynamicDTFlags1));
+    break;
+  default:
+    return;
+  }
+}
+
+template <class ELFT> void JSONELFDumper<ELFT>::printDynamicTable() {
+  Elf_Dyn_Range Table = this->dynamic_table();
+  ListScope L(this->W, "DynamicSection");
+  for (const auto &Entry : Table) {
+    DictScope D(this->W);
+    uintX_t Tag = Entry.getTag();
+    this->W.printHex("Tag", Tag);
+    this->W.printString("Type", this->Obj.getDynamicTagAsString(Tag));
+    this->W.printHex("Value", Entry.getVal());
+    this->printAuxillaryDynamicTableEntryInfo(Entry);
+  }
+}
+
 template <class ELFT> void LLVMELFDumper<ELFT>::printDynamicRelocations() {
   W.startLine() << "Dynamic Relocations {\n";
   W.indent();
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
index f8fa83e9271335..c4812e59e8b858 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
@@ -104,7 +104,7 @@ static bool callLooksLikeLoadStore(CallBase *CB, Value *&DataArg,
 
   // If we didn't find any arguments, we can fill in the pointer.
   if (!PtrArg) {
-    unsigned AS = CB->getModule()->getDataLayout().getAllocaAddrSpace();
+    unsigned AS = CB->getDataLayout().getAllocaAddrSpace();
 
     PointerType *PtrTy =
         PointerType::get(DataArg ? DataArg->getType()
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 8cb7fce5c366b3..758643f1414c1e 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -467,7 +467,7 @@ struct AllocaModifier: public Modifier {
 
   void Act() override {
     Type *Tp = pickType();
-    const DataLayout &DL = BB->getModule()->getDataLayout();
+    const DataLayout &DL = BB->getDataLayout();
     PT->push_back(new AllocaInst(Tp, DL.getAllocaAddrSpace(),
                                  "A", BB->getFirstNonPHI()));
   }
diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp
index e5b0875d46a11f..dd51226e21311f 100644
--- a/llvm/tools/sancov/sancov.cpp
+++ b/llvm/tools/sancov/sancov.cpp
@@ -262,7 +262,7 @@ RawCoverage::read(const std::string &FileName) {
   // to compactify the data.
   Addrs->erase(0);
 
-  return std::unique_ptr<RawCoverage>(new RawCoverage(std::move(Addrs)));
+  return std::make_unique<RawCoverage>(std::move(Addrs));
 }
 
 // Print coverage addresses.
@@ -460,8 +460,7 @@ static std::unique_ptr<symbolize::LLVMSymbolizer> createSymbolizer() {
   symbolize::LLVMSymbolizer::Options SymbolizerOptions;
   SymbolizerOptions.Demangle = ClDemangle;
   SymbolizerOptions.UseSymbolTable = true;
-  return std::unique_ptr<symbolize::LLVMSymbolizer>(
-      new symbolize::LLVMSymbolizer(SymbolizerOptions));
+  return std::make_unique<symbolize::LLVMSymbolizer>(SymbolizerOptions);
 }
 
 static std::string normalizeFilename(const std::string &FileName) {
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index f6af4b0e5f6512..86a25f4394e19f 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -631,6 +631,100 @@ TEST(APFloatTest, Maximum) {
   EXPECT_TRUE(std::isnan(maximum(nan, f1).convertToDouble()));
 }
 
+TEST(APFloatTest, MinimumNumber) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  APInt intPayload_89ab(64, 0x89ab);
+  APInt intPayload_cdef(64, 0xcdef);
+  APFloat nan_0123[2] = {APFloat::getNaN(APFloat::IEEEdouble(), false, 0x0123),
+                         APFloat::getNaN(APFloat::IEEEdouble(), false, 0x0123)};
+  APFloat mnan_4567[2] = {APFloat::getNaN(APFloat::IEEEdouble(), true, 0x4567),
+                          APFloat::getNaN(APFloat::IEEEdouble(), true, 0x4567)};
+  APFloat nan_89ab[2] = {
+      APFloat::getSNaN(APFloat::IEEEdouble(), false, &intPayload_89ab),
+      APFloat::getNaN(APFloat::IEEEdouble(), false, 0x89ab)};
+  APFloat mnan_cdef[2] = {
+      APFloat::getSNaN(APFloat::IEEEdouble(), true, &intPayload_cdef),
+      APFloat::getNaN(APFloat::IEEEdouble(), true, 0xcdef)};
+
+  EXPECT_TRUE(f1.bitwiseIsEqual(minimumnum(f1, f2)));
+  EXPECT_TRUE(f1.bitwiseIsEqual(minimumnum(f2, f1)));
+  EXPECT_TRUE(zn.bitwiseIsEqual(minimumnum(zp, zn)));
+  EXPECT_TRUE(zn.bitwiseIsEqual(minimumnum(zn, zp)));
+
+  EXPECT_TRUE(minimumnum(zn, zp).isNegative());
+  EXPECT_TRUE(minimumnum(zp, zn).isNegative());
+  EXPECT_TRUE(minimumnum(zn, zn).isNegative());
+  EXPECT_FALSE(minimumnum(zp, zp).isNegative());
+
+  for (APFloat n : {nan_0123[0], mnan_4567[0], nan_89ab[0], mnan_cdef[0]})
+    for (APFloat f : {f1, f2, zn, zp}) {
+      APFloat res = minimumnum(f, n);
+      EXPECT_FALSE(res.isNaN());
+      EXPECT_TRUE(res.bitwiseIsEqual(f));
+      res = minimumnum(n, f);
+      EXPECT_FALSE(res.isNaN());
+      EXPECT_TRUE(res.bitwiseIsEqual(f));
+    }
+
+  // When NaN vs NaN, we should keep payload/sign of either one.
+  for (auto n1 : {nan_0123, mnan_4567, nan_89ab, mnan_cdef})
+    for (auto n2 : {nan_0123, mnan_4567, nan_89ab, mnan_cdef}) {
+      APFloat res = minimumnum(n1[0], n2[0]);
+      EXPECT_TRUE(res.bitwiseIsEqual(n1[1]) || res.bitwiseIsEqual(n2[1]));
+      EXPECT_FALSE(res.isSignaling());
+    }
+}
+
+TEST(APFloatTest, MaximumNumber) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  APInt intPayload_89ab(64, 0x89ab);
+  APInt intPayload_cdef(64, 0xcdef);
+  APFloat nan_0123[2] = {APFloat::getNaN(APFloat::IEEEdouble(), false, 0x0123),
+                         APFloat::getNaN(APFloat::IEEEdouble(), false, 0x0123)};
+  APFloat mnan_4567[2] = {APFloat::getNaN(APFloat::IEEEdouble(), true, 0x4567),
+                          APFloat::getNaN(APFloat::IEEEdouble(), true, 0x4567)};
+  APFloat nan_89ab[2] = {
+      APFloat::getSNaN(APFloat::IEEEdouble(), false, &intPayload_89ab),
+      APFloat::getNaN(APFloat::IEEEdouble(), false, 0x89ab)};
+  APFloat mnan_cdef[2] = {
+      APFloat::getSNaN(APFloat::IEEEdouble(), true, &intPayload_cdef),
+      APFloat::getNaN(APFloat::IEEEdouble(), true, 0xcdef)};
+
+  EXPECT_TRUE(f2.bitwiseIsEqual(maximumnum(f1, f2)));
+  EXPECT_TRUE(f2.bitwiseIsEqual(maximumnum(f2, f1)));
+  EXPECT_TRUE(zp.bitwiseIsEqual(maximumnum(zp, zn)));
+  EXPECT_TRUE(zp.bitwiseIsEqual(maximumnum(zn, zp)));
+
+  EXPECT_FALSE(maximumnum(zn, zp).isNegative());
+  EXPECT_FALSE(maximumnum(zp, zn).isNegative());
+  EXPECT_TRUE(maximumnum(zn, zn).isNegative());
+  EXPECT_FALSE(maximumnum(zp, zp).isNegative());
+
+  for (APFloat n : {nan_0123[0], mnan_4567[0], nan_89ab[0], mnan_cdef[0]})
+    for (APFloat f : {f1, f2, zn, zp}) {
+      APFloat res = maximumnum(f, n);
+      EXPECT_FALSE(res.isNaN());
+      EXPECT_TRUE(res.bitwiseIsEqual(f));
+      res = maximumnum(n, f);
+      EXPECT_FALSE(res.isNaN());
+      EXPECT_TRUE(res.bitwiseIsEqual(f));
+    }
+
+  // When NaN vs NaN, we should keep payload/sign of either one.
+  for (auto n1 : {nan_0123, mnan_4567, nan_89ab, mnan_cdef})
+    for (auto n2 : {nan_0123, mnan_4567, nan_89ab, mnan_cdef}) {
+      APFloat res = maximumnum(n1[0], n2[0]);
+      EXPECT_TRUE(res.bitwiseIsEqual(n1[1]) || res.bitwiseIsEqual(n2[1]));
+      EXPECT_FALSE(res.isSignaling());
+    }
+}
+
 TEST(APFloatTest, Denormal) {
   APFloat::roundingMode rdmd = APFloat::rmNearestTiesToEven;
 
@@ -5414,8 +5508,8 @@ TEST(APFloatTest, ConvertE4M3FNToE5M2) {
   EXPECT_TRUE(losesInfo);
   EXPECT_EQ(status, APFloat::opInexact);
 
-  // Convert E4M3 denormal to E5M2 normal. Should not be truncated, despite the
-  // destination format having one fewer significand bit
+  // Convert E4M3FN denormal to E5M2 normal. Should not be truncated, despite
+  // the destination format having one fewer significand bit
   test = APFloat(APFloat::Float8E4M3FN(), "0x1.Cp-7");
   status = test.convert(APFloat::Float8E5M2(), APFloat::rmNearestTiesToEven,
                         &losesInfo);
@@ -5553,8 +5647,8 @@ TEST(APFloatTest, Float8E4M3FNAdd) {
     int category;
     APFloat::roundingMode roundingMode = APFloat::rmNearestTiesToEven;
   } AdditionTests[] = {
-      // Test addition operations involving NaN, overflow, and the max E4M3
-      // value (448) because E4M3 differs from IEEE-754 types in these regards
+      // Test addition operations involving NaN, overflow, and the max E4M3FN
+      // value (448) because E4M3FN differs from IEEE-754 types in these regards
       {FromStr("448"), FromStr("16"), "448", APFloat::opInexact,
        APFloat::fcNormal},
       {FromStr("448"), FromStr("18"), "NaN",
@@ -6184,8 +6278,8 @@ TEST(APFloatTest, ConvertE4M3FNUZToE5M2FNUZ) {
   EXPECT_TRUE(losesInfo);
   EXPECT_EQ(status, APFloat::opInexact);
 
-  // Convert E4M3 denormal to E5M2 normal. Should not be truncated, despite the
-  // destination format having one fewer significand bit
+  // Convert E4M3FNUZ denormal to E5M2 normal. Should not be truncated, despite
+  // the destination format having one fewer significand bit
   losesInfo = true;
   test = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.Cp-8");
   status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven,
diff --git a/llvm/unittests/ADT/HashingTest.cpp b/llvm/unittests/ADT/HashingTest.cpp
index ab13ee833ce556..c28356e229e662 100644
--- a/llvm/unittests/ADT/HashingTest.cpp
+++ b/llvm/unittests/ADT/HashingTest.cpp
@@ -235,78 +235,6 @@ TEST(HashingTest, HashCombineRangeLengthDiff) {
   }
 }
 
-TEST(HashingTest, HashCombineRangeGoldenTest) {
-  struct { const char *s; uint64_t hash; } golden_data[] = {
-#if SIZE_MAX == UINT64_MAX || SIZE_MAX == UINT32_MAX
-    { "a",                                0xaeb6f9d5517c61f8ULL },
-    { "ab",                               0x7ab1edb96be496b4ULL },
-    { "abc",                              0xe38e60bf19c71a3fULL },
-    { "abcde",                            0xd24461a66de97f6eULL },
-    { "abcdefgh",                         0x4ef872ec411dec9dULL },
-    { "abcdefghijklm",                    0xe8a865539f4eadfeULL },
-    { "abcdefghijklmnopqrstu",            0x261cdf85faaf4e79ULL },
-    { "abcdefghijklmnopqrstuvwxyzabcdef", 0x43ba70e4198e3b2aULL },
-    { "abcdefghijklmnopqrstuvwxyzabcdef"
-      "abcdefghijklmnopqrstuvwxyzghijkl"
-      "abcdefghijklmnopqrstuvwxyzmnopqr"
-      "abcdefghijklmnopqrstuvwxyzstuvwx"
-      "abcdefghijklmnopqrstuvwxyzyzabcd", 0xdcd57fb2afdf72beULL },
-    { "a",                                0xaeb6f9d5517c61f8ULL },
-    { "aa",                               0xf2b3b69a9736a1ebULL },
-    { "aaa",                              0xf752eb6f07b1cafeULL },
-    { "aaaaa",                            0x812bd21e1236954cULL },
-    { "aaaaaaaa",                         0xff07a2cff08ac587ULL },
-    { "aaaaaaaaaaaaa",                    0x84ac949d54d704ecULL },
-    { "aaaaaaaaaaaaaaaaaaaaa",            0xcb2c8fb6be8f5648ULL },
-    { "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0xcc40ab7f164091b6ULL },
-    { "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0xc58e174c1e78ffe9ULL },
-    { "z",                                0x1ba160d7e8f8785cULL },
-    { "zz",                               0x2c5c03172f1285d7ULL },
-    { "zzz",                              0x9d2c4f4b507a2ac3ULL },
-    { "zzzzz",                            0x0f03b9031735693aULL },
-    { "zzzzzzzz",                         0xe674147c8582c08eULL },
-    { "zzzzzzzzzzzzz",                    0x3162d9fa6938db83ULL },
-    { "zzzzzzzzzzzzzzzzzzzzz",            0x37b9a549e013620cULL },
-    { "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 0x8921470aff885016ULL },
-    { "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
-      "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
-      "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
-      "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
-      "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 0xf60fdcd9beb08441ULL },
-    { "a",                                0xaeb6f9d5517c61f8ULL },
-    { "ab",                               0x7ab1edb96be496b4ULL },
-    { "aba",                              0x3edb049950884d0aULL },
-    { "ababa",                            0x8f2de9e73a97714bULL },
-    { "abababab",                         0xee14a29ddf0ce54cULL },
-    { "ababababababa",                    0x38b3ddaada2d52b4ULL },
-    { "ababababababababababa",            0xd3665364219f2b85ULL },
-    { "abababababababababababababababab", 0xa75cd6afbf1bc972ULL },
-    { "abababababababababababababababab"
-      "abababababababababababababababab"
-      "abababababababababababababababab"
-      "abababababababababababababababab"
-      "abababababababababababababababab", 0x840192d129f7a22bULL }
-#else
-#error This test only supports 64-bit and 32-bit systems.
-#endif
-  };
-  for (unsigned i = 0; i < sizeof(golden_data)/sizeof(*golden_data); ++i) {
-    StringRef str = golden_data[i].s;
-    hash_code hash = hash_combine_range(str.begin(), str.end());
-#if 0 // Enable this to generate paste-able text for the above structure.
-    std::string member_str = "\"" + str.str() + "\",";
-    fprintf(stderr, " { %-35s 0x%016llxULL },\n",
-            member_str.c_str(), static_cast<uint64_t>(hash));
-#endif
-    EXPECT_EQ(static_cast<size_t>(golden_data[i].hash),
-              static_cast<size_t>(hash));
-  }
-}
-
 TEST(HashingTest, HashCombineBasicTest) {
   // Hashing a sequence of homogenous types matches range hashing.
   const int i1 = 42, i2 = 43, i3 = 123, i4 = 999, i5 = 0, i6 = 79;
diff --git a/llvm/unittests/ADT/SetOperationsTest.cpp b/llvm/unittests/ADT/SetOperationsTest.cpp
index 982ea819fd809e..b3d931cbfd479a 100644
--- a/llvm/unittests/ADT/SetOperationsTest.cpp
+++ b/llvm/unittests/ADT/SetOperationsTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -65,6 +66,16 @@ TEST(SetOperationsTest, SetIntersect) {
   // is empty as they are non-overlapping.
   EXPECT_THAT(Set1, IsEmpty());
   EXPECT_EQ(ExpectedSet2, Set2);
+
+  // Check that set_intersect works on SetVector via remove_if.
+  SmallSetVector<int, 4> SV;
+  SV.insert(3);
+  SV.insert(6);
+  SV.insert(4);
+  SV.insert(5);
+  set_intersect(SV, Set2);
+  // SV should contain only 6 and 5 now.
+  EXPECT_THAT(SV, testing::ElementsAre(6, 5));
 }
 
 TEST(SetOperationsTest, SetIntersection) {
diff --git a/llvm/unittests/ADT/SmallPtrSetTest.cpp b/llvm/unittests/ADT/SmallPtrSetTest.cpp
index a97f2617cbf707..b45318d076a3d8 100644
--- a/llvm/unittests/ADT/SmallPtrSetTest.cpp
+++ b/llvm/unittests/ADT/SmallPtrSetTest.cpp
@@ -243,45 +243,6 @@ TEST(SmallPtrSetTest, SwapTest) {
   EXPECT_TRUE(a.count(&buf[3]));
 }
 
-void checkEraseAndIterators(SmallPtrSetImpl<int*> &S) {
-  int buf[3];
-
-  S.insert(&buf[0]);
-  S.insert(&buf[1]);
-  S.insert(&buf[2]);
-
-  // Iterators must still be valid after erase() calls;
-  auto B = S.begin();
-  auto M = std::next(B);
-  auto E = S.end();
-  EXPECT_TRUE(*B == &buf[0] || *B == &buf[1] || *B == &buf[2]);
-  EXPECT_TRUE(*M == &buf[0] || *M == &buf[1] || *M == &buf[2]);
-  EXPECT_TRUE(*B != *M);
-  int *Removable = *std::next(M);
-  // No iterator points to Removable now.
-  EXPECT_TRUE(Removable == &buf[0] || Removable == &buf[1] ||
-              Removable == &buf[2]);
-  EXPECT_TRUE(Removable != *B && Removable != *M);
-
-  S.erase(Removable);
-
-  // B,M,E iterators should still be valid
-  EXPECT_EQ(B, S.begin());
-  EXPECT_EQ(M, std::next(B));
-  EXPECT_EQ(E, S.end());
-  EXPECT_EQ(std::next(M), E);
-}
-
-TEST(SmallPtrSetTest, EraseTest) {
-  // Test when set stays small.
-  SmallPtrSet<int *, 8> B;
-  checkEraseAndIterators(B);
-
-  // Test when set grows big.
-  SmallPtrSet<int *, 2> A;
-  checkEraseAndIterators(A);
-}
-
 // Verify that dereferencing and iteration work.
 TEST(SmallPtrSetTest, dereferenceAndIterate) {
   int Ints[] = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -409,3 +370,41 @@ TEST(SmallPtrSetTest, InsertIterator) {
   for (const auto *Ptr : Buf)
     EXPECT_TRUE(Set.contains(Ptr));
 }
+
+TEST(SmallPtrSetTest, RemoveIf) {
+  SmallPtrSet<int *, 5> Set;
+  int Vals[6] = {0, 1, 2, 3, 4, 5};
+
+  // Stay in small regime.
+  Set.insert(&Vals[0]);
+  Set.insert(&Vals[1]);
+  Set.insert(&Vals[2]);
+  Set.insert(&Vals[3]);
+  Set.erase(&Vals[0]); // Leave a tombstone.
+
+  // Remove odd elements.
+  bool Removed = Set.remove_if([](int *Ptr) { return *Ptr % 2 != 0; });
+  // We should only have element 2 left now.
+  EXPECT_TRUE(Removed);
+  EXPECT_EQ(Set.size(), 1u);
+  EXPECT_TRUE(Set.contains(&Vals[2]));
+
+  // Switch to big regime.
+  Set.insert(&Vals[0]);
+  Set.insert(&Vals[1]);
+  Set.insert(&Vals[3]);
+  Set.insert(&Vals[4]);
+  Set.insert(&Vals[5]);
+  Set.erase(&Vals[0]); // Leave a tombstone.
+
+  // Remove odd elements.
+  Removed = Set.remove_if([](int *Ptr) { return *Ptr % 2 != 0; });
+  // We should only have elements 2 and 4 left now.
+  EXPECT_TRUE(Removed);
+  EXPECT_EQ(Set.size(), 2u);
+  EXPECT_TRUE(Set.contains(&Vals[2]));
+  EXPECT_TRUE(Set.contains(&Vals[4]));
+
+  Removed = Set.remove_if([](int *Ptr) { return false; });
+  EXPECT_FALSE(Removed);
+}
diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
index 61a2031a8f81fd..0b340c43e31f86 100644
--- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
+++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
@@ -6,14 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "gtest/gtest.h"
 #include <random>
diff --git a/llvm/unittests/Analysis/DDGTest.cpp b/llvm/unittests/Analysis/DDGTest.cpp
index b1d35596c1e648..da6f53fb0141c0 100644
--- a/llvm/unittests/Analysis/DDGTest.cpp
+++ b/llvm/unittests/Analysis/DDGTest.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Analysis/GlobalsModRefTest.cpp b/llvm/unittests/Analysis/GlobalsModRefTest.cpp
index 096df299461af7..755997f4d494fc 100644
--- a/llvm/unittests/Analysis/GlobalsModRefTest.cpp
+++ b/llvm/unittests/Analysis/GlobalsModRefTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/TargetParser/Triple.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/Analysis/IVDescriptorsTest.cpp b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
index 83bedc9bfe55d8..32825d55ebd9af 100644
--- a/llvm/unittests/Analysis/IVDescriptorsTest.cpp
+++ b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Analysis/LoopInfoTest.cpp b/llvm/unittests/Analysis/LoopInfoTest.cpp
index 126467189d92dd..5e9edce0497188 100644
--- a/llvm/unittests/Analysis/LoopInfoTest.cpp
+++ b/llvm/unittests/Analysis/LoopInfoTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Analysis/LoopNestTest.cpp b/llvm/unittests/Analysis/LoopNestTest.cpp
index 596dbcf3a673d9..366be2e872118e 100644
--- a/llvm/unittests/Analysis/LoopNestTest.cpp
+++ b/llvm/unittests/Analysis/LoopNestTest.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Analysis/MLModelRunnerTest.cpp b/llvm/unittests/Analysis/MLModelRunnerTest.cpp
index 007a8cfef04338..87b37f81163a40 100644
--- a/llvm/unittests/Analysis/MLModelRunnerTest.cpp
+++ b/llvm/unittests/Analysis/MLModelRunnerTest.cpp
@@ -7,10 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/InteractiveModelRunner.h"
 #include "llvm/Analysis/NoInferenceModelRunner.h"
 #include "llvm/Analysis/ReleaseModeModelRunner.h"
 #include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/JSON.h"
@@ -28,14 +30,17 @@ namespace llvm {
 // This is a mock of the kind of AOT-generated model evaluator. It has 2 tensors
 // of shape {1}, and 'evaluation' adds them.
 // The interface is the one expected by ReleaseModelRunner.
-class MockAOTModel final {
+class MockAOTModelBase {
+protected:
   int64_t A = 0;
   int64_t B = 0;
   int64_t R = 0;
 
 public:
-  MockAOTModel() = default;
-  int LookupArgIndex(const std::string &Name) {
+  MockAOTModelBase() = default;
+  virtual ~MockAOTModelBase() = default;
+
+  virtual int LookupArgIndex(const std::string &Name) {
     if (Name == "prefix_a")
       return 0;
     if (Name == "prefix_b")
@@ -43,13 +48,13 @@ class MockAOTModel final {
     return -1;
   }
   int LookupResultIndex(const std::string &) { return 0; }
-  void Run() { R = A + B; }
-  void *result_data(int RIndex) {
+  virtual void Run() = 0;
+  virtual void *result_data(int RIndex) {
     if (RIndex == 0)
       return &R;
     return nullptr;
   }
-  void *arg_data(int Index) {
+  virtual void *arg_data(int Index) {
     switch (Index) {
     case 0:
       return &A;
@@ -60,6 +65,64 @@ class MockAOTModel final {
     }
   }
 };
+
+class AdditionAOTModel final : public MockAOTModelBase {
+public:
+  AdditionAOTModel() = default;
+  void Run() override { R = A + B; }
+};
+
+class DiffAOTModel final : public MockAOTModelBase {
+public:
+  DiffAOTModel() = default;
+  void Run() override { R = A - B; }
+};
+
+static const char *M1Selector = "the model that subtracts";
+static const char *M2Selector = "the model that adds";
+
+static MD5::MD5Result Hash1 = MD5::hash(arrayRefFromStringRef(M1Selector));
+static MD5::MD5Result Hash2 = MD5::hash(arrayRefFromStringRef(M2Selector));
+class ComposedAOTModel final {
+  DiffAOTModel M1;
+  AdditionAOTModel M2;
+  uint64_t Selector[2] = {0};
+
+  bool isHashSameAsSelector(const std::pair<uint64_t, uint64_t> &Words) const {
+    return Selector[0] == Words.first && Selector[1] == Words.second;
+  }
+  MockAOTModelBase *getModel() {
+    if (isHashSameAsSelector(Hash1.words()))
+      return &M1;
+    if (isHashSameAsSelector(Hash2.words()))
+      return &M2;
+    llvm_unreachable("Should be one of the two");
+  }
+
+public:
+  ComposedAOTModel() = default;
+  int LookupArgIndex(const std::string &Name) {
+    if (Name == "prefix_model_selector")
+      return 2;
+    return getModel()->LookupArgIndex(Name);
+  }
+  int LookupResultIndex(const std::string &Name) {
+    return getModel()->LookupResultIndex(Name);
+  }
+  void *arg_data(int Index) {
+    if (Index == 2)
+      return Selector;
+    return getModel()->arg_data(Index);
+  }
+  void *result_data(int RIndex) { return getModel()->result_data(RIndex); }
+  void Run() { getModel()->Run(); }
+};
+
+static EmbeddedModelRunnerOptions makeOptions() {
+  EmbeddedModelRunnerOptions Opts;
+  Opts.setFeedPrefix("prefix_");
+  return Opts;
+}
 } // namespace llvm
 
 TEST(NoInferenceModelRunner, AccessTensors) {
@@ -86,8 +149,8 @@ TEST(ReleaseModeRunner, NormalUse) {
   LLVMContext Ctx;
   std::vector<TensorSpec> Inputs{TensorSpec::createSpec<int64_t>("a", {1}),
                                  TensorSpec::createSpec<int64_t>("b", {1})};
-  auto Evaluator = std::make_unique<ReleaseModeModelRunner<MockAOTModel>>(
-      Ctx, Inputs, "", "prefix_");
+  auto Evaluator = std::make_unique<ReleaseModeModelRunner<AdditionAOTModel>>(
+      Ctx, Inputs, "", makeOptions());
   *Evaluator->getTensor<int64_t>(0) = 1;
   *Evaluator->getTensor<int64_t>(1) = 2;
   EXPECT_EQ(Evaluator->evaluate<int64_t>(), 3);
@@ -100,8 +163,8 @@ TEST(ReleaseModeRunner, ExtraFeatures) {
   std::vector<TensorSpec> Inputs{TensorSpec::createSpec<int64_t>("a", {1}),
                                  TensorSpec::createSpec<int64_t>("b", {1}),
                                  TensorSpec::createSpec<int64_t>("c", {1})};
-  auto Evaluator = std::make_unique<ReleaseModeModelRunner<MockAOTModel>>(
-      Ctx, Inputs, "", "prefix_");
+  auto Evaluator = std::make_unique<ReleaseModeModelRunner<AdditionAOTModel>>(
+      Ctx, Inputs, "", makeOptions());
   *Evaluator->getTensor<int64_t>(0) = 1;
   *Evaluator->getTensor<int64_t>(1) = 2;
   *Evaluator->getTensor<int64_t>(2) = -3;
@@ -118,8 +181,8 @@ TEST(ReleaseModeRunner, ExtraFeaturesOutOfOrder) {
       TensorSpec::createSpec<int64_t>("c", {1}),
       TensorSpec::createSpec<int64_t>("b", {1}),
   };
-  auto Evaluator = std::make_unique<ReleaseModeModelRunner<MockAOTModel>>(
-      Ctx, Inputs, "", "prefix_");
+  auto Evaluator = std::make_unique<ReleaseModeModelRunner<AdditionAOTModel>>(
+      Ctx, Inputs, "", makeOptions());
   *Evaluator->getTensor<int64_t>(0) = 1;         // a
   *Evaluator->getTensor<int64_t>(1) = 2;         // c
   *Evaluator->getTensor<int64_t>(2) = -3;        // b
@@ -129,6 +192,56 @@ TEST(ReleaseModeRunner, ExtraFeaturesOutOfOrder) {
   EXPECT_EQ(*Evaluator->getTensor<int64_t>(2), -3);
 }
 
+// We expect an error to be reported early if the user tried to specify a model
+// selector, but the model in fact doesn't support that.
+TEST(ReleaseModelRunner, ModelSelectorNoInputFeaturePresent) {
+  LLVMContext Ctx;
+  std::vector<TensorSpec> Inputs{TensorSpec::createSpec<int64_t>("a", {1}),
+                                 TensorSpec::createSpec<int64_t>("b", {1})};
+  EXPECT_DEATH(std::make_unique<ReleaseModeModelRunner<AdditionAOTModel>>(
+                   Ctx, Inputs, "", makeOptions().setModelSelector(M2Selector)),
+               "A model selector was specified but the underlying model does "
+               "not expose a model_selector input");
+}
+
+TEST(ReleaseModelRunner, ModelSelectorNoSelectorGiven) {
+  LLVMContext Ctx;
+  std::vector<TensorSpec> Inputs{TensorSpec::createSpec<int64_t>("a", {1}),
+                                 TensorSpec::createSpec<int64_t>("b", {1})};
+  EXPECT_DEATH(
+      std::make_unique<ReleaseModeModelRunner<ComposedAOTModel>>(
+          Ctx, Inputs, "", makeOptions()),
+      "A model selector was not specified but the underlying model requires "
+      "selecting one because it exposes a model_selector input");
+}
+
+// Test that we correctly set up the model_selector tensor value. We are only
+// responsbile for what happens if the user doesn't specify a value (but the
+// model supports the feature), or if the user specifies one, and we correctly
+// populate the tensor, and do so upfront (in case the model implementation
+// needs that for subsequent tensor buffer lookups).
+TEST(ReleaseModelRunner, ModelSelector) {
+  LLVMContext Ctx;
+  std::vector<TensorSpec> Inputs{TensorSpec::createSpec<int64_t>("a", {1}),
+                                 TensorSpec::createSpec<int64_t>("b", {1})};
+  // This explicitly asks for M1
+  auto Evaluator = std::make_unique<ReleaseModeModelRunner<ComposedAOTModel>>(
+      Ctx, Inputs, "", makeOptions().setModelSelector(M1Selector));
+  *Evaluator->getTensor<int64_t>(0) = 1;
+  *Evaluator->getTensor<int64_t>(1) = 2;
+  EXPECT_EQ(Evaluator->evaluate<int64_t>(), -1);
+
+  // Ask for M2
+  Evaluator = std::make_unique<ReleaseModeModelRunner<ComposedAOTModel>>(
+      Ctx, Inputs, "", makeOptions().setModelSelector(M2Selector));
+  *Evaluator->getTensor<int64_t>(0) = 1;
+  *Evaluator->getTensor<int64_t>(1) = 2;
+  EXPECT_EQ(Evaluator->evaluate<int64_t>(), 3);
+
+  // Asking for a model that's not supported isn't handled by our infra and we
+  // expect the model implementation to fail at a point.
+}
+
 #if defined(LLVM_ON_UNIX)
 TEST(InteractiveModelRunner, Evaluation) {
   LLVMContext Ctx;
diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp
index e730c5b04bbb10..9e6c517242a269 100644
--- a/llvm/unittests/Analysis/MemorySSATest.cpp
+++ b/llvm/unittests/Analysis/MemorySSATest.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
index 9e160aae689f3f..3330751120e6c8 100644
--- a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
@@ -1,6 +1,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Config/config.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
index 46957af823fce4..ca860a0dd55843 100644
--- a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
@@ -1,6 +1,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Config/config.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/unittests/Analysis/SparsePropagation.cpp b/llvm/unittests/Analysis/SparsePropagation.cpp
index bbdf20035b0abc..8583100dc9bc8e 100644
--- a/llvm/unittests/Analysis/SparsePropagation.cpp
+++ b/llvm/unittests/Analysis/SparsePropagation.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Analysis/SparsePropagation.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "gtest/gtest.h"
 using namespace llvm;
 
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index e498a2f60bad35..cc574b5887efcb 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/llvm/unittests/CodeGen/AMDGPUMetadataTest.cpp b/llvm/unittests/CodeGen/AMDGPUMetadataTest.cpp
index 4f4112dd7f9052..ceea57fa10cce4 100644
--- a/llvm/unittests/CodeGen/AMDGPUMetadataTest.cpp
+++ b/llvm/unittests/CodeGen/AMDGPUMetadataTest.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index dbbacdd95ec9f4..963cdcc0275e16 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_unittest(CodeGenTests
   LowLevelTypeTest.cpp
   LexicalScopesTest.cpp
   MachineBasicBlockTest.cpp
+  MachineDomTreeUpdaterTest.cpp
   MachineInstrBundleIteratorTest.cpp
   MachineInstrTest.cpp
   MachineOperandTest.cpp
diff --git a/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt b/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
index 111d7f4d2f6200..253b90d5dc1f8d 100644
--- a/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  AsmParser
   CodeGen
   CodeGenTypes
   Core
@@ -26,4 +27,5 @@ add_llvm_unittest(GlobalISelTests
   KnownBitsVectorTest.cpp
   GISelUtilsTest.cpp
   GISelAliasTest.cpp
+  CallLowering.cpp
   )
diff --git a/llvm/unittests/CodeGen/GlobalISel/CallLowering.cpp b/llvm/unittests/CodeGen/GlobalISel/CallLowering.cpp
new file mode 100644
index 00000000000000..1d36e5e9ca946d
--- /dev/null
+++ b/llvm/unittests/CodeGen/GlobalISel/CallLowering.cpp
@@ -0,0 +1,57 @@
+//===- CallLowering.cpp - CallLowering unit tests -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+struct TargetCallLoweringTest : public CallLowering, testing::Test {
+  LLVMContext C;
+
+public:
+  TargetCallLoweringTest() : CallLowering(nullptr) {}
+
+  std::unique_ptr<Module> parseIR(const char *IR) {
+    SMDiagnostic Err;
+    std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
+    if (!Mod)
+      Err.print("TestTargetCallLoweringTest", errs());
+    return Mod;
+  }
+};
+} // namespace
+
+TEST_F(TargetCallLoweringTest, ArgByRef) {
+  std::unique_ptr<Module> M = parseIR(R"(
+define void @foo(ptr %p0, ptr byref(i32) align(4) %p1) {
+  ret void
+}
+)");
+  const DataLayout &DL = M->getDataLayout();
+  Function *F = M->getFunction("foo");
+  // Dummy vregs.
+  SmallVector<Register, 4> VRegs(1, 1);
+
+  CallLowering::ArgInfo Arg0(VRegs, F->getArg(0)->getType(), 0);
+  setArgFlags(Arg0, AttributeList::FirstArgIndex + 0, DL, *F);
+  EXPECT_TRUE(Arg0.Flags[0].isPointer());
+  EXPECT_FALSE(Arg0.Flags[0].isByRef());
+
+  CallLowering::ArgInfo Arg1(VRegs, F->getArg(1)->getType(), 1);
+  setArgFlags(Arg1, AttributeList::FirstArgIndex + 1, DL, *F);
+  EXPECT_TRUE(Arg1.Flags[0].isPointer());
+  EXPECT_TRUE(Arg1.Flags[0].isByRef());
+  EXPECT_EQ(Arg1.Flags[0].getByRefSize(), 4U);
+}
diff --git a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp
index 7b4a06fc8a98a7..2c068ba7f88fe6 100644
--- a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp
@@ -449,3 +449,34 @@ TEST_F(AArch64GISelMITest, BuildBitfieldExtract) {
 
   EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
 }
+
+TEST_F(AArch64GISelMITest, BuildFPEnv) {
+  setUp();
+  if (!TM)
+    GTEST_SKIP();
+
+  LLT S32 = LLT::scalar(32); 
+  SmallVector<Register, 4> Copies;
+  collectCopies(Copies, MF);
+
+  B.buildGetFPEnv(Copies[0]);
+  B.buildSetFPEnv(Copies[1]);
+  B.buildResetFPEnv();
+  auto GetFPMode = B.buildGetFPMode(S32);
+  B.buildSetFPMode(GetFPMode);
+  B.buildResetFPMode();
+
+  auto CheckStr = R"(
+  ; CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+  ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+  ; CHECK: [[COPY0]]:_(s64) = G_GET_FPENV
+  ; CHECK: G_SET_FPENV [[COPY1]]:_(s64)
+  ; CHECK: G_RESET_FPENV
+  ; CHECK: [[FPMODE:%[0-9]+]]:_(s32) = G_GET_FPMODE
+  ; CHECK: G_SET_FPMODE [[FPMODE]]:_(s32)
+  ; CHECK: G_RESET_FPMODE
+  )";
+
+  EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
+}
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 7d4eaf388f7271..306a97c3149cc4 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/CodeGen/LexicalScopesTest.cpp b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
index 93ff4dc6a8b212..2374ea8a0c1228 100644
--- a/llvm/unittests/CodeGen/LexicalScopesTest.cpp
+++ b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
diff --git a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp b/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
index bc2d8729474d76..d06b49b0c82b4c 100644
--- a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
+++ b/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
@@ -46,7 +47,7 @@ class RegAllocDevelopmentFeaturesTest : public ::Test {
 protected:
   SmallVector<LRStartEndInfo>
   setupOverlapProblem(const SmallVectorImpl<LRPosInfoIndexes> &Segments,
-                      ilist<IndexListEntry> &IndexList) {
+                      simple_ilist<IndexListEntry> &IndexList) {
     SmallVector<LRStartEndInfo> PositionsToReturn;
     PositionsToReturn.reserve(Segments.size());
     for (auto CurrentPosIndexInfo : Segments) {
@@ -61,7 +62,7 @@ class RegAllocDevelopmentFeaturesTest : public ::Test {
           Allocator.Allocate(sizeof(IndexListEntry), alignof(IndexListEntry)));
       auto *CurrentListEntry =
           new (CurrentLEMem) IndexListEntry(nullptr, CurrentIndex);
-      IndexList.push_back(CurrentListEntry);
+      IndexList.push_back(*CurrentListEntry);
       for (size_t CurrentPosInfoIndex = 0;
            CurrentPosInfoIndex < Segments.size(); ++CurrentPosInfoIndex) {
         if ((CurrentIndex / SlotIndex::InstrDist) ==
@@ -107,7 +108,7 @@ class RegAllocDevelopmentFeaturesTest : public ::Test {
   }
 
   void runOverlapTest(SmallVectorImpl<LRPosInfoIndexes> &OverlapSetup) {
-    ilist<IndexListEntry> IndexList;
+    simple_ilist<IndexListEntry> IndexList;
     auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
     NoInferenceModelRunner ModelRunner = setupModelRunner();
     size_t MaxIndex = 0;
@@ -131,7 +132,7 @@ class RegAllocDevelopmentFeaturesTest : public ::Test {
             NumberOfInterferences * ModelMaxSupportedInstructionCount);
     ASSERT_THAT(MappingMatrix,
                 ContainerEq(getExpectedMappingMatrix(OverlapSetup)));
-    IndexList.clearAndLeakNodesUnsafely();
+    IndexList.clear();
   }
 
   BumpPtrAllocator Allocator;
@@ -144,7 +145,7 @@ TEST_F(RegAllocDevelopmentFeaturesTest,
   SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
   OverlapSetup.push_back({0, 5, 0});
   OverlapSetup.push_back({5, 10, 0});
-  ilist<IndexListEntry> IndexList;
+  simple_ilist<IndexListEntry> IndexList;
   auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
   ASSERT_EQ(OverlapProblem[0].End.distance(OverlapProblem[1].End),
             5 * SlotIndex::InstrDist);
@@ -154,7 +155,7 @@ TEST_F(RegAllocDevelopmentFeaturesTest,
 TEST_F(RegAllocDevelopmentFeaturesTest, MetaSlotIndicesAreValid) {
   SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
   OverlapSetup.push_back({0, 10, 0});
-  ilist<IndexListEntry> IndexList;
+  simple_ilist<IndexListEntry> IndexList;
   auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
   ASSERT_TRUE(OverlapProblem[0].Begin.isValid());
   ASSERT_TRUE(OverlapProblem[0].End.isValid());
@@ -165,7 +166,7 @@ TEST_F(RegAllocDevelopmentFeaturesTest, MetaSlotIndicesAreValid) {
 TEST_F(RegAllocDevelopmentFeaturesTest, InstructionOpcodesAreCorrect) {
   SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
   OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  ilist<IndexListEntry> IndexList;
+  simple_ilist<IndexListEntry> IndexList;
   auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
   NoInferenceModelRunner ModelRunner = setupModelRunner();
   SlotIndex LastIndex = OverlapProblem[0].End;
@@ -247,7 +248,7 @@ TEST_F(RegAllocDevelopmentFeaturesTest, SingleMBBTest) {
 TEST_F(RegAllocDevelopmentFeaturesTest, MBBFullTruncated) {
   SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
   OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  ilist<IndexListEntry> IndexList;
+  simple_ilist<IndexListEntry> IndexList;
   auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
   NoInferenceModelRunner ModelRunner = setupModelRunner();
   SlotIndex LastIndex = OverlapProblem[0].End;
diff --git a/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp b/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp
index 8c9d05d2578d51..6dece931072960 100644
--- a/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp
+++ b/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "gmock/gmock.h"
diff --git a/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp b/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp
new file mode 100644
index 00000000000000..9dcf3754a5bd74
--- /dev/null
+++ b/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp
@@ -0,0 +1,276 @@
+//===- MachineDomTreeUpdaterTest.cpp - MachineDomTreeUpdater unit tests ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDomTreeUpdater.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class MachineDomTreeUpdaterTest : public testing::Test {
+public:
+  LLVMContext Context;
+  std::unique_ptr<TargetMachine> TM;
+  std::unique_ptr<Module> M;
+  std::unique_ptr<MachineModuleInfo> MMI;
+  std::unique_ptr<MIRParser> MIR;
+
+  LoopAnalysisManager LAM;
+  MachineFunctionAnalysisManager MFAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+
+  ModulePassManager MPM;
+  FunctionPassManager FPM;
+  MachineFunctionPassManager MFPM;
+
+  static void SetUpTestCase() {
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+  }
+
+  void SetUp() override {
+    Triple TargetTriple("x86_64-unknown-linux-gnu");
+    std::string Error;
+    const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
+    if (!T)
+      GTEST_SKIP();
+    TargetOptions Options;
+    TM = std::unique_ptr<TargetMachine>(
+        T->createTargetMachine("X86", "", "", Options, std::nullopt));
+    if (!TM)
+      GTEST_SKIP();
+    MMI = std::make_unique<MachineModuleInfo>(
+        static_cast<LLVMTargetMachine *>(TM.get()));
+
+    PassBuilder PB(TM.get());
+    PB.registerModuleAnalyses(MAM);
+    PB.registerCGSCCAnalyses(CGAM);
+    PB.registerFunctionAnalyses(FAM);
+    PB.registerLoopAnalyses(LAM);
+    PB.registerMachineFunctionAnalyses(MFAM);
+    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
+    MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); });
+  }
+
+  bool parseMIR(StringRef MIRCode, const char *FnName) {
+    SMDiagnostic Diagnostic;
+    std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
+    MIR = createMIRParser(std::move(MBuffer), Context);
+    if (!MIR)
+      return false;
+
+    M = MIR->parseIRModule();
+    M->setDataLayout(TM->createDataLayout());
+
+    if (MIR->parseMachineFunctions(*M, MAM)) {
+      M.reset();
+      return false;
+    }
+
+    return true;
+  }
+};
+
+TEST_F(MachineDomTreeUpdaterTest, EagerUpdateBasicOperations) {
+  StringRef MIRString = R"(
+--- |
+  define i64 @f0(i64 %i, ptr %p) {
+  bb0:
+    store i64 %i, ptr %p, align 4
+    switch i64 %i, label %bb1 [
+      i64 1, label %bb2
+      i64 2, label %bb3
+    ]
+  bb1:                                              ; preds = %bb0
+    ret i64 1
+  bb2:                                              ; preds = %bb0
+    ret i64 2
+  bb3:                                              ; preds = %bb0
+    ret i64 3
+  }
+...
+---
+name:            f0
+body:             |
+  bb.0.bb0:
+    successors: %bb.2, %bb.4
+    liveins: $rdi, $rsi
+
+    %1:gr32 = COPY $rsi
+    %0:gr64 = COPY $rdi
+    MOV64mr %1, 1, $noreg, 0, $noreg, %0 :: (store (s64) into %ir.p)
+    %2:gr64 = SUB64ri32 %0, 1, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit $eflags
+    JMP_1 %bb.4
+
+  bb.4.bb0:
+    successors: %bb.3, %bb.1
+
+    %3:gr64 = SUB64ri32 %0, 2, implicit-def $eflags
+    JCC_1 %bb.3, 4, implicit $eflags
+    JMP_1 %bb.1
+
+  bb.1.bb1:
+    %6:gr64 = MOV32ri64 1
+    $rax = COPY %6
+    RET 0, $rax
+
+  bb.2.bb2:
+    %5:gr64 = MOV32ri64 2
+    $rax = COPY %5
+    RET 0, $rax
+
+  bb.3.bb3:
+    %4:gr64 = MOV32ri64 3
+    $rax = COPY %4
+    RET 0, $rax
+
+...
+)";
+
+  ASSERT_TRUE(parseMIR(MIRString, "f0"));
+
+  auto &MF =
+      FAM.getResult<MachineFunctionAnalysis>(*M->getFunction("f0")).getMF();
+
+  MachineDominatorTree DT(MF);
+  MachinePostDominatorTree PDT(MF);
+  MachineDomTreeUpdater DTU(DT, PDT,
+                            MachineDomTreeUpdater::UpdateStrategy::Eager);
+
+  ASSERT_TRUE(DTU.hasDomTree());
+  ASSERT_TRUE(DTU.hasPostDomTree());
+  ASSERT_TRUE(DTU.isEager());
+  ASSERT_FALSE(DTU.isLazy());
+  ASSERT_TRUE(DTU.getDomTree().verify());
+  ASSERT_TRUE(DTU.getPostDomTree().verify());
+  ASSERT_FALSE(DTU.hasPendingUpdates());
+
+  auto B = MF.begin();
+  [[maybe_unused]] auto BB0 = B;
+  auto BB1 = ++B;
+  auto BB2 = ++B;
+  [[maybe_unused]] auto BB3 = ++B;
+  auto BB4 = ++B;
+  EXPECT_EQ(BB1->succ_size(), 2u);
+  ASSERT_TRUE(DT.dominates(&*BB1, &*BB2));
+  ASSERT_TRUE(DT.dominates(&*BB1, &*BB4));
+  BB1->removeSuccessor(&*BB4);
+  DTU.deleteBB(&*BB4);
+  EXPECT_EQ(BB1->succ_size(), 1u);
+  ASSERT_TRUE(DT.dominates(&*BB1, &*BB2));
+}
+
+TEST_F(MachineDomTreeUpdaterTest, LazyUpdateBasicOperations) {
+  StringRef MIRString = R"(
+--- |
+  define i64 @f0(i64 %i, ptr %p) {
+  bb0:
+    store i64 %i, ptr %p, align 4
+    switch i64 %i, label %bb1 [
+      i64 1, label %bb2
+      i64 2, label %bb3
+    ]
+  bb1:                                              ; preds = %bb0
+    ret i64 1
+  bb2:                                              ; preds = %bb0
+    ret i64 2
+  bb3:                                              ; preds = %bb0
+    ret i64 3
+  }
+...
+---
+name:            f0
+body:             |
+  bb.0.bb0:
+    successors: %bb.2, %bb.4
+    liveins: $rdi, $rsi
+
+    %1:gr32 = COPY $rsi
+    %0:gr64 = COPY $rdi
+    MOV64mr %1, 1, $noreg, 0, $noreg, %0 :: (store (s64) into %ir.p)
+    %2:gr64 = SUB64ri32 %0, 1, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit $eflags
+    JMP_1 %bb.4
+
+  bb.4.bb0:
+    successors: %bb.3, %bb.1
+
+    %3:gr64 = SUB64ri32 %0, 2, implicit-def $eflags
+    JCC_1 %bb.3, 4, implicit $eflags
+    JMP_1 %bb.1
+
+  bb.1.bb1:
+    %6:gr64 = MOV32ri64 1
+    $rax = COPY %6
+    RET 0, $rax
+
+  bb.2.bb2:
+    %5:gr64 = MOV32ri64 2
+    $rax = COPY %5
+    RET 0, $rax
+
+  bb.3.bb3:
+    %4:gr64 = MOV32ri64 3
+    $rax = COPY %4
+    RET 0, $rax
+
+...
+)";
+
+  ASSERT_TRUE(parseMIR(MIRString, "f0"));
+
+  auto &MF =
+      FAM.getResult<MachineFunctionAnalysis>(*M->getFunction("f0")).getMF();
+
+  MachineDominatorTree DT(MF);
+  MachinePostDominatorTree PDT(MF);
+  MachineDomTreeUpdater DTU(DT, PDT,
+                            MachineDomTreeUpdater::UpdateStrategy::Lazy);
+
+  ASSERT_TRUE(DTU.hasDomTree());
+  ASSERT_TRUE(DTU.hasPostDomTree());
+  ASSERT_FALSE(DTU.isEager());
+  ASSERT_TRUE(DTU.isLazy());
+  ASSERT_TRUE(DTU.getDomTree().verify());
+  ASSERT_TRUE(DTU.getPostDomTree().verify());
+  ASSERT_FALSE(DTU.hasPendingUpdates());
+
+  auto B = MF.begin();
+  [[maybe_unused]] auto BB0 = B;
+  auto BB1 = ++B;
+  auto BB2 = ++B;
+  [[maybe_unused]] auto BB3 = ++B;
+  auto BB4 = ++B;
+  EXPECT_EQ(BB1->succ_size(), 2u);
+  ASSERT_TRUE(DT.dominates(&*BB1, &*BB2));
+  ASSERT_TRUE(DT.dominates(&*BB1, &*BB4));
+  BB1->removeSuccessor(&*BB4);
+  DTU.deleteBB(&*BB4);
+  ASSERT_TRUE(DTU.hasPendingDeletedBB());
+  EXPECT_EQ(BB1->succ_size(), 1u);
+  ASSERT_TRUE(DT.dominates(&*BB1, &*BB2));
+  ASSERT_NE(DT.getNode(&*BB4), nullptr);
+  DTU.flush();
+}
diff --git a/llvm/unittests/CodeGen/MachineInstrTest.cpp b/llvm/unittests/CodeGen/MachineInstrTest.cpp
index 8ea12a6ec6453c..af25acbb38fd5f 100644
--- a/llvm/unittests/CodeGen/MachineInstrTest.cpp
+++ b/llvm/unittests/CodeGen/MachineInstrTest.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
diff --git a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
index eae517f9d01cf2..9f8576eb7312bd 100644
--- a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
+++ b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 1f2b8c1754f6ef..88c4dbc2eec784 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 18d6ebeb9076a2..46c385a0bc050e 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index c68d9e423e7bae..c70341b5a86d24 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -688,8 +688,7 @@ TEST_F(OpenMPDecompositionTest, Order1) {
   std::string Dir5 = stringify(Dec.output[5]);
   ASSERT_EQ(Dir0, "target"); // (31)
   ASSERT_EQ(Dir1, "teams");  // (31)
-  // XXX OMP.td doesn't list "order" as allowed for "distribute"
-  ASSERT_EQ(Dir2, "distribute");       // (31)
+  ASSERT_EQ(Dir2, "distribute order(1, 0)"); // (31)
   ASSERT_EQ(Dir3, "parallel");         // (31)
   ASSERT_EQ(Dir4, "for order(1, 0)");  // (31)
   ASSERT_EQ(Dir5, "simd order(1, 0)"); // (31)
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 3ed3034f489ce7..8653bbd3d38fd4 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4980,8 +4980,14 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
   Builder.restoreIP(AfterIP);
 
   OpenMPIRBuilder::ReductionInfo ReductionInfos[] = {
-      {SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction},
-      {XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}};
+      {SumType, SumReduced, SumPrivatized,
+       /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction,
+       /*ReductionGenClang=*/nullptr, sumAtomicReduction},
+      {XorType, XorReduced, XorPrivatized,
+       /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction,
+       /*ReductionGenClang=*/nullptr, xorAtomicReduction}};
+  OMPBuilder.Config.setIsGPU(false);
+
   bool ReduceVariableByRef[] = {false, false};
 
   OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos,
@@ -5232,15 +5238,20 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
       /* NumThreads */ nullptr, OMP_PROC_BIND_default,
       /* IsCancellable */ false);
 
+  OMPBuilder.Config.setIsGPU(false);
   bool ReduceVariableByRef[] = {false};
 
   OMPBuilder.createReductions(
       FirstBodyIP, FirstBodyAllocaIP,
-      {{SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction}},
+      {{SumType, SumReduced, SumPrivatized,
+        /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction,
+        /*ReductionGenClang=*/nullptr, sumAtomicReduction}},
       ReduceVariableByRef);
   OMPBuilder.createReductions(
       SecondBodyIP, SecondBodyAllocaIP,
-      {{XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}},
+      {{XorType, XorReduced, XorPrivatized,
+        /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction,
+        /*ReductionGenClang=*/nullptr, xorAtomicReduction}},
       ReduceVariableByRef);
 
   Builder.restoreIP(AfterIP);
diff --git a/llvm/unittests/IR/CFGBuilder.cpp b/llvm/unittests/IR/CFGBuilder.cpp
index 6a4e087402281a..7dc1e5162a81e6 100644
--- a/llvm/unittests/IR/CFGBuilder.cpp
+++ b/llvm/unittests/IR/CFGBuilder.cpp
@@ -11,6 +11,7 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/IR/ConstantRangeListTest.cpp b/llvm/unittests/IR/ConstantRangeListTest.cpp
index 144b5ccdc1fc0c..d00e0a8ff2a97b 100644
--- a/llvm/unittests/IR/ConstantRangeListTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeListTest.cpp
@@ -94,4 +94,168 @@ TEST_F(ConstantRangeListTest, Insert) {
   EXPECT_TRUE(CRL == Expected);
 }
 
+ConstantRangeList GetCRL(ArrayRef<std::pair<APInt, APInt>> Pairs) {
+  SmallVector<ConstantRange, 2> Ranges;
+  for (auto &[Start, End] : Pairs)
+    Ranges.push_back(ConstantRange(Start, End));
+  return ConstantRangeList(Ranges);
+}
+
+TEST_F(ConstantRangeListTest, Subtract) {
+  APInt AP0 = APInt(64, 0, /*isSigned=*/true);
+  APInt AP2 = APInt(64, 2, /*isSigned=*/true);
+  APInt AP3 = APInt(64, 3, /*isSigned=*/true);
+  APInt AP4 = APInt(64, 4, /*isSigned=*/true);
+  APInt AP8 = APInt(64, 8, /*isSigned=*/true);
+  APInt AP10 = APInt(64, 10, /*isSigned=*/true);
+  APInt AP11 = APInt(64, 11, /*isSigned=*/true);
+  APInt AP12 = APInt(64, 12, /*isSigned=*/true);
+  ConstantRangeList CRL = GetCRL({{AP0, AP4}, {AP8, AP12}});
+
+  // Execute ConstantRangeList::subtract(ConstantRange) and check the result
+  // is expected. Pass "CRL" by value so that subtract() does not affect the
+  // argument in caller.
+  auto SubtractAndCheck = [](ConstantRangeList CRL,
+                             const std::pair<int64_t, int64_t> &Range,
+                             const ConstantRangeList &ExpectedCRL) {
+    CRL.subtract(ConstantRange(APInt(64, Range.first, /*isSigned=*/true),
+                               APInt(64, Range.second, /*isSigned=*/true)));
+    EXPECT_EQ(CRL, ExpectedCRL);
+  };
+
+  // No overlap
+  SubtractAndCheck(CRL, {-4, 0}, CRL);
+  SubtractAndCheck(CRL, {4, 8}, CRL);
+  SubtractAndCheck(CRL, {12, 16}, CRL);
+
+  // Overlap (left, right, or both)
+  SubtractAndCheck(CRL, {-4, 2}, GetCRL({{AP2, AP4}, {AP8, AP12}}));
+  SubtractAndCheck(CRL, {-4, 4}, GetCRL({{AP8, AP12}}));
+  SubtractAndCheck(CRL, {-4, 8}, GetCRL({{AP8, AP12}}));
+  SubtractAndCheck(CRL, {0, 2}, GetCRL({{AP2, AP4}, {AP8, AP12}}));
+  SubtractAndCheck(CRL, {0, 4}, GetCRL({{AP8, AP12}}));
+  SubtractAndCheck(CRL, {0, 8}, GetCRL({{AP8, AP12}}));
+  SubtractAndCheck(CRL, {10, 12}, GetCRL({{AP0, AP4}, {AP8, AP10}}));
+  SubtractAndCheck(CRL, {8, 12}, GetCRL({{AP0, AP4}}));
+  SubtractAndCheck(CRL, {6, 12}, GetCRL({{AP0, AP4}}));
+  SubtractAndCheck(CRL, {10, 16}, GetCRL({{AP0, AP4}, {AP8, AP10}}));
+  SubtractAndCheck(CRL, {8, 16}, GetCRL({{AP0, AP4}}));
+  SubtractAndCheck(CRL, {6, 16}, GetCRL({{AP0, AP4}}));
+  SubtractAndCheck(CRL, {2, 10}, GetCRL({{AP0, AP2}, {AP10, AP12}}));
+
+  // Subset
+  SubtractAndCheck(CRL, {2, 3}, GetCRL({{AP0, AP2}, {AP3, AP4}, {AP8, AP12}}));
+  SubtractAndCheck(CRL, {10, 11},
+                   GetCRL({{AP0, AP4}, {AP8, AP10}, {AP11, AP12}}));
+
+  // Superset
+  SubtractAndCheck(CRL, {0, 12}, GetCRL({}));
+  SubtractAndCheck(CRL, {-4, 16}, GetCRL({}));
+}
+
+TEST_F(ConstantRangeListTest, Union) {
+  APInt APN4 = APInt(64, -4, /*isSigned=*/true);
+  APInt APN2 = APInt(64, -2, /*isSigned=*/true);
+  APInt AP0 = APInt(64, 0, /*isSigned=*/true);
+  APInt AP2 = APInt(64, 2, /*isSigned=*/true);
+  APInt AP4 = APInt(64, 4, /*isSigned=*/true);
+  APInt AP6 = APInt(64, 6, /*isSigned=*/true);
+  APInt AP7 = APInt(64, 7, /*isSigned=*/true);
+  APInt AP8 = APInt(64, 8, /*isSigned=*/true);
+  APInt AP10 = APInt(64, 10, /*isSigned=*/true);
+  APInt AP11 = APInt(64, 11, /*isSigned=*/true);
+  APInt AP12 = APInt(64, 12, /*isSigned=*/true);
+  APInt AP16 = APInt(64, 16, /*isSigned=*/true);
+  APInt AP18 = APInt(64, 18, /*isSigned=*/true);
+  ConstantRangeList CRL = GetCRL({{AP0, AP4}, {AP8, AP12}});
+
+  // Union with a subset.
+  ConstantRangeList Empty;
+  EXPECT_EQ(CRL.unionWith(Empty), CRL);
+  EXPECT_EQ(Empty.unionWith(CRL), CRL);
+
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP0, AP2}})), CRL);
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP10, AP12}})), CRL);
+
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP0, AP2}, {AP8, AP10}})), CRL);
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP0, AP2}, {AP10, AP12}})), CRL);
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP2, AP4}, {AP8, AP10}})), CRL);
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP2, AP4}, {AP10, AP12}})), CRL);
+
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP0, AP4}, {AP8, AP10}, {AP11, AP12}})),
+            CRL);
+
+  EXPECT_EQ(CRL.unionWith(CRL), CRL);
+
+  // Union with new ranges.
+  EXPECT_EQ(CRL.unionWith(GetCRL({{APN4, APN2}})),
+            GetCRL({{APN4, APN2}, {AP0, AP4}, {AP8, AP12}}));
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP6, AP7}})),
+            GetCRL({{AP0, AP4}, {AP6, AP7}, {AP8, AP12}}));
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP16, AP18}})),
+            GetCRL({{AP0, AP4}, {AP8, AP12}, {AP16, AP18}}));
+
+  EXPECT_EQ(CRL.unionWith(GetCRL({{APN2, AP2}})),
+            GetCRL({{APN2, AP4}, {AP8, AP12}}));
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP2, AP6}})),
+            GetCRL({{AP0, AP6}, {AP8, AP12}}));
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP10, AP16}})),
+            GetCRL({{AP0, AP4}, {AP8, AP16}}));
+
+  EXPECT_EQ(CRL.unionWith(GetCRL({{APN2, AP10}})), GetCRL({{APN2, AP12}}));
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP2, AP10}})), GetCRL({{AP0, AP12}}));
+  EXPECT_EQ(CRL.unionWith(GetCRL({{AP4, AP16}})), GetCRL({{AP0, AP16}}));
+  EXPECT_EQ(CRL.unionWith(GetCRL({{APN2, AP16}})), GetCRL({{APN2, AP16}}));
+}
+
+TEST_F(ConstantRangeListTest, Intersect) {
+  APInt APN2 = APInt(64, -2, /*isSigned=*/true);
+  APInt AP0 = APInt(64, 0, /*isSigned=*/true);
+  APInt AP2 = APInt(64, 2, /*isSigned=*/true);
+  APInt AP4 = APInt(64, 4, /*isSigned=*/true);
+  APInt AP6 = APInt(64, 6, /*isSigned=*/true);
+  APInt AP7 = APInt(64, 7, /*isSigned=*/true);
+  APInt AP8 = APInt(64, 8, /*isSigned=*/true);
+  APInt AP10 = APInt(64, 10, /*isSigned=*/true);
+  APInt AP11 = APInt(64, 11, /*isSigned=*/true);
+  APInt AP12 = APInt(64, 12, /*isSigned=*/true);
+  APInt AP16 = APInt(64, 16, /*isSigned=*/true);
+  ConstantRangeList CRL = GetCRL({{AP0, AP4}, {AP8, AP12}});
+
+  // No intersection.
+  ConstantRangeList Empty;
+  EXPECT_EQ(CRL.intersectWith(Empty), Empty);
+  EXPECT_EQ(Empty.intersectWith(CRL), Empty);
+
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{APN2, AP0}})), Empty);
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP6, AP8}})), Empty);
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP12, AP16}})), Empty);
+
+  // Single intersect range.
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{APN2, AP2}})), GetCRL({{AP0, AP2}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{APN2, AP6}})), GetCRL({{AP0, AP4}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP2, AP4}})), GetCRL({{AP2, AP4}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP2, AP6}})), GetCRL({{AP2, AP4}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP6, AP10}})), GetCRL({{AP8, AP10}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP6, AP16}})), GetCRL({{AP8, AP12}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP10, AP12}})), GetCRL({{AP10, AP12}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP10, AP16}})), GetCRL({{AP10, AP12}}));
+
+  // Multiple intersect ranges.
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{APN2, AP10}})),
+            GetCRL({{AP0, AP4}, {AP8, AP10}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{APN2, AP16}})), CRL);
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP2, AP10}})),
+            GetCRL({{AP2, AP4}, {AP8, AP10}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP2, AP16}})),
+            GetCRL({{AP2, AP4}, {AP8, AP12}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{APN2, AP2}, {AP6, AP10}})),
+            GetCRL({{AP0, AP2}, {AP8, AP10}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{AP2, AP6}, {AP10, AP16}})),
+            GetCRL({{AP2, AP4}, {AP10, AP12}}));
+  EXPECT_EQ(CRL.intersectWith(GetCRL({{APN2, AP2}, {AP7, AP10}, {AP11, AP16}})),
+            GetCRL({{AP0, AP2}, {AP8, AP10}, {AP11, AP12}}));
+  EXPECT_EQ(CRL.intersectWith(CRL), CRL);
+}
+
 } // anonymous namespace
diff --git a/llvm/unittests/IR/PassBuilderCallbacksTest.cpp b/llvm/unittests/IR/PassBuilderCallbacksTest.cpp
index f37f3336051e29..6230aed7b7119b 100644
--- a/llvm/unittests/IR/PassBuilderCallbacksTest.cpp
+++ b/llvm/unittests/IR/PassBuilderCallbacksTest.cpp
@@ -15,6 +15,7 @@
 #include <llvm/Analysis/LoopAnalysisManager.h>
 #include <llvm/AsmParser/Parser.h>
 #include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
 #include <llvm/IR/PassInstrumentation.h>
 #include <llvm/IR/PassManager.h>
 #include <llvm/Passes/PassBuilder.h>
diff --git a/llvm/unittests/IR/VectorBuilderTest.cpp b/llvm/unittests/IR/VectorBuilderTest.cpp
index 4f9e9d7c494d9d..e01378a2755f03 100644
--- a/llvm/unittests/IR/VectorBuilderTest.cpp
+++ b/llvm/unittests/IR/VectorBuilderTest.cpp
@@ -65,7 +65,7 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions) {
     auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     auto *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = UndefValue::get(ValueTy);                                      \
+    Value *Op = PoisonValue::get(ValueTy);                                      \
     auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
                                              {Op, Op});                        \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
@@ -115,7 +115,7 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoMask) {
     auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = UndefValue::get(ValueTy);                                      \
+    Value *Op = PoisonValue::get(ValueTy);                                      \
     auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
                                              {Op, Op});                        \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
@@ -161,7 +161,7 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoEVL) {
     auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = UndefValue::get(ValueTy);                                      \
+    Value *Op = PoisonValue::get(ValueTy);                                      \
     auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
                                              {Op, Op});                        \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
@@ -196,7 +196,7 @@ TEST_F(VectorBuilderTest,
     auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = UndefValue::get(ValueTy);                                      \
+    Value *Op = PoisonValue::get(ValueTy);                                      \
     auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
                                              {Op, Op});                        \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
@@ -222,8 +222,8 @@ TEST_F(VectorBuilderTest, TestCreateLoadStore) {
   auto *FloatVecTy =
       FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
 
-  Value *FloatVecPtr = UndefValue::get(Builder.getPtrTy(0));
-  Value *FloatVec = UndefValue::get(FloatVecTy);
+  Value *FloatVecPtr = PoisonValue::get(Builder.getPtrTy(0));
+  Value *FloatVec = PoisonValue::get(FloatVecTy);
 
   // vp.load
   auto LoadVPID = VPIntrinsic::getForOpcode(Instruction::Load);
diff --git a/llvm/unittests/MC/AMDGPU/CMakeLists.txt b/llvm/unittests/MC/AMDGPU/CMakeLists.txt
index be8ff572e6f7d4..0a399772e019e7 100644
--- a/llvm/unittests/MC/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/MC/AMDGPU/CMakeLists.txt
@@ -7,8 +7,6 @@ set(LLVM_LINK_COMPONENTS
   AMDGPUCodeGen
   AMDGPUDesc
   AMDGPUInfo
-  CodeGen
-  Core
   MC
   Support
   TargetParser
@@ -16,5 +14,4 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(AMDGPUMCTests
   DwarfRegMappings.cpp
-  SIProgramInfoMCExprs.cpp
   )
diff --git a/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp
deleted file mode 100644
index 57828a728931df..00000000000000
--- a/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-//===- llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp ------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUHSAMetadataStreamer.h"
-#include "SIProgramInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-class SIProgramInfoMCExprsTest : public testing::Test {
-protected:
-  std::unique_ptr<LLVMTargetMachine> TM;
-  std::unique_ptr<LLVMContext> Ctx;
-  std::unique_ptr<MachineModuleInfo> MMI;
-  std::unique_ptr<MachineFunction> MF;
-  std::unique_ptr<Module> M;
-
-  SIProgramInfo PI;
-
-  static void SetUpTestSuite() {
-    LLVMInitializeAMDGPUTargetInfo();
-    LLVMInitializeAMDGPUTarget();
-    LLVMInitializeAMDGPUTargetMC();
-  }
-
-  SIProgramInfoMCExprsTest() {
-    std::string Triple = "amdgcn-amd-amdhsa";
-    std::string CPU = "gfx1010";
-    std::string FS = "";
-
-    std::string Error;
-    const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
-    TargetOptions Options;
-
-    TM.reset(static_cast<LLVMTargetMachine *>(TheTarget->createTargetMachine(
-        Triple, CPU, FS, Options, std::nullopt, std::nullopt)));
-
-    Ctx = std::make_unique<LLVMContext>();
-    M = std::make_unique<Module>("Module", *Ctx);
-    M->setDataLayout(TM->createDataLayout());
-    auto *FType = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    auto *F = Function::Create(FType, GlobalValue::ExternalLinkage, "Test", *M);
-    MMI = std::make_unique<MachineModuleInfo>(TM.get());
-
-    auto *ST = TM->getSubtargetImpl(*F);
-
-    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 1, *MMI);
-    MF->initTargetMachineFunctionInfo(*ST);
-    PI.reset(*MF.get());
-  }
-};
-
-TEST_F(SIProgramInfoMCExprsTest, TestDeathHSAKernelEmit) {
-  MCContext &Ctx = MF->getContext();
-  MCSymbol *Sym = Ctx.getOrCreateSymbol("Unknown");
-  PI.ScratchSize = MCSymbolRefExpr::create(Sym, Ctx);
-
-  auto &Func = MF->getFunction();
-  Func.setCallingConv(CallingConv::AMDGPU_KERNEL);
-  AMDGPU::HSAMD::MetadataStreamerMsgPackV4 MD;
-
-  testing::internal::CaptureStderr();
-  MD.emitKernel(*MF, PI);
-  std::string err = testing::internal::GetCapturedStderr();
-  EXPECT_EQ(
-      err, "<unknown>:0: error: could not resolve expression when required.\n");
-  EXPECT_TRUE(Ctx.hadError());
-}
diff --git a/llvm/unittests/MI/LiveIntervalTest.cpp b/llvm/unittests/MI/LiveIntervalTest.cpp
index edc4baa6bfeca2..9623d9f58543f4 100644
--- a/llvm/unittests/MI/LiveIntervalTest.cpp
+++ b/llvm/unittests/MI/LiveIntervalTest.cpp
@@ -7,6 +7,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/llvm/unittests/MIR/MachineMetadata.cpp b/llvm/unittests/MIR/MachineMetadata.cpp
index bea11e4c734fc0..63fad2d2effec5 100644
--- a/llvm/unittests/MIR/MachineMetadata.cpp
+++ b/llvm/unittests/MIR/MachineMetadata.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/FileCheck/FileCheck.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/TargetRegistry.h"
diff --git a/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp b/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp
index 57d48261796e2b..102b13cf19bab9 100644
--- a/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp
+++ b/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Module.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 
diff --git a/llvm/unittests/Passes/Plugins/PluginsTest.cpp b/llvm/unittests/Passes/Plugins/PluginsTest.cpp
index 2814dce9f26157..b1f09e966d6e5b 100644
--- a/llvm/unittests/Passes/Plugins/PluginsTest.cpp
+++ b/llvm/unittests/Passes/Plugins/PluginsTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Config/config.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index a2cda2ba52dfec..a28352057e3a6b 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -942,12 +942,10 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
   Instruction *Inst2 = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB);
   annotateValueSite(*M, *Inst, R.get(), IPVK_IndirectCallTarget, 0);
 
-  uint32_t N;
   uint64_t T;
   auto ValueData =
-      getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, N, T);
-  ASSERT_NE(ValueData, nullptr);
-  ASSERT_EQ(3U, N);
+      getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, T);
+  ASSERT_THAT(ValueData, SizeIs(3));
   ASSERT_EQ(21U, T);
   // The result should be sorted already:
   ASSERT_EQ(6000U, ValueData[0].Value);
@@ -956,22 +954,19 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
   ASSERT_EQ(5U, ValueData[1].Count);
   ASSERT_EQ(4000U, ValueData[2].Value);
   ASSERT_EQ(4U, ValueData[2].Count);
-  ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 1, N, T);
-  ASSERT_NE(ValueData, nullptr);
-  ASSERT_EQ(1U, N);
+  ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 1, T);
+  ASSERT_THAT(ValueData, SizeIs(1));
   ASSERT_EQ(21U, T);
 
-  ValueData =
-      getValueProfDataFromInst(*Inst2, IPVK_IndirectCallTarget, 5, N, T);
-  ASSERT_EQ(ValueData, nullptr);
+  ValueData = getValueProfDataFromInst(*Inst2, IPVK_IndirectCallTarget, 5, T);
+  ASSERT_THAT(ValueData, SizeIs(0));
 
   // Remove the MD_prof metadata
   Inst->setMetadata(LLVMContext::MD_prof, 0);
   // Annotate 5 records this time.
   annotateValueSite(*M, *Inst, R.get(), IPVK_IndirectCallTarget, 0, 5);
-  ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, N, T);
-  ASSERT_NE(ValueData, nullptr);
-  ASSERT_EQ(5U, N);
+  ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, T);
+  ASSERT_THAT(ValueData, SizeIs(5));
   ASSERT_EQ(21U, T);
   ASSERT_EQ(6000U, ValueData[0].Value);
   ASSERT_EQ(6U, ValueData[0].Count);
@@ -991,9 +986,8 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
                               {5000, 2}, {6000, 1}};
   annotateValueSite(*M, *Inst, ArrayRef(VD0Sorted).slice(2), 10,
                     IPVK_IndirectCallTarget, 5);
-  ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, N, T);
-  ASSERT_NE(ValueData, nullptr);
-  ASSERT_EQ(4U, N);
+  ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, T);
+  ASSERT_THAT(ValueData, SizeIs(4));
   ASSERT_EQ(10U, T);
   ASSERT_EQ(3000U, ValueData[0].Value);
   ASSERT_EQ(4U, ValueData[0].Count);
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 15eb59ee00c94c..14623e180f6c85 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -247,7 +247,7 @@ TEST(MemProf, FillsValue) {
 TEST(MemProf, PortableWrapper) {
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
                     /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
-                    /*dealloc_cpu=*/4);
+                    /*dealloc_cpu=*/4, /*Histogram=*/0, /*HistogramSize=*/0);
 
   const auto Schema = llvm::memprof::getFullSchema();
   PortableMemInfoBlock WriteBlock(Info, Schema);
@@ -276,7 +276,7 @@ TEST(MemProf, RecordSerializationRoundTripVersion0And1) {
 
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
                     /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
-                    /*dealloc_cpu=*/4);
+                    /*dealloc_cpu=*/4, /*Histogram=*/0, /*HistogramSize=*/0);
 
   llvm::SmallVector<llvm::SmallVector<FrameId>> AllocCallStacks = {
       {0x123, 0x345}, {0x123, 0x567}};
@@ -310,7 +310,7 @@ TEST(MemProf, RecordSerializationRoundTripVerion2) {
 
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
                     /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
-                    /*dealloc_cpu=*/4);
+                    /*dealloc_cpu=*/4, /*Histogram=*/0, /*HistogramSize=*/0);
 
   llvm::SmallVector<llvm::memprof::CallStackId> CallStackIds = {0x123, 0x456};
 
diff --git a/llvm/unittests/Support/ErrorOrTest.cpp b/llvm/unittests/Support/ErrorOrTest.cpp
index cb0e63660452b8..a271c02d341afd 100644
--- a/llvm/unittests/Support/ErrorOrTest.cpp
+++ b/llvm/unittests/Support/ErrorOrTest.cpp
@@ -36,9 +36,7 @@ TEST(ErrorOr, SimpleValue) {
 #endif
 }
 
-ErrorOr<std::unique_ptr<int> > t3() {
-  return std::unique_ptr<int>(new int(3));
-}
+ErrorOr<std::unique_ptr<int>> t3() { return std::make_unique<int>(3); }
 
 TEST(ErrorOr, Types) {
   int x;
diff --git a/llvm/unittests/Support/MathExtrasTest.cpp b/llvm/unittests/Support/MathExtrasTest.cpp
index bd09bab9be0041..a557b61db97520 100644
--- a/llvm/unittests/Support/MathExtrasTest.cpp
+++ b/llvm/unittests/Support/MathExtrasTest.cpp
@@ -189,8 +189,13 @@ TEST(MathExtras, AlignTo) {
   EXPECT_EQ(8u, alignTo(5, 8));
   EXPECT_EQ(24u, alignTo(17, 8));
   EXPECT_EQ(0u, alignTo(~0LL, 8));
-  EXPECT_EQ(static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1,
-            alignTo(std::numeric_limits<uint32_t>::max(), 2));
+  EXPECT_EQ(8u, alignTo(5ULL, 8ULL));
+
+  EXPECT_EQ(8u, alignTo<8>(5));
+  EXPECT_EQ(24u, alignTo<8>(17));
+  EXPECT_EQ(0u, alignTo<8>(~0LL));
+  EXPECT_EQ(254u,
+            alignTo<static_cast<uint8_t>(127)>(static_cast<uint8_t>(200)));
 
   EXPECT_EQ(7u, alignTo(5, 8, 7));
   EXPECT_EQ(17u, alignTo(17, 8, 1));
@@ -198,12 +203,21 @@ TEST(MathExtras, AlignTo) {
   EXPECT_EQ(552u, alignTo(321, 255, 42));
   EXPECT_EQ(std::numeric_limits<uint32_t>::max(),
             alignTo(std::numeric_limits<uint32_t>::max(), 2, 1));
+
+  // Overflow.
+  EXPECT_EQ(0u, alignTo(static_cast<uint8_t>(200), static_cast<uint8_t>(128)));
+  EXPECT_EQ(0u, alignTo<static_cast<uint8_t>(128)>(static_cast<uint8_t>(200)));
+  EXPECT_EQ(0u, alignTo(static_cast<uint8_t>(200), static_cast<uint8_t>(128),
+                        static_cast<uint8_t>(0)));
+  EXPECT_EQ(0u, alignTo(std::numeric_limits<uint32_t>::max(), 2));
 }
 
 TEST(MathExtras, AlignToPowerOf2) {
+  EXPECT_EQ(0u, alignToPowerOf2(0u, 8));
   EXPECT_EQ(8u, alignToPowerOf2(5, 8));
   EXPECT_EQ(24u, alignToPowerOf2(17, 8));
   EXPECT_EQ(0u, alignToPowerOf2(~0LL, 8));
+  EXPECT_EQ(240u, alignToPowerOf2(240, 16));
   EXPECT_EQ(static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1,
             alignToPowerOf2(std::numeric_limits<uint32_t>::max(), 2));
 }
diff --git a/llvm/unittests/Target/AArch64/InstSizes.cpp b/llvm/unittests/Target/AArch64/InstSizes.cpp
index d7e4b4a91cdbd6..1308d8b19637c9 100644
--- a/llvm/unittests/Target/AArch64/InstSizes.cpp
+++ b/llvm/unittests/Target/AArch64/InstSizes.cpp
@@ -2,6 +2,7 @@
 #include "AArch64TargetMachine.h"
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/Target/ARM/InstSizes.cpp b/llvm/unittests/Target/ARM/InstSizes.cpp
index 6c0d9cb96df890..082bd12e52f4c4 100644
--- a/llvm/unittests/Target/ARM/InstSizes.cpp
+++ b/llvm/unittests/Target/ARM/InstSizes.cpp
@@ -3,6 +3,7 @@
 #include "ARMTargetMachine.h"
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/Target/DirectX/PointerTypeAnalysisTests.cpp b/llvm/unittests/Target/DirectX/PointerTypeAnalysisTests.cpp
index 0f3367dd455f2e..9d41e94bb0bae8 100644
--- a/llvm/unittests/Target/DirectX/PointerTypeAnalysisTests.cpp
+++ b/llvm/unittests/Target/DirectX/PointerTypeAnalysisTests.cpp
@@ -10,6 +10,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypedPointerType.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/llvm/unittests/Target/LoongArch/InstSizes.cpp b/llvm/unittests/Target/LoongArch/InstSizes.cpp
index f38af4a8d915f4..da78012e202540 100644
--- a/llvm/unittests/Target/LoongArch/InstSizes.cpp
+++ b/llvm/unittests/Target/LoongArch/InstSizes.cpp
@@ -2,6 +2,7 @@
 #include "LoongArchTargetMachine.h"
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index c690466071a1c4..62cfda435a047c 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -11,6 +11,7 @@
 #include "RISCVTargetMachine.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
diff --git a/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp b/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp
index fc7c6e6ed762e1..d45b4c0b9630df 100644
--- a/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp
+++ b/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypedPointerType.h"
diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index 98b154f86c949f..5838ab6f782ba8 100644
--- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp b/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp
index 5afc1e01d5fe82..e072844ca0106f 100644
--- a/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp
+++ b/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/Target/X86/TernlogTest.cpp b/llvm/unittests/Target/X86/TernlogTest.cpp
index 072a91d57f65dd..b5be97334bd981 100644
--- a/llvm/unittests/Target/X86/TernlogTest.cpp
+++ b/llvm/unittests/Target/X86/TernlogTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 60bbf01c6276b9..f944ac87b2cb0a 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1077,12 +1077,12 @@ Experimental extensions
     ztso                 0.1
     zvfbfmin             1.0
     zvfbfwma             1.0
-    smmpm                0.8
-    smnpm                0.8
-    ssnpm                0.8
-    sspm                 0.8
+    smmpm                1.0
+    smnpm                1.0
+    ssnpm                1.0
+    sspm                 1.0
     ssqosid              1.0
-    supm                 0.8
+    supm                 1.0
 
 Supported Profiles
     rva20s64
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 553f5df7827e22..fe200993828599 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1130,7 +1130,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_MTE,         AArch64::AEK_SSBS,
                  AArch64::AEK_FP16,        AArch64::AEK_FP16FML,
                  AArch64::AEK_SB,          AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,         AArch64::AEK_AM}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a520", "armv9.2-a", "crypto-neon-fp-armv8",
@@ -1147,7 +1148,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_JSCVT,       AArch64::AEK_FCMA,
-                 AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PERFMON,     AArch64::AEK_AM,
+                 AArch64::AEK_ETE}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a520ae", "armv9.2-a", "crypto-neon-fp-armv8",
@@ -1164,7 +1166,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_JSCVT,       AArch64::AEK_FCMA,
-                 AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PERFMON,     AArch64::AEK_AM,
+                 AArch64::AEK_ETE}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a57", "armv8-a", "crypto-neon-fp-armv8",
@@ -1279,7 +1282,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PAUTH,   AArch64::AEK_FLAGM,
                  AArch64::AEK_SB,      AArch64::AEK_I8MM,
                  AArch64::AEK_BF16,    AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,    AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,    AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a715", "armv9-a", "neon-fp-armv8",
@@ -1296,7 +1300,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PREDRES, AArch64::AEK_PROFILE,
                  AArch64::AEK_FP16FML, AArch64::AEK_FP16,
                  AArch64::AEK_FLAGM,   AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,    AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,    AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,     AArch64::AEK_TRBE}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a720", "armv9.2-a", "crypto-neon-fp-armv8",
@@ -1313,7 +1318,9 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,         AArch64::AEK_SPE_EEF,
+                 AArch64::AEK_TRBE}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a720ae", "armv9.2-a", "crypto-neon-fp-armv8",
@@ -1330,7 +1337,9 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,         AArch64::AEK_SPE_EEF,
+                 AArch64::AEK_TRBE}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a725", "armv9.2-a", "crypto-neon-fp-armv8",
@@ -1347,7 +1356,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_ETE,
+                 AArch64::AEK_SPE_EEF,     AArch64::AEK_TRBE}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-v1", "armv8.4-a", "crypto-neon-fp-armv8",
@@ -1364,7 +1374,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PROFILE, AArch64::AEK_RAND,
                  AArch64::AEK_FP16FML, AArch64::AEK_I8MM,
                  AArch64::AEK_JSCVT,   AArch64::AEK_FCMA,
-                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON,
+                 AArch64::AEK_CCDP}),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-v2", "armv9-a", "neon-fp-armv8",
@@ -1380,7 +1391,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_FP16FML,     AArch64::AEK_I8MM,
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_RAND,
                  AArch64::AEK_JSCVT,       AArch64::AEK_FCMA,
-                 AArch64::AEK_PAUTH,       AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PAUTH,       AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-v3", "armv9.2-a", "neon-fp-armv8",
@@ -1399,7 +1411,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PERFMON,     AArch64::AEK_RAND,
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FP16FML,
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,         AArch64::AEK_SPE_EEF}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-v3ae", "armv9.2-a", "neon-fp-armv8",
@@ -1418,7 +1431,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PERFMON,     AArch64::AEK_RAND,
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FP16FML,
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,         AArch64::AEK_SPE_EEF}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-r82", "armv8-r", "crypto-neon-fp-armv8",
@@ -1429,7 +1443,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RCPC, AArch64::AEK_LSE, AArch64::AEK_SB,
                  AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH,
                  AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
-                 AArch64::AEK_PREDRES}),
+                 AArch64::AEK_PREDRES, AArch64::AEK_CCDP}),
             "8-R"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-r82ae", "armv8-r", "crypto-neon-fp-armv8",
@@ -1440,7 +1454,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RCPC, AArch64::AEK_LSE, AArch64::AEK_SB,
                  AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH,
                  AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
-                 AArch64::AEK_PREDRES}),
+                 AArch64::AEK_PREDRES, AArch64::AEK_CCDP}),
             "8-R"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x1", "armv8.2-a", "crypto-neon-fp-armv8",
@@ -1459,7 +1473,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RAS, AArch64::AEK_LSE, AArch64::AEK_FP16,
                  AArch64::AEK_DOTPROD, AArch64::AEK_RCPC, AArch64::AEK_SSBS,
                  AArch64::AEK_PAUTH, AArch64::AEK_PROFILE, AArch64::AEK_FLAGM,
-                 AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PERFMON, AArch64::AEK_LSE2, AArch64::AEK_RCPC_IMMO}),
             "8.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x2", "armv9-a", "neon-fp-armv8",
@@ -1475,7 +1489,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SB,          AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML,     AArch64::AEK_FLAGM,
                  AArch64::AEK_JSCVT,       AArch64::AEK_FCMA,
-                 AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PERFMON,     AArch64::AEK_AM,
+                 AArch64::AEK_ETE}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x3", "armv9-a", "neon-fp-armv8",
@@ -1492,7 +1507,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_FP16,    AArch64::AEK_FP16FML,
                  AArch64::AEK_PREDRES, AArch64::AEK_FLAGM,
                  AArch64::AEK_SSBS,    AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,    AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,    AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,     AArch64::AEK_TRBE}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x4", "armv9.2-a", "crypto-neon-fp-armv8",
@@ -1509,7 +1525,9 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,         AArch64::AEK_SPE_EEF,
+                 AArch64::AEK_TRBE}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x925", "armv9.2-a", "crypto-neon-fp-armv8",
@@ -1526,7 +1544,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA}),
+                 AArch64::AEK_FCMA,        AArch64::AEK_ETE,
+                 AArch64::AEK_SPE_EEF,     AArch64::AEK_TRBE}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cyclone", "armv8-a", "crypto-neon-fp-armv8",
@@ -1557,7 +1576,8 @@ INSTANTIATE_TEST_SUITE_P(
             AArch64::ExtensionBitset({AArch64::AEK_CRC, AArch64::AEK_AES,
                                       AArch64::AEK_SHA2, AArch64::AEK_FP,
                                       AArch64::AEK_RDM, AArch64::AEK_SIMD,
-                                      AArch64::AEK_PERFMON}),
+                                      AArch64::AEK_PERFMON, AArch64::AEK_PAN,
+                                      AArch64::AEK_LOR, AArch64::AEK_VH}),
             "8-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a11", "armv8.2-a", "crypto-neon-fp-armv8",
@@ -1617,7 +1637,9 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_JSCVT,   AArch64::AEK_FCMA,
                  AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON,
                  AArch64::AEK_PREDRES, AArch64::AEK_SB,
-                 AArch64::AEK_SSBS}),
+                 AArch64::AEK_SSBS,    AArch64::AEK_CCDP,
+                 AArch64::AEK_FRINT3264, AArch64::AEK_SPECRESTRICT,
+                 AArch64::AEK_ALTERNATIVENZCV}),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-m1", "armv8.4-a", "crypto-neon-fp-armv8",
@@ -1632,7 +1654,9 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_JSCVT,   AArch64::AEK_FCMA,
                  AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON,
                  AArch64::AEK_PREDRES, AArch64::AEK_SB,
-                 AArch64::AEK_SSBS}),
+                 AArch64::AEK_SSBS,    AArch64::AEK_CCDP,
+                 AArch64::AEK_FRINT3264, AArch64::AEK_SPECRESTRICT,
+                 AArch64::AEK_ALTERNATIVENZCV}),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a15", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1665,7 +1689,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RCPC,    AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3,    AArch64::AEK_BF16,
                  AArch64::AEK_I8MM,    AArch64::AEK_JSCVT,   AArch64::AEK_FCMA,
-                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON, AArch64::AEK_HCX}),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-m3", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1676,7 +1700,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RCPC,    AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3,    AArch64::AEK_BF16,
                  AArch64::AEK_I8MM,    AArch64::AEK_JSCVT,   AArch64::AEK_FCMA,
-                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON, AArch64::AEK_HCX}),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a17", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1687,7 +1711,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RCPC,    AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3,    AArch64::AEK_BF16,
                  AArch64::AEK_I8MM,    AArch64::AEK_JSCVT,   AArch64::AEK_FCMA,
-                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON, AArch64::AEK_HCX}),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-m4", "armv8.7-a", "crypto-neon-fp-armv8",
@@ -1771,7 +1795,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_BF16,
                  AArch64::AEK_I8MM,        AArch64::AEK_JSCVT,
                  AArch64::AEK_FCMA,        AArch64::AEK_PAUTH,
-                 AArch64::AEK_FP16FML,     AArch64::AEK_PERFMON}),
+                 AArch64::AEK_FP16FML,     AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,         AArch64::AEK_TRBE}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-n3", "armv9.2-a", "neon-fp-armv8",
@@ -1789,7 +1814,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_FLAGM,   AArch64::AEK_PERFMON,
                  AArch64::AEK_RAND,    AArch64::AEK_SVE2BITPERM,
                  AArch64::AEK_FP16FML, AArch64::AEK_PROFILE,
-                 AArch64::AEK_JSCVT,   AArch64::AEK_PERFMON}),
+                 AArch64::AEK_JSCVT,   AArch64::AEK_PERFMON,
+                 AArch64::AEK_ETE,     AArch64::AEK_SPE_EEF}),
             "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "ampere1", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1824,7 +1850,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SHA2,  AArch64::AEK_AES,   AArch64::AEK_I8MM,
                  AArch64::AEK_SSBS,  AArch64::AEK_SB,    AArch64::AEK_RAND,
                  AArch64::AEK_MTE,   AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
-                 AArch64::AEK_PAUTH, AArch64::AEK_CSSC,  AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PAUTH, AArch64::AEK_CSSC,  AArch64::AEK_PERFMON,
+                 AArch64::AEK_WFXT}),
             "8.7-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-512tvb", "armv8.4-a", "crypto-neon-fp-armv8",
@@ -1841,7 +1868,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PROFILE, AArch64::AEK_RAND,
                  AArch64::AEK_FP16FML, AArch64::AEK_I8MM,
                  AArch64::AEK_JSCVT,   AArch64::AEK_FCMA,
-                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON}),
+                 AArch64::AEK_PAUTH,   AArch64::AEK_PERFMON,
+                 AArch64::AEK_CCDP}),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "thunderx2t99", "armv8.1-a", "crypto-neon-fp-armv8",
@@ -2352,17 +2380,11 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
 TEST(TargetParserTest, AArch64PrintSupportedExtensions) {
   std::string expected =
       "All available -march extensions for AArch64\n\n"
-      "    Name                Description\n"
-      "    aes                 This is a long dummy description\n"
-      "    b16b16\n"
-      "    bf16\n";
-
-  StringMap<StringRef> DummyMap;
-  DummyMap["aes"] = "This is a long dummy description";
+      "    Name                Architecture Feature(s)                                Description\n";
 
   outs().flush();
   testing::internal::CaptureStdout();
-  AArch64::PrintSupportedExtensions(DummyMap);
+  AArch64::PrintSupportedExtensions();
   outs().flush();
   std::string captured = testing::internal::GetCapturedStdout();
 
@@ -2378,6 +2400,23 @@ TEST(TargetParserTest, AArch64PrintSupportedExtensions) {
   EXPECT_EQ(std::string::npos, captured.find("ssbs2"));
 }
 
+TEST(TargetParserTest, AArch64PrintEnabledExtensions) {
+  // Pick a single enabled extension to validate formatting
+  std::set<StringRef> EnabledExtensions = {"crc"};
+  std::string ExpectedOutput =
+      "Extensions enabled for the given AArch64 target\n\n"
+      "    Architecture Feature(s)                                Description\n"
+      "    FEAT_CRC32                                             Enable ARMv8 CRC-32 checksum instructions\n";
+
+  outs().flush();
+  testing::internal::CaptureStdout();
+  AArch64::printEnabledExtensions(EnabledExtensions);
+  outs().flush();
+  std::string CapturedOutput = testing::internal::GetCapturedStdout();
+
+  EXPECT_EQ(CapturedOutput, ExpectedOutput);
+}
+
 struct AArch64ExtensionDependenciesBaseArchTestParams {
   const llvm::AArch64::ArchInfo &Arch;
   std::vector<StringRef> Modifiers;
diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index 5152cbe19c215c..56692cf25b7972 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
index dbc1e215278552..9200fab4368664 100644
--- a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
@@ -16,6 +16,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 316d59a9d22969..3b3c45d9699714 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/Transforms/Utils/LoopRotationUtilsTest.cpp b/llvm/unittests/Transforms/Utils/LoopRotationUtilsTest.cpp
index cbb33a52ad2130..c276f2ea62fedb 100644
--- a/llvm/unittests/Transforms/Utils/LoopRotationUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LoopRotationUtilsTest.cpp
@@ -16,6 +16,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp b/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
index 8272c304ba2884..2b11aadb7f6e4d 100644
--- a/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
index eb27e69da47fae..9e80c6d39c3c13 100644
--- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
@@ -96,7 +96,7 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) {
   //   %bitcast2 = bitcast i8* %select to i32*
   //   br i1 undef, label %loop, label %exit
 
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = F->getDataLayout();
   BranchInst *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
   AllocaInst *Alloca =
diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
index de337803f506ac..241d55bd624576 100644
--- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
+++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 5c45d86130bdc8..4022c343d63ef8 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -875,8 +875,8 @@ TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   LLVMContext C;
 
   IntegerType *Int32 = IntegerType::get(C, 32);
-  auto *AI =
-      BinaryOperator::CreateAdd(UndefValue::get(Int32), UndefValue::get(Int32));
+  auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
+                                       PoisonValue::get(Int32));
   VPValue Op1;
   VPValue Op2;
   SmallVector<VPValue *, 2> Args;
@@ -924,7 +924,7 @@ TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *SelectI = SelectInst::Create(
-      UndefValue::get(Int1), UndefValue::get(Int32), UndefValue::get(Int32));
+      PoisonValue::get(Int1), PoisonValue::get(Int32), PoisonValue::get(Int32));
   VPValue Op1;
   VPValue Op2;
   VPValue Op3;
@@ -951,8 +951,8 @@ TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
 
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
-  auto *GEP = GetElementPtrInst::Create(Int32, UndefValue::get(Int32Ptr),
-                                        UndefValue::get(Int32));
+  auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr),
+                                        PoisonValue::get(Int32));
   VPValue Op1;
   VPValue Op2;
   SmallVector<VPValue *, 4> Args;
@@ -1014,7 +1014,7 @@ TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
 
   IntegerType *Int32 = IntegerType::get(C, 32);
   FunctionType *FTy = FunctionType::get(Int32, false);
-  auto *Call = CallInst::Create(FTy, UndefValue::get(FTy));
+  auto *Call = CallInst::Create(FTy, PoisonValue::get(FTy));
   VPReplicateRecipe Recipe(Call, make_range(Args.begin(), Args.end()), true);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
@@ -1039,7 +1039,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
   auto *Load =
-      new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
+      new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
   VPValue Addr;
   VPValue Mask;
   VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
@@ -1062,8 +1062,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
 
   {
-    auto *AI = BinaryOperator::CreateAdd(UndefValue::get(Int32),
-                                         UndefValue::get(Int32));
+    auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
+                                         PoisonValue::get(Int32));
     VPValue Op1;
     VPValue Op2;
     SmallVector<VPValue *, 2> Args;
@@ -1078,8 +1078,9 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   }
 
   {
-    auto *SelectI = SelectInst::Create(
-        UndefValue::get(Int1), UndefValue::get(Int32), UndefValue::get(Int32));
+    auto *SelectI =
+        SelectInst::Create(PoisonValue::get(Int1), PoisonValue::get(Int32),
+                           PoisonValue::get(Int32));
     VPValue Op1;
     VPValue Op2;
     VPValue Op3;
@@ -1096,8 +1097,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   }
 
   {
-    auto *GEP = GetElementPtrInst::Create(Int32, UndefValue::get(Int32Ptr),
-                                          UndefValue::get(Int32));
+    auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr),
+                                          PoisonValue::get(Int32));
     VPValue Op1;
     VPValue Op2;
     SmallVector<VPValue *, 4> Args;
@@ -1134,7 +1135,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 
   {
     auto *Load =
-        new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
+        new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
     VPValue Addr;
     VPValue Mask;
     VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
@@ -1146,8 +1147,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   }
 
   {
-    auto *Store = new StoreInst(UndefValue::get(Int32),
-                                UndefValue::get(Int32Ptr), false, Align(1));
+    auto *Store = new StoreInst(PoisonValue::get(Int32),
+                                PoisonValue::get(Int32Ptr), false, Align(1));
     VPValue Addr;
     VPValue Mask;
     VPValue StoredV;
@@ -1242,8 +1243,8 @@ TEST(VPRecipeTest, dumpRecipeInPlan) {
   LLVMContext C;
 
   IntegerType *Int32 = IntegerType::get(C, 32);
-  auto *AI =
-      BinaryOperator::CreateAdd(UndefValue::get(Int32), UndefValue::get(Int32));
+  auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
+                                       PoisonValue::get(Int32));
   AI->setName("a");
   SmallVector<VPValue *, 2> Args;
   VPValue *ExtVPV1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
@@ -1296,8 +1297,8 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
   LLVMContext C;
 
   IntegerType *Int32 = IntegerType::get(C, 32);
-  auto *AI =
-      BinaryOperator::CreateAdd(UndefValue::get(Int32), UndefValue::get(Int32));
+  auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
+                                       PoisonValue::get(Int32));
   AI->setName("a");
   SmallVector<VPValue *, 2> Args;
   VPValue *ExtVPV1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
@@ -1377,8 +1378,8 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
 TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
   LLVMContext C;
   IntegerType *Int32 = IntegerType::get(C, 32);
-  auto *AI =
-      BinaryOperator::CreateAdd(UndefValue::get(Int32), UndefValue::get(Int32));
+  auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
+                                       PoisonValue::get(Int32));
   AI->setName("a");
   VPValue *ExtVPV1 = new VPValue(ConstantInt::get(Int32, 1));
   VPValue *ExtVPV2 = new VPValue(AI);
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index b4d5fcaecd2ff1..bf110c8ad76bb8 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -182,8 +182,9 @@ TEST_F(X86SerialSnippetGeneratorTest,
     EXPECT_THAT(IT.getOpcode(), Opcode);
     ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
     for (const auto &Var : IT.getVariableValues()) {
-      if (Var.isReg())
+      if (Var.isReg()) {
         EXPECT_FALSE(ForbiddenRegisters[Var.getReg()]);
+      }
     }
   }
 }
diff --git a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
index ac35dce522ae10..1a14c687295cad 100644
--- a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
+++ b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
@@ -1,6 +1,8 @@
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "Views/SummaryView.h"
 #include "X86TestBase.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MCA/CustomBehaviour.h"
 #include "llvm/MCA/IncrementalSourceMgr.h"
 #include "llvm/MCA/InstrBuilder.h"
@@ -8,6 +10,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 #include <unordered_map>
 
 using namespace llvm;
@@ -185,3 +188,49 @@ TEST_F(X86TestBase, TestInstructionRecycling) {
     ASSERT_EQ(*BV, *V) << "Value of '" << F << "' does not match";
   }
 }
+
+// Test that we do not depend upon the MCInst address for variant description
+// construction. This test creates two instructions that will use variant
+// description as they are both zeroing idioms, but write to different
+// registers. If the key used to access the variant instruction description is
+// the same between the descriptions (like the MCInst pointer), we will run into
+// an assertion failure due to the different writes.
+TEST_F(X86TestBase, TestVariantInstructionsSameAddress) {
+  mca::Context MCA(*MRI, *STI);
+
+  mca::IncrementalSourceMgr ISM;
+  // Empty CustomBehaviour.
+  auto CB = std::make_unique<mca::CustomBehaviour>(*STI, ISM, *MCII);
+
+  auto PO = getDefaultPipelineOptions();
+  auto P = MCA.createDefaultPipeline(PO, ISM, *CB);
+  ASSERT_TRUE(P);
+
+  auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, 100);
+
+  const SmallVector<mca::Instrument *> Instruments;
+
+  MCInst InstructionToAdd;
+  InstructionToAdd = MCInstBuilder(X86::XOR64rr)
+                         .addReg(X86::RAX)
+                         .addReg(X86::RAX)
+                         .addReg(X86::RAX);
+  Expected<std::unique_ptr<mca::Instruction>> Instruction1OrErr =
+      IB.createInstruction(InstructionToAdd, Instruments);
+  ASSERT_TRUE(static_cast<bool>(Instruction1OrErr));
+  ISM.addInst(std::move(Instruction1OrErr.get()));
+
+  InstructionToAdd = MCInstBuilder(X86::XORPSrr)
+                         .addReg(X86::XMM0)
+                         .addReg(X86::XMM0)
+                         .addReg(X86::XMM0);
+  Expected<std::unique_ptr<mca::Instruction>> Instruction2OrErr =
+      IB.createInstruction(InstructionToAdd, Instruments);
+  ASSERT_TRUE(static_cast<bool>(Instruction2OrErr));
+  ISM.addInst(std::move(Instruction2OrErr.get()));
+
+  ISM.endOfStream();
+  Expected<unsigned> Cycles = P->run();
+  ASSERT_TRUE(static_cast<bool>(Cycles));
+}
diff --git a/llvm/utils/FileCheck/FileCheck.cpp b/llvm/utils/FileCheck/FileCheck.cpp
index a96251867c084e..9cf3a3164dfecc 100644
--- a/llvm/utils/FileCheck/FileCheck.cpp
+++ b/llvm/utils/FileCheck/FileCheck.cpp
@@ -641,7 +641,7 @@ static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req,
       LineOS = &ElidedLinesOS;
     else {
       LineOS = &OS;
-      DumpEllipsisOrElidedLines(OS, ElidedLinesOS.str(), LabelWidth);
+      DumpEllipsisOrElidedLines(OS, ElidedLines, LabelWidth);
     }
 
     // Print right-aligned line number.
@@ -723,7 +723,7 @@ static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req,
       ++AnnotationItr;
     }
   }
-  DumpEllipsisOrElidedLines(OS, ElidedLinesOS.str(), LabelWidth);
+  DumpEllipsisOrElidedLines(OS, ElidedLines, LabelWidth);
 
   OS << ">>>>>>\n";
 }
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index e22f353b451f9f..1435696a5d42fb 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -41,9 +42,9 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   std::vector<Record *> SortedExtensions =
       RK.getAllDerivedDefinitions("Extension");
   auto Alphabetical = [](Record *A, Record *B) -> bool {
-    const auto MarchA = A->getValueAsString("MArchName");
-    const auto MarchB = B->getValueAsString("MArchName");
-    return MarchA.compare(MarchB) < 0; // A lexographically less than B
+    const auto NameA = A->getValueAsString("Name");
+    const auto NameB = B->getValueAsString("Name");
+    return NameA.compare(NameB) < 0; // A lexographically less than B
   };
   std::sort(SortedExtensions.begin(), SortedExtensions.end(), Alphabetical);
 
@@ -95,17 +96,19 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   for (const Record *Rec : SortedExtensions) {
     auto AEK = Rec->getValueAsString("ArchExtKindSpelling").upper();
     OS << "  ";
-    OS << "{\"" << Rec->getValueAsString("MArchName") << "\"";
-    if (auto Alias = Rec->getValueAsString("MArchAlias"); Alias.empty())
+    OS << "{\"" << Rec->getValueAsString("UserVisibleName") << "\"";
+    if (auto Alias = Rec->getValueAsString("UserVisibleAlias"); Alias.empty())
       OS << ", {}";
     else
       OS << ", \"" << Alias << "\"";
     OS << ", AArch64::" << AEK;
+    OS << ", \"" << Rec->getValueAsString("ArchFeatureName") << "\"";
+    OS << ", \"" << Rec->getValueAsString("Desc") << "\"";
     OS << ", \"+" << Rec->getValueAsString("Name") << "\""; // posfeature
     OS << ", \"-" << Rec->getValueAsString("Name") << "\""; // negfeature
     OS << "},\n";
   };
-  OS << "  {\"none\", {}, AArch64::AEK_NONE, {}, {} },\n";
+  OS << "  {\"none\", {}, AArch64::AEK_NONE, {}, {}, {}, {} },\n";
   OS << "};\n"
      << "#undef EMIT_EXTENSIONS\n"
      << "#endif // EMIT_EXTENSIONS\n"
@@ -217,6 +220,36 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
      << "#endif // EMIT_ARCHITECTURES\n"
      << "\n";
 
+  // Emit CPU Aliases
+  OS << "#ifdef EMIT_CPU_ALIAS\n"
+     << "inline constexpr Alias CpuAliases[] = {\n";
+
+  llvm::StringSet<> Processors;
+  for (const Record *Rec : RK.getAllDerivedDefinitions("ProcessorModel"))
+    Processors.insert(Rec->getValueAsString("Name"));
+
+  llvm::StringSet<> Aliases;
+  for (const Record *Rec : RK.getAllDerivedDefinitions("ProcessorAlias")) {
+    auto Name = Rec->getValueAsString("Name");
+    auto Alias = Rec->getValueAsString("Alias");
+    if (!Processors.contains(Alias))
+      PrintFatalError(
+          Rec, "Alias '" + Name + "' references a non-existent ProcessorModel '" + Alias + "'");
+    if (Processors.contains(Name))
+      PrintFatalError(
+          Rec, "Alias '" + Name + "' duplicates an existing ProcessorModel");
+    if (!Aliases.insert(Name).second)
+      PrintFatalError(
+          Rec, "Alias '" + Name + "' duplicates an existing ProcessorAlias");
+
+    OS << llvm::formatv(R"(  { "{0}", "{1}" },)", Name, Alias) << '\n';
+  }
+
+  OS << "};\n"
+     << "#undef EMIT_CPU_ALIAS\n"
+     << "#endif // EMIT_CPU_ALIAS\n"
+     << "\n";
+
   // Emit CPU information
   OS << "#ifdef EMIT_CPU_INFO\n"
      << "inline constexpr CpuInfo CpuInfos[] = {\n";
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 29f15f0e57333f..5035ef52707f4e 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -2367,10 +2367,10 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   OS << "};\n\n";
 
   // Spit out the conversion driver function.
-  OS << CvtOS.str();
+  OS << ConvertFnBody;
 
   // Spit out the operand number lookup function.
-  OS << OpOS.str();
+  OS << OperandFnBody;
 
   return ConversionTable.size();
 }
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index 16661cd29edc55..79a1c245f0fd9b 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -1162,8 +1162,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
                        PatternCount - PatternStart);
   }
 
-  if (OpcodeO.str().empty()) {
-    O << HeaderO.str();
+  if (PatternsForOpcode.empty()) {
+    O << Header;
     O << "  return false;\n";
     O << "}\n\n";
     O << "#endif // PRINT_ALIAS_INSTR\n";
@@ -1177,15 +1177,15 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       << "                  const MCSubtargetInfo &STI,\n"
       << "                  unsigned PredicateIndex);\n";
 
-  O << HeaderO.str();
+  O << Header;
   O.indent(2) << "static const PatternsForOpcode OpToPatterns[] = {\n";
-  O << OpcodeO.str();
+  O << PatternsForOpcode;
   O.indent(2) << "};\n\n";
   O.indent(2) << "static const AliasPattern Patterns[] = {\n";
-  O << PatternO.str();
+  O << Patterns;
   O.indent(2) << "};\n\n";
   O.indent(2) << "static const AliasPatternCond Conds[] = {\n";
-  O << CondO.str();
+  O << Conds;
   O.indent(2) << "};\n\n";
   O.indent(2) << "static const char AsmStrings[] =\n";
   for (const auto &P : AsmStrings) {
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 3c868b1e8f4fe5..b5a6c1395c60e8 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -618,6 +619,9 @@ struct TupleExpander : SetTheory::Expander {
   // the synthesized definitions for their lifetime.
   std::vector<std::unique_ptr<Record>> &SynthDefs;
 
+  // Track all synthesized tuple names in order to detect duplicate definitions.
+  llvm::StringSet<> TupleNames;
+
   TupleExpander(std::vector<std::unique_ptr<Record>> &SynthDefs)
       : SynthDefs(SynthDefs) {}
 
@@ -684,6 +688,12 @@ struct TupleExpander : SetTheory::Expander {
       Record *NewReg = SynthDefs.back().get();
       Elts.insert(NewReg);
 
+      // Detect duplicates among synthesized registers.
+      const auto Res = TupleNames.insert(NewReg->getName());
+      if (!Res.second)
+        PrintFatalError(Def->getLoc(),
+                        "Register tuple redefines register '" + Name + "'.");
+
       // Copy Proto super-classes.
       ArrayRef<std::pair<Record *, SMRange>> Supers = Proto->getSuperClasses();
       for (const auto &SuperPair : Supers)
diff --git a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
index 4263d8f41715f3..e633dc36e2343b 100644
--- a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
@@ -405,7 +405,7 @@ void VarLenCodeEmitterGen::emitInstructionBaseValues(
   IS.indent(4) << "{/*NumBits*/0, /*Index*/0}\n  };\n";
   SS.indent(4) << "UINT64_C(0)\n  };\n";
 
-  OS << IS.str() << SS.str();
+  OS << IndexArray << StorageArray;
 }
 
 std::string VarLenCodeEmitterGen::getInstructionCases(Record *R,
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 60a0402103ce05..1adefea5b03599 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -239,6 +239,9 @@ void SubtargetEmitter::EmitSubtargetInfoMacroCalls(raw_ostream &OS) {
 
   OS << "\n#ifdef GET_SUBTARGETINFO_MC_DESC\n";
   OS << "#undef GET_SUBTARGETINFO_MC_DESC\n\n";
+
+  if (Target == "AArch64")
+    OS << "#include \"llvm/TargetParser/AArch64TargetParser.h\"\n\n";
 }
 
 //
@@ -1895,6 +1898,10 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
     return;
   }
 
+  if (Target == "AArch64")
+    OS << "  CPU = AArch64::resolveCPUAlias(CPU);\n"
+       << "  TuneCPU = AArch64::resolveCPUAlias(TuneCPU);\n";
+
   OS << "  InitMCProcessorInfo(CPU, TuneCPU, FS);\n"
      << "  const FeatureBitset &Bits = getFeatureBits();\n";
 
@@ -1946,6 +1953,11 @@ void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
     OS << "  unsigned getHwMode(enum HwModeType type = HwMode_Default) const "
           "override;\n";
   }
+  if (Target == "AArch64")
+    OS << "  bool isCPUStringValid(StringRef CPU) const override {\n"
+       << "    CPU = AArch64::resolveCPUAlias(CPU);\n"
+       << "    return MCSubtargetInfo::isCPUStringValid(CPU);\n"
+       << "  }\n";
   OS << "};\n";
   EmitHwModeCheck(Target + "GenMCSubtargetInfo", OS);
 }
@@ -2013,6 +2025,9 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "\nstatic inline MCSubtargetInfo *create" << Target
      << "MCSubtargetInfoImpl("
      << "const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS) {\n";
+  if (Target == "AArch64")
+    OS << "  CPU = AArch64::resolveCPUAlias(CPU);\n"
+       << "  TuneCPU = AArch64::resolveCPUAlias(TuneCPU);\n";
   OS << "  return new " << Target
      << "GenMCSubtargetInfo(TT, CPU, TuneCPU, FS, ";
   if (NumFeatures)
@@ -2045,6 +2060,8 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 
   OS << "#include \"llvm/Support/Debug.h\"\n";
   OS << "#include \"llvm/Support/raw_ostream.h\"\n\n";
+  if (Target == "AArch64")
+    OS << "#include \"llvm/TargetParser/AArch64TargetParser.h\"\n\n";
   ParseFeaturesFunction(OS);
 
   OS << "#endif // GET_SUBTARGETINFO_TARGET_DESC\n\n";
@@ -2112,8 +2129,13 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   }
 
   OS << ClassName << "::" << ClassName << "(const Triple &TT, StringRef CPU, "
-     << "StringRef TuneCPU, StringRef FS)\n"
-     << "  : TargetSubtargetInfo(TT, CPU, TuneCPU, FS, ";
+     << "StringRef TuneCPU, StringRef FS)\n";
+
+  if (Target == "AArch64")
+    OS << "  : TargetSubtargetInfo(TT, AArch64::resolveCPUAlias(CPU),\n"
+       << "                        AArch64::resolveCPUAlias(TuneCPU), FS, ";
+  else
+    OS << "  : TargetSubtargetInfo(TT, CPU, TuneCPU, FS, ";
   if (NumFeatures)
     OS << "ArrayRef(" << Target << "FeatureKV, " << NumFeatures << "), ";
   else
diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp
index 3187548a297048..851a525cf48c0d 100644
--- a/llvm/utils/TableGen/VTEmitter.cpp
+++ b/llvm/utils/TableGen/VTEmitter.cpp
@@ -29,6 +29,53 @@ class VTEmitter {
 
 } // End anonymous namespace.
 
+static void VTtoGetLLVMTyString(raw_ostream &OS, const Record *VT) {
+  bool IsVector = VT->getValueAsBit("isVector");
+  if (IsVector)
+    OS << (VT->getValueAsBit("isScalable") ? "Scalable" : "Fixed")
+       << "VectorType::get(";
+
+  auto OutputVT = IsVector ? VT->getValueAsDef("ElementType") : VT;
+  int64_t OutputVTSize = OutputVT->getValueAsInt("Size");
+
+  if (OutputVT->getValueAsBit("isFP")) {
+    StringRef FloatTy;
+    auto OutputVTName = OutputVT->getValueAsString("LLVMName");
+    switch (OutputVTSize) {
+    default:
+      llvm_unreachable("Unhandled case");
+    case 16:
+      FloatTy = (OutputVTName == "bf16") ? "BFloatTy" : "HalfTy";
+      break;
+    case 32:
+      FloatTy = "FloatTy";
+      break;
+    case 64:
+      FloatTy = "DoubleTy";
+      break;
+    case 80:
+      FloatTy = "X86_FP80Ty";
+      break;
+    case 128:
+      FloatTy = (OutputVTName == "ppcf128") ? "PPC_FP128Ty" : "FP128Ty";
+      break;
+    }
+    OS << "Type::get" << FloatTy << "(Context)";
+  } else if (OutputVT->getValueAsBit("isInteger")) {
+    // We only have Type::getInt1Ty, Int8, Int16, Int32, Int64, and Int128
+    if ((isPowerOf2_64(OutputVTSize) && OutputVTSize >= 8 &&
+         OutputVTSize <= 128) ||
+        OutputVTSize == 1)
+      OS << "Type::getInt" << OutputVTSize << "Ty(Context)";
+    else
+      OS << "Type::getIntNTy(Context, " << OutputVTSize << ")";
+  } else
+    llvm_unreachable("Unhandled case");
+
+  if (IsVector)
+    OS << ", " << VT->getValueAsInt("nElem") << ")";
+}
+
 void VTEmitter::run(raw_ostream &OS) {
   emitSourceFileHeader("ValueTypes Source Fragment", OS, Records);
 
@@ -130,6 +177,23 @@ void VTEmitter::run(raw_ostream &OS) {
     // clang-format on
   }
   OS << "#endif\n\n";
+
+  OS << "#ifdef GET_VT_EVT\n";
+  for (const auto *VT : VTsByNumber) {
+    if (!VT)
+      continue;
+    bool IsInteger = VT->getValueAsBit("isInteger");
+    bool IsVector = VT->getValueAsBit("isVector");
+    bool IsFP = VT->getValueAsBit("isFP");
+
+    if (!IsInteger && !IsVector && !IsFP)
+      continue;
+
+    OS << "  GET_VT_EVT(" << VT->getValueAsString("LLVMName") << ", ";
+    VTtoGetLLVMTyString(OS, VT);
+    OS << ")\n";
+  }
+  OS << "#endif\n\n";
 }
 
 static TableGen::Emitter::OptClass<VTEmitter> X("gen-vt", "Generate ValueType");
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index d60d4131bc94bd..f1207026704e88 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -216,17 +216,6 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str
             cf_cmd.append(args.start_rev)
             cf_cmd.append(args.end_rev)
 
-        # Gather the extension of all modified files and pass them explicitly to git-clang-format.
-        # This prevents git-clang-format from applying its own filtering rules on top of ours.
-        extensions = set()
-        for file in cpp_files:
-            _, ext = os.path.splitext(file)
-            extensions.add(
-                ext.strip(".")
-            )  # Exclude periods since git-clang-format takes extensions without them
-        cf_cmd.append("--extensions")
-        cf_cmd.append("'{}'".format(",".join(extensions)))
-
         cf_cmd.append("--")
         cf_cmd += cpp_files
 
diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index 1766ccb38ba255..3e382548591c48 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -11,6 +11,7 @@
 import argparse
 from git import Repo  # type: ignore
 import html
+import json
 import github
 import os
 import re
@@ -653,7 +654,13 @@ def request_release_note(token: str, repo_name: str, pr_number: int):
         mention = f"@{submitter}"
 
     comment = f"{mention} (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix.  When you are done, please add the release:note label to this PR. "
-    pr.as_issue().create_comment(comment)
+    try:
+        pr.as_issue().create_comment(comment)
+    except:
+        # Failed to create comment so emit file instead
+        with open("comments", "w") as file:
+            data = [{"body": comment}]
+            json.dump(data, file)
 
 
 parser = argparse.ArgumentParser()
diff --git a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
index e88df022be7842..f4a679345d7129 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
@@ -21,8 +21,11 @@ static_library("Core") {
     "BinaryData.cpp",
     "BinaryEmitter.cpp",
     "BinaryFunction.cpp",
+    "BinaryFunctionCallGraph.cpp",
     "BinaryFunctionProfile.cpp",
     "BinarySection.cpp",
+    "CallGraph.cpp",
+    "CallGraphWalker.cpp",
     "DIEBuilder.cpp",
     "DebugData.cpp",
     "DebugNames.cpp",
diff --git a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
index 50ce65a40e657f..620c9e8927faaf 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
@@ -16,12 +16,9 @@ static_library("Passes") {
     "Aligner.cpp",
     "AllocCombiner.cpp",
     "AsmDump.cpp",
-    "BinaryFunctionCallGraph.cpp",
     "BinaryPasses.cpp",
     "CMOVConversion.cpp",
     "CacheMetrics.cpp",
-    "CallGraph.cpp",
-    "CallGraphWalker.cpp",
     "DataflowAnalysis.cpp",
     "DataflowInfoManager.cpp",
     "FixRISCVCallsPass.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/tool/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/tool/BUILD.gn
index 6d8deec2fcb170..b224df093c7748 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/tool/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/tool/BUILD.gn
@@ -3,7 +3,7 @@ copy("assets") {
     "../assets/index.js",
     "../assets/clang-doc-default-stylesheet.css",
   ]
-  outputs = [ "$root_build_dir/share/clang/{{source_file_part}}" ]
+  outputs = [ "$root_build_dir/share/clang-doc/{{source_file_part}}" ]
 }
 
 executable("clang-doc") {
diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index d3a3ee755808bf..1eed616330eb1c 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -93,8 +93,7 @@ static_library("AST") {
     "FormatString.cpp",
     "InheritViz.cpp",
     "Interp/ByteCodeEmitter.cpp",
-    "Interp/ByteCodeExprGen.cpp",
-    "Interp/ByteCodeStmtGen.cpp",
+    "Interp/Compiler.cpp",
     "Interp/Context.cpp",
     "Interp/Descriptor.cpp",
     "Interp/Disasm.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index eff19f8f03c555..8f0e27dc2fd91d 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -39,6 +39,7 @@ static_library("Sema") {
   ]
   sources = [
     "AnalysisBasedWarnings.cpp",
+    "CheckExprLifetime.cpp",
     "CodeCompleteConsumer.cpp",
     "DeclSpec.cpp",
     "DelayedDiagnostic.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn
index e74f4c85524205..a30c291e156723 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn
@@ -118,7 +118,7 @@ shared_library("hwasan_shared") {
 
 static_library("hwasan_preinit") {
   output_dir = crt_current_out_dir
-  output_name = "clang_rt.${hwasan_name}_preinit$crt_current_target_suffix"
+  output_name = "clang_rt.${hwasan_name}-preinit$crt_current_target_suffix"
   complete_static_lib = true
   configs -= [
     "//llvm/utils/gn/build:llvm_code",
diff --git a/llvm/utils/gn/secondary/compiler-rt/test/hwasan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/hwasan/BUILD.gn
index cb1969ede65262..59ed1d1480967e 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/hwasan/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/hwasan/BUILD.gn
@@ -38,6 +38,7 @@ if (current_toolchain != host_toolchain) {
       "//compiler-rt/include($host_toolchain)",
       "//compiler-rt/lib/cfi:ignorelist($host_toolchain)",
       "//compiler-rt/lib/hwasan:hwasan_shared",
+      "//compiler-rt/lib/hwasan:hwasan_preinit",
       "//compiler-rt/test:lit_common_configured",
       "//llvm/utils/FileCheck($host_toolchain)",
       "//llvm/utils/llvm-lit($host_toolchain)",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index b3882fe4d6b768..bbd1f9af65095a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -128,6 +128,7 @@ static_library("CodeGen") {
     "MachineCopyPropagation.cpp",
     "MachineCycleAnalysis.cpp",
     "MachineDebugify.cpp",
+    "MachineDomTreeUpdater.cpp",
     "MachineDominanceFrontier.cpp",
     "MachineDominators.cpp",
     "MachineFrameInfo.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index af4a6993cbc790..d6b6b71a1e59b3 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -90,7 +90,6 @@ static_library("Support") {
     "FormattedStream.cpp",
     "GlobPattern.cpp",
     "GraphWriter.cpp",
-    "Hashing.cpp",
     "HexagonAttributeParser.cpp",
     "HexagonAttributes.cpp",
     "InitLLVM.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
index 50309d8ee024cd..0f67db3549fb14 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -26,6 +26,7 @@ static_library("Instrumentation") {
     "LowerAllowCheckPass.cpp",
     "MemProfiler.cpp",
     "MemorySanitizer.cpp",
+    "NumericalStabilitySanitizer.cpp",
     "PGOCtxProfLowering.cpp",
     "PGOForceFunctionAttrs.cpp",
     "PGOInstrumentation.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index df71d089ac0e2a..a3f89a5648cb5c 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -32,6 +32,7 @@ unittest("CodeGenTests") {
     "LowLevelTypeTest.cpp",
     "MLRegAllocDevelopmentFeatures.cpp",
     "MachineBasicBlockTest.cpp",
+    "MachineDomTreeUpdaterTest.cpp",
     "MachineInstrBundleIteratorTest.cpp",
     "MachineInstrTest.cpp",
     "MachineOperandTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn
index a91709357d88ee..06d87901346954 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn
@@ -15,6 +15,7 @@ unittest("GlobalISelTests") {
   ]
   sources = [
     "CSETest.cpp",
+    "CallLowering.cpp",
     "ConstantFoldingTest.cpp",
     "GIMatchTableExecutorTest.cpp",
     "GISelAliasTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn
index 4a7f829d6d8e66..2b78dc842bbd6a 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn
@@ -22,8 +22,5 @@ unittest("AMDGPUMCTests") {
 
   # AMDGPUMCTests heavily reaches into lib/Target/AMDGPU internals.
   include_dirs = [ "//llvm/lib/Target/AMDGPU" ]
-  sources = [
-    "DwarfRegMappings.cpp",
-    "SIProgramInfoMCExprs.cpp",
-  ]
+  sources = [ "DwarfRegMappings.cpp" ]
 }
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 94415431ab4a38..a7d52daaedba3b 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -45,8 +45,8 @@ def parse_args_and_run():
     parser.add_argument(
         "--input_type",
         type=str,
-        help="Input file type - JSON, LLD params, or directory.",
-        choices=["json", "params", "directory"],
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "bazel_aquery"],
         default="json",
         nargs="?",
     )
@@ -149,6 +149,11 @@ def main(args):
             "structured compilation database, use that instead"
         )
         objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "bazel_aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
     else:
         logging.error("Unknown input type: %s", args.input_type)
 
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
index 8e9779c6257f1b..f434e59524bbf6 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -316,6 +316,29 @@ def make_spec(obj_file: str):
     return [make_spec(path) for path in paths]
 
 
+def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
+    """Creates an object file array by looking at the JSON output of bazel aquery.
+
+    Args:
+      aquery_json: The JSON-formatted output of the bazel aquery command for
+        the target of interest. The bazel aquery JSON should be a JSON
+        serialized version of the analysis.ActionGraphContainer proto.
+        https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto
+      obj_base_dir: The base build directory that all object files will be
+        written out as arelative to.
+      output_dir: The output directory where extracted .bc and .cmd files should
+        be placed.
+    """
+    linker_params = []
+
+    for action_info in aquery_json["actions"]:
+        if action_info["mnemonic"] != "CppLink":
+            continue
+        linker_params = action_info["arguments"]
+
+    return load_from_lld_params(linker_params, obj_base_dir, output_dir)
+
+
 def run_extraction(
     objs: List[TrainingIRExtractor],
     num_workers: int,
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
index 51dc637347caf0..2ee72e73fe7bdb 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, system-linux
+# REQUIRES: system-linux
 
 ## Testing that the combine_trainig_corpus script works as expected when
 ## invoked.
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
index 4965a63ac2ae78..20541038e001d2 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, system-linux
+# REQUIRES: system-linux
 
 ## Test the functionality of combine_training_corpus_lib
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
index 107116618ce97b..104bb9b127794f 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, system-linux
+# REQUIRES: system-linux
 
 ## Test that invoking the extract_ir script work as expected.
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
index 5ee0762a551a58..f0564463f92b60 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, system-linux
+# REQUIRES: system-linux
 
 ## Test the functionality of extract_ir_lib
 
@@ -304,6 +304,43 @@ def test_lld_thinlto_extraction(outer, outdir):
     # CHECK-LLD-THINLTO-EXTRACTION-PY: 3
 
 
+## Test that we can load a bazel query JSON as expected.
+
+# RUN: %python %s test_load_bazel_aquery | FileCheck %s --check-prefix CHECK-TEST-LOAD-BAZEL-AQUERY
+
+
+def test_load_bazel_aquery():
+    obj = extract_ir_lib.load_bazel_aquery(
+        {
+            "actions": [
+                {"mnemonic": "not-link", "arguments": []},
+                {
+                    "mnemonic": "CppLink",
+                    "arguments": ["clang", "-o", "output_binary", "test1.o", "test2.o"],
+                },
+            ]
+        },
+        "/some/path",
+        "/tmp/out",
+    )
+    print(obj[0].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test1.o
+    print(obj[0].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test1.o
+    print(obj[0].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.cmd
+    print(obj[0].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.bc
+    print(obj[1].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test2.o
+    print(obj[1].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test2.o
+    print(obj[1].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.cmd
+    print(obj[1].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.bc
+
+
 ## Test that filtering works correctly
 
 # RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
index 9b4d57330f0101..ac6cda1d877ae2 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, system-linux
+# REQUIRES: system-linux
 
 ## Testing that the make_corpus script works as expected when invoked.
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
index 0a2246f4572d35..7a363358778767 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, system-linux
+# REQUIRES: system-linux
 
 ## Test the functionality of make_corpus_lib
 
diff --git a/llvm/utils/mlgo-utils/tests/lit.local.cfg b/llvm/utils/mlgo-utils/tests/lit.local.cfg
index e5cbf03104c489..1f0a5fcaa5b07a 100644
--- a/llvm/utils/mlgo-utils/tests/lit.local.cfg
+++ b/llvm/utils/mlgo-utils/tests/lit.local.cfg
@@ -1,10 +1,5 @@
 import sys
 
-# TODO(boomanaiden154): Remove this flag once the minimum Python version for
-# the entire project has been bumped to 3.8.
-if sys.version_info > (3,8):
-    config.available_features.add("python-38")
-
 # TODO(boomanaiden154): Remove this flag once we enable type checking in the
 # precommit CI.
 try:
diff --git a/mlir/include/mlir-c/Dialect/GPU.h b/mlir/include/mlir-c/Dialect/GPU.h
index 2adf73ddff6eae..c42ff61f9592c7 100644
--- a/mlir/include/mlir-c/Dialect/GPU.h
+++ b/mlir/include/mlir-c/Dialect/GPU.h
@@ -19,6 +19,14 @@ extern "C" {
 
 MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(GPU, gpu);
 
+//===-------------------------------------------------------------------===//
+// AsyncTokenType
+//===-------------------------------------------------------------------===//
+
+MLIR_CAPI_EXPORTED bool mlirTypeIsAGPUAsyncTokenType(MlirType type);
+
+MLIR_CAPI_EXPORTED MlirType mlirGPUAsyncTokenTypeGet(MlirContext ctx);
+
 //===---------------------------------------------------------------------===//
 // ObjectAttr
 //===---------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Analysis/Presburger/Barvinok.h b/mlir/include/mlir/Analysis/Presburger/Barvinok.h
index c9a1645b5e6324..b09617f0fe88e5 100644
--- a/mlir/include/mlir/Analysis/Presburger/Barvinok.h
+++ b/mlir/include/mlir/Analysis/Presburger/Barvinok.h
@@ -29,7 +29,6 @@
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/PresburgerRelation.h"
 #include "mlir/Analysis/Presburger/QuasiPolynomial.h"
-#include <bitset>
 #include <optional>
 
 namespace mlir {
diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index ad26e9786d0b0a..5e5cd898b75189 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -19,14 +19,15 @@
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
 #include "mlir/Analysis/Presburger/Utils.h"
-#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/DynamicAPInt.h"
+#include "llvm/ADT/SmallVector.h"
 #include <optional>
 
 namespace mlir {
 namespace presburger {
 using llvm::DynamicAPInt;
 using llvm::int64fromDynamicAPInt;
+using llvm::SmallVectorImpl;
 
 class IntegerRelation;
 class IntegerPolyhedron;
@@ -477,7 +478,7 @@ class IntegerRelation {
   /// equality detection; if successful, the constant is substituted for the
   /// variable everywhere in the constraint system and then removed from the
   /// system.
-  LogicalResult constantFoldVar(unsigned pos);
+  bool constantFoldVar(unsigned pos);
 
   /// This method calls `constantFoldVar` for the specified range of variables,
   /// `num` variables starting at position `pos`.
@@ -500,7 +501,7 @@ class IntegerRelation {
   /// 3) this   = {0 <= d0 <= 5, 1 <= d1 <= 9}
   ///    other  = {2 <= d0 <= 6, 5 <= d1 <= 15},
   ///    output = {0 <= d0 <= 6, 1 <= d1 <= 15}
-  LogicalResult unionBoundingBox(const IntegerRelation &other);
+  bool unionBoundingBox(const IntegerRelation &other);
 
   /// Returns the smallest known constant bound for the extent of the specified
   /// variable (pos^th), i.e., the smallest known constant that is greater
@@ -773,8 +774,8 @@ class IntegerRelation {
   /// Eliminates a single variable at `position` from equality and inequality
   /// constraints. Returns `success` if the variable was eliminated, and
   /// `failure` otherwise.
-  inline LogicalResult gaussianEliminateVar(unsigned position) {
-    return success(gaussianEliminateVars(position, position + 1) == 1);
+  inline bool gaussianEliminateVar(unsigned position) {
+    return gaussianEliminateVars(position, position + 1) == 1;
   }
 
   /// Removes local variables using equalities. Each equality is checked if it
diff --git a/mlir/include/mlir/Analysis/Presburger/Matrix.h b/mlir/include/mlir/Analysis/Presburger/Matrix.h
index e232ecd5e1509a..054eb7b26d06ee 100644
--- a/mlir/include/mlir/Analysis/Presburger/Matrix.h
+++ b/mlir/include/mlir/Analysis/Presburger/Matrix.h
@@ -16,15 +16,16 @@
 #define MLIR_ANALYSIS_PRESBURGER_MATRIX_H
 
 #include "mlir/Analysis/Presburger/Fraction.h"
-#include "mlir/Support/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/raw_ostream.h"
-
-#include <bitset>
 #include <cassert>
 
 namespace mlir {
 namespace presburger {
+using llvm::ArrayRef;
+using llvm::MutableArrayRef;
+using llvm::raw_ostream;
+using llvm::SmallVector;
 
 /// This is a class to represent a resizable matrix.
 ///
diff --git a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
index 9a41e4aee61d00..cff79579898710 100644
--- a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
+++ b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
@@ -14,14 +14,16 @@
 #ifndef MLIR_ANALYSIS_PRESBURGER_PRESBURGERSPACE_H
 #define MLIR_ANALYSIS_PRESBURGER_PRESBURGERSPACE_H
 
-#include "mlir/Support/TypeID.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/TypeName.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 namespace presburger {
+using llvm::ArrayRef;
+using llvm::SmallVector;
 
 /// Kind of variable. Implementation wise SetDims are treated as Range
 /// vars, and spaces with no distinction between dimension vars are treated
@@ -74,7 +76,7 @@ class Identifier {
   explicit Identifier(T value)
       : value(llvm::PointerLikeTypeTraits<T>::getAsVoidPointer(value)) {
 #ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS
-    idType = TypeID::get<T>();
+    idType = llvm::getTypeName<T>();
 #endif
   }
 
@@ -83,7 +85,7 @@ class Identifier {
   template <typename T>
   T getValue() const {
 #ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS
-    assert(TypeID::get<T>() == idType &&
+    assert(llvm::getTypeName<T>() == idType &&
            "Identifier was initialized with a different type than the one used "
            "to retrieve it.");
 #endif
@@ -108,7 +110,7 @@ class Identifier {
 
 #ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS
   /// TypeID of the identifiers in space. This should be used in asserts only.
-  TypeID idType = TypeID::get<void>();
+  llvm::StringRef idType;
 #endif
 };
 
diff --git a/mlir/include/mlir/Analysis/Presburger/Simplex.h b/mlir/include/mlir/Analysis/Presburger/Simplex.h
index ff26e94e019c83..f413636e06910e 100644
--- a/mlir/include/mlir/Analysis/Presburger/Simplex.h
+++ b/mlir/include/mlir/Analysis/Presburger/Simplex.h
@@ -20,12 +20,7 @@
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/PWMAFunction.h"
 #include "mlir/Analysis/Presburger/Utils.h"
-#include "mlir/Support/LogicalResult.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/raw_ostream.h"
 #include <optional>
 
 namespace mlir {
@@ -450,7 +445,7 @@ class LexSimplexBase : public SimplexBase {
   /// lexicopositivity of the basis transform. The row must have a non-positive
   /// sample value. If this is not possible, return failure. This occurs when
   /// the constraints have no solution or the sample value is zero.
-  LogicalResult moveRowUnknownToColumn(unsigned row);
+  bool moveRowUnknownToColumn(unsigned row);
 
   /// Given a row that has a non-integer sample value, add an inequality to cut
   /// away this fractional sample value from the polytope without removing any
@@ -464,7 +459,7 @@ class LexSimplexBase : public SimplexBase {
   ///
   /// Return failure if the tableau became empty, and success if it didn't.
   /// Failure status indicates that the polytope was integer empty.
-  LogicalResult addCut(unsigned row);
+  bool addCut(unsigned row);
 
   /// Undo the addition of the last constraint. This is only called while
   /// rolling back.
@@ -516,7 +511,7 @@ class LexSimplex : public LexSimplexBase {
   MaybeOptimum<SmallVector<Fraction, 8>> getRationalSample() const;
 
   /// Make the tableau configuration consistent.
-  LogicalResult restoreRationalConsistency();
+  bool restoreRationalConsistency();
 
   /// Return whether the specified row is violated;
   bool rowIsViolated(unsigned row) const;
@@ -631,7 +626,7 @@ class SymbolicLexSimplex : public LexSimplexBase {
   /// Return failure if the tableau became empty, indicating that the polytope
   /// is always integer empty in the current symbol domain.
   /// Return success otherwise.
-  LogicalResult doNonBranchingPivots();
+  bool doNonBranchingPivots();
 
   /// Get a row that is always violated in the current domain, if one exists.
   std::optional<unsigned> maybeGetAlwaysViolatedRow();
@@ -652,7 +647,7 @@ class SymbolicLexSimplex : public LexSimplexBase {
   /// at the time of the call. (This function may modify the symbol domain, but
   /// failure statu indicates that the polytope was empty for all symbol values
   /// in the initial domain.)
-  LogicalResult addSymbolicCut(unsigned row);
+  bool addSymbolicCut(unsigned row);
 
   /// Get the numerator of the symbolic sample of the specific row.
   /// This is an affine expression in the symbols with integer coefficients.
@@ -825,7 +820,7 @@ class Simplex : public SimplexBase {
   ///
   /// Returns success if the unknown was successfully restored to a non-negative
   /// sample value, failure otherwise.
-  LogicalResult restoreRow(Unknown &u);
+  bool restoreRow(Unknown &u);
 
   /// Find a pivot to change the sample value of row in the specified
   /// direction while preserving tableau consistency, except that if the
diff --git a/mlir/include/mlir/Analysis/Presburger/Utils.h b/mlir/include/mlir/Analysis/Presburger/Utils.h
index 9b93e52b48490b..d3c0802c240bc1 100644
--- a/mlir/include/mlir/Analysis/Presburger/Utils.h
+++ b/mlir/include/mlir/Analysis/Presburger/Utils.h
@@ -13,12 +13,10 @@
 #ifndef MLIR_ANALYSIS_PRESBURGER_UTILS_H
 #define MLIR_ANALYSIS_PRESBURGER_UTILS_H
 
-#include "mlir/Support/LLVM.h"
+#include "mlir/Analysis/Presburger/Matrix.h"
 #include "llvm/ADT/DynamicAPInt.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
-
-#include "mlir/Analysis/Presburger/Matrix.h"
 #include <optional>
 
 namespace mlir {
diff --git a/mlir/include/mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h b/mlir/include/mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h
new file mode 100644
index 00000000000000..38527822475272
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h
@@ -0,0 +1,22 @@
+//===- ConvertToSPIRVPass.h - Conversion to SPIR-V pass ---*- C++ -*-=========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_CONVERTTOSPIRV_CONVERTTOSPIRVPASS_H
+#define MLIR_CONVERSION_CONVERTTOSPIRV_CONVERTTOSPIRVPASS_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTTOSPIRVPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_CONVERTTOSPIRV_CONVERTTOSPIRVPASS_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 7700299b3a4f32..8c6f85d461aea3 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -30,6 +30,7 @@
 #include "mlir/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRV.h"
 #include "mlir/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.h"
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"
+#include "mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h"
 #include "mlir/Conversion/FuncToEmitC/FuncToEmitCPass.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 2315686839c20b..560b088dbe5cd2 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -31,6 +31,18 @@ def ConvertToLLVMPass : Pass<"convert-to-llvm"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ToSPIRV
+//===----------------------------------------------------------------------===//
+
+def ConvertToSPIRVPass : Pass<"convert-to-spirv"> {
+  let summary = "Convert to SPIR-V";
+  let description = [{
+    This is a generic pass to convert to SPIR-V.
+  }];
+  let dependentDialects = ["spirv::SPIRVDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // AffineToStandard
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
index 67965e34d8a3d7..c84e4f17c38d88 100644
--- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
+++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
@@ -47,7 +47,8 @@ void addTosaToLinalgPasses(
 void registerTosaToLinalgPipelines();
 
 /// Populates conversion passes from TOSA dialect to Linalg dialect.
-void populateTosaToLinalgConversionPatterns(RewritePatternSet *patterns);
+void populateTosaToLinalgConversionPatterns(TypeConverter &converter,
+                                            RewritePatternSet *patterns);
 
 /// Populates conversion passes from TOSA dialect to Linalg named operations.
 void populateTosaToLinalgNamedConversionPatterns(
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 6e24b607cebcf9..477478a4651cee 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1133,6 +1133,10 @@ def Arith_DivFOp : Arith_FloatBinaryOp<"divf"> {
 
 def Arith_RemFOp : Arith_FloatBinaryOp<"remf"> {
   let summary = "floating point division remainder operation";
+  let description = [{
+    Returns the floating point division remainder.
+    The remainder has the same sign as the dividend (lhs operand).
+  }];
   let hasFolder = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
index c1f016d9ce1f16..dfd64f995546ae 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
@@ -120,10 +120,10 @@ def EnableArmStreaming
            /*default=*/"false",
            "Only apply the selected streaming/ZA modes if the function contains"
            " ops that implement the ArmSMETileOpInterface.">,
-    Option<"ifContainsScalableVectors", "if-contains-scalable-vectors",
+    Option<"ifScalableAndSupported", "if-scalable-and-supported",
            "bool", /*default=*/"false",
            "Only apply the selected streaming/ZA modes if the function contains"
-           " operations that use scalable vector types.">
+           " supported scalable vector operations.">
   ];
   let dependentDialects = ["func::FuncDialect"];
 }
diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt
index 4bd7f12fabf7ba..f7102351973344 100644
--- a/mlir/include/mlir/Dialect/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/CMakeLists.txt
@@ -29,6 +29,7 @@ add_subdirectory(OpenMP)
 add_subdirectory(PDL)
 add_subdirectory(PDLInterp)
 add_subdirectory(Polynomial)
+add_subdirectory(Ptr)
 add_subdirectory(Quant)
 add_subdirectory(SCF)
 add_subdirectory(Shape)
diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index f77b8cbbbc61d2..181bc5a2d07fa1 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -37,7 +37,8 @@ class CF_Op<string mnemonic, list<Trait> traits = []> :
 // AssertOp
 //===----------------------------------------------------------------------===//
 
-def AssertOp : CF_Op<"assert"> {
+def AssertOp : CF_Op<"assert",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "Assert operation with message attribute";
   let description = [{
     Assert operation at runtime with single boolean operand and an error
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 0621a9f33ba1e8..d96ad919b65f0a 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -94,6 +94,11 @@ def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops"> {
   let dependentDialects = ["linalg::LinalgDialect"];
 }
 
+def LinalgSpecializeGenericOpsPass : Pass<"linalg-specialize-generic-ops"> {
+  let summary = "Convert generic ops back to named ops";
+  let dependentDialects = ["linalg::LinalgDialect"];
+}
+
 def LinalgDetensorizePass : InterfacePass<"linalg-detensorize", "FunctionOpInterface"> {
   let summary = "Detensorize linalg ops";
   let dependentDialects = [];
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 8424207ea47e54..3812eb50095d53 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1416,6 +1416,20 @@ struct LinalgGeneralizationPattern
   }
 };
 
+struct LinalgSpecializationPattern : public OpRewritePattern<GenericOp> {
+  using OpRewritePattern<GenericOp>::OpRewritePattern;
+
+  FailureOr<GenericOp>
+  returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const {
+    return specializeGenericOp(rewriter, op);
+  }
+
+  LogicalResult matchAndRewrite(GenericOp op,
+                                PatternRewriter &rewriter) const override {
+    return returningMatchAndRewrite(op, rewriter);
+  }
+};
+
 /// Vectorization pattern for memref::CopyOp.
 struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
   using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
@@ -1567,6 +1581,15 @@ void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns);
 /// linalg.generic ops.
 void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns);
 
+/// Populates `patterns` with patterns to convert linalg.generic ops to named
+/// ops where possible. A linalg.generic can represent wide range and complex
+/// computations for which equivalent linalg named op may not exist e.g.
+/// linalg.generic that takes a tensor and computes a polynomial such as:
+///     p(x) = an*x^n + ... + a1x + a0
+/// There is no equivalent named op to convert to. Many such cases exist.
+void populateLinalgGenericOpsSpecializationPatterns(
+    RewritePatternSet &patterns);
+
 /// Linalg decompose convolutions patterns
 
 /// Populates patterns to decompose high-D convolution ops into low-D ones.
@@ -1713,6 +1736,13 @@ void populateTransposeMatmulPatterns(RewritePatternSet &patterns,
 void populateBlockPackMatmulPatterns(RewritePatternSet &patterns,
                                      const ControlBlockPackMatmulFn &controlFn);
 
+/// Adds patterns that reduce the rank of named contraction ops that have
+/// unit dimensions in the operand(s) by converting to a sequence of `collapse_shape`,
+/// `<corresponding linalg named op>`, `expand_shape` (if on tensors).  For example a
+/// `linalg.batch_matmul` with unit batch size will convert to `linalg.matmul`
+/// and a `linalg.matvec` with with unit spatial dim in lhs will convert to a `linalg.dot`.
+void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns);
+
 } // namespace linalg
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index b3397ae131b56f..65a1a8b42e1495 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -219,7 +219,6 @@ struct FusionInfo {
   LinalgOp fusedProducer;
 };
 
-/// Tensor counterpart of `fuseProducerOfBuffer`.
 /// This implements the fusion part of the "tileAndFuse on tensors"
 /// transformation and thus requires the `consumerOpOperand` to be a
 /// `extract_slice` op (generally obtained by applying the tiling
@@ -227,7 +226,6 @@ struct FusionInfo {
 FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
                                            OpOperand &consumerOpOperand);
 
-/// Tensor counterpart of `fuseProducerOfBuffer`.
 /// This implements the fusion part of the "tileAndFuse on tensors"
 /// transformation and thus requires the `consumerOpOperand` to be a
 /// `extract_slice` op (generally obtained by applying the tiling
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index de7959db489e9d..386f9f3dcb689f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -94,6 +94,7 @@ struct IfClauseOps {
 
 struct InReductionClauseOps {
   llvm::SmallVector<Value> inReductionVars;
+  llvm::SmallVector<bool> inReductionVarsByRef;
   llvm::SmallVector<Attribute> inReductionDeclSymbols;
 };
 
@@ -147,6 +148,7 @@ struct NumThreadsClauseOps {
 
 struct OrderClauseOps {
   ClauseOrderKindAttr orderAttr;
+  OrderModifierAttr orderModAttr;
 };
 
 struct OrderedClauseOps {
@@ -177,7 +179,7 @@ struct ProcBindClauseOps {
 
 struct ReductionClauseOps {
   llvm::SmallVector<Value> reductionVars;
-  llvm::SmallVector<bool> reduceVarByRef;
+  llvm::SmallVector<bool> reductionVarsByRef;
   llvm::SmallVector<Attribute> reductionDeclSymbols;
 };
 
@@ -198,6 +200,7 @@ struct SimdlenClauseOps {
 
 struct TaskReductionClauseOps {
   llvm::SmallVector<Value> taskReductionVars;
+  llvm::SmallVector<bool> taskReductionVarsByRef;
   llvm::SmallVector<Attribute> taskReductionDeclSymbols;
 };
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
index bf3d33819e9a9c..b1a9e3330522b2 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
@@ -163,6 +163,21 @@ def DeclareTargetDeviceTypeAttr : OpenMP_EnumAttr<DeclareTargetDeviceType,
   let assemblyFormat = "`(` $value `)`";
 }
 
+//===----------------------------------------------------------------------===//
+// order_modifer enum.
+//===----------------------------------------------------------------------===//
+
+def OMP_OrderModReproducible       : I32EnumAttrCase<"reproducible", 0>;
+def OMP_OrderModUnconstrained      : I32EnumAttrCase<"unconstrained", 1>;
+def OrderModifier
+    : I32EnumAttr<"OrderModifier", "OpenMP Order Modifier",
+                  [OMP_OrderModReproducible, OMP_OrderModUnconstrained]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
+}
+def OrderModifierAttr : EnumAttr<OpenMP_Dialect, OrderModifier,
+                                    "order_mod">;
+
 //===----------------------------------------------------------------------===//
 // sched_mod enum.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index f28911adccd027..bba8a29a5599aa 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -179,6 +179,10 @@ def ParallelOp : OpenMP_Op<"parallel", [
     OpBuilder<(ins CArg<"const ParallelClauseOps &">:$clauses)>
   ];
   let extraClassDeclaration = [{
+    unsigned getNumAllocateVars() { return getAllocateVars().size(); }
+
+    unsigned getNumAllocatorsVars() { return getAllocatorsVars().size(); }
+
     /// Returns the number of reduction variables.
     unsigned getNumReductionVars() { return getReductionVars().size(); }
   }];
@@ -246,6 +250,7 @@ def TeamsOp : OpenMP_Op<"teams", [
                        Variadic<AnyType>:$allocate_vars,
                        Variadic<AnyType>:$allocators_vars,
                        Variadic<OpenMP_PointerLikeType>:$reduction_vars,
+                       OptionalAttr<DenseBoolArrayAttr>:$reduction_vars_byref,
                        OptionalAttr<SymbolRefArrayAttr>:$reductions);
 
   let regions = (region AnyRegion:$region);
@@ -262,8 +267,8 @@ def TeamsOp : OpenMP_Op<"teams", [
     | `thread_limit` `(` $thread_limit `:` type($thread_limit) `)`
     | `reduction` `(`
         custom<ReductionVarList>(
-          $reduction_vars, type($reduction_vars), $reductions
-        ) `)`
+          $reduction_vars, type($reduction_vars), $reduction_vars_byref,
+          $reductions ) `)`
     | `allocate` `(`
         custom<AllocateAndAllocator>(
           $allocate_vars, type($allocate_vars),
@@ -306,7 +311,9 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments,
     by the accumulator it uses and accumulators must not be repeated in the same
     reduction. The reduction declaration specifies how to combine the values
     from each section into the final value, which is available in the
-    accumulator after all the sections complete.
+    accumulator after all the sections complete. True values in
+    reduction_vars_byref indicate that the reduction variable should be passed
+    by reference.
 
     The $allocators_vars and $allocate_vars parameters are a variadic list of values
     that specify the memory allocator to be used to obtain storage for private values.
@@ -315,6 +322,7 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments,
     implicit barrier at the end of the construct.
   }];
   let arguments = (ins Variadic<OpenMP_PointerLikeType>:$reduction_vars,
+                       OptionalAttr<DenseBoolArrayAttr>:$reduction_vars_byref,
                        OptionalAttr<SymbolRefArrayAttr>:$reductions,
                        Variadic<AnyType>:$allocate_vars,
                        Variadic<AnyType>:$allocators_vars,
@@ -329,7 +337,8 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments,
   let assemblyFormat = [{
     oilist( `reduction` `(`
               custom<ReductionVarList>(
-                $reduction_vars, type($reduction_vars), $reductions
+                $reduction_vars, type($reduction_vars), $reduction_vars_byref,
+                $reductions
               ) `)`
           | `allocate` `(`
               custom<AllocateAndAllocator>(
@@ -540,7 +549,8 @@ def WsloopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
              UnitAttr:$simd_modifier,
              UnitAttr:$nowait,
              ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val,
-             OptionalAttr<OrderKindAttr>:$order_val);
+             OptionalAttr<OrderKindAttr>:$order_val,
+             OptionalAttr<OrderModifierAttr>:$order_mod);
 
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
@@ -564,7 +574,7 @@ def WsloopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
                 $schedule_chunk_var, type($schedule_chunk_var)) `)`
           |`nowait` $nowait
           |`ordered` `(` $ordered_val `)`
-          |`order` `(` custom<ClauseAttr>($order_val) `)`
+          |`order` `(` custom<OrderClause>($order_val, $order_mod) `)`
     ) custom<Wsloop>($region, $reduction_vars, type($reduction_vars),
                      $reduction_vars_byref, $reductions) attr-dict
   }];
@@ -630,6 +640,7 @@ def SimdOp : OpenMP_Op<"simd", [AttrSizedOperandSegments,
              Optional<I1>:$if_expr,
              Variadic<OpenMP_PointerLikeType>:$nontemporal_vars,
              OptionalAttr<OrderKindAttr>:$order_val,
+             OptionalAttr<OrderModifierAttr>:$order_mod,
              ConfinedAttr<OptionalAttr<I64Attr>, [IntPositive]>:$simdlen,
              ConfinedAttr<OptionalAttr<I64Attr>, [IntPositive]>:$safelen
      );
@@ -646,7 +657,7 @@ def SimdOp : OpenMP_Op<"simd", [AttrSizedOperandSegments,
                                    $alignment_values) `)`
           |`if` `(` $if_expr `)`
           |`nontemporal` `(`  $nontemporal_vars `:` type($nontemporal_vars) `)`
-          |`order` `(` custom<ClauseAttr>($order_val) `)`
+          |`order` `(` custom<OrderClause>($order_val, $order_mod) `)`
           |`simdlen` `(` $simdlen  `)`
           |`safelen` `(` $safelen  `)`
     ) $region attr-dict
@@ -728,7 +739,8 @@ def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
              Optional<IntLikeType>:$chunk_size,
              Variadic<AnyType>:$allocate_vars,
              Variadic<AnyType>:$allocators_vars,
-             OptionalAttr<OrderKindAttr>:$order_val);
+             OptionalAttr<OrderKindAttr>:$order_val,
+             OptionalAttr<OrderModifierAttr>:$order_mod);
 
   let regions = (region AnyRegion:$region);
 
@@ -739,7 +751,7 @@ def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
   let assemblyFormat = [{
     oilist(`dist_schedule_static` $dist_schedule_static
           |`chunk_size` `(` $chunk_size `:` type($chunk_size) `)`
-          |`order` `(` custom<ClauseAttr>($order_val) `)`
+          |`order` `(` custom<OrderClause>($order_val, $order_mod) `)`
           |`allocate` `(`
              custom<AllocateAndAllocator>(
                $allocate_vars, type($allocate_vars),
@@ -786,6 +798,8 @@ def TaskOp : OpenMP_Op<"task", [AttrSizedOperandSegments,
 
     The `in_reduction` clause specifies that this particular task (among all the
     tasks in current taskgroup, if any) participates in a reduction.
+    `in_reduction_vars_byref` indicates whether each reduction variable should
+    be passed by value or by reference.
 
     The `priority` clause is a hint for the priority of the generated task.
     The `priority` is a non-negative integer expression that provides a hint for
@@ -811,6 +825,7 @@ def TaskOp : OpenMP_Op<"task", [AttrSizedOperandSegments,
                        UnitAttr:$untied,
                        UnitAttr:$mergeable,
                        Variadic<OpenMP_PointerLikeType>:$in_reduction_vars,
+                       OptionalAttr<DenseBoolArrayAttr>:$in_reduction_vars_byref,
                        OptionalAttr<SymbolRefArrayAttr>:$in_reductions,
                        Optional<I32>:$priority,
                        OptionalAttr<TaskDependArrayAttr>:$depends,
@@ -828,7 +843,8 @@ def TaskOp : OpenMP_Op<"task", [AttrSizedOperandSegments,
           |`mergeable` $mergeable
           |`in_reduction` `(`
               custom<ReductionVarList>(
-                $in_reduction_vars, type($in_reduction_vars), $in_reductions
+                $in_reduction_vars, type($in_reduction_vars),
+                $in_reduction_vars_byref, $in_reductions
               ) `)`
           |`priority` `(` $priority `)`
           |`allocate` `(`
@@ -955,8 +971,10 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
                        UnitAttr:$untied,
                        UnitAttr:$mergeable,
                        Variadic<OpenMP_PointerLikeType>:$in_reduction_vars,
+                       OptionalAttr<DenseBoolArrayAttr>:$in_reduction_vars_byref,
                        OptionalAttr<SymbolRefArrayAttr>:$in_reductions,
                        Variadic<OpenMP_PointerLikeType>:$reduction_vars,
+                       OptionalAttr<DenseBoolArrayAttr>:$reduction_vars_byref,
                        OptionalAttr<SymbolRefArrayAttr>:$reductions,
                        Optional<IntLikeType>:$priority,
                        Variadic<AnyType>:$allocate_vars,
@@ -978,11 +996,13 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
           |`mergeable` $mergeable
           |`in_reduction` `(`
               custom<ReductionVarList>(
-                $in_reduction_vars, type($in_reduction_vars), $in_reductions
+                $in_reduction_vars, type($in_reduction_vars),
+                $in_reduction_vars_byref, $in_reductions
               ) `)`
           |`reduction` `(`
               custom<ReductionVarList>(
-                $reduction_vars, type($reduction_vars), $reductions
+                $reduction_vars, type($reduction_vars), $reduction_vars_byref,
+                $reductions
               ) `)`
           |`priority` `(` $priority `:` type($priority) `)`
           |`allocate` `(`
@@ -1033,6 +1053,7 @@ def TaskgroupOp : OpenMP_Op<"taskgroup", [AttrSizedOperandSegments,
   }];
 
   let arguments = (ins Variadic<OpenMP_PointerLikeType>:$task_reduction_vars,
+                       OptionalAttr<DenseBoolArrayAttr>:$task_reduction_vars_byref,
                        OptionalAttr<SymbolRefArrayAttr>:$task_reductions,
                        Variadic<AnyType>:$allocate_vars,
                        Variadic<AnyType>:$allocators_vars);
@@ -1046,7 +1067,8 @@ def TaskgroupOp : OpenMP_Op<"taskgroup", [AttrSizedOperandSegments,
   let assemblyFormat = [{
     oilist(`task_reduction` `(`
               custom<ReductionVarList>(
-                $task_reduction_vars, type($task_reduction_vars), $task_reductions
+                $task_reduction_vars, type($task_reduction_vars),
+                $task_reduction_vars_byref, $task_reductions
               ) `)`
           |`allocate` `(`
               custom<AllocateAndAllocator>(
diff --git a/mlir/include/mlir/Dialect/Ptr/CMakeLists.txt b/mlir/include/mlir/Dialect/Ptr/CMakeLists.txt
new file mode 100644
index 00000000000000..f33061b2d87cff
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
new file mode 100644
index 00000000000000..df07b8d5a63d96
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_mlir_dialect(PtrOps ptr)
+add_mlir_doc(PtrOps PtrOps Dialects/ -gen-op-doc)
+
+set(LLVM_TARGET_DEFINITIONS PtrOps.td)
+mlir_tablegen(PtrOpsAttrs.h.inc -gen-attrdef-decls -attrdefs-dialect=ptr)
+mlir_tablegen(PtrOpsAttrs.cpp.inc -gen-attrdef-defs -attrdefs-dialect=ptr)
+add_public_tablegen_target(MLIRPtrOpsAttributesIncGen)
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
new file mode 100644
index 00000000000000..e75038f300f1a7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
@@ -0,0 +1,70 @@
+//===-- PtrAttrDefs.td - Ptr Attributes definition file ----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTR_ATTRDEFS
+#define PTR_ATTRDEFS
+
+include "mlir/Dialect/Ptr/IR/PtrDialect.td"
+include "mlir/IR/AttrTypeBase.td"
+
+// All of the attributes will extend this class.
+class Ptr_Attr<string name, string attrMnemonic,
+                list<Trait> traits = [],
+                string baseCppClass = "::mlir::Attribute">
+    : AttrDef<Ptr_Dialect, name, traits, baseCppClass> {
+  let mnemonic = attrMnemonic;
+}
+
+//===----------------------------------------------------------------------===//
+// SpecAttr
+//===----------------------------------------------------------------------===//
+
+def Ptr_SpecAttr : Ptr_Attr<"Spec", "spec"> {
+  let summary = "ptr data layout spec";
+  let description = [{
+    Defines the data layout spec for a pointer type. This attribute has 4
+    fields:
+     - [Required] size: size of the pointer in bits.
+     - [Required] abi: ABI-required alignment for the pointer in bits.
+     - [Required] preferred: preferred alignment for the pointer in bits.
+     - [Optional] index: bitwidth that should be used when performing index
+     computations for the type. Setting the field to `kOptionalSpecValue`, means
+     the field is optional.
+    
+    Furthermore, the attribute will verify that all present values are divisible
+    by 8 (number of bits in a byte), and that `preferred` > `abi`.
+
+    Example:
+    ```mlir
+    // Spec for a 64 bit ptr, with a required alignment of 64 bits, but with
+    // a preferred alignment of 128 bits and an index bitwidth of 64 bits.
+    #ptr.spec<size = 64, abi = 64, preferred = 128, index = 64>
+    ```
+  }];
+  let parameters = (ins
+    "uint32_t":$size,
+    "uint32_t":$abi,
+    "uint32_t":$preferred,
+    DefaultValuedParameter<"uint32_t", "kOptionalSpecValue">:$index
+  );
+  let skipDefaultBuilders = 1;
+  let builders = [
+    AttrBuilder<(ins "uint32_t":$size, "uint32_t":$abi, "uint32_t":$preferred,
+                     CArg<"uint32_t", "kOptionalSpecValue">:$index), [{
+      return $_get($_ctxt, size, abi, preferred, index);
+    }]>
+  ];
+  let assemblyFormat = "`<` struct(params) `>`";
+  let extraClassDeclaration = [{
+    /// Constant for specifying a spec entry is optional.
+    static constexpr uint32_t kOptionalSpecValue = std::numeric_limits<uint32_t>::max();
+  }];
+  let genVerifyDecl = 1;
+}
+
+#endif // PTR_ATTRDEFS
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrs.h b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrs.h
new file mode 100644
index 00000000000000..72e767764d98b2
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrs.h
@@ -0,0 +1,21 @@
+//===- PtrAttrs.h - Pointer dialect attributes ------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Ptr dialect attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_PTR_IR_PTRATTRS_H
+#define MLIR_DIALECT_PTR_IR_PTRATTRS_H
+
+#include "mlir/IR/OpImplementation.h"
+
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/Ptr/IR/PtrOpsAttrs.h.inc"
+
+#endif // MLIR_DIALECT_PTR_IR_PTRATTRS_H
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.h b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.h
new file mode 100644
index 00000000000000..92f877c20dbf07
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.h
@@ -0,0 +1,20 @@
+//===- PtrDialect.h - Pointer dialect ---------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Ptr dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_PTR_IR_PTRDIALECT_H
+#define MLIR_DIALECT_PTR_IR_PTRDIALECT_H
+
+#include "mlir/IR/Dialect.h"
+
+#include "mlir/Dialect/Ptr/IR/PtrOpsDialect.h.inc"
+
+#endif // MLIR_DIALECT_PTR_IR_PTRDIALECT_H
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
new file mode 100644
index 00000000000000..14d72c3001d919
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
@@ -0,0 +1,73 @@
+//===- PtrDialect.td - Pointer dialect ---------------------*- tablegen -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTR_DIALECT
+#define PTR_DIALECT
+
+include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// Pointer dialect definition.
+//===----------------------------------------------------------------------===//
+
+def Ptr_Dialect : Dialect {
+  let name = "ptr";
+  let summary = "Pointer dialect";
+  let cppNamespace = "::mlir::ptr";
+  let useDefaultTypePrinterParser = 1;
+  let useDefaultAttributePrinterParser = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Pointer type definitions
+//===----------------------------------------------------------------------===//
+
+class Ptr_Type<string name, string typeMnemonic, list<Trait> traits = []>
+    : TypeDef<Ptr_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
+def Ptr_PtrType : Ptr_Type<"Ptr", "ptr", [
+    MemRefElementTypeInterface,
+    DeclareTypeInterfaceMethods<DataLayoutTypeInterface, [
+      "areCompatible", "getIndexBitwidth", "verifyEntries"]>
+  ]> {
+  let summary = "pointer type";
+  let description = [{
+    The `ptr` type is an opaque pointer type. This type typically represents a
+    handle to an object in memory or target-dependent values like `nullptr`.
+    Pointers are optionally parameterized by a memory space.
+
+    Syntax:
+
+    ```mlir
+    pointer ::= `ptr` (`<` memory-space `>`)?
+    memory-space ::= attribute-value
+    ```
+  }];
+  let parameters = (ins OptionalParameter<"Attribute">:$memorySpace);
+  let assemblyFormat = "(`<` $memorySpace^ `>`)?";
+  let builders = [
+    TypeBuilder<(ins CArg<"Attribute", "nullptr">:$memorySpace), [{
+      return $_get($_ctxt, memorySpace);
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Base address operation definition.
+//===----------------------------------------------------------------------===//
+
+class Pointer_Op<string mnemonic, list<Trait> traits = []> :
+        Op<Ptr_Dialect, mnemonic, traits>;
+
+#endif // PTR_DIALECT
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.h b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.h
new file mode 100644
index 00000000000000..6a0c1429c6be92
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.h
@@ -0,0 +1,25 @@
+//===- PtrDialect.h - Pointer dialect ---------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Ptr dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_PTR_IR_PTROPS_H
+#define MLIR_DIALECT_PTR_IR_PTROPS_H
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/Dialect/Ptr/IR/PtrAttrs.h"
+#include "mlir/Dialect/Ptr/IR/PtrDialect.h"
+#include "mlir/Dialect/Ptr/IR/PtrTypes.h"
+#include "mlir/IR/OpDefinition.h"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Ptr/IR/PtrOps.h.inc"
+
+#endif // MLIR_DIALECT_PTR_IR_PTROPS_H
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
new file mode 100644
index 00000000000000..c63a0b220e501c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
@@ -0,0 +1,16 @@
+//===- PtrOps.td - Pointer dialect ops ---------------------*- tablegen -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://ptr.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTR_OPS
+#define PTR_OPS
+
+include "mlir/Dialect/Ptr/IR/PtrDialect.td"
+include "mlir/Dialect/Ptr/IR/PtrAttrDefs.td"
+include "mlir/IR/OpAsmInterface.td"
+
+#endif // PTR_OPS
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrTypes.h b/mlir/include/mlir/Dialect/Ptr/IR/PtrTypes.h
new file mode 100644
index 00000000000000..264a97c80722a2
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrTypes.h
@@ -0,0 +1,22 @@
+//===- PtrTypes.h - Pointer types -------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Pointer dialect types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_PTR_IR_PTRTYPES_H
+#define MLIR_DIALECT_PTR_IR_PTRTYPES_H
+
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
+
+#define GET_TYPEDEF_CLASSES
+#include "mlir/Dialect/Ptr/IR/PtrOpsTypes.h.inc"
+
+#endif // MLIR_DIALECT_PTR_IR_PTRTYPES_H
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 6316f1d130d193..d68ca112073763 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -191,10 +191,14 @@ tileAndFuseProducerOfSlice(RewriterBase &rewriter,
 /// where `%0` had other uses as well. If not reconstructed from within the loop
 /// body, uses of `%0` could not be replaced, making it still live and the
 /// fusion immaterial.
+///
+/// The @param `yieldResultNumber` decides which result would be yield. If not
+/// given, yield all `opResult` of fused producer.
 LogicalResult yieldReplacementForFusedProducer(
     RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp,
     scf::SCFFuseProducerOfSliceResult fusedProducerInfo,
-    MutableArrayRef<LoopLikeOpInterface> loops);
+    MutableArrayRef<LoopLikeOpInterface> loops,
+    ArrayRef<unsigned> yieldResultNumber = ArrayRef<unsigned>{});
 
 /// Transformation information returned after tile and fuse.
 struct SCFTileAndFuseResult {
diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
index ca59b221d03eba..ceab7d9c628a54 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
@@ -37,7 +37,7 @@ Value clampFloatHelper(Location loc, Value arg, Value min, Value max,
 // Takes the parameters for a clamp and turns it into a series of ops for
 // integer inputs.
 Value clampIntHelper(Location loc, Value arg, Value min, Value max,
-                     OpBuilder &rewriter);
+                     OpBuilder &rewriter, bool isUnsigned);
 
 // Determines whether the integer value falls witin the range of integer type.
 bool validIntegerRange(IntegerType ty, int64_t value);
diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
index 594bcf5dbb399a..ba4f084d3efd1a 100644
--- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
@@ -200,6 +200,12 @@ struct SaturatedInteger {
     return SaturatedInteger{false, other.v + v};
   }
   SaturatedInteger operator*(SaturatedInteger other) {
+    // Multiplication with 0 is always 0.
+    if (!other.saturated && other.v == 0)
+      return SaturatedInteger{false, 0};
+    if (!saturated && v == 0)
+      return SaturatedInteger{false, 0};
+    // Otherwise, if this or the other integer is dynamic, so is the result.
     if (saturated || other.saturated)
       return SaturatedInteger{true, 0};
     return SaturatedInteger{false, other.v * v};
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index d9db21073e15c7..549c26c72d8a1e 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -63,6 +63,7 @@
 #include "mlir/Dialect/PDL/IR/PDL.h"
 #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialDialect.h"
+#include "mlir/Dialect/Ptr/IR/PtrDialect.h"
 #include "mlir/Dialect/Quant/QuantOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.h"
@@ -134,6 +135,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
                   pdl::PDLDialect,
                   pdl_interp::PDLInterpDialect,
                   polynomial::PolynomialDialect,
+                  ptr::PtrDialect,
                   quant::QuantizationDialect,
                   ROCDL::ROCDLDialect,
                   scf::SCFDialect,
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h
index 4cbad38df42bab..ab65f92820a6a8 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h
@@ -93,8 +93,7 @@ uint64_t getDefaultStackAlignment(DataLayoutEntryInterface entry);
 
 /// Returns the value of the property from the specified DataLayoutEntry. If the
 /// property is missing from the entry, returns std::nullopt.
-std::optional<int64_t>
-getDevicePropertyValueAsInt(DataLayoutEntryInterface entry);
+std::optional<Attribute> getDevicePropertyValue(DataLayoutEntryInterface entry);
 
 /// Given a list of data layout entries, returns a new list containing the
 /// entries with keys having the given type ID, i.e. belonging to the same type
@@ -246,9 +245,9 @@ class DataLayout {
 
   /// Returns the value of the specified property if the property is defined for
   /// the given device ID, otherwise returns std::nullopt.
-  std::optional<int64_t>
-  getDevicePropertyValueAsInt(TargetSystemSpecInterface::DeviceID,
-                              StringAttr propertyName) const;
+  std::optional<Attribute>
+  getDevicePropertyValue(TargetSystemSpecInterface::DeviceID,
+                         StringAttr propertyName) const;
 
 private:
   /// Combined layout spec at the given scope.
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
index 8ff0e3f5f863b6..d4b89997d9c867 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
@@ -468,12 +468,12 @@ def DataLayoutOpInterface : OpInterface<"DataLayoutOpInterface"> {
     StaticInterfaceMethod<
       /*description=*/"Returns the value of the property, if the property is "
                       "defined. Otherwise, it returns std::nullopt.",
-      /*retTy=*/"std::optional<int64_t>",
-      /*methodName=*/"getDevicePropertyValueAsInt",
+      /*retTy=*/"std::optional<Attribute>",
+      /*methodName=*/"getDevicePropertyValue",
       /*args=*/(ins "::mlir::DataLayoutEntryInterface":$entry),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
-        return ::mlir::detail::getDevicePropertyValueAsInt(entry);
+        return ::mlir::detail::getDevicePropertyValue(entry);
       }]
     >
   ];
diff --git a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h
index aaa261be6553f3..7bebfc9a30064f 100644
--- a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h
+++ b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h
@@ -27,7 +27,7 @@ struct MemorySlot {
 /// Memory slot attached with information about its destructuring procedure.
 struct DestructurableMemorySlot : public MemorySlot {
   /// Maps an index within the memory slot to the corresponding subelement type.
-  DenseMap<Attribute, Type> elementPtrs;
+  DenseMap<Attribute, Type> subelementTypes;
 };
 
 /// Returned by operation promotion logic requesting the deletion of an
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 3f927865ccf678..ad0af0aaaab808 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -51,7 +51,8 @@ def TilingInterface : OpInterface<"TilingInterface"> {
     For an operation to be "tiled and fused" with its (already tiled) consumer,
     an operation has to implement the following additional method (see
     description below): 
-      - `generateResultTileValue
+      - `generateResultTileValue`
+      - `getIterationDomainTileFromResultTile`
 
     For an operation to be "tiled and fused" with its (already tiled) producer,
     an operation has to implement the following additional methods (see
@@ -302,6 +303,41 @@ def TilingInterface : OpInterface<"TilingInterface"> {
           return failure();
         }]
       >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Method to return the tile of the iteration domain based
+          on the given tile of the certain result.
+
+          This method is required to allow operations to be "tiled and fused"
+          with an (already tiled) consumer. Given a tile of an result,
+          returns the tile of the iteration space that uses this tile.
+          - `resultNumber` is the result of the producer used by the consumer.
+          - `offsets` is the offset of the slice of the producer result used by
+            the tiled implementation of the consumer.
+          - `sizes` is the size of the slice of the producer result used by the
+            consumer.
+          If fusion of the producer with the consumer is not legal for the
+          result, or if this mapping cannot be computed, the implementation 
+          should return a failure.
+          
+          For most cases `generateResultTileValue` could be a implemented using 
+          `getIterationDomainTileFromResultTile` + `getTiledImplementation` 
+          methods.
+        }],
+        /*retType=*/"::mlir::LogicalResult",
+        /*methodName=*/"getIterationDomainTileFromResultTile",
+        /*args=*/(ins
+          "OpBuilder &":$b,
+          "unsigned":$resultNumber,
+          "ArrayRef<OpFoldResult> ":$offsets,
+          "ArrayRef<OpFoldResult> ":$sizes,
+          "SmallVectorImpl<OpFoldResult> &":$iterDomainOffsets,
+          "SmallVectorImpl<OpFoldResult> &":$iterDomainSizes),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return failure();
+        }]
+      >,
       InterfaceMethod<
         /*desc=*/[{
           Generates the scalar implementation of the operation.
diff --git a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
index 374fa65bd02e3b..44c9ded317fa58 100644
--- a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
+++ b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
@@ -27,6 +27,19 @@ namespace ROCDL {
 /// 5. Returns an empty string.
 StringRef getROCMPath();
 
+/// Helper enum for specifying the AMD GCN device libraries required for
+/// compilation.
+enum class AMDGCNLibraries : uint32_t {
+  None = 0,
+  Ockl = 1,
+  Ocml = 2,
+  OpenCL = 4,
+  Hip = 8,
+  LastLib = Hip,
+  LLVM_MARK_AS_BITMASK_ENUM(LastLib),
+  All = (LastLib << 1) - 1
+};
+
 /// Base class for all ROCDL serializations from GPU modules into binary
 /// strings. By default this class serializes into LLVM bitcode.
 class SerializeGPUModuleBase : public LLVM::ModuleToObject {
@@ -49,8 +62,8 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   /// Returns the bitcode files to be loaded.
   ArrayRef<std::string> getFileList() const;
 
-  /// Appends standard ROCm device libraries like `ocml.bc`, `ockl.bc`, etc.
-  LogicalResult appendStandardLibs();
+  /// Appends standard ROCm device libraries to `fileList`.
+  LogicalResult appendStandardLibs(AMDGCNLibraries libs);
 
   /// Loads the bitcode files in `fileList`.
   virtual std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
@@ -63,15 +76,20 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   LogicalResult handleBitcodeFile(llvm::Module &module) override;
 
 protected:
-  /// Appends the paths of common ROCm device libraries to `libs`.
-  LogicalResult getCommonBitcodeLibs(llvm::SmallVector<std::string> &libs,
-                                     SmallVector<char, 256> &libPath,
-                                     StringRef isaVersion);
-
   /// Adds `oclc` control variables to the LLVM module.
-  void addControlVariables(llvm::Module &module, bool wave64, bool daz,
-                           bool finiteOnly, bool unsafeMath, bool fastMath,
-                           bool correctSqrt, StringRef abiVer);
+  void addControlVariables(llvm::Module &module, AMDGCNLibraries libs,
+                           bool wave64, bool daz, bool finiteOnly,
+                           bool unsafeMath, bool fastMath, bool correctSqrt,
+                           StringRef abiVer);
+
+  /// Compiles assembly to a binary.
+  virtual std::optional<SmallVector<char, 0>>
+  compileToBinary(const std::string &serializedISA);
+
+  /// Default implementation of `ModuleToObject::moduleToObject`.
+  std::optional<SmallVector<char, 0>>
+  moduleToObjectImpl(const gpu::TargetOptions &targetOptions,
+                     llvm::Module &llvmModule);
 
   /// Returns the assembled ISA.
   std::optional<SmallVector<char, 0>> assembleIsa(StringRef isa);
@@ -84,6 +102,9 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
 
   /// List of LLVM bitcode files to link to.
   SmallVector<std::string> fileList;
+
+  /// AMD GCN libraries to use when linking, the default is using none.
+  AMDGCNLibraries deviceLibs = AMDGCNLibraries::None;
 };
 } // namespace ROCDL
 } // namespace mlir
diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
index 9721620807a0f0..244ce8b9c2ac63 100644
--- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
@@ -195,9 +195,14 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
       max -= 1;
     }
 
-    IntegerValueRangeLattice *ivEntry = getLatticeElement(*iv);
-    auto ivRange = ConstantIntRanges::fromSigned(min, max);
-    propagateIfChanged(ivEntry, ivEntry->join(IntegerValueRange{ivRange}));
+    // If we infer the lower bound to be larger than the upper bound, the
+    // resulting range is meaningless and should not be used in further
+    // inferences.
+    if (max.sge(min)) {
+      IntegerValueRangeLattice *ivEntry = getLatticeElement(*iv);
+      auto ivRange = ConstantIntRanges::fromSigned(min, max);
+      propagateIfChanged(ivEntry, ivEntry->join(IntegerValueRange{ivRange}));
+    }
     return;
   }
 
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index e628fb152b52f8..18047a613c620e 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -16,6 +16,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1247,10 +1248,10 @@ LogicalResult FlatLinearValueConstraints::unionBoundingBox(
   if (!areVarsAligned(*this, otherCst)) {
     FlatLinearValueConstraints otherCopy(otherCst);
     mergeAndAlignVars(/*offset=*/getNumDimVars(), this, &otherCopy);
-    return IntegerPolyhedron::unionBoundingBox(otherCopy);
+    return success(IntegerPolyhedron::unionBoundingBox(otherCopy));
   }
 
-  return IntegerPolyhedron::unionBoundingBox(otherCst);
+  return success(IntegerPolyhedron::unionBoundingBox(otherCst));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Analysis/Presburger/Barvinok.cpp b/mlir/lib/Analysis/Presburger/Barvinok.cpp
index e16b9269b75fc6..dae840e00ff2e2 100644
--- a/mlir/lib/Analysis/Presburger/Barvinok.cpp
+++ b/mlir/lib/Analysis/Presburger/Barvinok.cpp
@@ -10,7 +10,6 @@
 #include "mlir/Analysis/Presburger/Utils.h"
 #include "llvm/ADT/Sequence.h"
 #include <algorithm>
-#include <bitset>
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/CMakeLists.txt b/mlir/lib/Analysis/Presburger/CMakeLists.txt
index 1d30dd38ccd1bb..6b9842a6160ab7 100644
--- a/mlir/lib/Analysis/Presburger/CMakeLists.txt
+++ b/mlir/lib/Analysis/Presburger/CMakeLists.txt
@@ -8,8 +8,4 @@ add_mlir_library(MLIRPresburger
   PWMAFunction.cpp
   QuasiPolynomial.cpp
   Simplex.cpp
-  Utils.cpp
-
-  LINK_LIBS PUBLIC
-  MLIRSupport
-  )
+  Utils.cpp)
diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index 75215fbab52820..6b438692ff6f91 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -20,8 +20,6 @@
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
 #include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/Analysis/Presburger/Utils.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
@@ -1554,22 +1552,22 @@ static int findEqualityToConstant(const IntegerRelation &cst, unsigned pos,
   return -1;
 }
 
-LogicalResult IntegerRelation::constantFoldVar(unsigned pos) {
+bool IntegerRelation::constantFoldVar(unsigned pos) {
   assert(pos < getNumVars() && "invalid position");
   int rowIdx;
   if ((rowIdx = findEqualityToConstant(*this, pos)) == -1)
-    return failure();
+    return false;
 
   // atEq(rowIdx, pos) is either -1 or 1.
   assert(atEq(rowIdx, pos) * atEq(rowIdx, pos) == 1);
   DynamicAPInt constVal = -atEq(rowIdx, getNumCols() - 1) / atEq(rowIdx, pos);
   setAndEliminate(pos, constVal);
-  return success();
+  return true;
 }
 
 void IntegerRelation::constantFoldVarRange(unsigned pos, unsigned num) {
   for (unsigned s = pos, t = pos, e = pos + num; s < e; s++) {
-    if (failed(constantFoldVar(t)))
+    if (!constantFoldVar(t))
       t++;
   }
 }
@@ -1946,9 +1944,9 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow,
   for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
     if (atEq(r, pos) != 0) {
       // Use Gaussian elimination here (since we have an equality).
-      LogicalResult ret = gaussianEliminateVar(pos);
+      bool ret = gaussianEliminateVar(pos);
       (void)ret;
-      assert(succeeded(ret) && "Gaussian elimination guaranteed to succeed");
+      assert(ret && "Gaussian elimination guaranteed to succeed");
       LLVM_DEBUG(llvm::dbgs() << "FM output (through Gaussian elimination):\n");
       LLVM_DEBUG(dump());
       return;
@@ -2175,8 +2173,7 @@ static void getCommonConstraints(const IntegerRelation &a,
 
 // Computes the bounding box with respect to 'other' by finding the min of the
 // lower bounds and the max of the upper bounds along each of the dimensions.
-LogicalResult
-IntegerRelation::unionBoundingBox(const IntegerRelation &otherCst) {
+bool IntegerRelation::unionBoundingBox(const IntegerRelation &otherCst) {
   assert(space.isEqual(otherCst.getSpace()) && "Spaces should match.");
   assert(getNumLocalVars() == 0 && "local ids not supported yet here");
 
@@ -2204,13 +2201,13 @@ IntegerRelation::unionBoundingBox(const IntegerRelation &otherCst) {
     if (!extent.has_value())
       // TODO: symbolic extents when necessary.
       // TODO: handle union if a dimension is unbounded.
-      return failure();
+      return false;
 
     auto otherExtent = otherCst.getConstantBoundOnDimSize(
         d, &otherLb, &otherLbFloorDivisor, &otherUb);
     if (!otherExtent.has_value() || lbFloorDivisor != otherLbFloorDivisor)
       // TODO: symbolic extents when necessary.
-      return failure();
+      return false;
 
     assert(lbFloorDivisor > 0 && "divisor always expected to be positive");
 
@@ -2230,7 +2227,7 @@ IntegerRelation::unionBoundingBox(const IntegerRelation &otherCst) {
       auto constLb = getConstantBound(BoundType::LB, d);
       auto constOtherLb = otherCst.getConstantBound(BoundType::LB, d);
       if (!constLb.has_value() || !constOtherLb.has_value())
-        return failure();
+        return false;
       std::fill(minLb.begin(), minLb.end(), 0);
       minLb.back() = std::min(*constLb, *constOtherLb);
     }
@@ -2246,7 +2243,7 @@ IntegerRelation::unionBoundingBox(const IntegerRelation &otherCst) {
       auto constUb = getConstantBound(BoundType::UB, d);
       auto constOtherUb = otherCst.getConstantBound(BoundType::UB, d);
       if (!constUb.has_value() || !constOtherUb.has_value())
-        return failure();
+        return false;
       std::fill(maxUb.begin(), maxUb.end(), 0);
       maxUb.back() = std::max(*constUb, *constOtherUb);
     }
@@ -2284,7 +2281,7 @@ IntegerRelation::unionBoundingBox(const IntegerRelation &otherCst) {
   // union (since the above are just the union along dimensions); we shouldn't
   // be discarding any other constraints on the symbols.
 
-  return success();
+  return true;
 }
 
 bool IntegerRelation::isColZero(unsigned pos) const {
diff --git a/mlir/lib/Analysis/Presburger/LinearTransform.cpp b/mlir/lib/Analysis/Presburger/LinearTransform.cpp
index ecab6349676947..cccbf4c9991d3c 100644
--- a/mlir/lib/Analysis/Presburger/LinearTransform.cpp
+++ b/mlir/lib/Analysis/Presburger/LinearTransform.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Analysis/Presburger/LinearTransform.h"
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
 #include "mlir/Analysis/Presburger/Matrix.h"
-#include "mlir/Support/LLVM.h"
 #include <utility>
 
 using namespace mlir;
diff --git a/mlir/lib/Analysis/Presburger/Matrix.cpp b/mlir/lib/Analysis/Presburger/Matrix.cpp
index 134b805648d9fd..110c5df1af37c0 100644
--- a/mlir/lib/Analysis/Presburger/Matrix.cpp
+++ b/mlir/lib/Analysis/Presburger/Matrix.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/Fraction.h"
 #include "mlir/Analysis/Presburger/Utils.h"
-#include "mlir/Support/LLVM.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
diff --git a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
index 664670d506d530..f78eb7d2d98ceb 100644
--- a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
+++ b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Analysis/Presburger/PresburgerRelation.h"
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
 #include "mlir/Analysis/Presburger/Utils.h"
-#include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
index 6173f774d0475d..5c4965c919ac30 100644
--- a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
@@ -12,8 +12,6 @@
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
 #include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/Analysis/Presburger/Utils.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -755,18 +753,18 @@ class presburger::SetCoalescer {
   ///     \___\|/            \_____/
   ///
   ///
-  LogicalResult coalescePairCutCase(unsigned i, unsigned j);
+  bool coalescePairCutCase(unsigned i, unsigned j);
 
   /// Types the inequality `ineq` according to its `IneqType` for `simp` into
   /// `redundantIneqsB` and `cuttingIneqsB`. Returns success, if no separate
   /// inequalities were encountered. Otherwise, returns failure.
-  LogicalResult typeInequality(ArrayRef<DynamicAPInt> ineq, Simplex &simp);
+  bool typeInequality(ArrayRef<DynamicAPInt> ineq, Simplex &simp);
 
   /// Types the equality `eq`, i.e. for `eq` == 0, types both `eq` >= 0 and
   /// -`eq` >= 0 according to their `IneqType` for `simp` into
   /// `redundantIneqsB` and `cuttingIneqsB`. Returns success, if no separate
   /// inequalities were encountered. Otherwise, returns failure.
-  LogicalResult typeEquality(ArrayRef<DynamicAPInt> eq, Simplex &simp);
+  bool typeEquality(ArrayRef<DynamicAPInt> eq, Simplex &simp);
 
   /// Replaces the element at position `i` with the last element and erases
   /// the last element for both `disjuncts` and `simplices`.
@@ -777,7 +775,7 @@ class presburger::SetCoalescer {
   /// successfully coalesced. The simplices in `simplices` need to be the ones
   /// constructed from `disjuncts`. At this point, there are no empty
   /// disjuncts in `disjuncts` left.
-  LogicalResult coalescePair(unsigned i, unsigned j);
+  bool coalescePair(unsigned i, unsigned j);
 };
 
 /// Constructs a `SetCoalescer` from a `PresburgerRelation`. Only adds non-empty
@@ -820,7 +818,7 @@ PresburgerRelation SetCoalescer::coalesce() {
       cuttingIneqsB.clear();
       if (i == j)
         continue;
-      if (coalescePair(i, j).succeeded()) {
+      if (coalescePair(i, j)) {
         broken = true;
         break;
       }
@@ -904,7 +902,7 @@ void SetCoalescer::addCoalescedDisjunct(unsigned i, unsigned j,
 ///     \___\|/            \_____/
 ///
 ///
-LogicalResult SetCoalescer::coalescePairCutCase(unsigned i, unsigned j) {
+bool SetCoalescer::coalescePairCutCase(unsigned i, unsigned j) {
   /// All inequalities of `b` need to be redundant. We already know that the
   /// redundant ones are, so only the cutting ones remain to be checked.
   Simplex &simp = simplices[i];
@@ -912,7 +910,7 @@ LogicalResult SetCoalescer::coalescePairCutCase(unsigned i, unsigned j) {
   if (llvm::any_of(cuttingIneqsA, [this, &simp](ArrayRef<DynamicAPInt> curr) {
         return !isFacetContained(curr, simp);
       }))
-    return failure();
+    return false;
   IntegerRelation newSet(disjunct.getSpace());
 
   for (ArrayRef<DynamicAPInt> curr : redundantIneqsA)
@@ -922,30 +920,26 @@ LogicalResult SetCoalescer::coalescePairCutCase(unsigned i, unsigned j) {
     newSet.addInequality(curr);
 
   addCoalescedDisjunct(i, j, newSet);
-  return success();
+  return true;
 }
 
-LogicalResult SetCoalescer::typeInequality(ArrayRef<DynamicAPInt> ineq,
-                                           Simplex &simp) {
+bool SetCoalescer::typeInequality(ArrayRef<DynamicAPInt> ineq, Simplex &simp) {
   Simplex::IneqType type = simp.findIneqType(ineq);
   if (type == Simplex::IneqType::Redundant)
     redundantIneqsB.push_back(ineq);
   else if (type == Simplex::IneqType::Cut)
     cuttingIneqsB.push_back(ineq);
   else
-    return failure();
-  return success();
+    return false;
+  return true;
 }
 
-LogicalResult SetCoalescer::typeEquality(ArrayRef<DynamicAPInt> eq,
-                                         Simplex &simp) {
-  if (typeInequality(eq, simp).failed())
-    return failure();
+bool SetCoalescer::typeEquality(ArrayRef<DynamicAPInt> eq, Simplex &simp) {
+  if (!typeInequality(eq, simp))
+    return false;
   negEqs.push_back(getNegatedCoeffs(eq));
   ArrayRef<DynamicAPInt> inv(negEqs.back());
-  if (typeInequality(inv, simp).failed())
-    return failure();
-  return success();
+  return typeInequality(inv, simp);
 }
 
 void SetCoalescer::eraseDisjunct(unsigned i) {
@@ -957,7 +951,7 @@ void SetCoalescer::eraseDisjunct(unsigned i) {
   simplices.pop_back();
 }
 
-LogicalResult SetCoalescer::coalescePair(unsigned i, unsigned j) {
+bool SetCoalescer::coalescePair(unsigned i, unsigned j) {
 
   IntegerRelation &a = disjuncts[i];
   IntegerRelation &b = disjuncts[j];
@@ -965,7 +959,7 @@ LogicalResult SetCoalescer::coalescePair(unsigned i, unsigned j) {
   /// skipped.
   /// TODO: implement local id support.
   if (a.getNumLocalVars() != 0 || b.getNumLocalVars() != 0)
-    return failure();
+    return false;
   Simplex &simpA = simplices[i];
   Simplex &simpB = simplices[j];
 
@@ -975,34 +969,34 @@ LogicalResult SetCoalescer::coalescePair(unsigned i, unsigned j) {
   // inequality is encountered during typing, the two IntegerRelations
   // cannot be coalesced.
   for (int k = 0, e = a.getNumInequalities(); k < e; ++k)
-    if (typeInequality(a.getInequality(k), simpB).failed())
-      return failure();
+    if (!typeInequality(a.getInequality(k), simpB))
+      return false;
 
   for (int k = 0, e = a.getNumEqualities(); k < e; ++k)
-    if (typeEquality(a.getEquality(k), simpB).failed())
-      return failure();
+    if (!typeEquality(a.getEquality(k), simpB))
+      return false;
 
   std::swap(redundantIneqsA, redundantIneqsB);
   std::swap(cuttingIneqsA, cuttingIneqsB);
 
   for (int k = 0, e = b.getNumInequalities(); k < e; ++k)
-    if (typeInequality(b.getInequality(k), simpA).failed())
-      return failure();
+    if (!typeInequality(b.getInequality(k), simpA))
+      return false;
 
   for (int k = 0, e = b.getNumEqualities(); k < e; ++k)
-    if (typeEquality(b.getEquality(k), simpA).failed())
-      return failure();
+    if (!typeEquality(b.getEquality(k), simpA))
+      return false;
 
   // If there are no cutting inequalities of `a`, `b` is contained
   // within `a`.
   if (cuttingIneqsA.empty()) {
     eraseDisjunct(j);
-    return success();
+    return true;
   }
 
   // Try to apply the cut case
-  if (coalescePairCutCase(i, j).succeeded())
-    return success();
+  if (coalescePairCutCase(i, j))
+    return true;
 
   // Swap the vectors to compare the pair (j,i) instead of (i,j).
   std::swap(redundantIneqsA, redundantIneqsB);
@@ -1012,14 +1006,11 @@ LogicalResult SetCoalescer::coalescePair(unsigned i, unsigned j) {
   // within `a`.
   if (cuttingIneqsA.empty()) {
     eraseDisjunct(i);
-    return success();
+    return true;
   }
 
   // Try to apply the cut case
-  if (coalescePairCutCase(j, i).succeeded())
-    return success();
-
-  return failure();
+  return coalescePairCutCase(j, i);
 }
 
 PresburgerRelation PresburgerRelation::coalesce() const {
diff --git a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
index 4fd4886d22536d..85cb56e8a11366 100644
--- a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
+++ b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Analysis/Presburger/QuasiPolynomial.h"
 #include "mlir/Analysis/Presburger/Fraction.h"
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
-#include "mlir/Analysis/Presburger/Utils.h"
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index 2cdd79d42732d1..4efc7a3755014a 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -12,8 +12,6 @@
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
 #include "mlir/Analysis/Presburger/Utils.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -231,7 +229,7 @@ Direction flippedDirection(Direction direction) {
 /// add these to the set of ignored columns and continue to the next row. If we
 /// run out of rows, then A*y is zero and we are done.
 MaybeOptimum<SmallVector<Fraction, 8>> LexSimplex::findRationalLexMin() {
-  if (restoreRationalConsistency().failed()) {
+  if (!restoreRationalConsistency()) {
     markEmpty();
     return OptimumKind::Empty;
   }
@@ -276,7 +274,7 @@ MaybeOptimum<SmallVector<Fraction, 8>> LexSimplex::findRationalLexMin() {
 ///
 /// The constraint is violated when added (it would be useless otherwise)
 /// so we immediately try to move it to a column.
-LogicalResult LexSimplexBase::addCut(unsigned row) {
+bool LexSimplexBase::addCut(unsigned row) {
   DynamicAPInt d = tableau(row, 0);
   unsigned cutRow = addZeroRow(/*makeRestricted=*/true);
   tableau(cutRow, 0) = d;
@@ -303,7 +301,7 @@ std::optional<unsigned> LexSimplex::maybeGetNonIntegralVarRow() const {
 
 MaybeOptimum<SmallVector<DynamicAPInt, 8>> LexSimplex::findIntegerLexMin() {
   // We first try to make the tableau consistent.
-  if (restoreRationalConsistency().failed())
+  if (!restoreRationalConsistency())
     return OptimumKind::Empty;
 
   // Then, if the sample value is integral, we are done.
@@ -318,9 +316,9 @@ MaybeOptimum<SmallVector<DynamicAPInt, 8>> LexSimplex::findIntegerLexMin() {
     //
     // Failure indicates that the tableau became empty, which occurs when the
     // polytope is integer empty.
-    if (addCut(*maybeRow).failed())
+    if (!addCut(*maybeRow))
       return OptimumKind::Empty;
-    if (restoreRationalConsistency().failed())
+    if (!restoreRationalConsistency())
       return OptimumKind::Empty;
   }
 
@@ -413,7 +411,7 @@ bool SymbolicLexSimplex::isSymbolicSampleIntegral(unsigned row) const {
 /// (sum_i (b_i%d)y_i - (-c%d) - sum_i (-a_i%d)s_i + q*d)/d >= 0
 /// This constraint is violated when added so we immediately try to move it to a
 /// column.
-LogicalResult SymbolicLexSimplex::addSymbolicCut(unsigned row) {
+bool SymbolicLexSimplex::addSymbolicCut(unsigned row) {
   DynamicAPInt d = tableau(row, 0);
   if (isRangeDivisibleBy(tableau.getRow(row).slice(3, nSymbol), d)) {
     // The coefficients of symbols in the symbol numerator are divisible
@@ -525,11 +523,11 @@ std::optional<unsigned> SymbolicLexSimplex::maybeGetNonIntegralVarRow() {
 
 /// The non-branching pivots are just the ones moving the rows
 /// that are always violated in the symbol domain.
-LogicalResult SymbolicLexSimplex::doNonBranchingPivots() {
+bool SymbolicLexSimplex::doNonBranchingPivots() {
   while (std::optional<unsigned> row = maybeGetAlwaysViolatedRow())
-    if (moveRowUnknownToColumn(*row).failed())
-      return failure();
-  return success();
+    if (!moveRowUnknownToColumn(*row))
+      return false;
+  return true;
 }
 
 SymbolicLexOpt SymbolicLexSimplex::computeSymbolicIntegerLexMin() {
@@ -569,7 +567,7 @@ SymbolicLexOpt SymbolicLexSimplex::computeSymbolicIntegerLexMin() {
         continue;
       }
 
-      if (doNonBranchingPivots().failed()) {
+      if (!doNonBranchingPivots()) {
         // Could not find pivots for violated constraints; return.
         --level;
         continue;
@@ -629,7 +627,7 @@ SymbolicLexOpt SymbolicLexSimplex::computeSymbolicIntegerLexMin() {
       // The tableau is rationally consistent for the current domain.
       // Now we look for non-integral sample values and add cuts for them.
       if (std::optional<unsigned> row = maybeGetNonIntegralVarRow()) {
-        if (addSymbolicCut(*row).failed()) {
+        if (!addSymbolicCut(*row)) {
           // No integral points; return.
           --level;
           continue;
@@ -663,7 +661,7 @@ SymbolicLexOpt SymbolicLexSimplex::computeSymbolicIntegerLexMin() {
       SmallVector<DynamicAPInt, 8> splitIneq =
           getComplementIneq(getSymbolicSampleIneq(u.pos));
       normalizeRange(splitIneq);
-      if (moveRowUnknownToColumn(u.pos).failed()) {
+      if (!moveRowUnknownToColumn(u.pos)) {
         // The unknown can't be made non-negative; return.
         --level;
         continue;
@@ -701,13 +699,13 @@ std::optional<unsigned> LexSimplex::maybeGetViolatedRow() const {
 /// We simply look for violated rows and keep trying to move them to column
 /// orientation, which always succeeds unless the constraints have no solution
 /// in which case we just give up and return.
-LogicalResult LexSimplex::restoreRationalConsistency() {
+bool LexSimplex::restoreRationalConsistency() {
   if (empty)
-    return failure();
+    return false;
   while (std::optional<unsigned> maybeViolatedRow = maybeGetViolatedRow())
-    if (moveRowUnknownToColumn(*maybeViolatedRow).failed())
-      return failure();
-  return success();
+    if (!moveRowUnknownToColumn(*maybeViolatedRow))
+      return false;
+  return true;
 }
 
 // Move the row unknown to column orientation while preserving lexicopositivity
@@ -772,7 +770,7 @@ LogicalResult LexSimplex::restoreRationalConsistency() {
 // which is in contradiction to the fact that B.col(j) / B(i,j) must be
 // lexicographically smaller than B.col(k) / B(i,k), since it lexicographically
 // minimizes the change in sample value.
-LogicalResult LexSimplexBase::moveRowUnknownToColumn(unsigned row) {
+bool LexSimplexBase::moveRowUnknownToColumn(unsigned row) {
   std::optional<unsigned> maybeColumn;
   for (unsigned col = 3 + nSymbol, e = getNumColumns(); col < e; ++col) {
     if (tableau(row, col) <= 0)
@@ -782,10 +780,10 @@ LogicalResult LexSimplexBase::moveRowUnknownToColumn(unsigned row) {
   }
 
   if (!maybeColumn)
-    return failure();
+    return false;
 
   pivot(row, *maybeColumn);
-  return success();
+  return true;
 }
 
 unsigned LexSimplexBase::getLexMinPivotColumn(unsigned row, unsigned colA,
@@ -988,7 +986,7 @@ void SimplexBase::pivot(unsigned pivotRow, unsigned pivotCol) {
 /// Perform pivots until the unknown has a non-negative sample value or until
 /// no more upward pivots can be performed. Return success if we were able to
 /// bring the row to a non-negative sample value, and failure otherwise.
-LogicalResult Simplex::restoreRow(Unknown &u) {
+bool Simplex::restoreRow(Unknown &u) {
   assert(u.orientation == Orientation::Row &&
          "unknown should be in row position");
 
@@ -999,9 +997,9 @@ LogicalResult Simplex::restoreRow(Unknown &u) {
 
     pivot(*maybePivot);
     if (u.orientation == Orientation::Column)
-      return success(); // the unknown is unbounded above.
+      return true; // the unknown is unbounded above.
   }
-  return success(tableau(u.pos, 1) >= 0);
+  return tableau(u.pos, 1) >= 0;
 }
 
 /// Find a row that can be used to pivot the column in the specified direction.
@@ -1107,8 +1105,8 @@ void SimplexBase::markEmpty() {
 /// empty and we mark it as such.
 void Simplex::addInequality(ArrayRef<DynamicAPInt> coeffs) {
   unsigned conIndex = addRow(coeffs, /*makeRestricted=*/true);
-  LogicalResult result = restoreRow(con[conIndex]);
-  if (failed(result))
+  bool result = restoreRow(con[conIndex]);
+  if (!result)
     markEmpty();
 }
 
@@ -1386,7 +1384,7 @@ MaybeOptimum<Fraction> Simplex::computeOptimum(Direction direction,
   MaybeOptimum<Fraction> optimum = computeRowOptimum(direction, row);
   if (u.restricted && direction == Direction::Down &&
       (optimum.isUnbounded() || *optimum < Fraction(0, 1))) {
-    if (failed(restoreRow(u)))
+    if (!restoreRow(u))
       llvm_unreachable("Could not restore row!");
   }
   return optimum;
@@ -1455,7 +1453,7 @@ void Simplex::detectRedundant(unsigned offset, unsigned count) {
     if (minimum.isUnbounded() || *minimum < Fraction(0, 1)) {
       // Constraint is unbounded below or can attain negative sample values and
       // hence is not redundant.
-      if (failed(restoreRow(u)))
+      if (!restoreRow(u))
         llvm_unreachable("Could not restore non-redundant row!");
       continue;
     }
diff --git a/mlir/lib/Analysis/Presburger/Utils.cpp b/mlir/lib/Analysis/Presburger/Utils.cpp
index 1fab4c4dcca339..65190c6f07d4b6 100644
--- a/mlir/lib/Analysis/Presburger/Utils.cpp
+++ b/mlir/lib/Analysis/Presburger/Utils.cpp
@@ -13,18 +13,11 @@
 #include "mlir/Analysis/Presburger/Utils.h"
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
-#include <functional>
-#include <numeric>
-
 #include <numeric>
 #include <optional>
 
@@ -102,10 +95,10 @@ static void normalizeDivisionByGCD(MutableArrayRef<DynamicAPInt> dividend,
 /// If successful, `expr` is set to dividend of the division and `divisor` is
 /// set to the denominator of the division, which will be positive.
 /// The final division expression is normalized by GCD.
-static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
-                                unsigned ubIneq, unsigned lbIneq,
-                                MutableArrayRef<DynamicAPInt> expr,
-                                DynamicAPInt &divisor) {
+static bool getDivRepr(const IntegerRelation &cst, unsigned pos,
+                       unsigned ubIneq, unsigned lbIneq,
+                       MutableArrayRef<DynamicAPInt> expr,
+                       DynamicAPInt &divisor) {
 
   assert(pos <= cst.getNumVars() && "Invalid variable position");
   assert(ubIneq <= cst.getNumInequalities() &&
@@ -127,7 +120,7 @@ static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
       break;
 
   if (i < e)
-    return failure();
+    return false;
 
   // Then, check if the constant term is of the proper form.
   // Due to the form of the upper/lower bound inequalities, the sum of their
@@ -139,7 +132,7 @@ static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
   // Check if `c` satisfies the condition `0 <= c <= divisor - 1`.
   // This also implictly checks that `divisor` is positive.
   if (!(0 <= c && c <= divisor - 1)) // NOLINT
-    return failure();
+    return false;
 
   // The inequality pair can be used to extract the division.
   // Set `expr` to the dividend of the division except the constant term, which
@@ -154,7 +147,7 @@ static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
   expr.back() = cst.atIneq(ubIneq, cst.getNumCols() - 1) + c;
   normalizeDivisionByGCD(expr, divisor);
 
-  return success();
+  return true;
 }
 
 /// Check if the pos^th variable can be represented as a division using
@@ -168,10 +161,9 @@ static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
 /// If successful, `expr` is set to dividend of the division and `divisor` is
 /// set to the denominator of the division. The final division expression is
 /// normalized by GCD.
-static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
-                                unsigned eqInd,
-                                MutableArrayRef<DynamicAPInt> expr,
-                                DynamicAPInt &divisor) {
+static bool getDivRepr(const IntegerRelation &cst, unsigned pos, unsigned eqInd,
+                       MutableArrayRef<DynamicAPInt> expr,
+                       DynamicAPInt &divisor) {
 
   assert(pos <= cst.getNumVars() && "Invalid variable position");
   assert(eqInd <= cst.getNumEqualities() && "Invalid equality position");
@@ -182,7 +174,7 @@ static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
   // Equality must involve the pos-th variable and hence `tempDiv` != 0.
   DynamicAPInt tempDiv = cst.atEq(eqInd, pos);
   if (tempDiv == 0)
-    return failure();
+    return false;
   int signDiv = tempDiv < 0 ? -1 : 1;
 
   // The divisor is always a positive integer.
@@ -195,7 +187,7 @@ static LogicalResult getDivRepr(const IntegerRelation &cst, unsigned pos,
   expr.back() = -signDiv * cst.atEq(eqInd, cst.getNumCols() - 1);
   normalizeDivisionByGCD(expr, divisor);
 
-  return success();
+  return true;
 }
 
 // Returns `false` if the constraints depends on a variable for which an
@@ -246,7 +238,7 @@ MaybeLocalRepr presburger::computeSingleVarRepr(
   for (unsigned ubPos : ubIndices) {
     for (unsigned lbPos : lbIndices) {
       // Attempt to get divison representation from ubPos, lbPos.
-      if (failed(getDivRepr(cst, pos, ubPos, lbPos, dividend, divisor)))
+      if (!getDivRepr(cst, pos, ubPos, lbPos, dividend, divisor))
         continue;
 
       if (!checkExplicitRepresentation(cst, foundRepr, dividend, pos))
@@ -259,7 +251,7 @@ MaybeLocalRepr presburger::computeSingleVarRepr(
   }
   for (unsigned eqPos : eqIndices) {
     // Attempt to get divison representation from eqPos.
-    if (failed(getDivRepr(cst, pos, eqPos, dividend, divisor)))
+    if (!getDivRepr(cst, pos, eqPos, dividend, divisor))
       continue;
 
     if (!checkExplicitRepresentation(cst, foundRepr, dividend, pos))
diff --git a/mlir/lib/Bindings/Python/DialectGPU.cpp b/mlir/lib/Bindings/Python/DialectGPU.cpp
index 1f68bfc6ff1541..a9e339b50dabc6 100644
--- a/mlir/lib/Bindings/Python/DialectGPU.cpp
+++ b/mlir/lib/Bindings/Python/DialectGPU.cpp
@@ -25,6 +25,20 @@ using namespace mlir::python::adaptors;
 
 PYBIND11_MODULE(_mlirDialectsGPU, m) {
   m.doc() = "MLIR GPU Dialect";
+  //===-------------------------------------------------------------------===//
+  // AsyncTokenType
+  //===-------------------------------------------------------------------===//
+
+  auto mlirGPUAsyncTokenType =
+      mlir_type_subclass(m, "AsyncTokenType", mlirTypeIsAGPUAsyncTokenType);
+
+  mlirGPUAsyncTokenType.def_classmethod(
+      "get",
+      [](py::object cls, MlirContext ctx) {
+        return cls(mlirGPUAsyncTokenTypeGet(ctx));
+      },
+      "Gets an instance of AsyncTokenType in the same context", py::arg("cls"),
+      py::arg("ctx") = py::none());
 
   //===-------------------------------------------------------------------===//
   // ObjectAttr
diff --git a/mlir/lib/CAPI/Dialect/GPU.cpp b/mlir/lib/CAPI/Dialect/GPU.cpp
index e471e8cd9588e1..0acebb23004291 100644
--- a/mlir/lib/CAPI/Dialect/GPU.cpp
+++ b/mlir/lib/CAPI/Dialect/GPU.cpp
@@ -15,6 +15,18 @@ using namespace mlir;
 
 MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(GPU, gpu, gpu::GPUDialect)
 
+//===-------------------------------------------------------------------===//
+// AsyncTokenType
+//===-------------------------------------------------------------------===//
+
+bool mlirTypeIsAGPUAsyncTokenType(MlirType type) {
+  return isa<gpu::AsyncTokenType>(unwrap(type));
+}
+
+MlirType mlirGPUAsyncTokenTypeGet(MlirContext ctx) {
+  return wrap(gpu::AsyncTokenType::get(unwrap(ctx)));
+}
+
 //===---------------------------------------------------------------------===//
 // ObjectAttr
 //===---------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 0a03a2e133db18..e107738a4c50c0 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -19,6 +19,7 @@ add_subdirectory(ControlFlowToLLVM)
 add_subdirectory(ControlFlowToSCF)
 add_subdirectory(ControlFlowToSPIRV)
 add_subdirectory(ConvertToLLVM)
+add_subdirectory(ConvertToSPIRV)
 add_subdirectory(FuncToEmitC)
 add_subdirectory(FuncToLLVM)
 add_subdirectory(FuncToSPIRV)
diff --git a/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
new file mode 100644
index 00000000000000..f7b090acf33af3
--- /dev/null
+++ b/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
@@ -0,0 +1,32 @@
+set(LLVM_OPTIONAL_SOURCES
+  ConvertToSPIRVPass.cpp
+)
+
+add_mlir_conversion_library(MLIRConvertToSPIRVPass
+  ConvertToSPIRVPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ConvertToSPIRV
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithToSPIRV
+  MLIRArithTransforms
+  MLIRFuncToSPIRV
+  MLIRIndexToSPIRV
+  MLIRIR
+  MLIRPass
+  MLIRRewrite
+  MLIRSCFToSPIRV
+  MLIRSPIRVConversion
+  MLIRSPIRVDialect
+  MLIRSPIRVTransforms
+  MLIRSupport
+  MLIRTransforms
+  MLIRTransformUtils
+  MLIRUBToSPIRV
+  MLIRVectorToSPIRV
+  MLIRVectorTransforms
+  )
diff --git a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
new file mode 100644
index 00000000000000..b5be4654bcb255
--- /dev/null
+++ b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
@@ -0,0 +1,71 @@
+//===- ConvertToSPIRVPass.cpp - MLIR SPIR-V Conversion --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h"
+#include "mlir/Conversion/ArithToSPIRV/ArithToSPIRV.h"
+#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRV.h"
+#include "mlir/Conversion/IndexToSPIRV/IndexToSPIRV.h"
+#include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h"
+#include "mlir/Conversion/UBToSPIRV/UBToSPIRV.h"
+#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRV.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <memory>
+
+#define DEBUG_TYPE "convert-to-spirv"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTTOSPIRVPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+
+/// A pass to perform the SPIR-V conversion.
+struct ConvertToSPIRVPass final
+    : impl::ConvertToSPIRVPassBase<ConvertToSPIRVPass> {
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    Operation *op = getOperation();
+
+    spirv::TargetEnvAttr targetAttr = spirv::lookupTargetEnvOrDefault(op);
+    SPIRVTypeConverter typeConverter(targetAttr);
+
+    RewritePatternSet patterns(context);
+    ScfToSPIRVContext scfToSPIRVContext;
+
+    // Populate patterns.
+    arith::populateCeilFloorDivExpandOpsPatterns(patterns);
+    arith::populateArithToSPIRVPatterns(typeConverter, patterns);
+    populateBuiltinFuncToSPIRVPatterns(typeConverter, patterns);
+    populateFuncToSPIRVPatterns(typeConverter, patterns);
+    index::populateIndexToSPIRVPatterns(typeConverter, patterns);
+    populateVectorToSPIRVPatterns(typeConverter, patterns);
+    populateSCFToSPIRVPatterns(typeConverter, scfToSPIRVContext, patterns);
+    ub::populateUBToSPIRVConversionPatterns(typeConverter, patterns);
+
+    std::unique_ptr<ConversionTarget> target =
+        SPIRVConversionTarget::get(targetAttr);
+
+    if (failed(applyPartialConversion(op, *target, std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 744236692fbb61..efb80467369a23 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -268,55 +268,6 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
   }
 }
 
-/// Modifies the body of the function to construct the `MemRefDescriptor` from
-/// the bare pointer calling convention lowering of `memref` types.
-static void modifyFuncOpToUseBarePtrCallingConv(
-    ConversionPatternRewriter &rewriter, Location loc,
-    const LLVMTypeConverter &typeConverter, LLVM::LLVMFuncOp funcOp,
-    TypeRange oldArgTypes) {
-  if (funcOp.getBody().empty())
-    return;
-
-  // Promote bare pointers from memref arguments to memref descriptors at the
-  // beginning of the function so that all the memrefs in the function have a
-  // uniform representation.
-  Block *entryBlock = &funcOp.getBody().front();
-  auto blockArgs = entryBlock->getArguments();
-  assert(blockArgs.size() == oldArgTypes.size() &&
-         "The number of arguments and types doesn't match");
-
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(entryBlock);
-  for (auto it : llvm::zip(blockArgs, oldArgTypes)) {
-    BlockArgument arg = std::get<0>(it);
-    Type argTy = std::get<1>(it);
-
-    // Unranked memrefs are not supported in the bare pointer calling
-    // convention. We should have bailed out before in the presence of
-    // unranked memrefs.
-    assert(!isa<UnrankedMemRefType>(argTy) &&
-           "Unranked memref is not supported");
-    auto memrefTy = dyn_cast<MemRefType>(argTy);
-    if (!memrefTy)
-      continue;
-
-    // Replace barePtr with a placeholder (undef), promote barePtr to a ranked
-    // or unranked memref descriptor and replace placeholder with the last
-    // instruction of the memref descriptor.
-    // TODO: The placeholder is needed to avoid replacing barePtr uses in the
-    // MemRef descriptor instructions. We may want to have a utility in the
-    // rewriter to properly handle this use case.
-    Location loc = funcOp.getLoc();
-    auto placeholder = rewriter.create<LLVM::UndefOp>(
-        loc, typeConverter.convertType(memrefTy));
-    rewriter.replaceUsesOfBlockArgument(arg, placeholder);
-
-    Value desc = MemRefDescriptor::fromStaticShape(rewriter, loc, typeConverter,
-                                                   memrefTy, arg);
-    rewriter.replaceOp(placeholder, {desc});
-  }
-}
-
 FailureOr<LLVM::LLVMFuncOp>
 mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                                 ConversionPatternRewriter &rewriter,
@@ -462,10 +413,6 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
         wrapForExternalCallers(rewriter, funcOp->getLoc(), converter, funcOp,
                                newFuncOp);
     }
-  } else {
-    modifyFuncOpToUseBarePtrCallingConv(
-        rewriter, funcOp->getLoc(), converter, newFuncOp,
-        llvm::cast<FunctionType>(funcOp.getFunctionType()).getInputs());
   }
 
   return newFuncOp;
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 052a48c5650f0c..6053e34f30a418 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -182,35 +182,6 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
                                          &signatureConversion)))
     return failure();
 
-  // If bare memref pointers are being used, remap them back to memref
-  // descriptors This must be done after signature conversion to get rid of the
-  // unrealized casts.
-  if (getTypeConverter()->getOptions().useBarePtrCallConv) {
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(&llvmFuncOp.getBody().front());
-    for (const auto [idx, argTy] :
-         llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
-      auto memrefTy = dyn_cast<MemRefType>(argTy);
-      if (!memrefTy)
-        continue;
-      assert(memrefTy.hasStaticShape() &&
-             "Bare pointer convertion used with dynamically-shaped memrefs");
-      // Use a placeholder when replacing uses of the memref argument to prevent
-      // circular replacements.
-      auto remapping = signatureConversion.getInputMapping(idx);
-      assert(remapping && remapping->size == 1 &&
-             "Type converter should produce 1-to-1 mapping for bare memrefs");
-      BlockArgument newArg =
-          llvmFuncOp.getBody().getArgument(remapping->inputNo);
-      auto placeholder = rewriter.create<LLVM::UndefOp>(
-          loc, getTypeConverter()->convertType(memrefTy));
-      rewriter.replaceUsesOfBlockArgument(newArg, placeholder);
-      Value desc = MemRefDescriptor::fromStaticShape(
-          rewriter, loc, *getTypeConverter(), memrefTy, newArg);
-      rewriter.replaceOp(placeholder, {desc});
-    }
-  }
-
   // Get memref type from function arguments and set the noalias to
   // pointer arguments.
   for (const auto [idx, argTy] :
@@ -684,6 +655,62 @@ LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite(
   return success();
 }
 
+LogicalResult GPUReturnOpLowering::matchAndRewrite(
+    gpu::ReturnOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  unsigned numArguments = op.getNumOperands();
+  SmallVector<Value, 4> updatedOperands;
+
+  bool useBarePtrCallConv = getTypeConverter()->getOptions().useBarePtrCallConv;
+  if (useBarePtrCallConv) {
+    // For the bare-ptr calling convention, extract the aligned pointer to
+    // be returned from the memref descriptor.
+    for (auto it : llvm::zip(op->getOperands(), adaptor.getOperands())) {
+      Type oldTy = std::get<0>(it).getType();
+      Value newOperand = std::get<1>(it);
+      if (isa<MemRefType>(oldTy) && getTypeConverter()->canConvertToBarePtr(
+                                        cast<BaseMemRefType>(oldTy))) {
+        MemRefDescriptor memrefDesc(newOperand);
+        newOperand = memrefDesc.allocatedPtr(rewriter, loc);
+      } else if (isa<UnrankedMemRefType>(oldTy)) {
+        // Unranked memref is not supported in the bare pointer calling
+        // convention.
+        return failure();
+      }
+      updatedOperands.push_back(newOperand);
+    }
+  } else {
+    updatedOperands = llvm::to_vector<4>(adaptor.getOperands());
+    (void)copyUnrankedDescriptors(rewriter, loc, op.getOperands().getTypes(),
+                                  updatedOperands,
+                                  /*toDynamic=*/true);
+  }
+
+  // If ReturnOp has 0 or 1 operand, create it and return immediately.
+  if (numArguments <= 1) {
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
+        op, TypeRange(), updatedOperands, op->getAttrs());
+    return success();
+  }
+
+  // Otherwise, we need to pack the arguments into an LLVM struct type before
+  // returning.
+  auto packedType = getTypeConverter()->packFunctionResults(
+      op.getOperandTypes(), useBarePtrCallConv);
+  if (!packedType) {
+    return rewriter.notifyMatchFailure(op, "could not convert result types");
+  }
+
+  Value packed = rewriter.create<LLVM::UndefOp>(loc, packedType);
+  for (auto [idx, operand] : llvm::enumerate(updatedOperands)) {
+    packed = rewriter.create<LLVM::InsertValueOp>(loc, packed, operand, idx);
+  }
+  rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, TypeRange(), packed,
+                                              op->getAttrs());
+  return success();
+}
+
 void mlir::populateGpuMemorySpaceAttributeConversions(
     TypeConverter &typeConverter, const MemorySpaceMapping &mapping) {
   typeConverter.addTypeAttributeConversion(
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 0ec260b7dde2a4..92e69badc27ddf 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -112,10 +112,7 @@ struct GPUReturnOpLowering : public ConvertOpToLLVMPattern<gpu::ReturnOp> {
 
   LogicalResult
   matchAndRewrite(gpu::ReturnOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, adaptor.getOperands());
-    return success();
-  }
+                  ConversionPatternRewriter &rewriter) const override;
 };
 
 namespace impl {
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 3a01795ce3f53e..f5620a6a7cd913 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -159,18 +159,30 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   addArgumentMaterialization(
       [&](OpBuilder &builder, UnrankedMemRefType resultType, ValueRange inputs,
           Location loc) -> std::optional<Value> {
-        if (inputs.size() == 1)
+        if (inputs.size() == 1) {
+          // Bare pointers are not supported for unranked memrefs because a
+          // memref descriptor cannot be built just from a bare pointer.
           return std::nullopt;
+        }
         return UnrankedMemRefDescriptor::pack(builder, loc, *this, resultType,
                                               inputs);
       });
   addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType,
                                  ValueRange inputs,
                                  Location loc) -> std::optional<Value> {
-    // TODO: bare ptr conversion could be handled here but we would need a way
-    // to distinguish between FuncOp and other regions.
-    if (inputs.size() == 1)
-      return std::nullopt;
+    if (inputs.size() == 1) {
+      // This is a bare pointer. We allow bare pointers only for function entry
+      // blocks.
+      BlockArgument barePtr = dyn_cast<BlockArgument>(inputs.front());
+      if (!barePtr)
+        return std::nullopt;
+      Block *block = barePtr.getOwner();
+      if (!block->isEntryBlock() ||
+          !isa<FunctionOpInterface>(block->getParentOp()))
+        return std::nullopt;
+      return MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType,
+                                               inputs[0]);
+    }
     return MemRefDescriptor::pack(builder, loc, *this, resultType, inputs);
   });
   // Add generic source and target materializations to handle cases where
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 8ad8e41414656c..1442f2ad722557 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -46,10 +46,9 @@ createConstFromIntAttribute(Operation *op, const std::string &attrName,
       op->getLoc(), IntegerAttr::get(requiredAttrType, castedN));
 }
 
-static Value
-createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
-                                            ArrayRef<Type> resultTypes,
-                                            PatternRewriter &rewriter) {
+static Value createLinalgBodyCalculationForElementwiseOp(
+    Operation *op, ValueRange args, ArrayRef<Type> resultTypes,
+    ConversionPatternRewriter &rewriter) {
   Location loc = op->getLoc();
   auto elementTy =
       cast<ShapedType>(op->getOperand(0).getType()).getElementType();
@@ -186,7 +185,8 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
     Value max = rewriter.create<arith::ConstantIntOp>(
         loc, APInt::getSignedMaxValue(inputBitWidth).getSExtValue(),
         intermediateType);
-    auto clamp = clampIntHelper(loc, sub, min, max, rewriter);
+    auto clamp =
+        clampIntHelper(loc, sub, min, max, rewriter, /*isUnsigned=*/false);
 
     // Truncate to the final value.
     return rewriter.create<arith::TruncIOp>(loc, elementTy, clamp);
@@ -389,25 +389,33 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
     int64_t max =
         cast<IntegerAttr>(op->getAttr("max_int")).getValue().getSExtValue();
 
+    int64_t minRepresentable = std::numeric_limits<int64_t>::min();
+    int64_t maxRepresentable = std::numeric_limits<int64_t>::max();
     if (intTy.isUnsignedInteger()) {
-      min = std::max(min, (int64_t)0);
-      max = std::min(
-          max,
-          APInt::getMaxValue(intTy.getIntOrFloatBitWidth()).getSExtValue());
-    } else {
-      min =
-          std::max(min, APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth())
-                            .getSExtValue());
-      max =
-          std::min(max, APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth())
-                            .getSExtValue());
+      minRepresentable = 0;
+      if (intTy.getIntOrFloatBitWidth() <= 63) {
+        maxRepresentable = (int64_t)APInt::getMaxValue(intTy.getIntOrFloatBitWidth())
+                          .getZExtValue();
+      }
+    } else if(intTy.getIntOrFloatBitWidth() <= 64) {
+      // Ensure that min & max fit into signed n-bit constants.
+      minRepresentable = APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth())
+                            .getSExtValue();
+      maxRepresentable = APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth())
+                            .getSExtValue();
     }
+    // Ensure that the bounds are representable as n-bit signed/unsigned integers.
+    min = std::max(min, minRepresentable);
+    max = std::max(max, minRepresentable);
+    min = std::min(min, maxRepresentable);
+    max = std::min(max, maxRepresentable);
 
     auto minVal = rewriter.create<arith::ConstantIntOp>(
         loc, min, intTy.getIntOrFloatBitWidth());
     auto maxVal = rewriter.create<arith::ConstantIntOp>(
         loc, max, intTy.getIntOrFloatBitWidth());
-    return clampIntHelper(loc, args[0], minVal, maxVal, rewriter);
+    return clampIntHelper(loc, args[0], minVal, maxVal, rewriter,
+                          intTy.isUnsignedInteger());
   }
 
   // tosa::SigmoidOp
@@ -615,10 +623,9 @@ static Value expandRank(PatternRewriter &rewriter, Location loc, Value tensor,
 }
 
 static SmallVector<Value> expandInputRanks(PatternRewriter &rewriter,
-                                           Location loc, Operation *operation) {
-  auto rank =
-      cast<RankedTensorType>(operation->getResultTypes().front()).getRank();
-  return llvm::map_to_vector(operation->getOperands(), [&](Value operand) {
+                                           Location loc, ValueRange operands,
+                                           int64_t rank) {
+  return llvm::map_to_vector(operands, [&](Value operand) {
     return expandRank(rewriter, loc, operand, rank);
   });
 }
@@ -843,11 +850,16 @@ broadcastDynamicDimensions(PatternRewriter &rewriter, Location loc,
 }
 
 static LogicalResult
-emitElementwiseComputation(PatternRewriter &rewriter, Location loc,
+emitElementwiseComputation(ConversionPatternRewriter &rewriter, Location loc,
                            Operation *operation, ValueRange operands,
-                           ArrayRef<OpFoldResult> targetShape) {
+                           ArrayRef<OpFoldResult> targetShape,
+                           const TypeConverter &converter) {
   // Generate output tensor
-  auto resultType = cast<RankedTensorType>(operation->getResultTypes().front());
+  auto resultType = cast_or_null<RankedTensorType>(
+      converter.convertType(operation->getResultTypes().front()));
+  if (!resultType) {
+    return rewriter.notifyMatchFailure(operation, "failed to convert type");
+  }
   Value outputTensor = rewriter.create<tensor::EmptyOp>(
       loc, targetShape, resultType.getElementType());
 
@@ -894,8 +906,9 @@ emitElementwiseComputation(PatternRewriter &rewriter, Location loc,
 }
 
 static LogicalResult
-elementwiseMatchAndRewriteHelper(Operation *operation,
-                                 PatternRewriter &rewriter) {
+elementwiseMatchAndRewriteHelper(Operation *operation, ValueRange operands,
+                                 ConversionPatternRewriter &rewriter,
+                                 const TypeConverter &converter) {
 
   // Collect op properties
   assert(operation->getNumResults() == 1 && "elementwise op expects 1 result");
@@ -908,13 +921,15 @@ elementwiseMatchAndRewriteHelper(Operation *operation,
   // Lower operation
   IndexPool indexPool;
   auto loc = operation->getLoc();
-  auto expandedOperands = expandInputRanks(rewriter, loc, operation);
+  auto rank =
+      cast<RankedTensorType>(operation->getResultTypes().front()).getRank();
+  auto expandedOperands = expandInputRanks(rewriter, loc, operands, rank);
   auto [targetShape, masterOperands] =
       computeTargetShape(rewriter, loc, indexPool, expandedOperands);
   auto broadcastOperands = broadcastDynamicDimensions(
       rewriter, loc, indexPool, expandedOperands, targetShape, masterOperands);
   return emitElementwiseComputation(rewriter, loc, operation, broadcastOperands,
-                                    targetShape);
+                                    targetShape, converter);
 }
 
 // Returns the constant initial value for a given reduction operation. The
@@ -1100,13 +1115,16 @@ static LogicalResult reduceMatchAndRewriteHelper(Operation *op, uint64_t axis,
 namespace {
 
 template <typename SrcOp>
-class PointwiseConverter : public OpRewritePattern<SrcOp> {
+class PointwiseConverter : public OpConversionPattern<SrcOp> {
 public:
-  using OpRewritePattern<SrcOp>::OpRewritePattern;
+  using OpConversionPattern<SrcOp>::OpConversionPattern;
+  using typename OpConversionPattern<SrcOp>::OpAdaptor;
 
-  LogicalResult matchAndRewrite(SrcOp op,
-                                PatternRewriter &rewriter) const final {
-    return elementwiseMatchAndRewriteHelper(op, rewriter);
+  LogicalResult
+  matchAndRewrite(SrcOp op, OpAdaptor operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return elementwiseMatchAndRewriteHelper(
+        op, operands.getOperands(), rewriter, *this->getTypeConverter());
   }
 };
 
@@ -1279,7 +1297,7 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
               loc, nestedBuilder.getI32IntegerAttr(intMax));
 
           value = clampIntHelper(nestedLoc, value, intMinVal, intMaxVal,
-                                 nestedBuilder);
+                                 nestedBuilder, /*isUnsigned=*/false);
 
           if (outIntType.getWidth() < 32) {
             value = nestedBuilder.create<arith::TruncIOp>(
@@ -1643,7 +1661,7 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
 
           auto offset = b.create<arith::SelectOp>(pred, one, zeroI32);
           val = b.create<arith::AddIOp>(val, offset);
-          val = clampIntHelper(loc, val, zeroI32, max, b);
+          val = clampIntHelper(loc, val, zeroI32, max, b, /*isUnsigned=*/false);
           return b.create<arith::IndexCastOp>(b.getIndexType(), val);
         };
 
@@ -1664,8 +1682,10 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
                                   Value max, ImplicitLocOpBuilder &b) {
           val0 = in;
           val1 = b.create<arith::AddIOp>(val0, oneVal);
-          val0 = clampIntHelper(loc, val0, zeroI32, max, b);
-          val1 = clampIntHelper(loc, val1, zeroI32, max, b);
+          val0 =
+              clampIntHelper(loc, val0, zeroI32, max, b, /*isUnsigned=*/false);
+          val1 =
+              clampIntHelper(loc, val1, zeroI32, max, b, /*isUnsigned=*/false);
           val0 = b.create<arith::IndexCastOp>(b.getIndexType(), val0);
           val1 = b.create<arith::IndexCastOp>(b.getIndexType(), val1);
         };
@@ -2555,7 +2575,7 @@ struct FFT2dConverter final : OpRewritePattern<FFT2dOp> {
 } // namespace
 
 void mlir::tosa::populateTosaToLinalgConversionPatterns(
-    RewritePatternSet *patterns) {
+    TypeConverter &converter, RewritePatternSet *patterns) {
 
   // We have multiple resize coverters to handle degenerate cases.
   patterns->add<GenericResizeConverter>(patterns->getContext(),
@@ -2602,7 +2622,10 @@ void mlir::tosa::populateTosaToLinalgConversionPatterns(
       PointwiseConverter<tosa::CeilOp>,
       PointwiseConverter<tosa::FloorOp>,
       PointwiseConverter<tosa::ClampOp>,
-      PointwiseConverter<tosa::SigmoidOp>,
+      PointwiseConverter<tosa::SigmoidOp>
+        >(converter, patterns->getContext());
+
+  patterns->add<
       IdentityNConverter<tosa::IdentityOp>,
       ReduceConverter<tosa::ReduceAllOp>,
       ReduceConverter<tosa::ReduceAnyOp>,
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index d8fb3abc0bef8a..77c3d2e8757910 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -1015,7 +1015,8 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
             auto max = rewriter.create<arith::ConstantIntOp>(
                 loc, APInt::getSignedMaxValue(outBitwidth).getSExtValue(),
                 accETy);
-            auto clamp = clampIntHelper(loc, scaled, min, max, rewriter);
+            auto clamp = clampIntHelper(loc, scaled, min, max, rewriter,
+                                        /*isUnsigned=*/false);
 
             poolVal = clamp;
             // Convert type.
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
index 8904e3253922c0..44036d7c31a912 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
@@ -63,8 +63,11 @@ struct TosaToLinalg : public impl::TosaToLinalgBase<TosaToLinalg> {
 
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
 
+    TypeConverter converter;
+    tosa::populateTosaTypeConversion(converter);
+
     FunctionOpInterface func = getOperation();
-    mlir::tosa::populateTosaToLinalgConversionPatterns(&patterns);
+    mlir::tosa::populateTosaToLinalgConversionPatterns(converter, &patterns);
     if (failed(applyFullConversion(func, target, std::move(patterns))))
       signalPassFailure();
   }
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index bb319208f58a85..5891e2fa0067ed 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -162,9 +162,7 @@ static void getProducerCandidates(unsigned dstId, MemRefDependenceGraph *mdg,
   }
 
   llvm::sort(srcIdCandidates);
-  srcIdCandidates.erase(
-      std::unique(srcIdCandidates.begin(), srcIdCandidates.end()),
-      srcIdCandidates.end());
+  srcIdCandidates.erase(llvm::unique(srcIdCandidates), srcIdCandidates.end());
 }
 
 /// Returns in 'producerConsumerMemrefs' the memrefs involved in a
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index f4781fcb546a3f..0e7beffaf2f421 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1204,7 +1204,10 @@ OpFoldResult arith::RemFOp::fold(FoldAdaptor adaptor) {
   return constFoldBinaryOp<FloatAttr>(adaptor.getOperands(),
                                       [](const APFloat &a, const APFloat &b) {
                                         APFloat result(a);
-                                        (void)result.remainder(b);
+                                        // APFloat::mod() offers the remainder
+                                        // behavior we want, i.e. the result has
+                                        // the sign of LHS operand.
+                                        (void)result.mod(b);
                                         return result;
                                       });
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp b/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
index fb4bb41d87488e..eafdc1de5ef341 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
@@ -55,22 +55,33 @@ namespace {
 constexpr StringLiteral
     kEnableArmStreamingIgnoreAttr("enable_arm_streaming_ignore");
 
+template <typename... Ops>
+constexpr auto opList() {
+  return std::array{TypeID::get<Ops>()...};
+}
+
+bool isScalableVector(Type type) {
+  if (auto vectorType = dyn_cast<VectorType>(type))
+    return vectorType.isScalable();
+  return false;
+}
+
 struct EnableArmStreamingPass
     : public arm_sme::impl::EnableArmStreamingBase<EnableArmStreamingPass> {
   EnableArmStreamingPass(ArmStreamingMode streamingMode, ArmZaMode zaMode,
-                         bool ifRequiredByOps, bool ifContainsScalableVectors) {
+                         bool ifRequiredByOps, bool ifScalableAndSupported) {
     this->streamingMode = streamingMode;
     this->zaMode = zaMode;
     this->ifRequiredByOps = ifRequiredByOps;
-    this->ifContainsScalableVectors = ifContainsScalableVectors;
+    this->ifScalableAndSupported = ifScalableAndSupported;
   }
   void runOnOperation() override {
     auto function = getOperation();
 
-    if (ifRequiredByOps && ifContainsScalableVectors) {
+    if (ifRequiredByOps && ifScalableAndSupported) {
       function->emitOpError(
           "enable-arm-streaming: `if-required-by-ops` and "
-          "`if-contains-scalable-vectors` are mutually exclusive");
+          "`if-scalable-and-supported` are mutually exclusive");
       return signalPassFailure();
     }
 
@@ -87,22 +98,27 @@ struct EnableArmStreamingPass
         return;
     }
 
-    if (ifContainsScalableVectors) {
-      bool foundScalableVector = false;
-      auto isScalableVector = [&](Type type) {
-        if (auto vectorType = dyn_cast<VectorType>(type))
-          return vectorType.isScalable();
-        return false;
-      };
+    if (ifScalableAndSupported) {
+      // FIXME: This should be based on target information (i.e., the presence
+      // of FEAT_SME_FA64). This currently errs on the side of caution. If
+      // possible gathers/scatters should be lowered regular vector loads/stores
+      // before invoking this pass.
+      auto disallowedOperations = opList<vector::GatherOp, vector::ScatterOp>();
+      bool isCompatibleScalableFunction = false;
       function.walk([&](Operation *op) {
-        if (llvm::any_of(op->getOperandTypes(), isScalableVector) ||
-            llvm::any_of(op->getResultTypes(), isScalableVector)) {
-          foundScalableVector = true;
+        if (llvm::is_contained(disallowedOperations,
+                               op->getName().getTypeID())) {
+          isCompatibleScalableFunction = false;
           return WalkResult::interrupt();
         }
+        if (!isCompatibleScalableFunction &&
+            (llvm::any_of(op->getOperandTypes(), isScalableVector) ||
+             llvm::any_of(op->getResultTypes(), isScalableVector))) {
+          isCompatibleScalableFunction = true;
+        }
         return WalkResult::advance();
       });
-      if (!foundScalableVector)
+      if (!isCompatibleScalableFunction)
         return;
     }
 
@@ -126,7 +142,7 @@ struct EnableArmStreamingPass
 
 std::unique_ptr<Pass> mlir::arm_sme::createEnableArmStreamingPass(
     const ArmStreamingMode streamingMode, const ArmZaMode zaMode,
-    bool ifRequiredByOps, bool ifContainsScalableVectors) {
+    bool ifRequiredByOps, bool ifScalableAndSupported) {
   return std::make_unique<EnableArmStreamingPass>(
-      streamingMode, zaMode, ifRequiredByOps, ifContainsScalableVectors);
+      streamingMode, zaMode, ifRequiredByOps, ifScalableAndSupported);
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index b595c6dd8a6848..96dad6518fec8b 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/OneToNFuncConversions.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Transforms/OneToNTypeConversion.h"
@@ -373,6 +374,139 @@ struct LegalizeTransferWriteOpsByDecomposition
   }
 };
 
+/// Legalize a multi-tile transfer_write as a single store loop. This is done as
+/// part of type decomposition as at this level we know each tile write is
+/// disjoint, but that information is lost after decomposition (without analysis
+/// to reconstruct it).
+///
+/// Example (pseudo-MLIR):
+///
+/// ```
+/// vector.transfer_write %vector, %dest[%y, %x], %mask
+///   : vector<[16]x[8]xi16>, memref<?x?xi16>
+/// ```
+/// Is rewritten to:
+/// ```
+/// scf.for %slice_idx = %c0 to %c8_vscale step %c1 {
+///   %upper_slice_mask = vector.extract %mask[%slice_idx] ─┐
+///     : vector<[8]xi1> from vector<[16]x[8]xi1>           |
+///   %upper_slice = vector.extract %upper_tile[%slice_idx] |- Store upper tile
+///     : vector<[8]xi16> from vector<[8]x[8]xi16>          |
+///   vector.transfer_write %upper_slice,                   |
+///     %dest[%slice_idx + %y, %x], %upper_slice_mask       |
+///     : vector<[8]xi16>, memref<?x?xi16>                  ┘
+///   %lower_slice_idx = %slice_idx + %c8_vscale                 ─┐
+///   %lower_slice_mask = vector.extract %mask[%lower_slice_idx]  |
+///     : vector<[8]xi1> from vector<[16]x[8]xi1>                 |
+///   %lower_slice = vector.extract %lower_tile[%slice_idx]       |- Store lower
+///     : vector<[8]xi16> from vector<[8]x[8]xi16>                |  tile
+///   vector.transfer_write %lower_slice,                         |
+///     %dest[%lower_slice_idx + %y, %x], %lower_slice_mask       |
+///     : vector<[8]xi16>, memref<?x?xi16>                        ┘
+/// }
+/// ```
+struct LegalizeMultiTileTransferWriteAsStoreLoop
+    : public OneToNOpConversionPattern<vector::TransferWriteOp> {
+  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::TransferWriteOp writeOp, OpAdaptor adaptor,
+                  OneToNPatternRewriter &rewriter) const override {
+    if (writeOp.hasPureTensorSemantics())
+      return rewriter.notifyMatchFailure(
+          writeOp, "TODO: tensor semantics are unsupported");
+
+    auto permutationMap = writeOp.getPermutationMap();
+    if (!permutationMap.isPermutation())
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureNonPermutationMap);
+
+    bool transposed = !permutationMap.isIdentity();
+    if (transposed)
+      return rewriter.notifyMatchFailure(writeOp,
+                                         "TODO: transpose unsupported");
+
+    auto vectorType = writeOp.getVectorType();
+    if (!isMultipleOfSMETileVectorType(vectorType))
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureNotSMETileTypeMultiple);
+
+    // Note: We also disallow masks where any dimension is > 16 because that
+    // prevents the masking from being lowered to use arm_sve.psel.
+    auto mask = writeOp.getMask();
+    if (!isSupportedMaskOp(mask) || (mask && (vectorType.getDimSize(0) > 16 ||
+                                              vectorType.getDimSize(1) > 16)))
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureUnsupportedMaskOp);
+
+    auto loc = writeOp.getLoc();
+    auto vscale = rewriter.create<vector::VectorScaleOp>(loc);
+    auto createVscaleMultiple = [&](int64_t multiplier) {
+      return rewriter.create<arith::MulIOp>(
+          loc, vscale,
+          rewriter.create<arith::ConstantIndexOp>(loc, multiplier));
+    };
+
+    // Get SME tile and slice types.
+    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
+    auto minTileSlices = smeTileType.getDimSize(0);
+    VectorType sliceMaskType =
+        VectorType::get(minTileSlices, rewriter.getI1Type(), true);
+
+    // Create loop over all tile slices.
+    auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    auto upperBound = createVscaleMultiple(minTileSlices);
+    auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    auto storeLoop =
+        rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
+    rewriter.setInsertionPointToStart(storeLoop.getBody());
+
+    // For each sub-tile of the multi-tile `vectorType`.
+    auto inputSMETiles = adaptor.getVector();
+    auto tileSliceIndex = storeLoop.getInductionVar();
+    for (auto [index, smeTile] : llvm::enumerate(
+             decomposeToSMETiles(rewriter, vectorType, smeTileType))) {
+      // The coordinates of the tile within `vectorType`.
+      auto tileRow = createVscaleMultiple(smeTile.row);
+      auto tileCol = createVscaleMultiple(smeTile.col);
+
+      // The current slice of `vectorType` we are processing.
+      auto sliceIndex =
+          rewriter.create<arith::AddIOp>(loc, tileRow, tileSliceIndex);
+
+      // Where in the destination memref the current slice will be stored.
+      auto storeRow = rewriter.create<arith::AddIOp>(loc, sliceIndex,
+                                                     writeOp.getIndices()[0]);
+      auto storeCol =
+          rewriter.create<arith::AddIOp>(loc, tileCol, writeOp.getIndices()[1]);
+
+      // Extract the mask for the current slice.
+      Value sliceMask = nullptr;
+      if (mask) {
+        sliceMask = rewriter.create<vector::ExtractOp>(
+            loc, mask, OpFoldResult(sliceIndex));
+        if (sliceMaskType != sliceMask.getType())
+          sliceMask = rewriter.create<vector::ScalableExtractOp>(
+              loc, sliceMaskType, sliceMask, smeTile.col);
+      }
+
+      // Extract and store the current slice.
+      Value tile = inputSMETiles[index];
+      auto slice =
+          rewriter.create<vector::ExtractOp>(loc, tile, tileSliceIndex);
+      rewriter.create<vector::TransferWriteOp>(
+          loc, slice, writeOp.getSource(), ValueRange{storeRow, storeCol},
+          AffineMapAttr::get(writeOp.getPermutationMap().dropResult(0)),
+          sliceMask,
+          rewriter.getBoolArrayAttr(
+              ArrayRef<bool>(writeOp.getInBoundsValues()).drop_front()));
+    }
+
+    rewriter.eraseOp(writeOp);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // ArmSME-specific fixup canonicalizations/folds
 //===----------------------------------------------------------------------===//
@@ -663,9 +797,12 @@ struct VectorLegalizationPass
     patterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
                  LiftIllegalVectorTransposeToMemory,
                  ConvertIllegalShapeCastOpsToTransposes>(context);
-    // Note: High benefit to ensure masked outer products are lowered first.
-    patterns.add<LegalizeMaskedVectorOuterProductOpsByDecomposition>(
-        converter, context, 1024);
+    // Note: These two patterns are added with a high benefit to ensure:
+    //  - Masked outer products are handled before unmasked ones
+    //  - Multi-tile writes are lowered as a store loop (if possible)
+    patterns.add<LegalizeMaskedVectorOuterProductOpsByDecomposition,
+                 LegalizeMultiTileTransferWriteAsStoreLoop>(converter, context,
+                                                            /*benefit=*/1024);
     patterns.add<LegalizeArithConstantOpsByDecomposition,
                  LegalizeVectorOuterProductOpsByDecomposition,
                  LegalizeTransferReadOpsByDecomposition,
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
index bd5c4d47692169..ca5d0688b5b594 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
@@ -52,8 +52,8 @@ static bool isMemref(Value v) { return isa<BaseMemRefType>(v.getType()); }
 /// "Free" side effects.
 static bool hasNeitherAllocateNorFreeSideEffect(Operation *op) {
   if (isa<MemoryEffectOpInterface>(op))
-    return hasEffect<MemoryEffects::Allocate>(op) ||
-           hasEffect<MemoryEffects::Free>(op);
+    return !hasEffect<MemoryEffects::Allocate>(op) &&
+           !hasEffect<MemoryEffects::Free>(op);
   // If the op does not implement the MemoryEffectOpInterface but has has
   // recursive memory effects, then this op in isolation (without its body) does
   // not have any side effects. All the ops inside the regions of this op will
@@ -497,6 +497,11 @@ BufferDeallocation::verifyFunctionPreconditions(FunctionOpInterface op) {
 }
 
 LogicalResult BufferDeallocation::verifyOperationPreconditions(Operation *op) {
+  // We do not care about ops that do not operate on buffers and have no
+  // Allocate/Free side effect.
+  if (!hasBufferSemantics(op) && hasNeitherAllocateNorFreeSideEffect(op))
+    return success();
+
   // (1) The pass does not work properly when deallocations are already present.
   // Alternatively, we could also remove all deallocations as a pre-pass.
   if (isa<DeallocOp>(op))
@@ -517,11 +522,6 @@ LogicalResult BufferDeallocation::verifyOperationPreconditions(Operation *op) {
     return op->emitError(
         "ops with unknown memory side effects are not supported");
 
-  // We do not care about ops that do not operate on buffers and have no
-  // Allocate/Free side effect.
-  if (!hasBufferSemantics(op) && hasNeitherAllocateNorFreeSideEffect(op))
-    return success();
-
   // (3) Check that the control flow structures are supported.
   auto regions = op->getRegions();
   // Check that if the operation has at
diff --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt
index a324ce7f9b19f7..80b0ef068d96d9 100644
--- a/mlir/lib/Dialect/CMakeLists.txt
+++ b/mlir/lib/Dialect/CMakeLists.txt
@@ -29,6 +29,7 @@ add_subdirectory(OpenMP)
 add_subdirectory(PDL)
 add_subdirectory(PDLInterp)
 add_subdirectory(Polynomial)
+add_subdirectory(Ptr)
 add_subdirectory(Quant)
 add_subdirectory(SCF)
 add_subdirectory(Shape)
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index ede158db911ea5..98b429de1fd85c 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -89,6 +89,13 @@ LogicalResult AssertOp::canonicalize(AssertOp op, PatternRewriter &rewriter) {
   return failure();
 }
 
+// This side effect models "program termination". 
+void AssertOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Write::get());
+}
+
 //===----------------------------------------------------------------------===//
 // BranchOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp
index 592fced0b2abdd..420c605d1a19b2 100644
--- a/mlir/lib/Dialect/DLTI/DLTI.cpp
+++ b/mlir/lib/Dialect/DLTI/DLTI.cpp
@@ -352,6 +352,7 @@ TargetDeviceSpecAttr::verify(function_ref<InFlightDiagnostic()> emitError,
              << "dlti.target_device_spec does not allow type as a key: "
              << type;
     } else {
+      // Check that keys in a target device spec are unique.
       auto id = entry.getKey().get<StringAttr>();
       if (!ids.insert(id).second)
         return emitError() << "repeated layout entry key: " << id.getValue();
diff --git a/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp
index e4efcacaa75246..ad6b0afa29a1a8 100644
--- a/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp
@@ -78,8 +78,7 @@ void NVVMAttachTarget::runOnOperation() {
           targets.append(attrs->getValue().begin(), attrs->getValue().end());
         targets.push_back(target);
         // Remove any duplicate targets.
-        targets.erase(std::unique(targets.begin(), targets.end()),
-                      targets.end());
+        targets.erase(llvm::unique(targets), targets.end());
         // Update the target attribute array.
         module.setTargetsAttr(builder.getArrayAttr(targets));
       }
diff --git a/mlir/lib/Dialect/GPU/Transforms/ROCDLAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/ROCDLAttachTarget.cpp
index 925f4a9c1085dc..60f6a747488289 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ROCDLAttachTarget.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ROCDLAttachTarget.cpp
@@ -86,8 +86,7 @@ void ROCDLAttachTarget::runOnOperation() {
           targets.append(attrs->getValue().begin(), attrs->getValue().end());
         targets.push_back(target);
         // Remove any duplicate targets.
-        targets.erase(std::unique(targets.begin(), targets.end()),
-                      targets.end());
+        targets.erase(llvm::unique(targets), targets.end());
         // Update the target attribute array.
         module.setTargetsAttr(builder.getArrayAttr(targets));
       }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SPIRVAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/SPIRVAttachTarget.cpp
index 88c365d9ce3eb6..e4468ed6d28845 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SPIRVAttachTarget.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SPIRVAttachTarget.cpp
@@ -91,7 +91,7 @@ void SPIRVAttachTarget::runOnOperation() {
       targets.append(attrs->getValue().begin(), attrs->getValue().end());
     targets.push_back(target);
     // Remove any duplicate targets.
-    targets.erase(std::unique(targets.begin(), targets.end()), targets.end());
+    targets.erase(llvm::unique(targets), targets.end());
     // Update the target attribute array.
     gpuModule.setTargetsAttr(builder.getArrayAttr(targets));
   });
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index 3f1e5b1773bf7b..5dc506c14ef96f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -841,12 +841,12 @@ bool LLVM::GEPOp::canRewire(const DestructurableMemorySlot &slot,
     return false;
   auto indexAttr =
       IntegerAttr::get(IntegerType::get(getContext(), 32), accessInfo->index);
-  assert(slot.elementPtrs.contains(indexAttr));
+  assert(slot.subelementTypes.contains(indexAttr));
   usedIndices.insert(indexAttr);
 
   // The remainder of the subslot should be accesses in-bounds. Thus, we create
   // a dummy slot with the size of the remainder.
-  Type subslotType = slot.elementPtrs.lookup(indexAttr);
+  Type subslotType = slot.subelementTypes.lookup(indexAttr);
   uint64_t slotSize = dataLayout.getTypeSize(subslotType);
   LLVM::LLVMArrayType remainingSlotType =
       getByteArrayType(getContext(), slotSize - accessInfo->subslotOffset);
@@ -923,7 +923,7 @@ static bool definitelyWritesOnlyWithinSlot(MemIntr op, const MemorySlot &slot,
 /// into them.
 static bool areAllIndicesI32(const DestructurableMemorySlot &slot) {
   Type i32 = IntegerType::get(slot.ptr.getContext(), 32);
-  return llvm::all_of(llvm::make_first_range(slot.elementPtrs),
+  return llvm::all_of(llvm::make_first_range(slot.subelementTypes),
                       [&](Attribute index) {
                         auto intIndex = dyn_cast<IntegerAttr>(index);
                         return intIndex && intIndex.getType() == i32;
@@ -1159,7 +1159,7 @@ static bool memcpyCanRewire(MemcpyLike op, const DestructurableMemorySlot &slot,
     return false;
 
   if (op.getSrc() == slot.ptr)
-    for (Attribute index : llvm::make_first_range(slot.elementPtrs))
+    for (Attribute index : llvm::make_first_range(slot.subelementTypes))
       usedIndices.insert(index);
 
   return true;
@@ -1211,7 +1211,7 @@ memcpyRewire(MemcpyLike op, const DestructurableMemorySlot &slot,
   // It was previously checked that index types are consistent, so this type can
   // be fetched now.
   Type indexType = cast<IntegerAttr>(subslots.begin()->first).getType();
-  for (size_t i = 0, e = slot.elementPtrs.size(); i != e; i++) {
+  for (size_t i = 0, e = slot.subelementTypes.size(); i != e; i++) {
     Attribute index = IntegerAttr::get(indexType, i);
     if (!subslots.contains(index))
       continue;
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index f35ab3b856b4ea..6ee1810c2ff2b9 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -105,9 +105,9 @@ std::optional<Value> linalg::isaFillOpInterface(GenericOp genericOp) {
 static bool
 isaElemwiseSingleUnaryOrBinaryOpInterface(linalg::GenericOp genericOp,
                                           unsigned arity) {
-  // Check all loops are parallel, and have only tensor semantics.
+  // Check all loops are parallel.
   if (genericOp.getNumParallelLoops() != genericOp.getNumLoops() ||
-      genericOp.getNumLoops() < 1 || !genericOp.hasPureTensorSemantics())
+      genericOp.getNumLoops() < 1)
     return false;
 
   // Check there are arity-inputs, 1-output and all are identity-maps.
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 37467db568c27d..4eb334f8bbbfaf 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2314,7 +2314,7 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
     }
   } else {
     chunkSizes.resize(payload.size(),
-                       rewriter.getIndexAttr(getStaticChunkSizes()));
+                      rewriter.getIndexAttr(getStaticChunkSizes()));
   }
 
   auto checkStructuredOpAndDimensions =
@@ -2327,7 +2327,7 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
 
     if (getDimension() >= linalgOp.getNumLoops()) {
       auto diag = emitSilenceableError() << "dimension " << getDimension()
-                                          << " does not exist in target op";
+                                         << " does not exist in target op";
       diag.attachNote(loc) << "target op";
       return diag;
     }
@@ -2368,6 +2368,7 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
         break;
 
       linalgOp = cast<LinalgOp>(target);
+      Location loc = target->getLoc();
 
       rewriter.setInsertionPoint(linalgOp);
       std::tie(head, tail) = linalg::splitOp(
@@ -2376,7 +2377,7 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
 
       // Propagate errors.
       DiagnosedSilenceableFailure diag =
-          checkFailureInSplitting(!head && !tail, target->getLoc());
+          checkFailureInSplitting(!head && !tail, loc);
       if (diag.isDefiniteFailure())
         return diag;
 
@@ -2395,6 +2396,7 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
     Operation *noSecondPart = nullptr;
     for (const auto &pair : llvm::zip(payload, chunkSizes)) {
       Operation *target = std::get<0>(pair);
+      Location loc = target->getLoc();
       LinalgOp linalgOp = dyn_cast<LinalgOp>(target);
       DiagnosedSilenceableFailure diag =
           checkStructuredOpAndDimensions(linalgOp, target->getLoc());
@@ -2408,8 +2410,8 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
           getDimension(), std::get<1>(pair));
 
       // Propagate errors.
-      DiagnosedSilenceableFailure diagSplit = checkFailureInSplitting(
-          !first.back() && !second.back(), target->getLoc());
+      DiagnosedSilenceableFailure diagSplit =
+          checkFailureInSplitting(!first.back() && !second.back(), loc);
       if (diagSplit.isDefiniteFailure())
         return diag;
 
@@ -2718,8 +2720,8 @@ transform::ContinuousTileSizesOp::apply(transform::TransformRewriter &rewriter,
 
     auto getI64AttrsFromI64 = [&](ArrayRef<int64_t> values) {
       return llvm::map_to_vector(values, [&](int64_t value) -> Attribute {
-            return builder.getI64IntegerAttr(value);
-          });
+        return builder.getI64IntegerAttr(value);
+      });
     };
     transformResults.setParams(cast<OpResult>(getTileSizes()),
                                getI64AttrsFromI64(spec->tileSizes));
@@ -2756,9 +2758,9 @@ transform::ContinuousTileSizesOp::apply(transform::TransformRewriter &rewriter,
   }
 
   auto getDefiningOps = [&](ArrayRef<Value> values) {
-        return llvm::map_to_vector(values, [&](Value value) -> Operation * {
-          return value.getDefiningOp();
-        });
+    return llvm::map_to_vector(values, [&](Value value) -> Operation * {
+      return value.getDefiningOp();
+    });
   };
 
   transformResults.set(cast<OpResult>(getTileSizes()),
diff --git a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
index 81a5398dabcb7d..be158af09d3984 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -162,6 +162,35 @@ struct LinalgOpInterfaceHelper {
     (Ops::template attachInterface<LinalgOpInterface<Ops>>(*ctx), ...);
   }
 };
+
+struct SoftmaxOpInterface
+    : public DstBufferizableOpInterfaceExternalModel<SoftmaxOpInterface,
+                                                     linalg::SoftmaxOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState &state) const {
+    // Output operand is not read.
+    auto softmaxOp = cast<linalg::SoftmaxOp>(op);
+    return &opOperand == &softmaxOp.getInputMutable();
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto softmaxOp = cast<linalg::SoftmaxOp>(op);
+    FailureOr<Value> inputBuffer =
+        getBuffer(rewriter, softmaxOp.getInput(), options);
+    if (failed(inputBuffer))
+      return failure();
+    FailureOr<Value> outputBuffer =
+        getBuffer(rewriter, softmaxOp.getOutput(), options);
+    if (failed(outputBuffer))
+      return failure();
+    rewriter.create<linalg::SoftmaxOp>(softmaxOp.getLoc(),
+                                       /*result=*/TypeRange(), *inputBuffer,
+                                       *outputBuffer, softmaxOp.getDimension());
+    replaceOpWithBufferizedValues(rewriter, op, *outputBuffer);
+    return success();
+  }
+};
 } // namespace
 
 void mlir::linalg::registerBufferizableOpInterfaceExternalModels(
@@ -174,5 +203,7 @@ void mlir::linalg::registerBufferizableOpInterfaceExternalModels(
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
         >::registerOpInterface(ctx);
+
+    SoftmaxOp::attachInterface<SoftmaxOpInterface>(*ctx);
   });
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
index e51ae2264a36ac..6984bc2dff4980 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -605,7 +605,8 @@ static bool isDimsDivisibleByTileSizes(ArrayRef<int64_t> dimsPos,
 static int64_t applyPermutationAndReindexReassoc(
     SmallVector<ReassociationIndices> &reassocIndices,
     ArrayRef<int64_t> permutation) {
-  applyPermutationToVector<ReassociationIndices>(reassocIndices, permutation);
+  if (!permutation.empty())
+    applyPermutationToVector<ReassociationIndices>(reassocIndices, permutation);
   int64_t nextPos = 0;
   for (ReassociationIndices &indices : reassocIndices) {
     for (auto &index : indices) {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index c0829397f1f851..36f8696bf1b274 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -833,4 +833,265 @@ struct LinalgFoldUnitExtentDimsPass
     (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
   }
 };
+
+} // namespace
+
+namespace {
+
+/// Returns reassociation indices for collapsing/expanding a
+/// tensor of rank `rank` at position `pos`.
+static SmallVector<ReassociationIndices>
+getReassociationForReshapeAtDim(int64_t rank, int64_t pos) {
+  SmallVector<ReassociationIndices> reassociation(rank - 1, {0, 1});
+  bool lastDim = pos == rank - 1;
+  if (rank > 2) {
+    for (int64_t i = 0; i < rank - 1; i++) {
+      if (i == pos || (lastDim && i == pos - 1))
+        reassociation[i] = ReassociationIndices{i, i + 1};
+      else if (i < pos)
+        reassociation[i] = ReassociationIndices{i};
+      else
+        reassociation[i] = ReassociationIndices{i + 1};
+    }
+  }
+  return reassociation;
+}
+
+/// Returns a collapsed `val` where the collapsing occurs at dim `pos`.
+/// If `pos < 0`, then don't collapse.
+static Value collapseSingletonDimAt(PatternRewriter &rewriter, Value val,
+                                    int64_t pos) {
+  if (pos < 0)
+    return val;
+  auto valType = cast<ShapedType>(val.getType());
+  SmallVector<int64_t> collapsedShape(valType.getShape());
+  collapsedShape.erase(collapsedShape.begin() + pos);
+  return collapseValue(
+      rewriter, val.getLoc(), val, collapsedShape,
+      getReassociationForReshapeAtDim(valType.getRank(), pos),
+      ControlDropUnitDims::RankReductionStrategy::ReassociativeReshape);
+}
+
+/// Base class for all rank reduction patterns for contraction ops
+/// with unit dimensions.  All patterns should convert one named op
+/// to another named op.  Intended to reduce only one iteration space dim
+/// at a time.
+/// Reducing multiple dims will happen with recusive application of
+/// pattern rewrites.
+template <typename FromOpTy, typename ToOpTy>
+struct RankReduceContractionOps : OpRewritePattern<FromOpTy> {
+  using OpRewritePattern<FromOpTy>::OpRewritePattern;
+
+  /// Collapse all collapsable operands.
+  SmallVector<Value>
+  collapseOperands(PatternRewriter &rewriter, ArrayRef<Value> operands,
+                   ArrayRef<int64_t> operandCollapseDims) const {
+    assert(operandCollapseDims.size() == 3 && operands.size() == 3 &&
+           "expected 3 operands and dims");
+    return llvm::map_to_vector(
+        llvm::zip(operands, operandCollapseDims), [&](auto pair) {
+          return collapseSingletonDimAt(rewriter, std::get<0>(pair),
+                                        std::get<1>(pair));
+        });
+  }
+
+  /// Expand result tensor.
+  Value expandResult(PatternRewriter &rewriter, Value result,
+                     RankedTensorType expandedType, int64_t dim) const {
+    return rewriter.create<tensor::ExpandShapeOp>(
+        result.getLoc(), expandedType, result,
+        getReassociationForReshapeAtDim(expandedType.getRank(), dim));
+  }
+
+  LogicalResult matchAndRewrite(FromOpTy contractionOp,
+                                PatternRewriter &rewriter) const override {
+
+    auto loc = contractionOp.getLoc();
+    auto inputs = contractionOp.getDpsInputs();
+    auto inits = contractionOp.getDpsInits();
+    if (inputs.size() != 2 || inits.size() != 1)
+      return rewriter.notifyMatchFailure(contractionOp,
+                                         "expected 2 inputs and 1 init");
+    auto lhs = inputs[0];
+    auto rhs = inputs[1];
+    auto init = inits[0];
+    SmallVector<Value> operands{lhs, rhs, init};
+
+    SmallVector<int64_t> operandUnitDims;
+    if (failed(getOperandUnitDims(contractionOp, operandUnitDims)))
+      return rewriter.notifyMatchFailure(contractionOp,
+                                         "no reducable dims found");
+
+    SmallVector<Value> collapsedOperands =
+        collapseOperands(rewriter, operands, operandUnitDims);
+    Value collapsedLhs = collapsedOperands[0];
+    Value collapsedRhs = collapsedOperands[1];
+    Value collapsedInit = collapsedOperands[2];
+    SmallVector<Type, 1> collapsedResultTy;
+    if (isa<RankedTensorType>(collapsedInit.getType()))
+      collapsedResultTy.push_back(collapsedInit.getType());
+    auto collapsedOp = rewriter.create<ToOpTy>(
+        loc, collapsedResultTy, ValueRange{collapsedLhs, collapsedRhs},
+        ValueRange{collapsedInit});
+    for (auto attr : contractionOp->getAttrs()) {
+      if (attr.getName() == LinalgDialect::kMemoizedIndexingMapsAttrName)
+        continue;
+      collapsedOp->setAttr(attr.getName(), attr.getValue());
+    }
+
+    auto results = contractionOp.getResults();
+    assert(results.size() < 2 && "expected at most one result");
+    if (results.empty()) {
+      rewriter.replaceOp(contractionOp, collapsedOp);
+    } else {
+      rewriter.replaceOp(
+          contractionOp,
+          expandResult(rewriter, collapsedOp.getResultTensors()[0],
+                       cast<RankedTensorType>(results[0].getType()),
+                       operandUnitDims[2]));
+    }
+
+    return success();
+  }
+
+  /// Populate `operandUnitDims` with 3 indices indicating the unit dim
+  /// for each operand that should be collapsed in this pattern.  If an
+  /// operand shouldn't be collapsed, the index should be negative.
+  virtual LogicalResult
+  getOperandUnitDims(LinalgOp op,
+                     SmallVectorImpl<int64_t> &operandUnitDims) const = 0;
+};
+
+/// Patterns for unbatching batched contraction ops
+template <typename FromOpTy, typename ToOpTy>
+struct RankReduceToUnBatched : RankReduceContractionOps<FromOpTy, ToOpTy> {
+  using RankReduceContractionOps<FromOpTy, ToOpTy>::RankReduceContractionOps;
+
+  /// Look for unit batch dims to collapse.
+  LogicalResult
+  getOperandUnitDims(LinalgOp op,
+                     SmallVectorImpl<int64_t> &operandUnitDims) const override {
+    FailureOr<ContractionDimensions> maybeContractionDims =
+        inferContractionDims(op);
+    if (failed(maybeContractionDims)) {
+      LLVM_DEBUG(llvm::dbgs() << "could not infer contraction dims");
+      return failure();
+    }
+    ContractionDimensions contractionDims = maybeContractionDims.value();
+
+    if (contractionDims.batch.size() != 1)
+      return failure();
+    auto batchDim = contractionDims.batch[0];
+    SmallVector<std::pair<Value, unsigned>, 3> bOperands;
+    op.mapIterationSpaceDimToAllOperandDims(batchDim, bOperands);
+    if (bOperands.size() != 3 || llvm::any_of(bOperands, [](auto pair) {
+          return cast<ShapedType>(std::get<0>(pair).getType())
+                     .getShape()[std::get<1>(pair)] != 1;
+        })) {
+      LLVM_DEBUG(llvm::dbgs() << "specified unit dims not found");
+      return failure();
+    }
+
+    operandUnitDims = SmallVector<int64_t>{std::get<1>(bOperands[0]),
+                                           std::get<1>(bOperands[1]),
+                                           std::get<1>(bOperands[2])};
+    return success();
+  }
+};
+
+/// Patterns for reducing non-batch dimensions
+template <typename FromOpTy, typename ToOpTy>
+struct RankReduceMatmul : RankReduceContractionOps<FromOpTy, ToOpTy> {
+  using RankReduceContractionOps<FromOpTy, ToOpTy>::RankReduceContractionOps;
+
+  /// Helper for determining whether the lhs/init or rhs/init are reduced.
+  static bool constexpr reduceLeft =
+      (std::is_same_v<FromOpTy, BatchMatmulOp> &&
+       std::is_same_v<ToOpTy, BatchVecmatOp>) ||
+      (std::is_same_v<FromOpTy, BatchMatmulTransposeAOp> &&
+       std::is_same_v<ToOpTy, BatchVecmatOp>) ||
+      (std::is_same_v<FromOpTy, MatmulOp> &&
+       std::is_same_v<ToOpTy, VecmatOp>) ||
+      (std::is_same_v<FromOpTy, MatmulTransposeAOp> &&
+       std::is_same_v<ToOpTy, VecmatOp>) ||
+      (std::is_same_v<FromOpTy, MatvecOp> && std::is_same_v<ToOpTy, DotOp>);
+
+  /// Look for non-batch spatial dims to collapse.
+  LogicalResult
+  getOperandUnitDims(LinalgOp op,
+                     SmallVectorImpl<int64_t> &operandUnitDims) const override {
+    FailureOr<ContractionDimensions> maybeContractionDims =
+        inferContractionDims(op);
+    if (failed(maybeContractionDims)) {
+      LLVM_DEBUG(llvm::dbgs() << "could not infer contraction dims");
+      return failure();
+    }
+    ContractionDimensions contractionDims = maybeContractionDims.value();
+
+    if constexpr (reduceLeft) {
+      auto m = contractionDims.m[0];
+      SmallVector<std::pair<Value, unsigned>, 2> mOperands;
+      op.mapIterationSpaceDimToAllOperandDims(m, mOperands);
+      if (mOperands.size() != 2)
+        return failure();
+      if (llvm::all_of(mOperands, [](auto pair) {
+            return cast<ShapedType>(std::get<0>(pair).getType())
+                       .getShape()[std::get<1>(pair)] == 1;
+          })) {
+        operandUnitDims = SmallVector<int64_t>{std::get<1>(mOperands[0]), -1,
+                                               std::get<1>(mOperands[1])};
+        return success();
+      }
+    } else {
+      auto n = contractionDims.n[0];
+      SmallVector<std::pair<Value, unsigned>, 2> nOperands;
+      op.mapIterationSpaceDimToAllOperandDims(n, nOperands);
+      if (nOperands.size() != 2)
+        return failure();
+      if (llvm::all_of(nOperands, [](auto pair) {
+            return cast<ShapedType>(std::get<0>(pair).getType())
+                       .getShape()[std::get<1>(pair)] == 1;
+          })) {
+        operandUnitDims = SmallVector<int64_t>{-1, std::get<1>(nOperands[0]),
+                                               std::get<1>(nOperands[1])};
+        return success();
+      }
+    }
+    LLVM_DEBUG(llvm::dbgs() << "specified unit dims not found");
+    return failure();
+  }
+};
+
 } // namespace
+
+void mlir::linalg::populateContractionOpRankReducingPatterns(
+    RewritePatternSet &patterns) {
+  MLIRContext *context = patterns.getContext();
+  // Unbatching patterns for unit batch size
+  patterns.add<RankReduceToUnBatched<BatchMatmulOp, MatmulOp>>(context);
+  patterns
+      .add<RankReduceToUnBatched<BatchMatmulTransposeAOp, MatmulTransposeAOp>>(
+          context);
+  patterns
+      .add<RankReduceToUnBatched<BatchMatmulTransposeBOp, MatmulTransposeBOp>>(
+          context);
+  patterns.add<RankReduceToUnBatched<BatchMatvecOp, MatvecOp>>(context);
+  patterns.add<RankReduceToUnBatched<BatchVecmatOp, VecmatOp>>(context);
+
+  // Non-batch rank 1 reducing patterns
+  patterns.add<RankReduceMatmul<MatmulOp, VecmatOp>>(context);
+  patterns.add<RankReduceMatmul<MatmulOp, MatvecOp>>(context);
+  patterns.add<RankReduceMatmul<MatmulTransposeAOp, VecmatOp>>(context);
+  patterns.add<RankReduceMatmul<MatmulTransposeBOp, MatvecOp>>(context);
+  // Batch rank 1 reducing patterns
+  patterns.add<RankReduceMatmul<BatchMatmulOp, BatchVecmatOp>>(context);
+  patterns.add<RankReduceMatmul<BatchMatmulOp, BatchMatvecOp>>(context);
+  patterns.add<RankReduceMatmul<BatchMatmulTransposeAOp, BatchVecmatOp>>(
+      context);
+  patterns.add<RankReduceMatmul<BatchMatmulTransposeBOp, BatchMatvecOp>>(
+      context);
+
+  // Non-batch rank 0 reducing patterns
+  patterns.add<RankReduceMatmul<MatvecOp, DotOp>>(context);
+  patterns.add<RankReduceMatmul<VecmatOp, DotOp>>(context);
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index 2bc4d7fbfadccb..78bfa383d25a2a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -11,12 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/TypeID.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/Support/Debug.h"
 
+namespace mlir {
+#define GEN_PASS_DEF_LINALGSPECIALIZEGENERICOPSPASS
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+} // namespace mlir
+
 #define DEBUG_TYPE "linalg-specialization"
 
 #define REPLACE_BINARY_OP(NEWOP, OPERANDS_SWAP)                                \
@@ -58,6 +68,197 @@ static bool areBinOpsSwapped(GenericOp genericOp) {
   return swapped;
 }
 
+//===----------------------------------------------------------------------===//
+// Specialize linalg generic to matmul variants.
+//===----------------------------------------------------------------------===//
+/// Identifies linalg.generic that is essentially named op of the form:
+//    ` linalg.{batch_}?matmul{_transpose_a | _transpose_b}? `
+//
+// It is possible that a linalg.generic may be implementing a matmul but not
+// in a straight-forward way e.g. below is matrix multiply over some slice
+// ```
+//  %0 = linalg.generic {
+//          indexing_maps = [affine_map<(d0, d1, d2) -> (3, d1, d0)>,
+//                           affine_map<(d0, d1, d2) -> (d0, 5, d2)>,
+//                           affine_map<(d0, d1, d2) -> (d2, d1, 13)>],
+//          iterator_types = ["parallel", "parallel", "parallel"]}
+//          ins(%A, %B : tensor<20x20x20xf32>,  tensor<20x20x20xf32>)
+//          outs(%C : tensor<20x20x20xf32>) {
+//             ^bb0(%a: f32, %b: f32, %c : f32):
+//                %mul = arith.mulf %a, %b : f32
+//                %add = arith.addf %mul, %c : f32
+//                linalg.yield %add : f32
+//       } -> tensor<20x20x20xf32>
+// ```
+// It is not possible to represent above as named op.
+// e.g. linalg.batch_matmul(%A, %B :  tensor<20x20x20xf32>, ...) is
+// not  the same as linalg.generic above.
+namespace {
+enum class IndexMatchResult {
+  Match = 0,  // identity map.
+  Transposed, // transposed map.
+  Mismatch    // none of the above.
+};
+
+// Checks whether the input Affine `map` contains two consecutive dims that
+// can be interpreted as accessing a 2D matrix. It is assumed that the row
+// column dimension are adjacent axis (in this order) and start at
+// `rowDimIdx` in the input map.
+//
+//  e.g. consider A matrix in `C[M,N] = A[M,K] * B[K,N]`. We will check
+//  whether the map of A is identity (match), transposed, or something
+//  completely different (mis-match). Similar for B and C.
+static IndexMatchResult matchOperandMap(AffineMap map, unsigned rowDimIdx,
+                                        unsigned expectedPosOfRowDim,
+                                        unsigned expectedPosOfColDim) {
+  // Get the matrix multiply indices. They are past the batch indices.
+  auto exprOfRowDim = map.getResults()[rowDimIdx];
+  auto exprOfColDim = map.getResults()[rowDimIdx + 1];
+
+  // They should be pure dimension ids.
+  if (exprOfRowDim.getKind() != AffineExprKind::DimId ||
+      exprOfColDim.getKind() != AffineExprKind::DimId)
+    return IndexMatchResult::Mismatch;
+
+  auto posRowDim = cast<AffineDimExpr>(exprOfRowDim).getPosition();
+  auto posColDim = cast<AffineDimExpr>(exprOfColDim).getPosition();
+
+  if (expectedPosOfRowDim == posRowDim && expectedPosOfColDim == posColDim)
+    return IndexMatchResult::Match;
+
+  if (expectedPosOfRowDim == posColDim && expectedPosOfColDim == posRowDim)
+    return IndexMatchResult::Transposed;
+
+  return IndexMatchResult::Mismatch;
+}
+
+// Replaces genericOp with `NamedOpTy` op, supplied as a template arg.
+//  All the variants expressed as pseudo regular expression:
+//      `linalg.{batch_}?matmul{_transpose_a | _transpose_b}?`
+//  have same number of ins/out, so its easy to stamp different versions.
+template <typename NamedOpTy>
+static LinalgOp replaceWithMatmulVariant(RewriterBase &rewriter, GenericOp op) {
+  LinalgOp namedOp = rewriter.replaceOpWithNewOp<NamedOpTy>(
+      op, ValueRange{op.getDpsInputs()[0], op.getDpsInputs()[1]},
+      ValueRange{op.getDpsInits()[0]});
+  return namedOp;
+}
+
+// Converts linalg.generic to named linalg.*matmul* where possible.
+static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
+                                                        GenericOp genericOp) {
+  if (genericOp.getNumDpsInputs() != 2 || genericOp.getNumDpsInits() != 1)
+    return failure();
+
+  // Early exit if not projected permutations.
+  auto mapRange = genericOp.getIndexingMapsArray();
+  if (llvm::any_of(mapRange,
+                   [](AffineMap m) { return !m.isProjectedPermutation(); }))
+    return failure();
+
+  // Linalg generic contraction can be across multiple axis e.g.
+  // ```
+  //      linalg.generic
+  //           {indexing_maps = [affine_map<(m, n, k1, k2) -> (m, k1, k2)>,
+  //                             affine_map<(m, n, k1, k2) -> (k2, k1, n)>,
+  //                             affine_map<(m, n, k1, k2) -> (m, n)>],
+  //           iterator_types = ["parallel", "parallel",
+  //                             "reduction", "reduction"]}
+  //           ins(%A, %B : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
+  //           outs(%C : tensor<10x40xf32>) {
+  //           ^bb0(%a: f32, %b: f32, %c: f32):
+  //                 %1 = arith.mulf %a, %b : f32
+  //                 %2 = arith.addf %c, %1 : f32
+  //                 linalg.yield %2 : f32
+  //      } -> tensor<10x40xf32>
+  //  ```
+  //  In above contraction, there are two reduction dimensions {k1, k2}
+  //  and although a valid linalg contraction, it is not a named-op
+  //  matrix multiply kind. Therefore, reject multi-dim reduction.
+  auto res = inferContractionDims(genericOp);
+  if (!succeeded(res))
+    return failure();
+  auto dims = *res;
+  if (dims.m.size() != 1 || dims.n.size() != 1 || dims.k.size() != 1)
+    return failure();
+
+  if (!mlir::linalg::detail::isContractionBody(
+          *genericOp.getBlock(), [](Operation *first, Operation *second) {
+            if ((isa<arith::MulFOp>(first) && isa<arith::AddFOp>(second)) ||
+                (isa<arith::MulIOp>(first) && isa<arith::AddIOp>(second)) ||
+                (isa<complex::MulOp>(first) && isa<complex::AddOp>(second)))
+              return true;
+            return false;
+          }))
+    return failure();
+
+  // Check rank of operands
+  auto indexingMaps = genericOp.getIndexingMapsArray();
+  if (llvm::any_of(indexingMaps, [&dims](AffineMap m) {
+        return m.getResults().size() !=
+               dims.batch.size() + 2 /* any two of {m,n,k} */;
+      }))
+    return failure();
+
+  auto numOfBatchDims = dims.batch.size();
+  if (indexingMaps[0].getNumDims() != numOfBatchDims + 3)
+    return failure();
+
+  if (numOfBatchDims) {
+    // Each operand in a linalg generic contraction  could express different
+    // permutations for its batch dimension. But for named op it must be
+    // identity since separate maps are not specified.
+    if (llvm::any_of(indexingMaps, [numOfBatchDims](AffineMap m) {
+          for (unsigned i = 0; i < numOfBatchDims; ++i) {
+            auto expr = m.getResults()[i];
+            if (expr.getKind() != AffineExprKind::DimId ||
+                cast<AffineDimExpr>(expr).getPosition() != i)
+              return true;
+          }
+          return false;
+        }))
+      return failure();
+  }
+
+  auto a =
+      matchOperandMap(indexingMaps[0], numOfBatchDims, dims.m[0], dims.k[0]);
+  auto b =
+      matchOperandMap(indexingMaps[1], numOfBatchDims, dims.k[0], dims.n[0]);
+  auto c =
+      matchOperandMap(indexingMaps[2], numOfBatchDims, dims.m[0], dims.n[0]);
+
+  if (llvm::any_of(ArrayRef<IndexMatchResult>{a, b, c}, [](IndexMatchResult r) {
+        return r == IndexMatchResult::Mismatch;
+      }))
+    return failure();
+
+  if (c != IndexMatchResult::Match ||
+      (a == IndexMatchResult::Transposed && b == IndexMatchResult::Transposed))
+    return failure();
+
+  /// Codegen the different matmul variants.
+  if (numOfBatchDims) {
+    if (a == IndexMatchResult::Transposed)
+      return replaceWithMatmulVariant<BatchMatmulTransposeAOp>(rewriter,
+                                                               genericOp);
+    if (b == IndexMatchResult::Transposed)
+      return replaceWithMatmulVariant<BatchMatmulTransposeBOp>(rewriter,
+                                                               genericOp);
+    return replaceWithMatmulVariant<BatchMatmulOp>(rewriter, genericOp);
+  }
+
+  if (a == IndexMatchResult::Transposed)
+    return replaceWithMatmulVariant<MatmulTransposeAOp>(rewriter, genericOp);
+  if (b == IndexMatchResult::Transposed)
+    return replaceWithMatmulVariant<MatmulTransposeBOp>(rewriter, genericOp);
+  return replaceWithMatmulVariant<MatmulOp>(rewriter, genericOp);
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Categorize linalg generic to named op where possible.
+//===----------------------------------------------------------------------===//
 FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(RewriterBase &rewriter,
                                                       GenericOp genericOp) {
   if (isaCopyOpInterface(genericOp)) {
@@ -100,5 +301,33 @@ FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(RewriterBase &rewriter,
       return namedOp;
     }
   }
+
+  if (isaContractionOpInterface(genericOp)) {
+    return specializeLinalgContractions(rewriter, genericOp);
+  }
   return failure();
 }
+
+namespace {
+struct LinalgSpecializeGenericOpsPass
+    : public impl::LinalgSpecializeGenericOpsPassBase<
+          LinalgSpecializeGenericOpsPass> {
+
+  using impl::LinalgSpecializeGenericOpsPassBase<
+      LinalgSpecializeGenericOpsPass>::LinalgSpecializeGenericOpsPassBase;
+  void runOnOperation() override;
+};
+} // namespace
+
+void LinalgSpecializeGenericOpsPass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  populateLinalgGenericOpsSpecializationPatterns(patterns);
+
+  if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+    signalPassFailure();
+}
+
+void mlir::linalg::populateLinalgGenericOpsSpecializationPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<LinalgSpecializationPattern>(patterns.getContext());
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index b2a1e7c71f58e4..2133458efe74c8 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -215,10 +215,11 @@ struct LinalgOpTilingInterface
     return success();
   }
 
-  FailureOr<TilingResult>
-  generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber,
-                          ArrayRef<OpFoldResult> offsets,
-                          ArrayRef<OpFoldResult> sizes) const {
+  LogicalResult getIterationDomainTileFromResultTile(
+      Operation *op, OpBuilder &b, unsigned resultNumber,
+      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+      SmallVectorImpl<OpFoldResult> &iterDomainOffsets,
+      SmallVectorImpl<OpFoldResult> &iterDomainSizes) const {
     auto linalgOp = cast<LinalgOp>(op);
 
     // Check that the indexing map used for the output is a projected
@@ -232,9 +233,21 @@ struct LinalgOpTilingInterface
           "unhandled tiled implementation generation when result is not "
           "accessed using a permuted projection");
     }
-    SmallVector<OpFoldResult> mappedOffsets, mappedSizes;
+
     getMappedOffsetAndSize(linalgOp, b, indexingMap, offsets, sizes,
-                           mappedOffsets, mappedSizes);
+                           iterDomainOffsets, iterDomainSizes);
+    return success();
+  }
+
+  FailureOr<TilingResult>
+  generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber,
+                          ArrayRef<OpFoldResult> offsets,
+                          ArrayRef<OpFoldResult> sizes) const {
+    SmallVector<OpFoldResult> mappedOffsets, mappedSizes;
+    if (failed(getIterationDomainTileFromResultTile(
+            op, b, resultNumber, offsets, sizes, mappedOffsets, mappedSizes))) {
+      return failure();
+    }
     auto tilingInterfaceOp = cast<TilingInterface>(op);
     FailureOr<TilingResult> tilingResult =
         tilingInterfaceOp.getTiledImplementation(b, mappedOffsets, mappedSizes);
@@ -447,39 +460,10 @@ struct LinalgOpPartialReductionInterface
                                          Location loc, ValueRange partialReduce,
                                          ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
-
-    // Step 1. Recover the dims that actually need to be merged from the
-    // original operation. We can classify the original iterators as follows:
-    //
-    // parallel                         --> parallel
-    // reduction + not in reductionDims --> parallel (already reduced)
-    // reduction + in reductionDims     --> reduction (will reduce now)
-    SmallVector<utils::IteratorType> iterators(linalgOp.getNumLoops(),
-                                               utils::IteratorType::parallel);
-    for (int redIdx : reductionDims)
-      iterators[redIdx] = utils::IteratorType::reduction;
-
-    // Step 2. For each partial result, create a map to index it. This map
-    // is simply the indexing map for the original result with reductionDims
-    // appended (as produced in tileToPartialReduction).
-    int64_t numInits = linalgOp.getNumDpsInits();
-    SmallVector<AffineMap> indexingMaps(numInits * 2);
-    for (int idx : llvm::seq<int>(0, numInits)) {
-      AffineMap &inputMap = indexingMaps[idx];
-      AffineMap &outputMap = indexingMaps[numInits + idx];
-
-      outputMap =
-          linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(idx));
-      inputMap = outputMap;
-      for (int redPos : reductionDims) {
-        inputMap = inputMap.insertResult(b.getAffineDimExpr(redPos),
-                                         inputMap.getNumResults());
-      }
-    }
-
-    auto reduction = b.create<GenericOp>(
-        loc, op->getResultTypes(), partialReduce, linalgOp.getDpsInits(),
-        indexingMaps, iterators,
+    SmallVector<int64_t> reductionDimsInt64(reductionDims.begin(),
+                                            reductionDims.end());
+    auto reduction = b.create<linalg::ReduceOp>(
+        loc, partialReduce, linalgOp.getDpsInits(), reductionDimsInt64,
         [&linalgOp](OpBuilder &b, Location loc, ValueRange inputs) {
           int64_t numInits = linalgOp.getNumDpsInits();
           SmallVector<Value> yieldedValues;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 511835a226e7a2..3a75d2ac081572 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -3234,6 +3234,11 @@ struct Conv1DGenerator
       auto maskType =
           VectorType::get(maskShape, rewriter.getI1Type(), scalableDims);
 
+      SmallVector<bool> inBounds(maskShape.size(), true);
+      auto xferOp = cast<VectorTransferOpInterface>(opToMask);
+      xferOp->setAttr(xferOp.getInBoundsAttrName(),
+                      rewriter.getBoolArrayAttr(inBounds));
+
       SmallVector<OpFoldResult> mixedDims = vector::getMixedSizesXfer(
           cast<LinalgOp>(op).hasPureTensorSemantics(), opToMask, rewriter);
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
index 631dee2d405385..847c2f350f2bb0 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
@@ -277,7 +277,7 @@ bool memref::StoreOp::canRewire(const DestructurableMemorySlot &slot,
     return false;
   Attribute index = getAttributeIndexFromIndexOperands(
       getContext(), getIndices(), getMemRefType());
-  if (!index || !slot.elementPtrs.contains(index))
+  if (!index || !slot.subelementTypes.contains(index))
     return false;
   usedIndices.insert(index);
   return true;
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index d70e6d0b79cd6f..f9e8a279731782 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2691,10 +2691,10 @@ Type SubViewOp::inferResultType(MemRefType sourceMemRefType,
   //   `sourceOffset + sum_i(staticOffset_i * sourceStrides_i)`.
   int64_t targetOffset = sourceOffset;
   for (auto it : llvm::zip(staticOffsets, sourceStrides)) {
-    auto staticOffset = std::get<0>(it), targetStride = std::get<1>(it);
+    auto staticOffset = std::get<0>(it), sourceStride = std::get<1>(it);
     targetOffset = (SaturatedInteger::wrap(targetOffset) +
                     SaturatedInteger::wrap(staticOffset) *
-                        SaturatedInteger::wrap(targetStride))
+                        SaturatedInteger::wrap(sourceStride))
                        .asInteger();
   }
 
diff --git a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
index 54fc91cb26427a..a2338476397a81 100644
--- a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
+++ b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
@@ -166,12 +166,12 @@ LogicalResult mesh::ShardingInterface::verifyShardingInterfaceImpl() {
 
   // check loop types
   SmallVector<utils::IteratorType> loopTypes = getLoopIteratorTypes();
-  if (loopTypes.size() == 0)
+  if (loopTypes.empty())
     return failure();
 
   // check maps
   SmallVector<AffineMap> maps = getIndexingMaps();
-  if (maps.size() == 0)
+  if (maps.empty())
     return failure();
   unsigned numOperands = op->getNumOperands();
   unsigned numResults = op->getNumResults();
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 110873011fe35d..c0be9e919d2fda 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -48,6 +48,11 @@ static ArrayAttr makeArrayAttr(MLIRContext *context,
   return attrs.empty() ? nullptr : ArrayAttr::get(context, attrs);
 }
 
+static DenseBoolArrayAttr
+makeDenseBoolArrayAttr(MLIRContext *ctx, const ArrayRef<bool> boolArray) {
+  return boolArray.empty() ? nullptr : DenseBoolArrayAttr::get(ctx, boolArray);
+}
+
 namespace {
 struct MemRefPointerLikeModel
     : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
@@ -433,6 +438,45 @@ static void printScheduleClause(OpAsmPrinter &p, Operation *op,
     p << ", simd";
 }
 
+//===----------------------------------------------------------------------===//
+// Parser and printer for Order Clause
+//===----------------------------------------------------------------------===//
+
+// order ::= `order` `(` [order-modiﬁer ':'] concurrent `)`
+// order-modiﬁer ::= reproducible | unconstrained
+static ParseResult parseOrderClause(OpAsmParser &parser,
+                                    ClauseOrderKindAttr &kindAttr,
+                                    OrderModifierAttr &modifierAttr) {
+  StringRef enumStr;
+  SMLoc loc = parser.getCurrentLocation();
+  if (parser.parseKeyword(&enumStr))
+    return failure();
+  if (std::optional<OrderModifier> enumValue =
+          symbolizeOrderModifier(enumStr)) {
+    modifierAttr = OrderModifierAttr::get(parser.getContext(), *enumValue);
+    if (parser.parseOptionalColon())
+      return failure();
+    loc = parser.getCurrentLocation();
+    if (parser.parseKeyword(&enumStr))
+      return failure();
+  }
+  if (std::optional<ClauseOrderKind> enumValue =
+          symbolizeClauseOrderKind(enumStr)) {
+    kindAttr = ClauseOrderKindAttr::get(parser.getContext(), *enumValue);
+    return success();
+  }
+  return parser.emitError(loc, "invalid clause value: '") << enumStr << "'";
+}
+
+static void printOrderClause(OpAsmPrinter &p, Operation *op,
+                             ClauseOrderKindAttr kindAttr,
+                             OrderModifierAttr modifierAttr) {
+  if (modifierAttr)
+    p << stringifyOrderModifier(modifierAttr.getValue()) << ":";
+  if (kindAttr)
+    p << stringifyClauseOrderKind(kindAttr.getValue());
+}
+
 //===----------------------------------------------------------------------===//
 // Parser, printer and verifier for ReductionVarList
 //===----------------------------------------------------------------------===//
@@ -460,7 +504,7 @@ static ParseResult parseClauseWithRegionArgs(
             return success();
           })))
     return failure();
-  isByRef = DenseBoolArrayAttr::get(parser.getContext(), isByRefVec);
+  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
 
   auto *argsBegin = regionPrivateArgs.begin();
   MutableArrayRef argsSubrange(argsBegin + regionArgOffset,
@@ -552,7 +596,7 @@ static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
     mlir::SmallVector<bool> isByRefVec;
     isByRefVec.resize(privateVarTypes.size(), false);
     DenseBoolArrayAttr isByRef =
-        DenseBoolArrayAttr::get(op->getContext(), isByRefVec);
+        makeDenseBoolArrayAttr(op->getContext(), isByRefVec);
 
     printClauseWithRegionArgs(p, op, argsSubrange, "private",
                               privateVarOperands, privateVarTypes, isByRef,
@@ -568,18 +612,22 @@ static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
 static ParseResult
 parseReductionVarList(OpAsmParser &parser,
                       SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                      SmallVectorImpl<Type> &types,
+                      SmallVectorImpl<Type> &types, DenseBoolArrayAttr &isByRef,
                       ArrayAttr &redcuctionSymbols) {
   SmallVector<SymbolRefAttr> reductionVec;
+  SmallVector<bool> isByRefVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
+        ParseResult optionalByref = parser.parseOptionalKeyword("byref");
         if (parser.parseAttribute(reductionVec.emplace_back()) ||
             parser.parseArrow() ||
             parser.parseOperand(operands.emplace_back()) ||
             parser.parseColonType(types.emplace_back()))
           return failure();
+        isByRefVec.push_back(optionalByref.succeeded());
         return success();
       })))
     return failure();
+  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
   SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
   redcuctionSymbols = ArrayAttr::get(parser.getContext(), reductions);
   return success();
@@ -589,11 +637,21 @@ parseReductionVarList(OpAsmParser &parser,
 static void printReductionVarList(OpAsmPrinter &p, Operation *op,
                                   OperandRange reductionVars,
                                   TypeRange reductionTypes,
+                                  std::optional<DenseBoolArrayAttr> isByRef,
                                   std::optional<ArrayAttr> reductions) {
-  for (unsigned i = 0, e = reductions->size(); i < e; ++i) {
+  auto getByRef = [&](unsigned i) -> const char * {
+    if (!isByRef || !*isByRef)
+      return "";
+    assert(isByRef->empty() || i < isByRef->size());
+    if (!isByRef->empty() && (*isByRef)[i])
+      return "byref ";
+    return "";
+  };
+
+  for (unsigned i = 0, e = reductionVars.size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
-    p << (*reductions)[i] << " -> " << reductionVars[i] << " : "
+    p << getByRef(i) << (*reductions)[i] << " -> " << reductionVars[i] << " : "
       << reductionVars[i].getType();
   }
 }
@@ -602,16 +660,12 @@ static void printReductionVarList(OpAsmPrinter &p, Operation *op,
 static LogicalResult
 verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
                        OperandRange reductionVars,
-                       std::optional<ArrayRef<bool>> byRef = std::nullopt) {
+                       std::optional<ArrayRef<bool>> byRef) {
   if (!reductionVars.empty()) {
     if (!reductions || reductions->size() != reductionVars.size())
       return op->emitOpError()
              << "expected as many reduction symbol references "
                 "as reduction variables";
-    if (mlir::isa<omp::WsloopOp, omp::ParallelOp>(op))
-      assert(byRef);
-    else
-      assert(!byRef); // TODO: support byref reductions on other operations
     if (byRef && byRef->size() != reductionVars.size())
       return op->emitError() << "expected as many reduction variable by "
                                 "reference attributes as reduction variables";
@@ -1453,7 +1507,7 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
   ParallelOp::build(builder, state, clauses.ifVar, clauses.numThreadsVar,
                     clauses.allocateVars, clauses.allocatorVars,
                     clauses.reductionVars,
-                    DenseBoolArrayAttr::get(ctx, clauses.reduceVarByRef),
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
                     makeArrayAttr(ctx, clauses.reductionDeclSymbols),
                     clauses.procBindKindAttr, clauses.privateVars,
                     makeArrayAttr(ctx, clauses.privatizers));
@@ -1551,6 +1605,7 @@ void TeamsOp::build(OpBuilder &builder, OperationState &state,
                  clauses.numTeamsUpperVar, clauses.ifVar,
                  clauses.threadLimitVar, clauses.allocateVars,
                  clauses.allocatorVars, clauses.reductionVars,
+                 makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
                  makeArrayAttr(ctx, clauses.reductionDeclSymbols));
 }
 
@@ -1582,7 +1637,8 @@ LogicalResult TeamsOp::verify() {
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars());
+  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
+                                getReductionVarsByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1594,6 +1650,7 @@ void SectionsOp::build(OpBuilder &builder, OperationState &state,
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: reductionByRefAttr, privateVars, privatizers.
   SectionsOp::build(builder, state, clauses.reductionVars,
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
                     makeArrayAttr(ctx, clauses.reductionDeclSymbols),
                     clauses.allocateVars, clauses.allocatorVars,
                     clauses.nowaitAttr);
@@ -1604,7 +1661,8 @@ LogicalResult SectionsOp::verify() {
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars());
+  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
+                                getReductionVarsByref());
 }
 
 LogicalResult SectionsOp::verifyRegions() {
@@ -1682,7 +1740,8 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state,
         /*reductions=*/nullptr, /*schedule_val=*/nullptr,
         /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
         /*simd_modifier=*/false, /*nowait=*/false,
-        /*ordered_val=*/nullptr, /*order_val=*/nullptr);
+        /*ordered_val=*/nullptr, /*order_val=*/nullptr,
+        /*order_modifier=*/nullptr);
   state.addAttributes(attributes);
 }
 
@@ -1693,11 +1752,12 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state,
   // privatizers.
   WsloopOp::build(builder, state, clauses.linearVars, clauses.linearStepVars,
                   clauses.reductionVars,
-                  DenseBoolArrayAttr::get(ctx, clauses.reduceVarByRef),
+                  makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
                   makeArrayAttr(ctx, clauses.reductionDeclSymbols),
                   clauses.scheduleValAttr, clauses.scheduleChunkVar,
                   clauses.scheduleModAttr, clauses.scheduleSimdAttr,
-                  clauses.nowaitAttr, clauses.orderedAttr, clauses.orderAttr);
+                  clauses.nowaitAttr, clauses.orderedAttr, clauses.orderAttr,
+                  clauses.orderModAttr);
 }
 
 LogicalResult WsloopOp::verify() {
@@ -1726,8 +1786,8 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
   // privatizers, reductionDeclSymbols.
   SimdOp::build(builder, state, clauses.alignedVars,
                 makeArrayAttr(ctx, clauses.alignmentAttrs), clauses.ifVar,
-                clauses.nontemporalVars, clauses.orderAttr, clauses.simdlenAttr,
-                clauses.safelenAttr);
+                clauses.nontemporalVars, clauses.orderAttr,
+                clauses.orderModAttr, clauses.simdlenAttr, clauses.safelenAttr);
 }
 
 LogicalResult SimdOp::verify() {
@@ -1762,7 +1822,8 @@ void DistributeOp::build(OpBuilder &builder, OperationState &state,
   // TODO Store clauses in op: privateVars, privatizers.
   DistributeOp::build(builder, state, clauses.distScheduleStaticAttr,
                       clauses.distScheduleChunkSizeVar, clauses.allocateVars,
-                      clauses.allocatorVars, clauses.orderAttr);
+                      clauses.allocatorVars, clauses.orderAttr,
+                      clauses.orderModAttr);
 }
 
 LogicalResult DistributeOp::verify() {
@@ -1892,6 +1953,7 @@ void TaskOp::build(OpBuilder &builder, OperationState &state,
   TaskOp::build(
       builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
       clauses.mergeableAttr, clauses.inReductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
       makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.priorityVar,
       makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
       clauses.allocateVars, clauses.allocatorVars);
@@ -1903,7 +1965,8 @@ LogicalResult TaskOp::verify() {
   return failed(verifyDependVars)
              ? verifyDependVars
              : verifyReductionVarList(*this, getInReductions(),
-                                      getInReductionVars());
+                                      getInReductionVars(),
+                                      getInReductionVarsByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1913,14 +1976,17 @@ LogicalResult TaskOp::verify() {
 void TaskgroupOp::build(OpBuilder &builder, OperationState &state,
                         const TaskgroupClauseOps &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TaskgroupOp::build(builder, state, clauses.taskReductionVars,
-                     makeArrayAttr(ctx, clauses.taskReductionDeclSymbols),
-                     clauses.allocateVars, clauses.allocatorVars);
+  TaskgroupOp::build(
+      builder, state, clauses.taskReductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.taskReductionVarsByRef),
+      makeArrayAttr(ctx, clauses.taskReductionDeclSymbols),
+      clauses.allocateVars, clauses.allocatorVars);
 }
 
 LogicalResult TaskgroupOp::verify() {
   return verifyReductionVarList(*this, getTaskReductions(),
-                                getTaskReductionVars());
+                                getTaskReductionVars(),
+                                getTaskReductionVarsByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1934,7 +2000,9 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state,
   TaskloopOp::build(
       builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
       clauses.mergeableAttr, clauses.inReductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
       makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
       makeArrayAttr(ctx, clauses.reductionDeclSymbols), clauses.priorityVar,
       clauses.allocateVars, clauses.allocatorVars, clauses.grainsizeVar,
       clauses.numTasksVar, clauses.nogroupAttr);
@@ -1952,10 +2020,11 @@ LogicalResult TaskloopOp::verify() {
   if (getAllocateVars().size() != getAllocatorsVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
-  if (failed(
-          verifyReductionVarList(*this, getReductions(), getReductionVars())) ||
+  if (failed(verifyReductionVarList(*this, getReductions(), getReductionVars(),
+                                    getReductionVarsByref())) ||
       failed(verifyReductionVarList(*this, getInReductions(),
-                                    getInReductionVars())))
+                                    getInReductionVars(),
+                                    getInReductionVarsByref())))
     return failure();
 
   if (!getReductionVars().empty() && getNogroup())
diff --git a/mlir/lib/Dialect/Ptr/CMakeLists.txt b/mlir/lib/Dialect/Ptr/CMakeLists.txt
new file mode 100644
index 00000000000000..f33061b2d87cff
--- /dev/null
+++ b/mlir/lib/Dialect/Ptr/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/mlir/lib/Dialect/Ptr/IR/CMakeLists.txt b/mlir/lib/Dialect/Ptr/IR/CMakeLists.txt
new file mode 100644
index 00000000000000..9cf3643c73d3ed
--- /dev/null
+++ b/mlir/lib/Dialect/Ptr/IR/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_dialect_library(
+  MLIRPtrDialect
+  PtrAttrs.cpp
+  PtrTypes.cpp
+  PtrDialect.cpp
+
+  DEPENDS
+  MLIRPtrOpsAttributesIncGen
+  MLIRPtrOpsIncGen
+
+  LINK_LIBS
+  PUBLIC
+  MLIRIR
+  MLIRDataLayoutInterfaces
+  MLIRMemorySlotInterfaces
+)
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp b/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
new file mode 100644
index 00000000000000..f8ce820d0bcbd0
--- /dev/null
+++ b/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
@@ -0,0 +1,40 @@
+//===- PtrAttrs.cpp - Pointer dialect attributes ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Ptr dialect attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Ptr/IR/PtrAttrs.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+using namespace mlir::ptr;
+
+constexpr const static unsigned kBitsInByte = 8;
+
+//===----------------------------------------------------------------------===//
+// SpecAttr
+//===----------------------------------------------------------------------===//
+
+LogicalResult SpecAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                               uint32_t size, uint32_t abi, uint32_t preferred,
+                               uint32_t index) {
+  if (size % kBitsInByte != 0)
+    return emitError() << "size entry must be divisible by 8";
+  if (abi % kBitsInByte != 0)
+    return emitError() << "abi entry must be divisible by 8";
+  if (preferred % kBitsInByte != 0)
+    return emitError() << "preferred entry must be divisible by 8";
+  if (index != kOptionalSpecValue && index % kBitsInByte != 0)
+    return emitError() << "index entry must be divisible by 8";
+  if (abi > preferred)
+    return emitError() << "preferred alignment is expected to be at least "
+                          "as large as ABI alignment";
+  return success();
+}
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
new file mode 100644
index 00000000000000..7830ffe893dfd1
--- /dev/null
+++ b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
@@ -0,0 +1,55 @@
+//===- PtrDialect.cpp - Pointer dialect ---------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Pointer dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Ptr/IR/PtrOps.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+using namespace mlir::ptr;
+
+//===----------------------------------------------------------------------===//
+// Pointer dialect
+//===----------------------------------------------------------------------===//
+
+void PtrDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/Ptr/IR/PtrOps.cpp.inc"
+      >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/Ptr/IR/PtrOpsAttrs.cpp.inc"
+      >();
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "mlir/Dialect/Ptr/IR/PtrOpsTypes.cpp.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// Pointer API.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Ptr/IR/PtrOpsDialect.cpp.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/Ptr/IR/PtrOpsAttrs.cpp.inc"
+
+#define GET_TYPEDEF_CLASSES
+#include "mlir/Dialect/Ptr/IR/PtrOpsTypes.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Ptr/IR/PtrOps.cpp.inc"
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp b/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
new file mode 100644
index 00000000000000..2866d4eb10feb1
--- /dev/null
+++ b/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
@@ -0,0 +1,144 @@
+//===- PtrTypes.cpp - Pointer dialect types ---------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Ptr dialect types.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Ptr/IR/PtrTypes.h"
+#include "mlir/Dialect/Ptr/IR/PtrAttrs.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+using namespace mlir::ptr;
+
+//===----------------------------------------------------------------------===//
+// Pointer type
+//===----------------------------------------------------------------------===//
+
+constexpr const static unsigned kDefaultPointerSizeBits = 64;
+constexpr const static unsigned kBitsInByte = 8;
+constexpr const static unsigned kDefaultPointerAlignment = 8;
+
+static Attribute getDefaultMemorySpace(PtrType ptr) { return nullptr; }
+
+/// Searches the data layout for the pointer spec, returns nullptr if it is not
+/// found.
+static SpecAttr getPointerSpec(DataLayoutEntryListRef params, PtrType type) {
+  for (DataLayoutEntryInterface entry : params) {
+    if (!entry.isTypeEntry())
+      continue;
+    if (cast<PtrType>(entry.getKey().get<Type>()).getMemorySpace() ==
+        type.getMemorySpace()) {
+      if (auto spec = dyn_cast<SpecAttr>(entry.getValue()))
+        return spec;
+    }
+  }
+  // If not found, and this is the pointer to the default memory space, assume
+  // 64-bit pointers.
+  if (type.getMemorySpace() == getDefaultMemorySpace(type))
+    return SpecAttr::get(type.getContext(), kDefaultPointerSizeBits,
+                         kDefaultPointerAlignment, kDefaultPointerAlignment,
+                         kDefaultPointerSizeBits);
+  return nullptr;
+}
+
+bool PtrType::areCompatible(DataLayoutEntryListRef oldLayout,
+                            DataLayoutEntryListRef newLayout) const {
+  for (DataLayoutEntryInterface newEntry : newLayout) {
+    if (!newEntry.isTypeEntry())
+      continue;
+    uint32_t size = kDefaultPointerSizeBits;
+    uint32_t abi = kDefaultPointerAlignment;
+    auto newType = llvm::cast<PtrType>(newEntry.getKey().get<Type>());
+    const auto *it =
+        llvm::find_if(oldLayout, [&](DataLayoutEntryInterface entry) {
+          if (auto type = llvm::dyn_cast_if_present<Type>(entry.getKey())) {
+            return llvm::cast<PtrType>(type).getMemorySpace() ==
+                   newType.getMemorySpace();
+          }
+          return false;
+        });
+    if (it == oldLayout.end()) {
+      it = llvm::find_if(oldLayout, [&](DataLayoutEntryInterface entry) {
+        if (auto type = llvm::dyn_cast_if_present<Type>(entry.getKey())) {
+          auto ptrTy = llvm::cast<PtrType>(type);
+          return ptrTy.getMemorySpace() == getDefaultMemorySpace(ptrTy);
+        }
+        return false;
+      });
+    }
+    if (it != oldLayout.end()) {
+      auto spec = llvm::cast<SpecAttr>(*it);
+      size = spec.getSize();
+      abi = spec.getAbi();
+    }
+
+    auto newSpec = llvm::cast<SpecAttr>(newEntry.getValue());
+    uint32_t newSize = newSpec.getSize();
+    uint32_t newAbi = newSpec.getAbi();
+    if (size != newSize || abi < newAbi || abi % newAbi != 0)
+      return false;
+  }
+  return true;
+}
+
+uint64_t PtrType::getABIAlignment(const DataLayout &dataLayout,
+                                  DataLayoutEntryListRef params) const {
+  if (SpecAttr spec = getPointerSpec(params, *this))
+    return spec.getAbi() / kBitsInByte;
+
+  return dataLayout.getTypeABIAlignment(
+      get(getContext(), getDefaultMemorySpace(*this)));
+}
+
+std::optional<uint64_t>
+PtrType::getIndexBitwidth(const DataLayout &dataLayout,
+                          DataLayoutEntryListRef params) const {
+  if (SpecAttr spec = getPointerSpec(params, *this)) {
+    return spec.getIndex() == SpecAttr::kOptionalSpecValue ? spec.getSize()
+                                                           : spec.getIndex();
+  }
+
+  return dataLayout.getTypeIndexBitwidth(
+      get(getContext(), getDefaultMemorySpace(*this)));
+}
+
+llvm::TypeSize PtrType::getTypeSizeInBits(const DataLayout &dataLayout,
+                                          DataLayoutEntryListRef params) const {
+  if (SpecAttr spec = getPointerSpec(params, *this))
+    return llvm::TypeSize::getFixed(spec.getSize());
+
+  // For other memory spaces, use the size of the pointer to the default memory
+  // space.
+  return dataLayout.getTypeSizeInBits(
+      get(getContext(), getDefaultMemorySpace(*this)));
+}
+
+uint64_t PtrType::getPreferredAlignment(const DataLayout &dataLayout,
+                                        DataLayoutEntryListRef params) const {
+  if (SpecAttr spec = getPointerSpec(params, *this))
+    return spec.getPreferred() / kBitsInByte;
+
+  return dataLayout.getTypePreferredAlignment(
+      get(getContext(), getDefaultMemorySpace(*this)));
+}
+
+LogicalResult PtrType::verifyEntries(DataLayoutEntryListRef entries,
+                                     Location loc) const {
+  for (DataLayoutEntryInterface entry : entries) {
+    if (!entry.isTypeEntry())
+      continue;
+    auto key = entry.getKey().get<Type>();
+    if (!llvm::isa<SpecAttr>(entry.getValue())) {
+      return emitError(loc) << "expected layout attribute for " << key
+                            << " to be a #ptr.spec attribute";
+    }
+  }
+  return success();
+}
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 35edd490f72eba..a1392813d6de33 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -953,49 +953,122 @@ mlir::scf::tileAndFuseProducerOfSlice(
 LogicalResult mlir::scf::yieldReplacementForFusedProducer(
     RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp,
     scf::SCFFuseProducerOfSliceResult fusedProducerInfo,
-    MutableArrayRef<LoopLikeOpInterface> loops) {
+    MutableArrayRef<LoopLikeOpInterface> loops,
+    ArrayRef<unsigned> yieldResultNumber) {
   if (loops.empty())
     return success();
 
-  OpResult fusableProducer = fusedProducerInfo.origProducer;
-  Value tiledAndFusedProducer = fusedProducerInfo.tiledAndFusedProducer;
-  FailureOr<Value> initValue = tensor::getOrCreateDestination(
-      rewriter, fusableProducer.getOwner()->getLoc(), fusableProducer);
-  if (succeeded(initValue)) {
-
-    YieldTiledValuesFn newYieldValuesFn =
-        [&](RewriterBase &innerRewriter, Location loc, ValueRange /*ivs*/,
-            ValueRange newRegionIterArgs, SmallVector<Value> &tiledResult,
-            SmallVector<SmallVector<OpFoldResult>> &tiledOffset,
-            SmallVector<SmallVector<OpFoldResult>> &tiledSizes)
-        -> LogicalResult {
-      OpBuilder::InsertionGuard g(innerRewriter);
-      if (auto tiledDestStyleOp =
-              tiledAndFusedProducer
-                  .getDefiningOp<DestinationStyleOpInterface>()) {
-        rewriter.setInsertionPoint(tiledDestStyleOp);
-        Value newRegionArg = newRegionIterArgs.back();
+  Operation *originalOwner = fusedProducerInfo.origProducer.getOwner(),
+            *tiledOwner = fusedProducerInfo.tiledOps[0];
+
+  Location loc = originalOwner->getLoc();
+  // a. collect all init Value to be appended
+  SmallVector<unsigned> initNumberList =
+      yieldResultNumber.empty() ? llvm::to_vector(llvm::seq<unsigned>(
+                                      0, originalOwner->getNumResults()))
+                                : llvm::to_vector(yieldResultNumber);
+  SmallVector<Value> initValueList;
+  for (const auto &resultNumber : initNumberList) {
+    FailureOr<Value> initValue = tensor::getOrCreateDestination(
+        rewriter, loc, originalOwner->getResult(resultNumber));
+    if (succeeded(initValue)) {
+      initValueList.push_back(initValue.value());
+    } else {
+      return failure();
+    }
+  }
+
+  YieldTiledValuesFn newYieldValuesFn =
+      [&](RewriterBase &innerRewriter, Location loc, ValueRange /*ivs*/,
+          ValueRange newRegionIterArgs, SmallVector<Value> &tiledResult,
+          SmallVector<SmallVector<OpFoldResult>> &tiledOffset,
+          SmallVector<SmallVector<OpFoldResult>> &tiledSizes) -> LogicalResult {
+    OpBuilder::InsertionGuard g(innerRewriter);
+
+    // get sliceOp tile information
+    SmallVector<OpFoldResult> sliceOffset = sliceOp.getMixedOffsets(),
+                              sliceSizes = sliceOp.getMixedSizes();
+
+    // expect all strides of sliceOp being 1
+    if (llvm::any_of(sliceOp.getMixedStrides(), [](OpFoldResult ofr) {
+          return !isConstantIntValue(ofr, 1);
+        }))
+      return failure();
+
+    unsigned sliceResultNumber =
+        fusedProducerInfo.origProducer.getResultNumber();
+
+    auto tilableOp = cast<TilingInterface>(originalOwner);
+    // b. get iterDomain Offset and Sizes based on sliceOp tile
+    SmallVector<OpFoldResult> iterDomainOffset, iterDomainSizes;
+    // skip tensor.pack/unpack/pad, which expects single opResult
+    if (tilableOp->getNumResults() > 1 &&
+        failed(tilableOp.getIterationDomainTileFromResultTile(
+            rewriter, sliceResultNumber, sliceOffset, sliceSizes,
+            iterDomainOffset, iterDomainSizes))) {
+      // In theory, it is unnecessary to raise an error here. Actually although
+      // it fails to reconstruct the result tensor, it should not broke current
+      // fusion anyway. The reason why we must return failure currently is that
+      // the callback function `newYieldValuesFn` will be called after new init
+      // operand(s) has already been appended. It will take more refactoring to
+      // make sure the init operands are added consistently in the future. For
+      // more details, please refer to:
+      // https://github.com/llvm/llvm-project/pull/93144#discussion_r1643760814
+      return failure();
+    }
+
+    // c. calculate offsets and sizes info of all OpResults respectively based
+    // on iteration Domain Tile
+    SmallVector<SmallVector<OpFoldResult>> offsetList, sizesList;
+    for (const auto &resultNumber : initNumberList) {
+      if (resultNumber == sliceResultNumber) {
+        offsetList.push_back(sliceOffset);
+        sizesList.push_back(sliceSizes);
+      } else {
+        assert(!iterDomainOffset.empty() && !iterDomainSizes.empty());
+        // infer result tile according to the iteration domain tile
+        SmallVector<OpFoldResult> offset, sizes;
+        if (failed(tilableOp.getResultTilePosition(
+                rewriter, resultNumber, iterDomainOffset, iterDomainSizes,
+                offset, sizes))) {
+          return failure();
+        }
+        offsetList.push_back(offset);
+        sizesList.push_back(sizes);
+      }
+    }
+
+    // d. create `extract_slice` for `iter_args` for DPS operation if necessary
+    if (auto tiledDestStyleOp =
+            dyn_cast<DestinationStyleOpInterface>(tiledOwner)) {
+      rewriter.setInsertionPoint(tiledDestStyleOp);
+      for (const auto &&[index, newRegionArg] :
+           llvm::enumerate(newRegionIterArgs)) {
         auto destSlice = rewriter.create<tensor::ExtractSliceOp>(
-            sliceOp.getLoc(), newRegionArg, sliceOp.getMixedOffsets(),
-            sliceOp.getMixedSizes(), sliceOp.getMixedStrides());
-        unsigned resultNumber = fusableProducer.getResultNumber();
+            loc, newRegionArg, offsetList[index], sizesList[index],
+            SmallVector<OpFoldResult>(offsetList[index].size(),
+                                      rewriter.getIndexAttr(1)));
+        unsigned resultNumber = initNumberList[index];
         rewriter.modifyOpInPlace(tiledDestStyleOp, [&]() {
           tiledDestStyleOp.getDpsInitsMutable()[resultNumber].set(destSlice);
         });
       }
-      Block *block = rewriter.getInsertionPoint()->getBlock();
-      rewriter.setInsertionPoint(block->getTerminator());
-      tiledResult.push_back(fusedProducerInfo.tiledAndFusedProducer);
-      tiledOffset.emplace_back(sliceOp.getMixedOffsets());
-      tiledSizes.emplace_back(sliceOp.getMixedSizes());
-      return success();
-    };
+    }
 
-    return addInitOperandsToLoopNest(rewriter, loops,
-                                     SmallVector<Value>{initValue.value()},
-                                     newYieldValuesFn);
-  }
-  return success();
+    // e. prepare tiled offset and sizes for later `insert_slice` creation by
+    // caller
+    Block *block = rewriter.getInsertionPoint()->getBlock();
+    rewriter.setInsertionPoint(block->getTerminator());
+    for (const auto &&[index, resultNumber] : llvm::enumerate(initNumberList)) {
+      tiledResult.push_back(tiledOwner->getResult(resultNumber));
+      tiledOffset.emplace_back(offsetList[index]);
+      tiledSizes.emplace_back(sizesList[index]);
+    }
+    return success();
+  };
+
+  return addInitOperandsToLoopNest(rewriter, loops, initValueList,
+                                   newYieldValuesFn);
 }
 
 /// Implementation of tile consumer and fuse producer greedily.
@@ -1085,14 +1158,22 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF(
       continue;
 
     if (yieldReplacement) {
+      // Reconstruct and yield all opResult of fusableProducerOp by default. The
+      // caller can specific which one to yield by designating optional argument
+      // named `yieldResultNumber` of `yieldReplacementForFusedProducer`.
+      Operation *fusableProducerOp = fusableProducer.getOwner();
       if (failed(yieldReplacementForFusedProducer(
               rewriter, candidateSliceOp, fusedResult.value(), loops))) {
         return rewriter.notifyMatchFailure(
-            fusableProducer.getOwner(), "failed to replacement value for this "
-                                        "oepration from within the tiled loop");
+            fusableProducerOp, "failed to replacement value for this "
+                               "operation from within the tiled loop");
+      }
+      for (auto [index, result] :
+           llvm::enumerate(fusableProducerOp->getResults())) {
+        origValToResultNumber[result] = loops.front()->getNumResults() -
+                                        fusableProducerOp->getNumResults() +
+                                        index;
       }
-      origValToResultNumber[fusableProducer] =
-          loops.front()->getNumResults() - 1;
     }
 
     if (Operation *tiledAndFusedOp =
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 6d6d7e506e8545..0e840da9530edb 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -135,40 +135,40 @@ bool tensor::isSameTypeWithoutEncoding(Type tp1, Type tp2) {
 static llvm::SmallBitVector getDroppedDims(ArrayRef<int64_t> reducedShape,
                                            ArrayRef<OpFoldResult> mixedSizes) {
   llvm::SmallBitVector droppedDims(mixedSizes.size());
-  int64_t shapePos = 0;
+  int64_t shapePos = reducedShape.size() - 1;
 
-  for (const auto &size : enumerate(mixedSizes)) {
+  for (const auto &size : enumerate(llvm::reverse(mixedSizes))) {
+    size_t idx = mixedSizes.size() - size.index() - 1;
     // Rank-reduced dims must have a static unit dimension.
     bool isStaticUnitSize =
         size.value().is<Attribute>() &&
         llvm::cast<IntegerAttr>(size.value().get<Attribute>()).getInt() == 1;
 
-    if (shapePos == static_cast<int64_t>(reducedShape.size())) {
+    if (shapePos < 0) {
       // There are no more dims in the reduced shape. All remaining sizes must
       // be rank-reduced dims.
       assert(isStaticUnitSize && "expected unit dim");
-      droppedDims.set(size.index());
+      droppedDims.set(idx);
       continue;
     }
 
     // Dim is preserved if the size is not a static 1.
     if (!isStaticUnitSize) {
-      ++shapePos;
+      --shapePos;
       continue;
     }
 
     // Dim is preserved if the reduced shape dim is also 1.
     if (reducedShape[shapePos] == 1) {
-      ++shapePos;
+      --shapePos;
       continue;
     }
 
     // Otherwise: Dim is dropped.
-    droppedDims.set(size.index());
+    droppedDims.set(idx);
   }
 
-  assert(shapePos == static_cast<int64_t>(reducedShape.size()) &&
-         "dimension mismatch");
+  assert(shapePos < 0 && "dimension mismatch");
   return droppedDims;
 }
 
diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
index 4fc97115064f33..f276924a8a9f62 100644
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -38,7 +38,11 @@ Value mlir::tosa::clampFloatHelper(Location loc, Value arg, Value min,
 }
 
 Value mlir::tosa::clampIntHelper(Location loc, Value arg, Value min, Value max,
-                                 OpBuilder &rewriter) {
+                                 OpBuilder &rewriter, bool isUnsigned) {
+  if (isUnsigned) {
+    auto minOrArg = rewriter.create<arith::MaxUIOp>(loc, min, arg);
+    return rewriter.create<arith::MinUIOp>(loc, max, minOrArg);
+  }
   auto minOrArg = rewriter.create<arith::MaxSIOp>(loc, min, arg);
   return rewriter.create<arith::MinSIOp>(loc, max, minOrArg);
 }
diff --git a/mlir/lib/Dialect/Transform/Interfaces/MatchInterfaces.cpp b/mlir/lib/Dialect/Transform/Interfaces/MatchInterfaces.cpp
index 4151d0ea5bee05..783eb137b9affa 100644
--- a/mlir/lib/Dialect/Transform/Interfaces/MatchInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/Interfaces/MatchInterfaces.cpp
@@ -93,7 +93,7 @@ LogicalResult transform::verifyTransformMatchDimsOp(Operation *op,
                                 "'all' is not specified";
   }
   SmallVector<int64_t> rawVector = llvm::to_vector(raw);
-  auto *it = std::unique(rawVector.begin(), rawVector.end());
+  auto *it = llvm::unique(rawVector);
   if (it != rawVector.end())
     return op->emitOpError() << "expected the listed values to be unique";
 
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 6734c80f2760d0..149723f51cc123 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4172,11 +4172,7 @@ static LogicalResult foldTransferFullMask(TransferOp op) {
   if (!mask)
     return failure();
 
-  auto constantMask = mask.template getDefiningOp<vector::ConstantMaskOp>();
-  if (!constantMask)
-    return failure();
-
-  if (!constantMask.isAllOnesMask())
+  if (getMaskFormat(mask) != MaskFormat::AllTrue)
     return failure();
 
   op.getMaskMutable().clear();
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
index dd027d107d16a5..a0df03c7e808b3 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
@@ -55,6 +55,8 @@ namespace {
 /// ```
 ///
 /// When applied exhaustively, this will produce a sequence of 1-d gather ops.
+///
+/// Supports vector types with a fixed leading dimension.
 struct FlattenGather : OpRewritePattern<vector::GatherOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -64,6 +66,11 @@ struct FlattenGather : OpRewritePattern<vector::GatherOp> {
     if (resultTy.getRank() < 2)
       return rewriter.notifyMatchFailure(op, "already flat");
 
+    // Unrolling doesn't take vscale into account. Pattern is disabled for
+    // vectors with leading scalable dim(s).
+    if (resultTy.getScalableDims().front())
+      return rewriter.notifyMatchFailure(op, "cannot unroll scalable dim");
+
     Location loc = op.getLoc();
     Value indexVec = op.getIndexVec();
     Value maskVec = op.getMask();
@@ -72,8 +79,7 @@ struct FlattenGather : OpRewritePattern<vector::GatherOp> {
     Value result = rewriter.create<arith::ConstantOp>(
         loc, resultTy, rewriter.getZeroAttr(resultTy));
 
-    Type subTy = VectorType::get(resultTy.getShape().drop_front(),
-                                 resultTy.getElementType());
+    VectorType subTy = VectorType::Builder(resultTy).dropDim(0);
 
     for (int64_t i = 0, e = resultTy.getShape().front(); i < e; ++i) {
       int64_t thisIdx[1] = {i};
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 59b6cb3ae667ae..ac2a4d3abcc68c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -912,8 +912,8 @@ static Value rewriteI4ToI8UnsignedExt(PatternRewriter &rewriter, Location loc,
   return rewriter.create<vector::InterleaveOp>(loc, low, high);
 }
 
-/// Rewrite the i8 -> i4 truncation into a sequence of shuffles and bitwise ops
-/// that take advantage of high-level information to avoid leaving LLVM to
+/// Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise
+/// ops that take advantage of high-level information to avoid leaving LLVM to
 /// scramble with peephole optimizations.
 static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc,
                                 Value srcValue) {
@@ -922,39 +922,22 @@ static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc,
          "Expected i8 type");
 
   // 1. De-interleave low and high i8 elements.
-  int64_t vecDimSize = srcVecType.getShape().back();
-  SmallVector<int64_t> deinterleaveLowMaskValues;
-  SmallVector<int64_t> deinterleaveHighMaskValues;
-  assert((vecDimSize % 2) == 0 && "Odd number of i4 elements");
-  deinterleaveLowMaskValues.reserve(vecDimSize / 2);
-  deinterleaveHighMaskValues.reserve(vecDimSize / 2);
-  for (int i = 0, end = vecDimSize; i < end; i += 2) {
-    deinterleaveLowMaskValues.push_back(i);
-    deinterleaveHighMaskValues.push_back(i + 1);
-  }
-
-  auto lowShuffleOp = rewriter.create<vector::ShuffleOp>(
-      loc, srcValue, srcValue,
-      rewriter.getI64ArrayAttr(deinterleaveLowMaskValues));
-  auto highShuffleOp = rewriter.create<vector::ShuffleOp>(
-      loc, srcValue, srcValue,
-      rewriter.getI64ArrayAttr(deinterleaveHighMaskValues));
+  auto deinterleaveOp = rewriter.create<vector::DeinterleaveOp>(loc, srcValue);
 
   // 2. Zero out the upper side of each low i8 element.
   constexpr int8_t i8LowBitMask = 0x0F;
+  VectorType deinterI8VecType = deinterleaveOp.getResultVectorType();
   Value zeroOutMask = rewriter.create<arith::ConstantOp>(
-      loc,
-      DenseElementsAttr::get(lowShuffleOp.getResultVectorType(), i8LowBitMask));
-  Value zeroOutLow =
-      rewriter.create<arith::AndIOp>(loc, lowShuffleOp, zeroOutMask);
+      loc, DenseElementsAttr::get(deinterI8VecType, i8LowBitMask));
+  Value zeroOutLow = rewriter.create<arith::AndIOp>(
+      loc, deinterleaveOp.getRes1(), zeroOutMask);
 
   // 3. Move high i4 values to upper side of the byte.
   constexpr int8_t bitsToShift = 4;
-  VectorType deinterI8VecType = highShuffleOp.getResultVectorType();
   auto shiftValues = rewriter.create<arith::ConstantOp>(
       loc, DenseElementsAttr::get(deinterI8VecType, bitsToShift));
-  Value shlHigh =
-      rewriter.create<arith::ShLIOp>(loc, highShuffleOp, shiftValues);
+  Value shlHigh = rewriter.create<arith::ShLIOp>(loc, deinterleaveOp.getRes2(),
+                                                 shiftValues);
 
   // 4. Merge high and low i4 values.
   auto mergedHiLowOp = rewriter.create<arith::OrIOp>(loc, zeroOutLow, shlHigh);
@@ -1148,7 +1131,7 @@ struct RewriteAlignedSubByteIntExt : OpRewritePattern<ConversionOpType> {
   }
 };
 
-/// Rewrite the i8 -> i4 part of any truncation into a sequence of shuffles and
+/// Rewrite the i8 -> i4 part of any truncation into a deinterleave and
 /// bitwise ops that take advantage of high-level information to avoid leaving
 /// LLVM to scramble with peephole optimizations.
 ///
@@ -1158,13 +1141,11 @@ struct RewriteAlignedSubByteIntExt : OpRewritePattern<ConversionOpType> {
 ///
 ///        %cst = arith.constant dense<15> : vector<4xi8>
 ///        %cst_0 = arith.constant dense<4> : vector<4xi8>
-///        %0 = arith.trunci %in : vector<8xi32> to vector<8xi8>
-///        %1 = vector.shuffle %0, %0 [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
-///        %2 = vector.shuffle %0, %0 [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
-///        %3 = arith.andi %1, %cst : vector<4xi8>
-///        %4 = arith.shli %2, %cst_0 : vector<4xi8>
-///        %5 = arith.ori %3, %4 : vector<4xi8>
-///        %6 = vector.bitcast %5 : vector<4xi8> to vector<8xi4>
+///        %0, %1 = vector.deinterleave %in : vector<8xi8>, vector<8xi8>
+///        %2 = arith.andi %0, %cst : vector<4xi8>
+///        %3 = arith.shli %1, %cst_0 : vector<4xi8>
+///        %4 = arith.ori %2, %3 : vector<4xi8>
+///        %5 = vector.bitcast %4 : vector<4xi8> to vector<8xi4>
 ///
 struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
   using OpRewritePattern<arith::TruncIOp>::OpRewritePattern;
@@ -1178,11 +1159,6 @@ struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
     if (!srcVecType || !dstVecType)
       return failure();
 
-    // Only single dim vectors are supported until we have
-    // `vector.deinterleave`.
-    if (srcVecType.getRank() != 1)
-      return failure();
-
     if (failed(commonConversionPrecondition(rewriter, srcVecType, truncOp)))
       return failure();
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index eac6db585aad78..da3d9648cf2838 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1552,6 +1552,7 @@ struct CanonicalizeContractMatmulToMMT final
 /// Cores, i.e, `mma.sync.*.f32.f16.f16.f32` and `mma.sync.*.f32.bf16.bf16.f32`.
 /// This pattern folds the arithmetic extensions into the vector contraction and
 /// enables the usage of native mixed precision Tensor Core instructions.
+template <typename ExtOp>
 struct FoldArithExtIntoContractionOp
     : public OpRewritePattern<vector::ContractionOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -1559,8 +1560,8 @@ struct FoldArithExtIntoContractionOp
   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
                                 PatternRewriter &rewriter) const override {
 
-    auto lhsDefOp = contractOp.getLhs().getDefiningOp<arith::ExtFOp>();
-    auto rhsDefOp = contractOp.getRhs().getDefiningOp<arith::ExtFOp>();
+    auto lhsDefOp = contractOp.getLhs().getDefiningOp<ExtOp>();
+    auto rhsDefOp = contractOp.getRhs().getDefiningOp<ExtOp>();
 
     if (!lhsDefOp || !rhsDefOp) {
       return rewriter.notifyMatchFailure(contractOp,
@@ -1895,7 +1896,9 @@ struct FoldArithToVectorOuterProduct : public OpRewritePattern<MulOpType> {
 
 void mlir::vector::populateFoldArithExtensionPatterns(
     RewritePatternSet &patterns) {
-  patterns.add<FoldArithExtIntoContractionOp>(patterns.getContext());
+  patterns.add<FoldArithExtIntoContractionOp<arith::ExtFOp>,
+               FoldArithExtIntoContractionOp<arith::ExtSIOp>>(
+      patterns.getContext());
 }
 
 void mlir::vector::populateVectorMaskMaterializationPatterns(
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 1fab33327ba768..798398464da8dd 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cstdint>
+#include <limits>
 #include <utility>
 
 #include "AffineExprDetail.h"
@@ -645,10 +647,14 @@ mlir::getAffineConstantExprs(ArrayRef<int64_t> constants,
 static AffineExpr simplifyAdd(AffineExpr lhs, AffineExpr rhs) {
   auto lhsConst = dyn_cast<AffineConstantExpr>(lhs);
   auto rhsConst = dyn_cast<AffineConstantExpr>(rhs);
-  // Fold if both LHS, RHS are a constant.
-  if (lhsConst && rhsConst)
-    return getAffineConstantExpr(lhsConst.getValue() + rhsConst.getValue(),
-                                 lhs.getContext());
+  // Fold if both LHS, RHS are a constant and the sum does not overflow.
+  if (lhsConst && rhsConst) {
+    int64_t sum;
+    if (llvm::AddOverflow(lhsConst.getValue(), rhsConst.getValue(), sum)) {
+      return nullptr;
+    }
+    return getAffineConstantExpr(sum, lhs.getContext());
+  }
 
   // Canonicalize so that only the RHS is a constant. (4 + d0 becomes d0 + 4).
   // If only one of them is a symbolic expressions, make it the RHS.
@@ -774,9 +780,13 @@ static AffineExpr simplifyMul(AffineExpr lhs, AffineExpr rhs) {
   auto lhsConst = dyn_cast<AffineConstantExpr>(lhs);
   auto rhsConst = dyn_cast<AffineConstantExpr>(rhs);
 
-  if (lhsConst && rhsConst)
-    return getAffineConstantExpr(lhsConst.getValue() * rhsConst.getValue(),
-                                 lhs.getContext());
+  if (lhsConst && rhsConst) {
+    int64_t product;
+    if (llvm::MulOverflow(lhsConst.getValue(), rhsConst.getValue(), product)) {
+      return nullptr;
+    }
+    return getAffineConstantExpr(product, lhs.getContext());
+  }
 
   if (!lhs.isSymbolicOrConstant() && !rhs.isSymbolicOrConstant())
     return nullptr;
@@ -845,26 +855,31 @@ static AffineExpr simplifyFloorDiv(AffineExpr lhs, AffineExpr rhs) {
   auto lhsConst = dyn_cast<AffineConstantExpr>(lhs);
   auto rhsConst = dyn_cast<AffineConstantExpr>(rhs);
 
-  // mlir floordiv by zero or negative numbers is undefined and preserved as is.
-  if (!rhsConst || rhsConst.getValue() < 1)
+  if (!rhsConst || rhsConst.getValue() == 0)
     return nullptr;
 
-  if (lhsConst)
+  if (lhsConst) {
+    // divideFloorSigned can only overflow in this case:
+    if (lhsConst.getValue() == std::numeric_limits<int64_t>::min() &&
+        rhsConst.getValue() == -1) {
+      return nullptr;
+    }
     return getAffineConstantExpr(
         divideFloorSigned(lhsConst.getValue(), rhsConst.getValue()),
         lhs.getContext());
+  }
 
   // Fold floordiv of a multiply with a constant that is a multiple of the
   // divisor. Eg: (i * 128) floordiv 64 = i * 2.
   if (rhsConst == 1)
     return lhs;
 
-  // Simplify (expr * const) floordiv divConst when expr is known to be a
-  // multiple of divConst.
+  // Simplify `(expr * lrhs) floordiv rhsConst` when `lrhs` is known to be a
+  // multiple of `rhsConst`.
   auto lBin = dyn_cast<AffineBinaryOpExpr>(lhs);
   if (lBin && lBin.getKind() == AffineExprKind::Mul) {
     if (auto lrhs = dyn_cast<AffineConstantExpr>(lBin.getRHS())) {
-      // rhsConst is known to be a positive constant.
+      // `rhsConst` is known to be a nonzero constant.
       if (lrhs.getValue() % rhsConst.getValue() == 0)
         return lBin.getLHS() * (lrhs.getValue() / rhsConst.getValue());
     }
@@ -875,7 +890,7 @@ static AffineExpr simplifyFloorDiv(AffineExpr lhs, AffineExpr rhs) {
   if (lBin && lBin.getKind() == AffineExprKind::Add) {
     int64_t llhsDiv = lBin.getLHS().getLargestKnownDivisor();
     int64_t lrhsDiv = lBin.getRHS().getLargestKnownDivisor();
-    // rhsConst is known to be a positive constant.
+    // rhsConst is known to be a nonzero constant.
     if (llhsDiv % rhsConst.getValue() == 0 ||
         lrhsDiv % rhsConst.getValue() == 0)
       return lBin.getLHS().floorDiv(rhsConst.getValue()) +
@@ -902,25 +917,31 @@ static AffineExpr simplifyCeilDiv(AffineExpr lhs, AffineExpr rhs) {
   auto lhsConst = dyn_cast<AffineConstantExpr>(lhs);
   auto rhsConst = dyn_cast<AffineConstantExpr>(rhs);
 
-  if (!rhsConst || rhsConst.getValue() < 1)
+  if (!rhsConst || rhsConst.getValue() == 0)
     return nullptr;
 
-  if (lhsConst)
+  if (lhsConst) {
+    // divideCeilSigned can only overflow in this case:
+    if (lhsConst.getValue() == std::numeric_limits<int64_t>::min() &&
+        rhsConst.getValue() == -1) {
+      return nullptr;
+    }
     return getAffineConstantExpr(
         divideCeilSigned(lhsConst.getValue(), rhsConst.getValue()),
         lhs.getContext());
+  }
 
   // Fold ceildiv of a multiply with a constant that is a multiple of the
   // divisor. Eg: (i * 128) ceildiv 64 = i * 2.
   if (rhsConst.getValue() == 1)
     return lhs;
 
-  // Simplify (expr * const) ceildiv divConst when const is known to be a
-  // multiple of divConst.
+  // Simplify `(expr * lrhs) ceildiv rhsConst` when `lrhs` is known to be a
+  // multiple of `rhsConst`.
   auto lBin = dyn_cast<AffineBinaryOpExpr>(lhs);
   if (lBin && lBin.getKind() == AffineExprKind::Mul) {
     if (auto lrhs = dyn_cast<AffineConstantExpr>(lBin.getRHS())) {
-      // rhsConst is known to be a positive constant.
+      // `rhsConst` is known to be a nonzero constant.
       if (lrhs.getValue() % rhsConst.getValue() == 0)
         return lBin.getLHS() * (lrhs.getValue() / rhsConst.getValue());
     }
@@ -950,9 +971,11 @@ static AffineExpr simplifyMod(AffineExpr lhs, AffineExpr rhs) {
   if (!rhsConst || rhsConst.getValue() < 1)
     return nullptr;
 
-  if (lhsConst)
+  if (lhsConst) {
+    // mod never overflows.
     return getAffineConstantExpr(mod(lhsConst.getValue(), rhsConst.getValue()),
                                  lhs.getContext());
+  }
 
   // Fold modulo of an expression that is known to be a multiple of a constant
   // to zero if that constant is a multiple of the modulo factor. Eg: (i * 128)
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index ce7b751294cc9c..e5993eb08dc8be 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -748,8 +748,7 @@ AffineMap mlir::simplifyAffineMap(AffineMap map) {
 AffineMap mlir::removeDuplicateExprs(AffineMap map) {
   auto results = map.getResults();
   SmallVector<AffineExpr, 4> uniqueExprs(results.begin(), results.end());
-  uniqueExprs.erase(std::unique(uniqueExprs.begin(), uniqueExprs.end()),
-                    uniqueExprs.end());
+  uniqueExprs.erase(llvm::unique(uniqueExprs), uniqueExprs.end());
   return AffineMap::get(map.getNumDims(), map.getNumSymbols(), uniqueExprs,
                         map.getContext());
 }
diff --git a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
index df86bd757b6282..2634245a4b7b1e 100644
--- a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
+++ b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
@@ -293,13 +293,12 @@ mlir::detail::getDefaultStackAlignment(DataLayoutEntryInterface entry) {
   return value.getValue().getZExtValue();
 }
 
-std::optional<int64_t>
-mlir::detail::getDevicePropertyValueAsInt(DataLayoutEntryInterface entry) {
+std::optional<Attribute>
+mlir::detail::getDevicePropertyValue(DataLayoutEntryInterface entry) {
   if (entry == DataLayoutEntryInterface())
     return std::nullopt;
 
-  auto value = cast<IntegerAttr>(entry.getValue());
-  return value.getValue().getZExtValue();
+  return entry.getValue();
 }
 
 DataLayoutEntryList
@@ -661,7 +660,7 @@ uint64_t mlir::DataLayout::getStackAlignment() const {
   return *stackAlignment;
 }
 
-std::optional<int64_t> mlir::DataLayout::getDevicePropertyValueAsInt(
+std::optional<Attribute> mlir::DataLayout::getDevicePropertyValue(
     TargetSystemSpecInterface::DeviceID deviceID,
     StringAttr propertyName) const {
   checkValid();
@@ -676,9 +675,9 @@ std::optional<int64_t> mlir::DataLayout::getDevicePropertyValueAsInt(
   // missing, we return std::nullopt so that the users can resort to
   // the default value however they want.
   if (auto iface = dyn_cast_or_null<DataLayoutOpInterface>(scope))
-    return iface.getDevicePropertyValueAsInt(entry);
+    return iface.getDevicePropertyValue(entry);
   else
-    return detail::getDevicePropertyValueAsInt(entry);
+    return detail::getDevicePropertyValue(entry);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 1c709e18cbfda2..93dc5ff9d35b74 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -127,17 +127,18 @@ add_mlir_dialect_library(MLIRROCDLTarget
   )
 
 if(MLIR_ENABLE_ROCM_CONVERSIONS)
-  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
-    message(SEND_ERROR
-      "Building mlir with ROCm support requires the AMDGPU backend")
-  endif()
-
   if (DEFINED ROCM_PATH)
     set(DEFAULT_ROCM_PATH "${ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   elseif(DEFINED ENV{ROCM_PATH})
     set(DEFAULT_ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   else()
-    set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+    if (WIN32)
+      # Avoid setting an UNIX path for Windows.
+      # TODO: Eventually migrate to FindHIP once it becomes a part of CMake.
+      set(DEFAULT_ROCM_PATH "" CACHE PATH "Fallback path to search for ROCm installs")
+    else()
+      set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+    endif()
   endif()
   message(VERBOSE "MLIR Default ROCM toolkit path: ${DEFAULT_ROCM_PATH}")
 
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index cc13e5b7436ea7..047d214b751f19 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -17,9 +17,6 @@
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVM/ROCDL/Utils.h"
-#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
 #include "llvm/IR/Constants.h"
@@ -111,9 +108,6 @@ SerializeGPUModuleBase::SerializeGPUModuleBase(
     for (Attribute attr : files.getValue())
       if (auto file = dyn_cast<StringAttr>(attr))
         fileList.push_back(file.str());
-
-  // Append standard ROCm device bitcode libraries to the files to be loaded.
-  (void)appendStandardLibs();
 }
 
 void SerializeGPUModuleBase::init() {
@@ -138,28 +132,63 @@ ArrayRef<std::string> SerializeGPUModuleBase::getFileList() const {
   return fileList;
 }
 
-LogicalResult SerializeGPUModuleBase::appendStandardLibs() {
+LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) {
+  if (libs == AMDGCNLibraries::None)
+    return success();
   StringRef pathRef = getToolkitPath();
-  if (!pathRef.empty()) {
-    SmallVector<char, 256> path;
-    path.insert(path.begin(), pathRef.begin(), pathRef.end());
-    llvm::sys::path::append(path, "amdgcn", "bitcode");
-    pathRef = StringRef(path.data(), path.size());
-    if (!llvm::sys::fs::is_directory(pathRef)) {
-      getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef
-                                  << " does not exist or is not a directory.";
-      return failure();
-    }
-    StringRef isaVersion =
-        llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
-    isaVersion.consume_front("gfx");
-    return getCommonBitcodeLibs(fileList, path, isaVersion);
+
+  // Get the path for the device libraries
+  SmallString<256> path;
+  path.insert(path.begin(), pathRef.begin(), pathRef.end());
+  llvm::sys::path::append(path, "amdgcn", "bitcode");
+  pathRef = StringRef(path.data(), path.size());
+
+  // Fail if the path is invalid.
+  if (!llvm::sys::fs::is_directory(pathRef)) {
+    getOperation().emitError() << "ROCm amdgcn bitcode path: " << pathRef
+                               << " does not exist or is not a directory";
+    return failure();
   }
+
+  // Get the ISA version.
+  StringRef isaVersion =
+      llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
+  isaVersion.consume_front("gfx");
+
+  // Helper function for adding a library.
+  auto addLib = [&](const Twine &lib) -> bool {
+    auto baseSize = path.size();
+    llvm::sys::path::append(path, lib);
+    StringRef pathRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_regular_file(pathRef)) {
+      getOperation().emitRemark() << "bitcode library path: " << pathRef
+                                  << " does not exist or is not a file";
+      return true;
+    }
+    fileList.push_back(pathRef.str());
+    path.truncate(baseSize);
+    return false;
+  };
+
+  // Add ROCm device libraries. Fail if any of the libraries is not found, ie.
+  // if any of the `addLib` failed.
+  if ((any(libs & AMDGCNLibraries::Ocml) && addLib("ocml.bc")) ||
+      (any(libs & AMDGCNLibraries::Ockl) && addLib("ockl.bc")) ||
+      (any(libs & AMDGCNLibraries::Hip) && addLib("hip.bc")) ||
+      (any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc")) ||
+      (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl)) &&
+       addLib("oclc_isa_version_" + isaVersion + ".bc")))
+    return failure();
   return success();
 }
 
 std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
 SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
+  // Return if there are no libs to load.
+  if (deviceLibs == AMDGCNLibraries::None && fileList.empty())
+    return SmallVector<std::unique_ptr<llvm::Module>>();
+  if (failed(appendStandardLibs(deviceLibs)))
+    return std::nullopt;
   SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
   if (failed(loadBitcodeFilesFromList(module.getContext(), fileList, bcFiles,
                                       true)))
@@ -174,80 +203,77 @@ LogicalResult SerializeGPUModuleBase::handleBitcodeFile(llvm::Module &module) {
   // Stop spamming us with clang version numbers
   if (auto *ident = module.getNamedMetadata("llvm.ident"))
     module.eraseNamedMetadata(ident);
+  // Override the libModules datalayout and target triple with the compiler's
+  // data layout should there be a discrepency.
+  setDataLayoutAndTriple(module);
   return success();
 }
 
 void SerializeGPUModuleBase::handleModulePreLink(llvm::Module &module) {
-  [[maybe_unused]] std::optional<llvm::TargetMachine *> targetMachine =
-      getOrCreateTargetMachine();
-  assert(targetMachine && "expect a TargetMachine");
-  addControlVariables(module, target.hasWave64(), target.hasDaz(),
+  // If all libraries are not set, traverse the module to determine which
+  // libraries are required.
+  if (deviceLibs != AMDGCNLibraries::All) {
+    for (llvm::Function &f : module.functions()) {
+      if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
+        StringRef funcName = f.getName();
+        if ("printf" == funcName)
+          deviceLibs |= AMDGCNLibraries::OpenCL | AMDGCNLibraries::Ockl |
+                        AMDGCNLibraries::Ocml;
+        if (funcName.starts_with("__ockl_"))
+          deviceLibs |= AMDGCNLibraries::Ockl;
+        if (funcName.starts_with("__ocml_"))
+          deviceLibs |= AMDGCNLibraries::Ocml;
+        if (funcName == "__atomic_work_item_fence")
+          deviceLibs |= AMDGCNLibraries::Hip;
+      }
+    }
+  }
+  addControlVariables(module, deviceLibs, target.hasWave64(), target.hasDaz(),
                       target.hasFiniteOnly(), target.hasUnsafeMath(),
                       target.hasFastMath(), target.hasCorrectSqrt(),
                       target.getAbi());
 }
 
-// Get the paths of ROCm device libraries.
-LogicalResult SerializeGPUModuleBase::getCommonBitcodeLibs(
-    llvm::SmallVector<std::string> &libs, SmallVector<char, 256> &libPath,
-    StringRef isaVersion) {
-  auto addLib = [&](StringRef path) -> bool {
-    if (!llvm::sys::fs::is_regular_file(path)) {
-      getOperation().emitRemark() << "Bitcode library path: " << path
-                                  << " does not exist or is not a file.\n";
-      return true;
-    }
-    libs.push_back(path.str());
-    return false;
-  };
-  auto getLibPath = [&libPath](Twine lib) {
-    auto baseSize = libPath.size();
-    llvm::sys::path::append(libPath, lib + ".bc");
-    std::string path(StringRef(libPath.data(), libPath.size()).str());
-    libPath.truncate(baseSize);
-    return path;
-  };
-
-  // Add ROCm device libraries. Fail if any of the libraries is not found.
-  if (addLib(getLibPath("ocml")) || addLib(getLibPath("ockl")) ||
-      addLib(getLibPath("hip")) || addLib(getLibPath("opencl")) ||
-      addLib(getLibPath("oclc_isa_version_" + isaVersion)))
-    return failure();
-  return success();
-}
-
 void SerializeGPUModuleBase::addControlVariables(
-    llvm::Module &module, bool wave64, bool daz, bool finiteOnly,
-    bool unsafeMath, bool fastMath, bool correctSqrt, StringRef abiVer) {
-  llvm::Type *i8Ty = llvm::Type::getInt8Ty(module.getContext());
-  auto addControlVariable = [i8Ty, &module](StringRef name, bool enable) {
+    llvm::Module &module, AMDGCNLibraries libs, bool wave64, bool daz,
+    bool finiteOnly, bool unsafeMath, bool fastMath, bool correctSqrt,
+    StringRef abiVer) {
+  // Return if no device libraries are required.
+  if (libs == AMDGCNLibraries::None)
+    return;
+  // Helper function for adding control variables.
+  auto addControlVariable = [&module](StringRef name, uint32_t value,
+                                      uint32_t bitwidth) {
+    if (module.getNamedGlobal(name))
+      return;
+    llvm::IntegerType *type =
+        llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
     llvm::GlobalVariable *controlVariable = new llvm::GlobalVariable(
-        module, i8Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
-        llvm::ConstantInt::get(i8Ty, enable), name, nullptr,
-        llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+        module, /*isConstant=*/type, true,
+        llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
+        llvm::ConstantInt::get(type, value), name, /*before=*/nullptr,
+        /*threadLocalMode=*/llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
+        /*addressSpace=*/4);
     controlVariable->setVisibility(
         llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
-    controlVariable->setAlignment(llvm::MaybeAlign(1));
+    controlVariable->setAlignment(llvm::MaybeAlign(bitwidth / 8));
     controlVariable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
   };
-  addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath);
-  addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath);
-  addControlVariable("__oclc_daz_opt", daz || fastMath);
-  addControlVariable("__oclc_correctly_rounded_sqrt32",
-                     correctSqrt && !fastMath);
-  addControlVariable("__oclc_wavefrontsize64", wave64);
-
-  llvm::Type *i32Ty = llvm::Type::getInt32Ty(module.getContext());
-  int abi = 500;
-  abiVer.getAsInteger(0, abi);
-  llvm::GlobalVariable *abiVersion = new llvm::GlobalVariable(
-      module, i32Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
-      llvm::ConstantInt::get(i32Ty, abi), "__oclc_ABI_version", nullptr,
-      llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
-  abiVersion->setVisibility(
-      llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
-  abiVersion->setAlignment(llvm::MaybeAlign(4));
-  abiVersion->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
+  // Add ocml related control variables.
+  if (any(libs & AMDGCNLibraries::Ocml)) {
+    addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath, 8);
+    addControlVariable("__oclc_daz_opt", daz || fastMath, 8);
+    addControlVariable("__oclc_correctly_rounded_sqrt32",
+                       correctSqrt && !fastMath, 8);
+    addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath, 8);
+  }
+  // Add ocml or ockl related control variables.
+  if (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl))) {
+    addControlVariable("__oclc_wavefrontsize64", wave64, 8);
+    int abi = 500;
+    abiVer.getAsInteger(0, abi);
+    addControlVariable("__oclc_ABI_version", abi, 32);
+  }
 }
 
 std::optional<SmallVector<char, 0>>
@@ -307,53 +333,21 @@ SerializeGPUModuleBase::assembleIsa(StringRef isa) {
 
   if (!tap) {
     emitError(loc, "assembler initialization error");
-    return {};
+    return std::nullopt;
   }
 
   parser->setTargetParser(*tap);
   parser->Run(false);
-
-  return result;
-}
-
-#if MLIR_ENABLE_ROCM_CONVERSIONS
-namespace {
-class AMDGPUSerializer : public SerializeGPUModuleBase {
-public:
-  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                   const gpu::TargetOptions &targetOptions);
-
-  gpu::GPUModuleOp getOperation();
-
-  // Compile to HSA.
-  std::optional<SmallVector<char, 0>>
-  compileToBinary(const std::string &serializedISA);
-
-  std::optional<SmallVector<char, 0>>
-  moduleToObject(llvm::Module &llvmModule) override;
-
-private:
-  // Target options.
-  gpu::TargetOptions targetOptions;
-};
-} // namespace
-
-AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                                   const gpu::TargetOptions &targetOptions)
-    : SerializeGPUModuleBase(module, target, targetOptions),
-      targetOptions(targetOptions) {}
-
-gpu::GPUModuleOp AMDGPUSerializer::getOperation() {
-  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
+  return std::move(result);
 }
 
 std::optional<SmallVector<char, 0>>
-AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
+SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
   // Assemble the ISA.
   std::optional<SmallVector<char, 0>> isaBinary = assembleIsa(serializedISA);
 
   if (!isaBinary) {
-    getOperation().emitError() << "Failed during ISA assembling.";
+    getOperation().emitError() << "failed during ISA assembling";
     return std::nullopt;
   }
 
@@ -363,7 +357,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel%%", "o", tempIsaBinaryFd,
                                          tempIsaBinaryFilename)) {
     getOperation().emitError()
-        << "Failed to create a temporary file for dumping the ISA binary.";
+        << "failed to create a temporary file for dumping the ISA binary";
     return std::nullopt;
   }
   llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
@@ -378,7 +372,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco",
                                          tempHsacoFilename)) {
     getOperation().emitError()
-        << "Failed to create a temporary file for the HSA code object.";
+        << "failed to create a temporary file for the HSA code object";
     return std::nullopt;
   }
   llvm::FileRemover cleanupHsaco(tempHsacoFilename);
@@ -389,7 +383,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
       lldPath,
       {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
   if (lldResult != 0) {
-    getOperation().emitError() << "lld invocation failed.";
+    getOperation().emitError() << "lld invocation failed";
     return std::nullopt;
   }
 
@@ -398,7 +392,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
       llvm::MemoryBuffer::getFile(tempHsacoFilename, /*IsText=*/false);
   if (!hsacoFile) {
     getOperation().emitError()
-        << "Failed to read the HSA code object from the temp file.";
+        << "failed to read the HSA code object from the temp file";
     return std::nullopt;
   }
 
@@ -407,13 +401,13 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   return SmallVector<char, 0>(buffer.begin(), buffer.end());
 }
 
-std::optional<SmallVector<char, 0>>
-AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
+std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
+    const gpu::TargetOptions &targetOptions, llvm::Module &llvmModule) {
   // Return LLVM IR if the compilation target is offload.
 #define DEBUG_TYPE "serialize-to-llvm"
   LLVM_DEBUG({
-    llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
-                 << "\n"
+    llvm::dbgs() << "LLVM IR for module: "
+                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << llvmModule << "\n";
   });
 #undef DEBUG_TYPE
@@ -423,8 +417,8 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   std::optional<llvm::TargetMachine *> targetMachine =
       getOrCreateTargetMachine();
   if (!targetMachine) {
-    getOperation().emitError() << "Target Machine unavailable for triple "
-                               << triple << ", can't compile with LLVM\n";
+    getOperation().emitError() << "target Machine unavailable for triple "
+                               << triple << ", can't compile with LLVM";
     return std::nullopt;
   }
 
@@ -432,12 +426,13 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   std::optional<std::string> serializedISA =
       translateToISA(llvmModule, **targetMachine);
   if (!serializedISA) {
-    getOperation().emitError() << "Failed translating the module to ISA.";
+    getOperation().emitError() << "failed translating the module to ISA";
     return std::nullopt;
   }
 #define DEBUG_TYPE "serialize-to-isa"
   LLVM_DEBUG({
-    llvm::dbgs() << "ISA for module: " << getOperation().getNameAttr() << "\n"
+    llvm::dbgs() << "ISA for module: "
+                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << *serializedISA << "\n";
   });
 #undef DEBUG_TYPE
@@ -445,9 +440,41 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
     return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
 
+  // Compiling to binary requires a valid ROCm path, fail if it's not found.
+  if (getToolkitPath().empty()) {
+    getOperation().emitError() << "invalid ROCm path, please set a valid path";
+    return std::nullopt;
+  }
+
   // Compile to binary.
   return compileToBinary(*serializedISA);
 }
+
+#if MLIR_ENABLE_ROCM_CONVERSIONS
+namespace {
+class AMDGPUSerializer : public SerializeGPUModuleBase {
+public:
+  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                   const gpu::TargetOptions &targetOptions);
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+  // Target options.
+  gpu::TargetOptions targetOptions;
+};
+} // namespace
+
+AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                                   const gpu::TargetOptions &targetOptions)
+    : SerializeGPUModuleBase(module, target, targetOptions),
+      targetOptions(targetOptions) {}
+
+std::optional<SmallVector<char, 0>>
+AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
+  return moduleToObjectImpl(targetOptions, llvmModule);
+}
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 
 std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
@@ -457,7 +484,7 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   if (!module)
     return std::nullopt;
   if (!mlir::isa<gpu::GPUModuleOp>(module)) {
-    module->emitError("Module must be a GPU module.");
+    module->emitError("module must be a GPU module");
     return std::nullopt;
   }
 #if MLIR_ENABLE_ROCM_CONVERSIONS
@@ -466,8 +493,8 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   serializer.init();
   return serializer.run();
 #else
-  module->emitError("The `AMDGPU` target was not built. Please enable it when "
-                    "building LLVM.");
+  module->emitError("the `AMDGPU` target was not built. Please enable it when "
+                    "building LLVM");
   return std::nullopt;
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 }
@@ -477,10 +504,15 @@ ROCDLTargetAttrImpl::createObject(Attribute attribute,
                                   const SmallVector<char, 0> &object,
                                   const gpu::TargetOptions &options) const {
   gpu::CompilationTarget format = options.getCompilationTarget();
+  // If format is `fatbin` transform it to binary as `fatbin` is not yet
+  // supported.
+  if (format > gpu::CompilationTarget::Binary)
+    format = gpu::CompilationTarget::Binary;
+
+  DictionaryAttr properties{};
   Builder builder(attribute.getContext());
   return builder.getAttr<gpu::ObjectAttr>(
-      attribute,
-      format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary
-                                              : format,
-      builder.getStringAttr(StringRef(object.data(), object.size())), nullptr);
+      attribute, format,
+      builder.getStringAttr(StringRef(object.data(), object.size())),
+      properties);
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 7793d5da952eff..6754a5c002efc8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -773,7 +774,7 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
 /// Allocate space for privatized reduction variables.
 template <typename T>
 static void allocByValReductionVars(
-    T loop, llvm::IRBuilderBase &builder,
+    T loop, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
     LLVM::ModuleTranslation &moduleTranslation,
     llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
@@ -782,15 +783,13 @@ static void allocByValReductionVars(
     llvm::ArrayRef<bool> isByRefs) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
-  auto args =
-      loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
 
   for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
     if (isByRefs[i])
       continue;
     llvm::Value *var = builder.CreateAlloca(
         moduleTranslation.convertType(reductionDecls[i].getType()));
-    moduleTranslation.mapValue(args[i], var);
+    moduleTranslation.mapValue(reductionArgs[i], var);
     privateReductionVariables[i] = var;
     reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
   }
@@ -837,14 +836,17 @@ static void collectReductionInfo(
   // Collect the reduction information.
   reductionInfos.reserve(numReductions);
   for (unsigned i = 0; i < numReductions; ++i) {
-    llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr;
+    llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy atomicGen = nullptr;
     if (owningAtomicReductionGens[i])
       atomicGen = owningAtomicReductionGens[i];
     llvm::Value *variable =
         moduleTranslation.lookupValue(loop.getReductionVars()[i]);
     reductionInfos.push_back(
         {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
-         privateReductionVariables[i], owningReductionGens[i], atomicGen});
+         privateReductionVariables[i],
+         /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar,
+         owningReductionGens[i],
+         /*ReductionGenClang=*/nullptr, atomicGen});
   }
 }
 
@@ -925,14 +927,17 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   SmallVector<llvm::Value *> privateReductionVariables(
       wsloopOp.getNumReductionVars());
   DenseMap<Value, llvm::Value *> reductionVariableMap;
-  allocByValReductionVars(wsloopOp, builder, moduleTranslation, allocaIP,
-                          reductionDecls, privateReductionVariables,
+
+  MutableArrayRef<BlockArgument> reductionArgs =
+      wsloopOp.getRegion().getArguments();
+
+  allocByValReductionVars(wsloopOp, reductionArgs, builder, moduleTranslation,
+                          allocaIP, reductionDecls, privateReductionVariables,
                           reductionVariableMap, isByRef);
 
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
-  ArrayRef<BlockArgument> reductionArgs = wsloopOp.getRegion().getArguments();
   for (unsigned i = 0; i < wsloopOp.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
 
@@ -1158,16 +1163,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Allocate reduction vars
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                            reductionDecls, privateReductionVariables,
-                            reductionVariableMap, isByRef);
 
-    // Initialize reduction vars
-    builder.restoreIP(allocaIP);
     MutableArrayRef<BlockArgument> reductionArgs =
-        opInst.getRegion().getArguments().take_back(
+        opInst.getRegion().getArguments().slice(
+            opInst.getNumAllocateVars() + opInst.getNumAllocatorsVars(),
             opInst.getNumReductionVars());
 
+    allocByValReductionVars(opInst, reductionArgs, builder, moduleTranslation,
+                            allocaIP, reductionDecls, privateReductionVariables,
+                            reductionVariableMap, isByRef);
+
+    // Initialize reduction vars
+    builder.restoreIP(allocaIP);
     llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
     allocaIP =
         InsertPointTy(allocaIP.getBlock(),
@@ -1304,7 +1311,26 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
           // region. The privatizer is processed in-place (see below) before it
           // gets inlined in the parallel region and therefore processing the
           // original op is dangerous.
-          return {privVar, privatizer.clone()};
+
+          MLIRContext &context = moduleTranslation.getContext();
+          mlir::IRRewriter opCloner(&context);
+          opCloner.setInsertionPoint(privatizer);
+          auto clone = llvm::cast<mlir::omp::PrivateClauseOp>(
+              opCloner.clone(*privatizer));
+
+          // Unique the clone name to avoid clashes in the symbol table.
+          unsigned counter = 0;
+          SmallString<256> cloneName = SymbolTable::generateSymbolName<256>(
+              privatizer.getSymName(),
+              [&](llvm::StringRef candidate) {
+                return SymbolTable::lookupNearestSymbolFrom(
+                           opInst, StringAttr::get(&context, candidate)) !=
+                       nullptr;
+              },
+              counter);
+
+          clone.setSymName(cloneName);
+          return {privVar, clone};
         }
       }
 
@@ -1434,6 +1460,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   return bodyGenStatus;
 }
 
+/// Convert Order attribute to llvm::omp::OrderKind.
+static llvm::omp::OrderKind
+convertOrderKind(std::optional<omp::ClauseOrderKind> o) {
+  if (!o)
+    return llvm::omp::OrderKind::OMP_ORDER_unknown;
+  switch (*o) {
+  case omp::ClauseOrderKind::Concurrent:
+    return llvm::omp::OrderKind::OMP_ORDER_concurrent;
+  }
+  llvm_unreachable("Unknown ClauseOrderKind kind");
+}
+
 /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -1513,11 +1551,12 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
     safelen = builder.getInt64(safelenVar.value());
 
   llvm::MapVector<llvm::Value *, llvm::Value *> alignedVars;
-  ompBuilder->applySimd(
-      loopInfo, alignedVars,
-      simdOp.getIfExpr() ? moduleTranslation.lookupValue(simdOp.getIfExpr())
-                         : nullptr,
-      llvm::omp::OrderKind::OMP_ORDER_unknown, simdlen, safelen);
+  llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrderVal());
+  ompBuilder->applySimd(loopInfo, alignedVars,
+                        simdOp.getIfExpr()
+                            ? moduleTranslation.lookupValue(simdOp.getIfExpr())
+                            : nullptr,
+                        order, simdlen, safelen);
 
   builder.restoreIP(afterIP);
   return success();
@@ -1954,12 +1993,6 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
                             Operation *clauseOp, llvm::Value *basePointer,
                             llvm::Type *baseType, llvm::IRBuilderBase &builder,
                             LLVM::ModuleTranslation &moduleTranslation) {
-  // utilising getTypeSizeInBits instead of getTypeSize as getTypeSize gives
-  // the size in inconsistent byte or bit format.
-  uint64_t underlyingTypeSzInBits = dl.getTypeSizeInBits(type);
-  if (auto arrTy = llvm::dyn_cast_if_present<LLVM::LLVMArrayType>(type))
-    underlyingTypeSzInBits = getArrayElementSizeInBits(arrTy, dl);
-
   if (auto memberClause =
           mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(clauseOp)) {
     // This calculates the size to transfer based on bounds and the underlying
@@ -1985,6 +2018,12 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
         }
       }
 
+      // utilising getTypeSizeInBits instead of getTypeSize as getTypeSize gives
+      // the size in inconsistent byte or bit format.
+      uint64_t underlyingTypeSzInBits = dl.getTypeSizeInBits(type);
+      if (auto arrTy = llvm::dyn_cast_if_present<LLVM::LLVMArrayType>(type))
+        underlyingTypeSzInBits = getArrayElementSizeInBits(arrTy, dl);
+
       // The size in bytes x number of elements, the sizeInBytes stored is
       // the underyling types size, e.g. if ptr<i32>, it'll be the i32's
       // size, so we do some on the fly runtime math to get the size in
@@ -1995,7 +2034,7 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
     }
   }
 
-  return builder.getInt64(underlyingTypeSzInBits / 8);
+  return builder.getInt64(dl.getTypeSizeInBits(type) / 8);
 }
 
 void collectMapDataFromMapOperands(MapInfoData &mapData,
@@ -2876,7 +2915,7 @@ static bool targetOpSupported(Operation &opInst) {
 static void
 handleDeclareTargetMapVar(MapInfoData &mapData,
                           LLVM::ModuleTranslation &moduleTranslation,
-                          llvm::IRBuilderBase &builder) {
+                          llvm::IRBuilderBase &builder, llvm::Function *func) {
   for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
     // In the case of declare target mapped variables, the basePointer is
     // the reference pointer generated by the convertDeclareTargetAttr
@@ -2891,19 +2930,31 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
     // reference pointer and the pointer are assigned in the kernel argument
     // structure for the host.
     if (mapData.IsDeclareTarget[i]) {
+      // If the original map value is a constant, then we have to make sure all
+      // of it's uses within the current kernel/function that we are going to
+      // rewrite are converted to instructions, as we will be altering the old
+      // use (OriginalValue) from a constant to an instruction, which will be
+      // illegal and ICE the compiler if the user is a constant expression of
+      // some kind e.g. a constant GEP.
+      if (auto *constant = dyn_cast<llvm::Constant>(mapData.OriginalValue[i]))
+        convertUsersOfConstantsToInstructions(constant, func, false);
+
       // The users iterator will get invalidated if we modify an element,
-      // so we populate this vector of uses to alter each user on an individual
-      // basis to emit its own load (rather than one load for all).
+      // so we populate this vector of uses to alter each user on an
+      // individual basis to emit its own load (rather than one load for
+      // all).
       llvm::SmallVector<llvm::User *> userVec;
       for (llvm::User *user : mapData.OriginalValue[i]->users())
         userVec.push_back(user);
 
       for (llvm::User *user : userVec) {
         if (auto *insn = dyn_cast<llvm::Instruction>(user)) {
-          auto *load = builder.CreateLoad(mapData.BasePointers[i]->getType(),
-                                          mapData.BasePointers[i]);
-          load->moveBefore(insn);
-          user->replaceUsesOfWith(mapData.OriginalValue[i], load);
+          if (insn->getFunction() == func) {
+            auto *load = builder.CreateLoad(mapData.BasePointers[i]->getType(),
+                                            mapData.BasePointers[i]);
+            load->moveBefore(insn);
+            user->replaceUsesOfWith(mapData.OriginalValue[i], load);
+          }
         }
       }
     }
@@ -3021,6 +3072,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto &targetRegion = targetOp.getRegion();
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
   SmallVector<Value> mapOperands = targetOp.getMapOperands();
+  llvm::Function *llvmOutlinedFn = nullptr;
 
   LogicalResult bodyGenStatus = success();
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
@@ -3030,7 +3082,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     // original function to the new outlined function.
     llvm::Function *llvmParentFn =
         moduleTranslation.lookupFunction(parentFn.getName());
-    llvm::Function *llvmOutlinedFn = codeGenIP.getBlock()->getParent();
+    llvmOutlinedFn = codeGenIP.getBlock()->getParent();
     assert(llvmParentFn && llvmOutlinedFn &&
            "Both parent and outlined functions must exist at this point");
 
@@ -3125,7 +3177,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   // Remap access operations to declare target reference pointers for the
   // device, essentially generating extra loadop's as necessary
   if (moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice())
-    handleDeclareTargetMapVar(mapData, moduleTranslation, builder);
+    handleDeclareTargetMapVar(mapData, moduleTranslation, builder,
+                              llvmOutlinedFn);
 
   return bodyGenStatus;
 }
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
index ae0961c62abad6..76b066feb69308 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
@@ -1808,8 +1808,7 @@ void lsp::PDLLServer::getInlayHints(const URIForFile &uri, const Range &range,
 
   // Drop any duplicated hints that may have cropped up.
   llvm::sort(inlayHints);
-  inlayHints.erase(std::unique(inlayHints.begin(), inlayHints.end()),
-                   inlayHints.end());
+  inlayHints.erase(llvm::unique(inlayHints), inlayHints.end());
 }
 
 std::optional<lsp::PDLLViewOutputResult>
diff --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
index 39f7256fb789de..1c0efa06518ddb 100644
--- a/mlir/lib/Transforms/SROA.cpp
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -146,11 +146,11 @@ static void destructureSlot(
       allocator.destructure(slot, info.usedIndices, builder, newAllocators);
 
   if (statistics.slotsWithMemoryBenefit &&
-      slot.elementPtrs.size() != info.usedIndices.size())
+      slot.subelementTypes.size() != info.usedIndices.size())
     (*statistics.slotsWithMemoryBenefit)++;
 
   if (statistics.maxSubelementAmount)
-    statistics.maxSubelementAmount->updateMax(slot.elementPtrs.size());
+    statistics.maxSubelementAmount->updateMax(slot.subelementTypes.size());
 
   SetVector<Operation *> usersToRewire;
   for (Operation *user : llvm::make_first_range(info.userToBlockingUses))
diff --git a/mlir/python/mlir/extras/types.py b/mlir/python/mlir/extras/types.py
index db9e8229fb2884..b93c46b172a9a1 100644
--- a/mlir/python/mlir/extras/types.py
+++ b/mlir/python/mlir/extras/types.py
@@ -68,7 +68,7 @@ def ui(width):
 bf16 = lambda: BF16Type.get()
 
 f8E5M2 = lambda: Float8E5M2Type.get()
-f8E4M3 = lambda: Float8E4M3FNType.get()
+f8E4M3FN = lambda: Float8E4M3FNType.get()
 f8E4M3B11FNUZ = lambda: Float8E4M3B11FNUZType.get()
 
 none = lambda: NoneType.get()
diff --git a/mlir/test/Conversion/ConvertToSPIRV/arith.mlir b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
new file mode 100644
index 00000000000000..a2adc0ad9c7a5a
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
@@ -0,0 +1,218 @@
+// RUN: mlir-opt -convert-to-spirv -split-input-file %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// arithmetic ops
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @int32_scalar
+func.func @int32_scalar(%lhs: i32, %rhs: i32) {
+  // CHECK: spirv.IAdd %{{.*}}, %{{.*}}: i32
+  %0 = arith.addi %lhs, %rhs: i32
+  // CHECK: spirv.ISub %{{.*}}, %{{.*}}: i32
+  %1 = arith.subi %lhs, %rhs: i32
+  // CHECK: spirv.IMul %{{.*}}, %{{.*}}: i32
+  %2 = arith.muli %lhs, %rhs: i32
+  // CHECK: spirv.SDiv %{{.*}}, %{{.*}}: i32
+  %3 = arith.divsi %lhs, %rhs: i32
+  // CHECK: spirv.UDiv %{{.*}}, %{{.*}}: i32
+  %4 = arith.divui %lhs, %rhs: i32
+  // CHECK: spirv.UMod %{{.*}}, %{{.*}}: i32
+  %5 = arith.remui %lhs, %rhs: i32
+  return
+}
+
+// CHECK-LABEL: @int32_scalar_srem
+// CHECK-SAME: (%[[LHS:.+]]: i32, %[[RHS:.+]]: i32)
+func.func @int32_scalar_srem(%lhs: i32, %rhs: i32) {
+  // CHECK: %[[LABS:.+]] = spirv.GL.SAbs %[[LHS]] : i32
+  // CHECK: %[[RABS:.+]] = spirv.GL.SAbs %[[RHS]] : i32
+  // CHECK:  %[[ABS:.+]] = spirv.UMod %[[LABS]], %[[RABS]] : i32
+  // CHECK:  %[[POS:.+]] = spirv.IEqual %[[LHS]], %[[LABS]] : i32
+  // CHECK:  %[[NEG:.+]] = spirv.SNegate %[[ABS]] : i32
+  // CHECK:      %{{.+}} = spirv.Select %[[POS]], %[[ABS]], %[[NEG]] : i1, i32
+  %0 = arith.remsi %lhs, %rhs: i32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// arith bit ops
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @bitwise_scalar
+func.func @bitwise_scalar(%arg0 : i32, %arg1 : i32) {
+  // CHECK: spirv.BitwiseAnd
+  %0 = arith.andi %arg0, %arg1 : i32
+  // CHECK: spirv.BitwiseOr
+  %1 = arith.ori %arg0, %arg1 : i32
+  // CHECK: spirv.BitwiseXor
+  %2 = arith.xori %arg0, %arg1 : i32
+  return
+}
+
+// CHECK-LABEL: @bitwise_vector
+func.func @bitwise_vector(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+  // CHECK: spirv.BitwiseAnd
+  %0 = arith.andi %arg0, %arg1 : vector<4xi32>
+  // CHECK: spirv.BitwiseOr
+  %1 = arith.ori %arg0, %arg1 : vector<4xi32>
+  // CHECK: spirv.BitwiseXor
+  %2 = arith.xori %arg0, %arg1 : vector<4xi32>
+  return
+}
+
+// CHECK-LABEL: @logical_scalar
+func.func @logical_scalar(%arg0 : i1, %arg1 : i1) {
+  // CHECK: spirv.LogicalAnd
+  %0 = arith.andi %arg0, %arg1 : i1
+  // CHECK: spirv.LogicalOr
+  %1 = arith.ori %arg0, %arg1 : i1
+  // CHECK: spirv.LogicalNotEqual
+  %2 = arith.xori %arg0, %arg1 : i1
+  return
+}
+
+// CHECK-LABEL: @logical_vector
+func.func @logical_vector(%arg0 : vector<4xi1>, %arg1 : vector<4xi1>) {
+  // CHECK: spirv.LogicalAnd
+  %0 = arith.andi %arg0, %arg1 : vector<4xi1>
+  // CHECK: spirv.LogicalOr
+  %1 = arith.ori %arg0, %arg1 : vector<4xi1>
+  // CHECK: spirv.LogicalNotEqual
+  %2 = arith.xori %arg0, %arg1 : vector<4xi1>
+  return
+}
+
+// CHECK-LABEL: @shift_scalar
+func.func @shift_scalar(%arg0 : i32, %arg1 : i32) {
+  // CHECK: spirv.ShiftLeftLogical
+  %0 = arith.shli %arg0, %arg1 : i32
+  // CHECK: spirv.ShiftRightArithmetic
+  %1 = arith.shrsi %arg0, %arg1 : i32
+  // CHECK: spirv.ShiftRightLogical
+  %2 = arith.shrui %arg0, %arg1 : i32
+  return
+}
+
+// CHECK-LABEL: @shift_vector
+func.func @shift_vector(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+  // CHECK: spirv.ShiftLeftLogical
+  %0 = arith.shli %arg0, %arg1 : vector<4xi32>
+  // CHECK: spirv.ShiftRightArithmetic
+  %1 = arith.shrsi %arg0, %arg1 : vector<4xi32>
+  // CHECK: spirv.ShiftRightLogical
+  %2 = arith.shrui %arg0, %arg1 : vector<4xi32>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// arith.cmpf
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @cmpf
+func.func @cmpf(%arg0 : f32, %arg1 : f32) {
+  // CHECK: spirv.FOrdEqual
+  %1 = arith.cmpf oeq, %arg0, %arg1 : f32
+  return
+}
+
+// CHECK-LABEL: @vec1cmpf
+func.func @vec1cmpf(%arg0 : vector<1xf32>, %arg1 : vector<1xf32>) {
+  // CHECK: spirv.FOrdGreaterThan
+  %0 = arith.cmpf ogt, %arg0, %arg1 : vector<1xf32>
+  // CHECK: spirv.FUnordLessThan
+  %1 = arith.cmpf ult, %arg0, %arg1 : vector<1xf32>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// arith.cmpi
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @cmpi
+func.func @cmpi(%arg0 : i32, %arg1 : i32) {
+  // CHECK: spirv.IEqual
+  %0 = arith.cmpi eq, %arg0, %arg1 : i32
+  return
+}
+
+// CHECK-LABEL: @indexcmpi
+func.func @indexcmpi(%arg0 : index, %arg1 : index) {
+  // CHECK: spirv.IEqual
+  %0 = arith.cmpi eq, %arg0, %arg1 : index
+  return
+}
+
+// CHECK-LABEL: @vec1cmpi
+func.func @vec1cmpi(%arg0 : vector<1xi32>, %arg1 : vector<1xi32>) {
+  // CHECK: spirv.ULessThan
+  %0 = arith.cmpi ult, %arg0, %arg1 : vector<1xi32>
+  // CHECK: spirv.SGreaterThan
+  %1 = arith.cmpi sgt, %arg0, %arg1 : vector<1xi32>
+  return
+}
+
+// CHECK-LABEL: @boolcmpi_equality
+func.func @boolcmpi_equality(%arg0 : i1, %arg1 : i1) {
+  // CHECK: spirv.LogicalEqual
+  %0 = arith.cmpi eq, %arg0, %arg1 : i1
+  // CHECK: spirv.LogicalNotEqual
+  %1 = arith.cmpi ne, %arg0, %arg1 : i1
+  return
+}
+
+// CHECK-LABEL: @boolcmpi_unsigned
+func.func @boolcmpi_unsigned(%arg0 : i1, %arg1 : i1) {
+  // CHECK-COUNT-2: spirv.Select
+  // CHECK: spirv.UGreaterThanEqual
+  %0 = arith.cmpi uge, %arg0, %arg1 : i1
+  // CHECK-COUNT-2: spirv.Select
+  // CHECK: spirv.ULessThan
+  %1 = arith.cmpi ult, %arg0, %arg1 : i1
+  return
+}
+
+// CHECK-LABEL: @vec1boolcmpi_equality
+func.func @vec1boolcmpi_equality(%arg0 : vector<1xi1>, %arg1 : vector<1xi1>) {
+  // CHECK: spirv.LogicalEqual
+  %0 = arith.cmpi eq, %arg0, %arg1 : vector<1xi1>
+  // CHECK: spirv.LogicalNotEqual
+  %1 = arith.cmpi ne, %arg0, %arg1 : vector<1xi1>
+  return
+}
+
+// CHECK-LABEL: @vec1boolcmpi_unsigned
+func.func @vec1boolcmpi_unsigned(%arg0 : vector<1xi1>, %arg1 : vector<1xi1>) {
+  // CHECK-COUNT-2: spirv.Select
+  // CHECK: spirv.UGreaterThanEqual
+  %0 = arith.cmpi uge, %arg0, %arg1 : vector<1xi1>
+  // CHECK-COUNT-2: spirv.Select
+  // CHECK: spirv.ULessThan
+  %1 = arith.cmpi ult, %arg0, %arg1 : vector<1xi1>
+  return
+}
+
+// CHECK-LABEL: @vecboolcmpi_equality
+func.func @vecboolcmpi_equality(%arg0 : vector<4xi1>, %arg1 : vector<4xi1>) {
+  // CHECK: spirv.LogicalEqual
+  %0 = arith.cmpi eq, %arg0, %arg1 : vector<4xi1>
+  // CHECK: spirv.LogicalNotEqual
+  %1 = arith.cmpi ne, %arg0, %arg1 : vector<4xi1>
+  return
+}
+
+// CHECK-LABEL: @vecboolcmpi_unsigned
+func.func @vecboolcmpi_unsigned(%arg0 : vector<3xi1>, %arg1 : vector<3xi1>) {
+  // CHECK-COUNT-2: spirv.Select
+  // CHECK: spirv.UGreaterThanEqual
+  %0 = arith.cmpi uge, %arg0, %arg1 : vector<3xi1>
+  // CHECK-COUNT-2: spirv.Select
+  // CHECK: spirv.ULessThan
+  %1 = arith.cmpi ult, %arg0, %arg1 : vector<3xi1>
+  return
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/combined.mlir b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
new file mode 100644
index 00000000000000..9e908465cb142f
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s
+
+// CHECK-LABEL: @combined
+// CHECK: %[[C0_F32:.*]] = spirv.Constant 0.000000e+00 : f32
+// CHECK: %[[C1_F32:.*]]  = spirv.Constant 1.000000e+00 : f32
+// CHECK: %[[C0_I32:.*]] = spirv.Constant 0 : i32
+// CHECK: %[[C4_I32:.*]] = spirv.Constant 4 : i32
+// CHECK: %[[C0_I32_0:.*]] = spirv.Constant 0 : i32
+// CHECK: %[[C4_I32_0:.*]] = spirv.Constant 4 : i32
+// CHECK: %[[C1_I32:.*]] = spirv.Constant 1 : i32
+// CHECK: %[[VEC:.*]] = spirv.Constant dense<1.000000e+00> : vector<4xf32>
+// CHECK: %[[VARIABLE:.*]] = spirv.Variable : !spirv.ptr<f32, Function>
+// CHECK: spirv.mlir.loop {
+// CHECK:    spirv.Branch ^[[HEADER:.*]](%[[C0_I32_0]], %[[C0_F32]] : i32, f32)
+// CHECK:  ^[[HEADER]](%[[INDVAR_0:.*]]: i32, %[[INDVAR_1:.*]]: f32):
+// CHECK:    %[[SLESSTHAN:.*]] = spirv.SLessThan %[[INDVAR_0]], %[[C4_I32_0]] : i32
+// CHECK:    spirv.BranchConditional %[[SLESSTHAN]], ^[[BODY:.*]], ^[[MERGE:.*]]
+// CHECK:  ^[[BODY]]:
+// CHECK:    %[[FADD:.*]] = spirv.FAdd %[[INDVAR_1]], %[[C1_F32]]  : f32
+// CHECK:    %[[INSERT:.*]] = spirv.CompositeInsert %[[FADD]], %[[VEC]][0 : i32] : f32 into vector<4xf32>
+// CHECK:    spirv.Store "Function" %[[VARIABLE]], %[[FADD]] : f32
+// CHECK:    %[[IADD:.*]] = spirv.IAdd %[[INDVAR_0]], %[[C1_I32]] : i32
+// CHECK:    spirv.Branch ^[[HEADER]](%[[IADD]], %[[FADD]] : i32, f32)
+// CHECK:  ^[[MERGE]]:
+// CHECK:    spirv.mlir.merge
+// CHECK:  }
+// CHECK:  %[[LOAD:.*]] = spirv.Load "Function" %[[VARIABLE]] : f32
+// CHECK:  %[[UNDEF:.*]] = spirv.Undef : f32
+// CHECK:  spirv.ReturnValue %[[UNDEF]] : f32
+func.func @combined() -> f32 {
+  %c0_f32 = arith.constant 0.0 : f32
+  %c1_f32 = arith.constant 1.0 : f32
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %lb = index.casts %c0_i32 : i32 to index
+  %ub = index.casts %c4_i32 : i32 to index
+  %step = arith.constant 1 : index
+  %buf = vector.broadcast %c1_f32 : f32 to vector<4xf32>
+  scf.for %iv = %lb to %ub step %step iter_args(%sum_iter = %c0_f32) -> f32 {
+    %t = vector.extract %buf[0] : f32 from vector<4xf32>
+    %sum_next = arith.addf %sum_iter, %t : f32
+    vector.insert %sum_next, %buf[0] : f32 into vector<4xf32>
+    scf.yield %sum_next : f32
+  }
+  %ret = ub.poison : f32
+  return %ret : f32
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/index.mlir b/mlir/test/Conversion/ConvertToSPIRV/index.mlir
new file mode 100644
index 00000000000000..db747625bc7b39
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/index.mlir
@@ -0,0 +1,63 @@
+// RUN: mlir-opt %s -convert-to-spirv | FileCheck %s
+
+// CHECK-LABEL: @basic
+func.func @basic(%a: index, %b: index) {
+  // CHECK: spirv.IAdd
+  %0 = index.add %a, %b
+  // CHECK: spirv.ISub
+  %1 = index.sub %a, %b
+  // CHECK: spirv.IMul
+  %2 = index.mul %a, %b
+  // CHECK: spirv.SDiv
+  %3 = index.divs %a, %b
+  // CHECK: spirv.UDiv
+  %4 = index.divu %a, %b
+  // CHECK: spirv.SRem
+  %5 = index.rems %a, %b
+  // CHECK: spirv.UMod
+  %6 = index.remu %a, %b
+  // CHECK: spirv.GL.SMax
+  %7 = index.maxs %a, %b
+  // CHECK: spirv.GL.UMax
+  %8 = index.maxu %a, %b
+  // CHECK: spirv.GL.SMin
+  %9 = index.mins %a, %b
+  // CHECK: spirv.GL.UMin
+  %10 = index.minu %a, %b
+  // CHECK: spirv.ShiftLeftLogical
+  %11 = index.shl %a, %b
+  // CHECK: spirv.ShiftRightArithmetic
+  %12 = index.shrs %a, %b
+  // CHECK: spirv.ShiftRightLogical
+  %13 = index.shru %a, %b
+  // CHECK: spirv.BitwiseAnd
+  %14 = index.and %a, %b
+  // CHECK: spirv.BitwiseOr
+  %15 = index.or %a, %b
+  // CHECK: spirv.BitwiseXor
+  %16 = index.xor %a, %b
+  return
+}
+
+// CHECK-LABEL: @cmp
+func.func @cmp(%a : index, %b : index) {
+  // CHECK: spirv.IEqual
+  %0 = index.cmp eq(%a, %b)
+  return
+}
+
+// CHECK-LABEL: @ceildivs
+// CHECK-SAME: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32
+// CHECK:    spirv.ReturnValue %{{.*}} : i32
+func.func @ceildivs(%n: index, %m: index) -> index {
+  %result = index.ceildivs %n, %m
+  return %result : index
+}
+
+// CHECK-LABEL: @ceildivu
+// CHECK-SAME: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32
+// CHECK:    spirv.ReturnValue %{{.*}} : i32
+func.func @ceildivu(%n: index, %m: index) -> index {
+  %result = index.ceildivu %n, %m
+  return %result : index
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/scf.mlir b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
new file mode 100644
index 00000000000000..f619ca5771824b
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s
+
+// CHECK-LABEL: @if_yield
+// CHECK: %[[VAR:.*]] = spirv.Variable : !spirv.ptr<f32, Function>
+// CHECK:       spirv.mlir.selection {
+// CHECK-NEXT:    spirv.BranchConditional {{%.*}}, [[TRUE:\^.*]], [[FALSE:\^.*]]
+// CHECK-NEXT:  [[TRUE]]:
+// CHECK:         %[[C0TRUE:.*]] = spirv.Constant 0.000000e+00 : f32
+// CHECK:         %[[RETTRUE:.*]] = spirv.Constant 0.000000e+00 : f32
+// CHECK-DAG:     spirv.Store "Function" %[[VAR]], %[[RETTRUE]] : f32
+// CHECK:         spirv.Branch ^[[MERGE:.*]]
+// CHECK-NEXT:  [[FALSE]]:
+// CHECK:         %[[C0FALSE:.*]] = spirv.Constant 1.000000e+00 : f32
+// CHECK:         %[[RETFALSE:.*]] = spirv.Constant 2.71828175 : f32
+// CHECK-DAG:     spirv.Store "Function" %[[VAR]], %[[RETFALSE]] : f32
+// CHECK:         spirv.Branch ^[[MERGE]]
+// CHECK-NEXT:  ^[[MERGE]]:
+// CHECK:         spirv.mlir.merge
+// CHECK-NEXT:  }
+// CHECK-DAG:   %[[OUT:.*]] = spirv.Load "Function" %[[VAR]] : f32
+// CHECK:       spirv.ReturnValue %[[OUT]] : f32
+func.func @if_yield(%arg0: i1) -> f32 {
+  %0 = scf.if %arg0 -> f32 {
+    %c0 = arith.constant 0.0 : f32
+    %res = math.sqrt %c0 : f32
+    scf.yield %res : f32
+  } else {
+    %c0 = arith.constant 1.0 : f32
+    %res = math.exp %c0 : f32
+    scf.yield %res : f32
+  }
+  return %0 : f32
+}
+
+// CHECK-LABEL: @while
+func.func @while(%arg0: i32, %arg1: i32) -> i32 {
+  %c2_i32 = arith.constant 2 : i32
+  %0 = scf.while (%arg3 = %arg0) : (i32) -> (i32) {
+    %1 = arith.cmpi slt, %arg3, %arg1 : i32
+    scf.condition(%1) %arg3 : i32
+  } do {
+  ^bb0(%arg5: i32):
+    %1 = arith.muli %arg5, %c2_i32 : i32
+    scf.yield %1 : i32
+  }
+  return %0 : i32
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/simple.mlir b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
new file mode 100644
index 00000000000000..20b2a42bc3975c
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s
+
+// CHECK-LABEL: @return_scalar
+// CHECK-SAME: %[[ARG0:.*]]: i32
+// CHECK: spirv.ReturnValue %[[ARG0]]
+func.func @return_scalar(%arg0 : i32) -> i32 {
+  return %arg0 : i32
+}
+
+// CHECK-LABEL: @return_vector
+// CHECK-SAME: %[[ARG0:.*]]: vector<4xi32>
+// CHECK: spirv.ReturnValue %[[ARG0]]
+func.func @return_vector(%arg0 : vector<4xi32>) -> vector<4xi32> {
+  return %arg0 : vector<4xi32>
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/ub.mlir b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
new file mode 100644
index 00000000000000..66528b68f58cf3
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s
+
+// CHECK-LABEL: @ub
+// CHECK: %[[UNDEF:.*]] = spirv.Undef : i32
+// CHECK: spirv.ReturnValue %[[UNDEF]] : i32
+func.func @ub() -> index {
+  %0 = ub.poison : index
+  return %0 : index
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
new file mode 100644
index 00000000000000..336f0fe10c27ef
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
@@ -0,0 +1,439 @@
+// RUN: mlir-opt -split-input-file -convert-to-spirv %s | FileCheck %s
+
+// CHECK-LABEL: @extract
+//  CHECK-SAME: %[[ARG:.+]]: vector<2xf32>
+//       CHECK:   spirv.CompositeExtract %[[ARG]][0 : i32] : vector<2xf32>
+//       CHECK:   spirv.CompositeExtract %[[ARG]][1 : i32] : vector<2xf32>
+func.func @extract(%arg0 : vector<2xf32>) -> (vector<1xf32>, f32) {
+  %0 = "vector.extract"(%arg0) <{static_position = array<i64: 0>}> : (vector<2xf32>) -> vector<1xf32>
+  %1 = "vector.extract"(%arg0) <{static_position = array<i64: 1>}> : (vector<2xf32>) -> f32
+  return %0, %1: vector<1xf32>, f32
+}
+
+// -----
+
+// CHECK-LABEL: @extract_size1_vector
+//  CHECK-SAME: %[[ARG0:.+]]: f32
+//       CHECK:   spirv.ReturnValue %[[ARG0]] : f32
+func.func @extract_size1_vector(%arg0 : vector<1xf32>) -> f32 {
+  %0 = vector.extract %arg0[0] : f32 from vector<1xf32>
+  return %0: f32
+}
+
+// -----
+
+// CHECK-LABEL: @insert
+//  CHECK-SAME: %[[V:.*]]: vector<4xf32>, %[[S:.*]]: f32
+//       CHECK:   spirv.CompositeInsert %[[S]], %[[V]][2 : i32] : f32 into vector<4xf32>
+func.func @insert(%arg0 : vector<4xf32>, %arg1: f32) -> vector<4xf32> {
+  %1 = vector.insert %arg1, %arg0[2] : f32 into vector<4xf32>
+  return %1: vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @insert_index_vector
+//       CHECK:   spirv.CompositeInsert %{{.+}}, %{{.+}}[2 : i32] : i32 into vector<4xi32>
+func.func @insert_index_vector(%arg0 : vector<4xindex>, %arg1: index) -> vector<4xindex> {
+  %1 = vector.insert %arg1, %arg0[2] : index into vector<4xindex>
+  return %1: vector<4xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @insert_size1_vector
+//  CHECK-SAME: %[[V:.*]]: f32, %[[S:.*]]: f32
+//       CHECK:   spirv.ReturnValue %[[S]] : f32
+func.func @insert_size1_vector(%arg0 : vector<1xf32>, %arg1: f32) -> vector<1xf32> {
+  %1 = vector.insert %arg1, %arg0[0] : f32 into vector<1xf32>
+  return %1 : vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @extract_element
+//  CHECK-SAME: %[[V:.*]]: vector<4xf32>, %[[ID:.*]]: i32
+//       CHECK:   spirv.VectorExtractDynamic %[[V]][%[[ID]]] : vector<4xf32>, i32
+func.func @extract_element(%arg0 : vector<4xf32>, %id : i32) -> f32 {
+  %0 = vector.extractelement %arg0[%id : i32] : vector<4xf32>
+  return %0: f32
+}
+
+// -----
+
+// CHECK-LABEL: @extract_element_cst
+//  CHECK-SAME: %[[V:.*]]: vector<4xf32>
+//       CHECK:   spirv.CompositeExtract %[[V]][1 : i32] : vector<4xf32>
+func.func @extract_element_cst(%arg0 : vector<4xf32>) -> f32 {
+  %idx = arith.constant 1 : i32
+  %0 = vector.extractelement %arg0[%idx : i32] : vector<4xf32>
+  return %0: f32
+}
+
+// -----
+
+// CHECK-LABEL: @extract_element_index
+func.func @extract_element_index(%arg0 : vector<4xf32>, %id : index) -> f32 {
+  // CHECK: spirv.VectorExtractDynamic
+  %0 = vector.extractelement %arg0[%id : index] : vector<4xf32>
+  return %0: f32
+}
+
+// -----
+
+// CHECK-LABEL: @extract_element_size1_vector
+//  CHECK-SAME:(%[[S:.+]]: f32,
+func.func @extract_element_size1_vector(%arg0 : f32, %i: index) -> f32 {
+  %bcast = vector.broadcast %arg0 : f32 to vector<1xf32>
+  %0 = vector.extractelement %bcast[%i : index] : vector<1xf32>
+  // CHECK: spirv.ReturnValue %[[S]]
+  return %0: f32
+}
+
+// -----
+
+// CHECK-LABEL: @extract_element_0d_vector
+//  CHECK-SAME: (%[[S:.+]]: f32)
+func.func @extract_element_0d_vector(%arg0 : f32) -> f32 {
+  %bcast = vector.broadcast %arg0 : f32 to vector<f32>
+  %0 = vector.extractelement %bcast[] : vector<f32>
+  // CHECK: spirv.ReturnValue %[[S]]
+  return %0: f32
+}
+
+// -----
+
+// CHECK-LABEL: @insert_element
+//  CHECK-SAME: %[[VAL:.*]]: f32, %[[V:.*]]: vector<4xf32>, %[[ID:.*]]: i32
+//       CHECK:   spirv.VectorInsertDynamic %[[VAL]], %[[V]][%[[ID]]] : vector<4xf32>, i32
+func.func @insert_element(%val: f32, %arg0 : vector<4xf32>, %id : i32) -> vector<4xf32> {
+  %0 = vector.insertelement %val, %arg0[%id : i32] : vector<4xf32>
+  return %0: vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @insert_element_cst
+//  CHECK-SAME: %[[VAL:.*]]: f32, %[[V:.*]]: vector<4xf32>
+//       CHECK:   spirv.CompositeInsert %[[VAL]], %[[V]][2 : i32] : f32 into vector<4xf32>
+func.func @insert_element_cst(%val: f32, %arg0 : vector<4xf32>) -> vector<4xf32> {
+  %idx = arith.constant 2 : i32
+  %0 = vector.insertelement %val, %arg0[%idx : i32] : vector<4xf32>
+  return %0: vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @insert_element_index
+func.func @insert_element_index(%val: f32, %arg0 : vector<4xf32>, %id : index) -> vector<4xf32> {
+  // CHECK: spirv.VectorInsertDynamic
+  %0 = vector.insertelement %val, %arg0[%id : index] : vector<4xf32>
+  return %0: vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @insert_element_size1_vector
+//  CHECK-SAME: (%[[S:[a-z0-9]+]]: f32
+func.func @insert_element_size1_vector(%scalar: f32, %vector : vector<1xf32>, %i: index) -> vector<1xf32> {
+  %0 = vector.insertelement %scalar, %vector[%i : index] : vector<1xf32>
+  // CHECK: spirv.ReturnValue %[[S]]
+  return %0: vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @insert_element_0d_vector
+//  CHECK-SAME: (%[[S:[a-z0-9]+]]: f32
+func.func @insert_element_0d_vector(%scalar: f32, %vector : vector<f32>) -> vector<f32> {
+  %0 = vector.insertelement %scalar, %vector[] : vector<f32>
+  // CHECK: spirv.ReturnValue %[[S]]
+  return %0: vector<f32>
+}
+
+// -----
+
+// CHECK-LABEL: @insert_size1_vector
+//  CHECK-SAME: %[[SUB:.*]]: f32, %[[FULL:.*]]: vector<3xf32>
+//       CHECK:   %[[RET:.*]] = spirv.CompositeInsert %[[SUB]], %[[FULL]][2 : i32] : f32 into vector<3xf32>
+//       CHECK:   spirv.ReturnValue %[[RET]] : vector<3xf32>
+func.func @insert_size1_vector(%arg0 : vector<1xf32>, %arg1: vector<3xf32>) -> vector<3xf32> {
+  %1 = vector.insert_strided_slice %arg0, %arg1 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
+  return %1 : vector<3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @fma
+//  CHECK-SAME: %[[A:.*]]: vector<4xf32>, %[[B:.*]]: vector<4xf32>, %[[C:.*]]: vector<4xf32>
+//       CHECK:   spirv.GL.Fma %[[A]], %[[B]], %[[C]] : vector<4xf32>
+func.func @fma(%a: vector<4xf32>, %b: vector<4xf32>, %c: vector<4xf32>) -> vector<4xf32> {
+  %0 = vector.fma %a, %b, %c: vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @fma_size1_vector
+//       CHECK:   spirv.GL.Fma %{{.+}} : f32
+func.func @fma_size1_vector(%a: vector<1xf32>, %b: vector<1xf32>, %c: vector<1xf32>) -> vector<1xf32> {
+  %0 = vector.fma %a, %b, %c: vector<1xf32>
+  return %0 : vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @splat
+//  CHECK-SAME: (%[[A:.+]]: f32)
+//       CHECK:   %[[VAL:.+]] = spirv.CompositeConstruct %[[A]], %[[A]], %[[A]], %[[A]]
+//       CHECK:   spirv.ReturnValue %[[VAL]] : vector<4xf32>
+func.func @splat(%f : f32) -> vector<4xf32> {
+  %splat = vector.splat %f : vector<4xf32>
+  return %splat : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @splat_size1_vector
+//  CHECK-SAME: (%[[A:.+]]: f32)
+//       CHECK:   spirv.ReturnValue %[[A]] : f32
+func.func @splat_size1_vector(%f : f32) -> vector<1xf32> {
+  %splat = vector.splat %f : vector<1xf32>
+  return %splat : vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func @shuffle
+//  CHECK-SAME:  %[[ARG0:.+]]: f32, %[[ARG1:.+]]: f32
+//       CHECK:    spirv.CompositeConstruct %[[ARG0]], %[[ARG1]], %[[ARG1]], %[[ARG0]] : (f32, f32, f32, f32) -> vector<4xf32>
+func.func @shuffle(%v0 : vector<1xf32>, %v1: vector<1xf32>) -> vector<4xf32> {
+  %shuffle = vector.shuffle %v0, %v1 [0, 1, 1, 0] : vector<1xf32>, vector<1xf32>
+  return %shuffle : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func @shuffle
+//  CHECK-SAME:  %[[V0:.+]]: vector<3xf32>, %[[V1:.+]]: vector<3xf32>
+//       CHECK:    spirv.VectorShuffle [3 : i32, 2 : i32, 5 : i32, 1 : i32] %[[V0]], %[[V1]] : vector<3xf32>, vector<3xf32> -> vector<4xf32>
+func.func @shuffle(%v0 : vector<3xf32>, %v1: vector<3xf32>) -> vector<4xf32> {
+  %shuffle = vector.shuffle %v0, %v1 [3, 2, 5, 1] : vector<3xf32>, vector<3xf32>
+  return %shuffle : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func @shuffle
+//  CHECK-SAME:  %[[ARG0:.+]]: i32, %[[ARG1:.+]]: vector<3xi32>
+//       CHECK:    %[[S1:.+]] = spirv.CompositeExtract %[[ARG1]][1 : i32] : vector<3xi32>
+//       CHECK:    %[[S2:.+]] = spirv.CompositeExtract %[[ARG1]][2 : i32] : vector<3xi32>
+//       CHECK:    %[[RES:.+]] = spirv.CompositeConstruct %[[ARG0]], %[[S1]], %[[S2]] : (i32, i32, i32) -> vector<3xi32>
+//       CHECK:    spirv.ReturnValue %[[RES]]
+func.func @shuffle(%v0 : vector<1xi32>, %v1: vector<3xi32>) -> vector<3xi32> {
+  %shuffle = vector.shuffle %v0, %v1 [0, 2, 3] : vector<1xi32>, vector<3xi32>
+  return %shuffle : vector<3xi32>
+}
+
+// -----
+
+// CHECK-LABEL:  func @shuffle
+//  CHECK-SAME:  %[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32
+//       CHECK:    %[[RES:.+]] = spirv.CompositeConstruct %[[ARG0]], %[[ARG1]] : (i32, i32) -> vector<2xi32>
+//       CHECK:    spirv.ReturnValue %[[RES]]
+func.func @shuffle(%v0 : vector<1xi32>, %v1: vector<1xi32>) -> vector<2xi32> {
+  %shuffle = vector.shuffle %v0, %v1 [0, 1] : vector<1xi32>, vector<1xi32>
+  return %shuffle : vector<2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @interleave
+//  CHECK-SAME: (%[[ARG0:.+]]: vector<2xf32>, %[[ARG1:.+]]: vector<2xf32>)
+//       CHECK: %[[SHUFFLE:.*]] = spirv.VectorShuffle [0 : i32, 2 : i32, 1 : i32, 3 : i32] %[[ARG0]], %[[ARG1]] : vector<2xf32>, vector<2xf32> -> vector<4xf32>
+//       CHECK: spirv.ReturnValue %[[SHUFFLE]]
+func.func @interleave(%a: vector<2xf32>, %b: vector<2xf32>) -> vector<4xf32> {
+  %0 = vector.interleave %a, %b : vector<2xf32> -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @interleave_size1
+// CHECK-SAME: (%[[ARG0:.+]]: f32, %[[ARG1:.+]]: f32)
+//       CHECK: %[[RES:.*]] = spirv.CompositeConstruct %[[ARG0]], %[[ARG1]] : (f32, f32) -> vector<2xf32>
+//       CHECK: spirv.ReturnValue %[[RES]]
+func.func @interleave_size1(%a: vector<1xf32>, %b: vector<1xf32>) -> vector<2xf32> {
+  %0 = vector.interleave %a, %b : vector<1xf32> -> vector<2xf32>
+  return %0 : vector<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_add
+//  CHECK-SAME: (%[[V:.+]]: vector<4xi32>)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<4xi32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<4xi32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<4xi32>
+//       CHECK:   %[[S3:.+]] = spirv.CompositeExtract %[[V]][3 : i32] : vector<4xi32>
+//       CHECK:   %[[ADD0:.+]] = spirv.IAdd %[[S0]], %[[S1]]
+//       CHECK:   %[[ADD1:.+]] = spirv.IAdd %[[ADD0]], %[[S2]]
+//       CHECK:   %[[ADD2:.+]] = spirv.IAdd %[[ADD1]], %[[S3]]
+//       CHECK:   spirv.ReturnValue %[[ADD2]]
+func.func @reduction_add(%v : vector<4xi32>) -> i32 {
+  %reduce = vector.reduction <add>, %v : vector<4xi32> into i32
+  return %reduce : i32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_addf_one_elem
+//  CHECK-SAME:  (%[[ARG0:.+]]: f32)
+//  CHECK:       spirv.ReturnValue %[[ARG0]] : f32
+func.func @reduction_addf_one_elem(%arg0: vector<1xf32>) -> f32 {
+  %red = vector.reduction <add>, %arg0 : vector<1xf32> into f32
+  return %red : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_addf_one_elem_acc
+//  CHECK-SAME:  (%[[ARG0:.+]]: f32, %[[ACC:.+]]: f32)
+//  CHECK:       %[[RES:.+]] = spirv.FAdd %[[ACC]], %[[ARG0]] : f32
+//  CHECK:       spirv.ReturnValue %[[RES]] : f32
+func.func @reduction_addf_one_elem_acc(%arg0: vector<1xf32>, %acc: f32) -> f32 {
+  %red = vector.reduction <add>, %arg0, %acc : vector<1xf32> into f32
+  return %red : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_mul
+//  CHECK-SAME: (%[[V:.+]]: vector<3xf32>, %[[S:.+]]: f32)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+//       CHECK:   %[[MUL0:.+]] = spirv.FMul %[[S0]], %[[S1]]
+//       CHECK:   %[[MUL1:.+]] = spirv.FMul %[[MUL0]], %[[S2]]
+//       CHECK:   %[[MUL2:.+]] = spirv.FMul %[[MUL1]], %[[S]]
+//       CHECK:   spirv.ReturnValue %[[MUL2]]
+func.func @reduction_mul(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <mul>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_maximumf
+//  CHECK-SAME: (%[[V:.+]]: vector<3xf32>, %[[S:.+]]: f32)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+//       CHECK:   %[[MAX0:.+]] = spirv.GL.FMax %[[S0]], %[[S1]]
+//       CHECK:   %[[MAX1:.+]] = spirv.GL.FMax %[[MAX0]], %[[S2]]
+//       CHECK:   %[[MAX2:.+]] = spirv.GL.FMax %[[MAX1]], %[[S]]
+//       CHECK:   spirv.ReturnValue %[[MAX2]]
+func.func @reduction_maximumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <maximumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_minimumf
+//  CHECK-SAME: (%[[V:.+]]: vector<3xf32>, %[[S:.+]]: f32)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+//       CHECK:   %[[MIN0:.+]] = spirv.GL.FMin %[[S0]], %[[S1]]
+//       CHECK:   %[[MIN1:.+]] = spirv.GL.FMin %[[MIN0]], %[[S2]]
+//       CHECK:   %[[MIN2:.+]] = spirv.GL.FMin %[[MIN1]], %[[S]]
+//       CHECK:   spirv.ReturnValue %[[MIN2]]
+func.func @reduction_minimumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <minimumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_maxsi
+//  CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xi32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xi32>
+//       CHECK:   %[[MAX0:.+]] = spirv.GL.SMax %[[S0]], %[[S1]]
+//       CHECK:   %[[MAX1:.+]] = spirv.GL.SMax %[[MAX0]], %[[S2]]
+//       CHECK:   %[[MAX2:.+]] = spirv.GL.SMax %[[MAX1]], %[[S]]
+//       CHECK:   spirv.ReturnValue %[[MAX2]]
+func.func @reduction_maxsi(%v : vector<3xi32>, %s: i32) -> i32 {
+  %reduce = vector.reduction <maxsi>, %v, %s : vector<3xi32> into i32
+  return %reduce : i32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_minsi
+//  CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xi32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xi32>
+//       CHECK:   %[[MIN0:.+]] = spirv.GL.SMin %[[S0]], %[[S1]]
+//       CHECK:   %[[MIN1:.+]] = spirv.GL.SMin %[[MIN0]], %[[S2]]
+//       CHECK:   %[[MIN2:.+]] = spirv.GL.SMin %[[MIN1]], %[[S]]
+//       CHECK:   spirv.ReturnValue %[[MIN2]]
+func.func @reduction_minsi(%v : vector<3xi32>, %s: i32) -> i32 {
+  %reduce = vector.reduction <minsi>, %v, %s : vector<3xi32> into i32
+  return %reduce : i32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_maxui
+//  CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xi32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xi32>
+//       CHECK:   %[[MAX0:.+]] = spirv.GL.UMax %[[S0]], %[[S1]]
+//       CHECK:   %[[MAX1:.+]] = spirv.GL.UMax %[[MAX0]], %[[S2]]
+//       CHECK:   %[[MAX2:.+]] = spirv.GL.UMax %[[MAX1]], %[[S]]
+//       CHECK:   spirv.ReturnValue %[[MAX2]]
+func.func @reduction_maxui(%v : vector<3xi32>, %s: i32) -> i32 {
+  %reduce = vector.reduction <maxui>, %v, %s : vector<3xi32> into i32
+  return %reduce : i32
+}
+
+// -----
+
+// CHECK-LABEL: func @reduction_minui
+//  CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32)
+//       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32>
+//       CHECK:   %[[S1:.+]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xi32>
+//       CHECK:   %[[S2:.+]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xi32>
+//       CHECK:   %[[MIN0:.+]] = spirv.GL.UMin %[[S0]], %[[S1]]
+//       CHECK:   %[[MIN1:.+]] = spirv.GL.UMin %[[MIN0]], %[[S2]]
+//       CHECK:   %[[MIN2:.+]] = spirv.GL.UMin %[[MIN1]], %[[S]]
+//       CHECK:   spirv.ReturnValue %[[MIN2]]
+func.func @reduction_minui(%v : vector<3xi32>, %s: i32) -> i32 {
+  %reduce = vector.reduction <minui>, %v, %s : vector<3xi32> into i32
+  return %reduce : i32
+}
+
+// -----
+
+// CHECK-LABEL: @shape_cast_same_type
+//  CHECK-SAME: (%[[ARG0:.*]]: vector<2xf32>)
+//       CHECK:   spirv.ReturnValue %[[ARG0]]
+func.func @shape_cast_same_type(%arg0 : vector<2xf32>) -> vector<2xf32> {
+  %1 = vector.shape_cast %arg0 : vector<2xf32> to vector<2xf32>
+  return %arg0 : vector<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @shape_cast_size1_vector
+//  CHECK-SAME: (%[[ARG0:.*]]: f32)
+//       CHECK:   spirv.ReturnValue %[[ARG0]]
+func.func @shape_cast_size1_vector(%arg0 : vector<f32>) -> vector<1xf32> {
+  %1 = vector.shape_cast %arg0 : vector<f32> to vector<1xf32>
+  return %1 : vector<1xf32>
+}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index c57cfd2977836a..d914790c05fe0b 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-bare-ptr-memref-call-conv=1' -split-input-file | FileCheck %s --check-prefix=CHECK-BARE
 // RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
 gpu.module @test_module_0 {
@@ -670,7 +671,7 @@ gpu.module @test_module_32 {
   }
 }
 
-gpu.module @gpumodule {
+gpu.module @test_module_33 {
 // CHECK-LABEL: func @kernel_with_block_size()
 // CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>}
   gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} {
@@ -679,6 +680,28 @@ gpu.module @gpumodule {
 }
 
 
+gpu.module @test_module_34 {
+  // CHECK-LABEL: llvm.func @memref_signature(
+  //  CHECK-SAME:     %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: f32) -> !llvm.struct<(struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, f32)>
+  //       CHECK:   llvm.mlir.undef
+  //       CHECK:   llvm.insertvalue
+  //       CHECK:   llvm.insertvalue
+  //       CHECK:   llvm.insertvalue
+  //       CHECK:   llvm.insertvalue
+  //       CHECK:   llvm.insertvalue
+  //       CHECK:   llvm.mlir.undef
+  //       CHECK:   llvm.insertvalue
+  //       CHECK:   llvm.insertvalue
+  //       CHECK:   llvm.return
+
+  // CHECK-BARE-LABEL: llvm.func @memref_signature(
+  //  CHECK-BARE-SAME:     %{{.*}}: !llvm.ptr, %{{.*}}: f32) -> !llvm.struct<(ptr, f32)>
+  gpu.func @memref_signature(%m: memref<2xf32>, %f: f32) -> (memref<2xf32>, f32) {
+    gpu.return %m, %f : memref<2xf32>, f32
+  }
+}
+
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
@@ -701,9 +724,7 @@ module attributes {transform.with_named_sequence} {
     } with type_converter {
       transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
         {index_bitwidth = 64,
-        use_bare_ptr = true,
-        use_bare_ptr_memref_call_conv = true,
-        use_opaque_pointers = true}
+        use_bare_ptr_call_conv = false}
     } {
       legal_dialects = ["llvm", "memref", "nvvm", "test"],
       legal_ops = ["func.func", "gpu.module", "gpu.module_end", "gpu.yield"],
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 45b39f79a2a729..5187d79fd4c0be 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -606,7 +606,7 @@ func.func @test_simple_ui8(%arg0: tensor<1xui8>) -> () {
 // -----
 
 // CHECK-LABEL: @test_simple_i32
-func.func @test_simple_i32(%arg0: tensor<1xi32>) -> () {
+func.func @test_simple_i32(%arg0: tensor<1xi32>, %unsigned: tensor<1xui32>, %unsigned64: tensor<1xui64>) -> () {
   // CHECK: linalg.generic
   // CHECK: arith.addi
   %0 = tosa.add %arg0, %arg0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
@@ -700,6 +700,34 @@ func.func @test_simple_i32(%arg0: tensor<1xi32>) -> () {
   // CHECK-DAG: arith.minsi
   %19 = tosa.clamp %0 {min_int = 1 : i64, max_int = 5 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xi32>) -> tensor<1xi32>
 
+  // CHECK: linalg.generic
+  // CHECK-DAG: %[[LB:.*]] = arith.constant 4 : i32
+  // CHECK-DAG: %[[UB:.*]] = arith.constant 32 : i32
+  // CHECK-DAG: arith.maxui %[[LB]],
+  // CHECK-DAG: arith.minui %[[UB]],
+  %u0 = tosa.clamp %unsigned {min_int = 4 : i64, max_int = 32 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui32>) -> tensor<1xui32>
+
+  // CHECK: linalg.generic
+  // CHECK-DAG: %[[LB:.*]] = arith.constant -1 : i32
+  // CHECK-DAG: %[[UB:.*]] = arith.constant -1 : i32
+  // CHECK-DAG: arith.maxui %[[LB]],
+  // CHECK-DAG: arith.minui %[[UB]],
+  %u1 = tosa.clamp %unsigned {min_int = 9223372036854775807 : i64, max_int = 9223372036854775807 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui32>) -> tensor<1xui32>
+
+  // CHECK: linalg.generic
+  // CHECK-DAG: %[[LB:.*]] = arith.constant 0 : i32
+  // CHECK-DAG: %[[UB:.*]] = arith.constant 0 : i32
+  // CHECK-DAG: arith.maxui %[[LB]],
+  // CHECK-DAG: arith.minui %[[UB]],
+  %u2 = tosa.clamp %unsigned {min_int = -3 : i64, max_int = -2 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui32>) -> tensor<1xui32>
+
+  // CHECK: linalg.generic
+  // CHECK-DAG: %[[LB:.*]] = arith.constant 0 : i64
+  // CHECK-DAG: %[[UB:.*]] = arith.constant 9223372036854775807 : i64
+  // CHECK-DAG: arith.maxui %[[LB]],
+  // CHECK-DAG: arith.minui %[[UB]],
+  %u3 = tosa.clamp %unsigned64 {min_int = -3 : i64, max_int = 9223372036854775807 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui64>) -> tensor<1xui64>
+
   // CHECK: linalg.generic
   // CHECK: arith.trunci
   %20 = tosa.cast %0 : (tensor<1xi32>) -> tensor<1xi16>
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 4fe7cfb689be83..a386a178b78995 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2467,7 +2467,7 @@ func.func @test_remsi_1(%arg : vector<4xi32>) -> (vector<4xi32>) {
 // -----
 
 // CHECK-LABEL: @test_remf(
-// CHECK: %[[res:.+]] = arith.constant -1.000000e+00 : f32
+// CHECK: %[[res:.+]] = arith.constant 1.000000e+00 : f32
 // CHECK: return %[[res]]
 func.func @test_remf() -> (f32) {
   %v1 = arith.constant 3.0 : f32
@@ -2476,11 +2476,24 @@ func.func @test_remf() -> (f32) {
   return %0 : f32
 }
 
+// CHECK-LABEL: @test_remf2(
+// CHECK: %[[respos:.+]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[resneg:.+]] = arith.constant -1.000000e+00 : f32
+// CHECK: return %[[respos]], %[[resneg]]
+func.func @test_remf2() -> (f32, f32) {
+  %v1 = arith.constant 3.0 : f32
+  %v2 = arith.constant -2.0 : f32
+  %v3 = arith.constant -3.0 : f32
+  %0 = arith.remf %v1, %v2 : f32
+  %1 = arith.remf %v3, %v2 : f32
+  return %0, %1 : f32, f32
+}
+
 // CHECK-LABEL: @test_remf_vec(
 // CHECK: %[[res:.+]] = arith.constant dense<[1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00]> : vector<4xf32>
 // CHECK: return %[[res]]
 func.func @test_remf_vec() -> (vector<4xf32>) {
-  %v1 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : vector<4xf32>
+  %v1 = arith.constant dense<[1.0, 2.0, -3.0, 4.0]> : vector<4xf32>
   %v2 = arith.constant dense<[2.0, 2.0, 2.0, 2.0]> : vector<4xf32>
   %0 = arith.remf %v1, %v2 : vector<4xf32>
   return %0 : vector<4xf32>
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index e00b7692fe3968..48fdb1cdced4dc 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -918,3 +918,21 @@ func.func @test_cmpf_propagates(%a: f32, %b: f32) -> index {
   func.return %2 : index
 }
 
+// CHECK-LABEL: func @zero_trip_loop
+func.func @zero_trip_loop() {
+  %idx1 = arith.constant 1 : index
+  scf.for %arg0 = %idx1 to %idx1 step %idx1 {
+    %138 = index.floordivs %arg0, %arg0
+  }
+  return
+}
+
+// CHECK-LABEL: func @zero_trip_loop2
+func.func @zero_trip_loop2() {
+  %idx1 = arith.constant 1 : index
+  %idxm1 = arith.constant -1 : index
+  scf.for %arg0 = %idx1 to %idx1 step %idxm1 {
+    %138 = index.floordivs %arg0, %arg0
+  }
+  return
+}
diff --git a/mlir/test/Dialect/ArmSME/enable-arm-streaming-invalid.mlir b/mlir/test/Dialect/ArmSME/enable-arm-streaming-invalid.mlir
index da70b632d70c4e..4e92f814c0ec83 100644
--- a/mlir/test/Dialect/ArmSME/enable-arm-streaming-invalid.mlir
+++ b/mlir/test/Dialect/ArmSME/enable-arm-streaming-invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -enable-arm-streaming="if-contains-scalable-vectors if-required-by-ops" -verify-diagnostics
+// RUN: mlir-opt %s -enable-arm-streaming="if-scalable-and-supported if-required-by-ops" -verify-diagnostics
 
-// expected-error@below {{enable-arm-streaming: `if-required-by-ops` and `if-contains-scalable-vectors` are mutually exclusive}}
+// expected-error@below {{enable-arm-streaming: `if-required-by-ops` and `if-scalable-and-supported` are mutually exclusive}}
 func.func @test() { return }
diff --git a/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir b/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
index 2011802c5c8b2f..00b38b86f0d6ef 100644
--- a/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
+++ b/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
@@ -3,7 +3,7 @@
 // RUN: mlir-opt %s -enable-arm-streaming=streaming-mode=streaming-compatible -verify-diagnostics | FileCheck %s -check-prefix=CHECK-COMPATIBLE
 // RUN: mlir-opt %s -enable-arm-streaming=za-mode=new-za -verify-diagnostics | FileCheck %s -check-prefix=CHECK-ENABLE-ZA
 // RUN: mlir-opt %s -enable-arm-streaming=if-required-by-ops -verify-diagnostics | FileCheck %s -check-prefix=IF-REQUIRED
-// RUN: mlir-opt %s -enable-arm-streaming=if-contains-scalable-vectors -verify-diagnostics | FileCheck %s -check-prefix=IF-SCALABLE
+// RUN: mlir-opt %s -enable-arm-streaming=if-scalable-and-supported -verify-diagnostics | FileCheck %s -check-prefix=IF-SCALABLE
 
 // CHECK-LABEL: @arm_streaming
 // CHECK-SAME: attributes {arm_streaming}
@@ -53,3 +53,20 @@ func.func @no_scalable_vectors(%vec: vector<4xf32>) -> vector<4xf32> {
   %0 = arith.addf %vec, %vec : vector<4xf32>
   return %0 : vector<4xf32>
 }
+
+// IF-SCALABLE-LABEL: @contains_gather
+// IF-SCALABLE-NOT: arm_streaming
+func.func @contains_gather(%base: memref<?xf32>, %v: vector<[4]xindex>, %mask: vector<[4]xi1>, %pass_thru: vector<[4]xf32>) -> vector<[4]xf32> {
+ %c0 = arith.constant 0 : index
+ %0 = vector.gather %base[%c0][%v], %mask, %pass_thru : memref<?xf32>, vector<[4]xindex>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
+ return %0 : vector<[4]xf32>
+}
+
+// IF-SCALABLE-LABEL: @contains_scatter
+// IF-SCALABLE-NOT: arm_streaming
+func.func @contains_scatter(%base: memref<?xf32>, %v: vector<[4]xindex>,%mask: vector<[4]xi1>, %value: vector<[4]xf32>)
+{
+  %c0 = arith.constant 0 : index
+  vector.scatter %base[%c0][%v], %mask, %value : memref<?xf32>, vector<[4]xindex>, vector<[4]xi1>, vector<[4]xf32>
+  return
+}
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index f43ef1cce787c5..71d80bc16ea12c 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -174,11 +174,17 @@ func.func @transfer_read_i16_scalable_8x16_masked(%src: memref<?x?xi16>, %dim0:
 func.func @transfer_write_f16_scalable_16x8(%dest: memref<?x?xf16>, %vec: vector<[16]x[8]xf16>)
 {
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
   // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
   // CHECK-DAG: %[[VSCALE:.*]] = vector.vscale
   // CHECK-DAG: %[[C8_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C8]] : index
-  // CHECK-DAG: vector.transfer_write %[[TOP]], %[[DEST]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<[8]x[8]xf16>, memref<?x?xf16>
-  // CHECK-DAG: vector.transfer_write %[[BOTTOM]], %[[DEST]][%[[C8_VSCALE]], %[[C0]]] {in_bounds = [true, true]} : vector<[8]x[8]xf16>, memref<?x?xf16>
+  // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C8_VSCALE]] step %[[C1]] {
+  // CHECK-NEXT:   %[[TOP_SLICE:.*]] = vector.extract %[[TOP]][%[[I]]] : vector<[8]xf16> from vector<[8]x[8]xf16>
+  // CHECK-NEXT:   vector.transfer_write %[[TOP_SLICE]], %[[DEST]][%[[I]], %[[C0]]] {in_bounds = [true]} : vector<[8]xf16>, memref<?x?xf16>
+  // CHECK-NEXT:   %[[BOTTOM_I:.*]] = arith.addi %[[C8_VSCALE]], %[[I]] : index
+  // CHECK-NEXT:   %[[BOTTOM_SLICE:.*]] = vector.extract %[[BOTTOM]][%[[I]]] : vector<[8]xf16> from vector<[8]x[8]xf16>
+  // CHECK-NEXT:   vector.transfer_write %[[BOTTOM_SLICE]], %[[DEST]][%[[BOTTOM_I]], %[[C0]]] {in_bounds = [true]} : vector<[8]xf16>, memref<?x?xf16>
+  // CHECK-NEXT: }
   // CHECK-NEXT: return
   %c0 = arith.constant 0 : index
   vector.transfer_write %vec, %dest[%c0, %c0] {in_bounds = [true, true]} : vector<[16]x[8]xf16>, memref<?x?xf16>
@@ -201,6 +207,90 @@ func.func @transfer_write_i8_scalable_16x16_masked(%dest: memref<?x?xi8>, %vec:
 
 // -----
 
+// CHECK-LABEL: @transfer_write_f32_scalable_8x8_masked(
+// CHECK-SAME:                                    %[[DEST:[a-z0-9]+]]: memref<?x?xf32>,
+// CHECK-SAME:                                    %[[DIM_0:[a-z0-9]+]]: index,
+// CHECK-SAME:                                    %[[DIM_1:[a-z0-9]+]]: index,
+// CHECK-SAME:                                    %[[TILE_0:[a-z0-9]+]]: vector<[4]x[4]xf32>,
+// CHECK-SAME:                                    %[[TILE_1:[a-z0-9]+]]: vector<[4]x[4]xf32>,
+// CHECK-SAME:                                    %[[TILE_2:[a-z0-9]+]]: vector<[4]x[4]xf32>,
+// CHECK-SAME:                                    %[[TILE_3:[a-z0-9]+]]: vector<[4]x[4]xf32>)
+func.func @transfer_write_f32_scalable_8x8_masked(%dest: memref<?x?xf32>, %dim0: index, %dim1: index, %vec: vector<[8]x[8]xf32>)
+{
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+  // CHECK-DAG: %[[VSCALE:.*]] = vector.vscale
+  // CHECK-DAG: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+  // CHECK-DAG: %[[MASK:.*]] =  vector.create_mask %[[DIM_0]], %[[DIM_1]] : vector<[8]x[8]xi1>
+  // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C4_VSCALE]] step %[[C1]] {
+  // CHECK-NEXT:   %[[UPPER_SLICE_MASK:.*]] = vector.extract %[[MASK]][%[[I]]] : vector<[8]xi1> from vector<[8]x[8]xi1>
+  // CHECK-NEXT:   %[[TILE_0_SLICE_MASK:.*]] = vector.scalable.extract %[[UPPER_SLICE_MASK]][0] : vector<[4]xi1> from vector<[8]xi1>
+  // CHECK-NEXT:   %[[TILE_0_SLICE:.*]] = vector.extract %[[TILE_0]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_0_SLICE]], %[[DEST]][%[[I]], %[[C0]]], %[[TILE_0_SLICE_MASK]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT:   %[[TILE_1_SLICE_MASK:.*]] = vector.scalable.extract %[[UPPER_SLICE_MASK]][4] : vector<[4]xi1> from vector<[8]xi1>
+  // CHECK-NEXT:   %[[TILE_1_SLICE:.*]] = vector.extract %[[TILE_1]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_1_SLICE]], %[[DEST]][%[[I]], %[[C4_VSCALE]]], %[[TILE_1_SLICE_MASK]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT:   %[[LOWER_SLICE_I:.*]] = arith.addi %[[C4_VSCALE]], %[[I]] : index
+  // CHECK-NEXT:   %[[LOWER_SLICE_MASK:.*]] = vector.extract %[[MASK]][%[[LOWER_SLICE_I]]] : vector<[8]xi1> from vector<[8]x[8]xi1>
+  // CHECK-NEXT:   %[[TILE_2_SLICE_MASK:.*]] = vector.scalable.extract %[[LOWER_SLICE_MASK]][0] : vector<[4]xi1> from vector<[8]xi1>
+  // CHECK-NEXT:   %[[TILE_2_SLICE:.*]] = vector.extract %[[TILE_2]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_2_SLICE]], %[[DEST]][%[[LOWER_SLICE_I]], %[[C0]]], %[[TILE_2_SLICE_MASK]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT:   %[[TILE_3_SLICE_MASK:.*]] = vector.scalable.extract %[[LOWER_SLICE_MASK]][4] : vector<[4]xi1> from vector<[8]xi1>
+  // CHECK-NEXT:   %[[TILE_3_SLICE:.*]] = vector.extract %[[TILE_3]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_3_SLICE:.*]], %[[DEST]][%[[LOWER_SLICE_I]], %[[C4_VSCALE]]], %[[TILE_3_SLICE_MASK]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT: }
+  %c0 = arith.constant 0 : index
+  %mask = vector.create_mask %dim0, %dim1 : vector<[8]x[8]xi1>
+  vector.transfer_write %vec, %dest[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[8]x[8]xf32>, memref<?x?xf32>
+  return
+}
+
+// -----
+
+// Tensor semantics are not supported for the store loop lowering.
+
+// CHECK-LABEL: @negative_transfer_write_f32_scalable_8x8_tensor
+// CHECK-NOT: scf.for
+func.func @negative_transfer_write_f32_scalable_8x8_tensor(%dest: tensor<?x?xf32>, %vec: vector<[8]x[8]xf32>)
+{
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %dest[%c0, %c0] {in_bounds = [true, true]} : vector<[8]x[8]xf32>, tensor<?x?xf32>
+  return
+}
+
+// -----
+
+#transpose = affine_map<(d0, d1) -> (d1, d0)>
+
+// Transposes are not supported for the store loop lowering.
+
+// CHECK-LABEL: @negative_transfer_write_f32_scalable_8x8_tensor
+// CHECK-NOT: scf.for
+func.func @negative_transfer_write_f32_scalable_8x8_tensor(%dest: tensor<?x?xf32>, %dim0: index, %dim1: index, %vec: vector<[8]x[8]xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %mask = vector.create_mask %dim0, %dim1 : vector<[8]x[8]xi1>
+  vector.transfer_write %vec, %dest[%c0, %c0], %mask {permutation_map = #transpose, in_bounds = [true, true]} : vector<[8]x[8]xf32>, tensor<?x?xf32>
+  return
+}
+
+// -----
+
+// Masked writes where any dimension of the mask is > 16 are not supported for the store loop lowering.
+
+// CHECK-LABEL: @negative_transfer_write_f32_scalable_32x32
+// CHECK-NOT: scf.for
+func.func @negative_transfer_write_f32_scalable_32x32(%dest: memref<?x?xf32>, %dim0: index, %dim1: index, %vec: vector<[32]x[32]xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %mask = vector.create_mask %dim0, %dim1 : vector<[32]x[32]xi1>
+  vector.transfer_write %vec, %dest[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[32]x[32]xf32>, memref<?x?xf32>
+  return
+}
+
+// -----
+
 #transpose = affine_map<(d0, d1) -> (d1, d0)>
 
 // CHECK-LABEL: @transpose_f32_scalable_4x16_via_read(
@@ -209,6 +299,7 @@ func.func @transfer_write_i8_scalable_16x16_masked(%dest: memref<?x?xi8>, %vec:
 func.func @transpose_f32_scalable_4x16_via_read(%src: memref<?x?xf32>, %dest: memref<?x?xf32>)
 {
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
   // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
   // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
   // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
@@ -221,10 +312,19 @@ func.func @transpose_f32_scalable_4x16_via_read(%src: memref<?x?xf32>, %dest: me
   // CHECK-DAG: %[[TILE_1:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C4_VSCALE]]], %[[PAD]] {in_bounds = [true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<[4]x[4]xf32>
   // CHECK-DAG: %[[TILE_2:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C8_VSCALE]]], %[[PAD]] {in_bounds = [true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<[4]x[4]xf32>
   // CHECK-DAG: %[[TILE_3:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C12_VSCALE]]], %[[PAD]] {in_bounds = [true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<[4]x[4]xf32>
-  // CHECK-DAG: vector.transfer_write %[[TILE_0]], %[[DEST]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32>
-  // CHECK-DAG: vector.transfer_write %[[TILE_1]], %[[DEST]][%[[C4_VSCALE]], %[[C0]]] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32>
-  // CHECK-DAG: vector.transfer_write %[[TILE_2]], %[[DEST]][%[[C8_VSCALE]], %[[C0]]] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32>
-  // CHECK-DAG: vector.transfer_write %[[TILE_3]], %[[DEST]][%[[C12_VSCALE]], %[[C0]]] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C4_VSCALE]] step %[[C1]] {
+  // CHECK-NEXT:   %[[TILE_0_SLICE:.*]] = vector.extract %[[TILE_0]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_0_SLICE]], %[[DEST]][%[[I]], %[[C0]]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT:   %[[TILE_1_I:.*]] = arith.addi %[[C4_VSCALE]], %[[I]] : index
+  // CHECK-NEXT:   %[[TILE_1_SLICE:.*]] = vector.extract %[[TILE_1]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_1_SLICE]], %[[DEST]][%[[TILE_1_I]], %[[C0]]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT:   %[[TILE_2_I:.*]] = arith.addi %[[C8_VSCALE]], %[[I]] : index
+  // CHECK-NEXT:   %[[TILE_2_SLICE:.*]] = vector.extract %[[TILE_2]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_2_SLICE]], %[[DEST]][%[[TILE_2_I]], %[[C0]]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT:   %[[TILE_3_I:.*]] = arith.addi %[[C12_VSCALE]], %[[I]] : index
+  // CHECK-NEXT:   %[[TILE_3_SLICE:.*]] = vector.extract %[[TILE_3]][%[[I]]] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  // CHECK-NEXT:   vector.transfer_write %[[TILE_3_SLICE]], %[[DEST]][%[[TILE_3_I]], %[[C0]]] {in_bounds = [true]} : vector<[4]xf32>, memref<?x?xf32>
+  // CHECK-NEXT: }
   // CHECK-NEXT: return
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f32
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/misc-other.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/misc-other.mlir
new file mode 100644
index 00000000000000..05e52848ca8778
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/misc-other.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt -verify-diagnostics -ownership-based-buffer-deallocation -split-input-file %s
+
+// Test Case: ownership-based-buffer-deallocation should not fail
+//            with cf.assert op
+
+// CHECK-LABEL: func @func_with_assert(
+//       CHECK: %0 = arith.cmpi slt, %arg0, %arg1 : index
+//       CHECK: cf.assert %0, "%arg0 must be less than %arg1"
+func.func @func_with_assert(%arg0: index, %arg1: index) {
+  %0 = arith.cmpi slt, %arg0, %arg1 : index
+  cf.assert %0, "%arg0 must be less than %arg1"
+  return
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
index 0ed3a9f077ce0e..dbf8d6563477b5 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
@@ -268,4 +268,4 @@ func.func @materialize_in_dest_raw(%f: f32, %f2: f32, %idx: index) -> (tensor<5x
   %r = tensor.extract %dest_filled[%idx] : tensor<5xf32>
 
   return %0, %r : tensor<5xf32>, f32
-}
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/DLTI/invalid.mlir b/mlir/test/Dialect/DLTI/invalid.mlir
index d732fb462e68e9..36bf6088fab7e9 100644
--- a/mlir/test/Dialect/DLTI/invalid.mlir
+++ b/mlir/test/Dialect/DLTI/invalid.mlir
@@ -126,7 +126,7 @@ module attributes {
   // expected-error@+2 {{failed to parse DLTI_TargetSystemSpecAttr parameter 'entries' which is to be a `::llvm::ArrayRef<DeviceIDTargetDeviceSpecPair>`}}
   dlti.target_system_spec = #dlti.target_system_spec<
     0: #dlti.target_device_spec<
-        #dlti.dl_entry<"L1_cache_size_in_bytes", 4096: i32>> 
+        #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>
   >} {}
 
 // -----
diff --git a/mlir/test/Dialect/DLTI/roundtrip.mlir b/mlir/test/Dialect/DLTI/roundtrip.mlir
index 7b8255ad71d4d9..43188aad595a7a 100644
--- a/mlir/test/Dialect/DLTI/roundtrip.mlir
+++ b/mlir/test/Dialect/DLTI/roundtrip.mlir
@@ -66,8 +66,8 @@
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
     "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"dlti.L1_cache_size_in_bytes", 4096: ui32>>,
+      #dlti.dl_entry<"dlti.L1_cache_size_in_bytes", 4096 : ui32>>,
     "GPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"dlti.max_vector_op_width", 128: ui32>>
+      #dlti.dl_entry<"dlti.max_vector_op_width", 128 : ui32>>
   >} {}
  
diff --git a/mlir/test/Dialect/DLTI/valid.mlir b/mlir/test/Dialect/DLTI/valid.mlir
index 98e9d0b8de4913..8ee7a7eaa069db 100644
--- a/mlir/test/Dialect/DLTI/valid.mlir
+++ b/mlir/test/Dialect/DLTI/valid.mlir
@@ -13,9 +13,9 @@
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
     "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096: i32>>,
+      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>,
     "GPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"max_vector_op_width", 128: i32>>
+      #dlti.dl_entry<"max_vector_op_width", 128 : i32>>
   >} {}
 
 // -----
@@ -31,9 +31,9 @@ module attributes {
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
     "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096: i32>>,
+      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>,
     "GPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"L1_cache_size_in_bytes", 8192: i32>>
+      #dlti.dl_entry<"L1_cache_size_in_bytes", 8192 : i32>>
   >} {}
 
 // -----
@@ -49,9 +49,9 @@ module attributes {
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
     "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096: i64>>,
+      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i64>>,
     "GPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"L1_cache_size_in_bytes", 8192: i64>>
+      #dlti.dl_entry<"L1_cache_size_in_bytes", 8192 : i64>>
   >} {}
 
 // -----
@@ -67,9 +67,9 @@ module attributes {
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
     "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"max_vector_op_width", 64: i32>>,
+      #dlti.dl_entry<"max_vector_op_width", 64 : i32>>,
     "GPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"max_vector_op_width", 128: i32>>
+      #dlti.dl_entry<"max_vector_op_width", 128 : i32>>
   >} {}
 
 // -----
@@ -85,9 +85,9 @@ module attributes {
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
     "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"max_vector_op_width", 64: i64>>,
+      #dlti.dl_entry<"max_vector_op_width", 64 : i64>>,
     "GPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"max_vector_op_width", 128: i64>>
+      #dlti.dl_entry<"max_vector_op_width", 128 : i64>>
   >} {}
 
 // -----
@@ -103,7 +103,47 @@ module attributes {
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
     "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"max_vector_op_width", 64>>,
+      #dlti.dl_entry<"max_vector_op_width", 64 : i64>>,
     "GPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"max_vector_op_width", 128>>
+      #dlti.dl_entry<"max_vector_op_width", 128 : i64>>
+  >} {}
+
+// -----
+
+// Check values of mixed type
+//
+// CHECK:      module attributes {
+// CHECK-SAME:  dlti.target_system_spec = #dlti.target_system_spec<
+// CHECK-SAME:    "CPU" : #dlti.target_device_spec<
+// CHECK-SAME:      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : ui32>>,
+// CHECK-SAME:    "GPU" : #dlti.target_device_spec<
+// CHECK-SAME:      #dlti.dl_entry<"max_vector_op_width", "128">>
+// CHECK-SAME:  >} {
+// CHECK:        }
+module attributes {
+  dlti.target_system_spec = #dlti.target_system_spec<
+    "CPU": #dlti.target_device_spec<
+      #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : ui32>>,
+    "GPU": #dlti.target_device_spec<
+      #dlti.dl_entry<"max_vector_op_width", "128">>
+  >} {}
+
+// -----
+
+// Check values of mixed type
+//
+// CHECK:      module attributes {
+// CHECK-SAME:  dlti.target_system_spec = #dlti.target_system_spec<
+// CHECK-SAME:    "CPU" : #dlti.target_device_spec<
+// CHECK-SAME:      #dlti.dl_entry<"max_vector_op_width", 4.096000e+03 : f32>>,
+// CHECK-SAME:    "GPU" : #dlti.target_device_spec<
+// CHECK-SAME:      #dlti.dl_entry<"L1_cache_size_in_bytes", "128">>
+// CHECK-SAME:  >} {
+// CHECK:        }
+module attributes {
+  dlti.target_system_spec = #dlti.target_system_spec<
+    "CPU": #dlti.target_device_spec<
+      #dlti.dl_entry<"max_vector_op_width", 4096.0 : f32>>,
+    "GPU": #dlti.target_device_spec<
+      #dlti.dl_entry<"L1_cache_size_in_bytes", "128">>
   >} {}
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index e8ab1184b1fd26..f416cd9fcf0b29 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -189,3 +189,20 @@ func.func @bufferize_dot(%in: tensor<4xf32>, %out: tensor<f32>) -> tensor<f32> {
   // CHECK: %[[OUT_TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] : memref<f32>
   // CHECK: return %[[OUT_TENSOR]]
 }
+
+// -----
+
+// CHECK-LABEL: func @bufferize_softmax(
+//  CHECK-SAME:     %[[arg0:.*]]: tensor<2x16x32xf32>, %[[arg1:.*]]: tensor<2x16x32xf32>
+//       CHECK:   %[[m0:.*]] = bufferization.to_memref %[[arg0]]
+//       CHECK:   %[[alloc:.*]] = memref.alloc()
+//   CHECK-NOT:   memref.copy
+//       CHECK:   linalg.softmax dimension(2) ins(%[[m0]] : {{.*}}) outs(%[[alloc:.*]] : {{.*}})
+//       CHECK:   %[[result:.*]] = bufferization.to_tensor %[[alloc]]
+//       CHECK:   return %[[result]]
+func.func @bufferize_softmax(%arg0: tensor<2x16x32xf32>, %arg1: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> {
+  %1 = linalg.softmax dimension(2)
+      ins(%arg0 : tensor<2x16x32xf32>)
+      outs(%arg1: tensor<2x16x32xf32>) -> tensor<2x16x32xf32>
+  return %1 : tensor<2x16x32xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index 78505d0aa41408..626dd8b697e595 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -926,6 +926,24 @@ func.func @bubble_up_pack_through_collapse(%1: tensor<?x16x4xf32>, %dim : index)
 
 // -----
 
+func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm(%1: tensor<?x16x4xf32>, %dim : index) -> tensor<?x4x8x1xf32> {
+  %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor<?x16x4xf32> into tensor<?x4xf32>
+  %2 = tensor.empty(%dim) : tensor<?x4x8x1xf32>
+  %pack = tensor.pack %collapsed inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor<?x4xf32> -> tensor<?x4x8x1xf32>
+  func.return %pack : tensor<?x4x8x1xf32>
+}
+// CHECK-LABEL: func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x16x4xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x2x4x8x1xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<?x16x4xf32> -> tensor<?x2x4x8x1xf32>
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] : tensor<?x2x4x8x1xf32> into tensor<?x4x8x1xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<?x4x8x1xf32>
+
+// -----
+
 func.func @bubble_up_permuted_pack_through_collapse(%1: tensor<4x192x16x256xf32>) -> tensor<4x32x3072x8x1xf32> {
   %collapsed = tensor.collapse_shape %1 [[0], [1, 2], [3]] : tensor<4x192x16x256xf32> into tensor<4x3072x256xf32>
   %2 = tensor.empty() : tensor<4x32x3072x8x1xf32>
@@ -1269,6 +1287,27 @@ func.func @push_down_unpack_through_expand(%5: tensor<?x32x8x8xf32>, %dim: index
 
 // -----
 
+func.func @push_down_unpack_through_expand_empty_outer_dims_perm(%5: tensor<?x32x8x8xf32>, %dim: index, %sz0: index) -> tensor<?x256x256xf32> {
+  %6 = tensor.empty(%dim) : tensor<?x256xf32>
+  %unpack = tensor.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<?x32x8x8xf32> -> tensor<?x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor<?x256xf32> into tensor<?x256x256xf32>
+  func.return %expanded : tensor<?x256x256xf32>
+}
+// CHECK-LABEL: func.func @push_down_unpack_through_expand_empty_outer_dims_perm
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C32:.+]] = arith.constant 32 : index
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x32x8x8xf32>
+// CHECK:         %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C32]] : index
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
+// CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x32x32x8x8xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<?x32x32x8x8xf32> -> tensor<?x256x256xf32>
+// CHECK:         return %[[UNPACK]] : tensor<?x256x256xf32>
+
+// -----
+
 func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32>) -> tensor<4x12x256x256xf32> {
   %6 = tensor.empty() : tensor<4x3072x256xf32>
   %unpack = tensor.unpack %5 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 8] into %6 : tensor<4x32x384x8x8xf32> -> tensor<4x3072x256xf32>
diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir
index 2d640057df340a..00b8c649b82c3e 100644
--- a/mlir/test/Dialect/Linalg/promote.mlir
+++ b/mlir/test/Dialect/Linalg/promote.mlir
@@ -42,23 +42,23 @@ func.func @matmul_f32(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 ///
 //       CHECK:         %[[tmpA:.*]] = memref.alloca() : memref<32xi8>
 //       CHECK:         %[[fullA:.*]] = memref.view %[[tmpA]][{{.*}}][{{.*}}] : memref<32xi8> to memref<?x?xf32>
-//       CHECK:         %[[partialA:.*]] = memref.subview %[[fullA]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+//       CHECK:         %[[partialA:.*]] = memref.subview %[[fullA]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 ///
 //       CHECK:         %[[tmpB:.*]] = memref.alloca() : memref<48xi8>
 //       CHECK:         %[[fullB:.*]] = memref.view %[[tmpB]][{{.*}}][{{.*}}] : memref<48xi8> to memref<?x?xf32>
-//       CHECK:         %[[partialB:.*]] = memref.subview %[[fullB]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+//       CHECK:         %[[partialB:.*]] = memref.subview %[[fullB]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 ///
 //       CHECK:         %[[tmpC:.*]] = memref.alloca() : memref<24xi8>
 //       CHECK:         %[[fullC:.*]] = memref.view %[[tmpC]][{{.*}}][{{.*}}] : memref<24xi8> to memref<?x?xf32>
-//       CHECK:         %[[partialC:.*]] = memref.subview %[[fullC]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+//       CHECK:         %[[partialC:.*]] = memref.subview %[[fullC]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 
-//       CHECK:         linalg.copy ins(%[[vA]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialA]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
-//       CHECK:         linalg.copy ins(%[[vB]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialB]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
-//       CHECK:         linalg.copy ins(%[[vC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[vA]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialA]] : memref<?x?xf32, strided<[?, 1]>>)
+//       CHECK:         linalg.copy ins(%[[vB]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialB]] : memref<?x?xf32, strided<[?, 1]>>)
+//       CHECK:         linalg.copy ins(%[[vC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialC]] : memref<?x?xf32, strided<[?, 1]>>)
 //
 //       CHECK:         linalg.matmul ins(%[[partialA]], %[[partialB]]{{.*}} outs(%[[partialC]]
 //
-//       CHECK:         linalg.copy ins(%[[partialC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[vC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[partialC]] : memref<?x?xf32, strided<[?, 1]>>) outs(%[[vC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
 //
 //   CHECK-NOT:         memref.dealloc %[[tmpA]] : memref<32xi8>
 //   CHECK-NOT:         memref.dealloc %[[tmpB]] : memref<48xi8>
@@ -112,23 +112,23 @@ func.func @matmul_f64(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 ///
 //       CHECK:         %[[tmpA_f64:.*]] = memref.alloc() : memref<64xi8>
 //       CHECK:         %[[fullA_f64:.*]] = memref.view %[[tmpA_f64]][{{.*}}][{{.*}}] : memref<64xi8> to memref<?x?xf64>
-//       CHECK:         %[[partialA_f64:.*]] = memref.subview %[[fullA_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, strided<[?, 1], offset: ?>>
+//       CHECK:         %[[partialA_f64:.*]] = memref.subview %[[fullA_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, strided<[?, 1]>>
 ///
 //       CHECK:         %[[tmpB_f64:.*]] = memref.alloc() : memref<96xi8>
 //       CHECK:         %[[fullB_f64:.*]] = memref.view %[[tmpB_f64]][{{.*}}][{{.*}}] : memref<96xi8> to memref<?x?xf64>
-//       CHECK:         %[[partialB_f64:.*]] = memref.subview %[[fullB_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, strided<[?, 1], offset: ?>>
+//       CHECK:         %[[partialB_f64:.*]] = memref.subview %[[fullB_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, strided<[?, 1]>>
 ///
 //       CHECK:         %[[tmpC_f64:.*]] = memref.alloc() : memref<48xi8>
 //       CHECK:         %[[fullC_f64:.*]] = memref.view %[[tmpC_f64]][{{.*}}][{{.*}}] : memref<48xi8> to memref<?x?xf64>
-//       CHECK:         %[[partialC_f64:.*]] = memref.subview %[[fullC_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, strided<[?, 1], offset: ?>>
+//       CHECK:         %[[partialC_f64:.*]] = memref.subview %[[fullC_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, strided<[?, 1]>>
 
-//       CHECK:         linalg.copy ins(%[[vA_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialA_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
-//       CHECK:         linalg.copy ins(%[[vB_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialB_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
-//       CHECK:         linalg.copy ins(%[[vC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[vA_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialA_f64]] : memref<?x?xf64, strided<[?, 1]>>)
+//       CHECK:         linalg.copy ins(%[[vB_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialB_f64]] : memref<?x?xf64, strided<[?, 1]>>)
+//       CHECK:         linalg.copy ins(%[[vC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialC_f64]] : memref<?x?xf64, strided<[?, 1]>>)
 //
 //       CHECK:         linalg.matmul ins(%[[partialA_f64]], %[[partialB_f64]]{{.*}} outs(%[[partialC_f64]]
 //
-//       CHECK:         linalg.copy ins(%[[partialC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[vC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[partialC_f64]] : memref<?x?xf64, strided<[?, 1]>>) outs(%[[vC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
 //
 //       CHECK:         memref.dealloc %[[tmpA_f64]] : memref<64xi8>
 //       CHECK:         memref.dealloc %[[tmpB_f64]] : memref<96xi8>
@@ -318,7 +318,7 @@ func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf
   // CHECK:           %[[VAL_21:.*]] = arith.constant 12 : index
   // CHECK:           %[[VAL_22:.*]] = memref.alloc() : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           %[[VAL_23:.*]] = memref.view %[[VAL_22]]{{\[}}%[[VAL_18]]]{{\[}}%[[VAL_12]], %[[VAL_15]]] : memref<48xi8, #gpu.address_space<workgroup>> to memref<?x?xf32, #gpu.address_space<workgroup>>
-  // CHECK:           %[[VAL_24:.*]] = memref.subview %[[VAL_23]][0, 0] {{\[}}%[[VAL_14]], %[[VAL_17]]] [1, 1] : memref<?x?xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
+  // CHECK:           %[[VAL_24:.*]] = memref.subview %[[VAL_23]][0, 0] {{\[}}%[[VAL_14]], %[[VAL_17]]] [1, 1] : memref<?x?xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>
   // CHECK:           %[[VAL_25:.*]] = arith.constant 0 : index
   // CHECK:           %[[VAL_26:.*]] = arith.constant 4 : index
   // CHECK:           %[[VAL_27:.*]] = arith.constant 1 : index
@@ -337,7 +337,7 @@ func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf
   // CHECK:           %[[VAL_40:.*]] = arith.constant 12 : index
   // CHECK:           %[[VAL_41:.*]] = memref.alloc() : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           %[[VAL_42:.*]] = memref.view %[[VAL_41]]{{\[}}%[[VAL_37]]]{{\[}}%[[VAL_31]], %[[VAL_34]]] : memref<48xi8, #gpu.address_space<workgroup>> to memref<?x?xf32, #gpu.address_space<workgroup>>
-  // CHECK:           %[[VAL_43:.*]] = memref.subview %[[VAL_42]][0, 0] {{\[}}%[[VAL_33]], %[[VAL_36]]] [1, 1] : memref<?x?xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
+  // CHECK:           %[[VAL_43:.*]] = memref.subview %[[VAL_42]][0, 0] {{\[}}%[[VAL_33]], %[[VAL_36]]] [1, 1] : memref<?x?xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>
   // CHECK:           %[[VAL_44:.*]] = arith.constant 0 : index
   // CHECK:           %[[VAL_45:.*]] = arith.constant 4 : index
   // CHECK:           %[[VAL_46:.*]] = arith.constant 1 : index
@@ -356,10 +356,10 @@ func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf
   // CHECK:           %[[VAL_59:.*]] = arith.constant 12 : index
   // CHECK:           %[[VAL_60:.*]] = memref.alloc() : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           %[[VAL_61:.*]] = memref.view %[[VAL_60]]{{\[}}%[[VAL_56]]]{{\[}}%[[VAL_50]], %[[VAL_53]]] : memref<48xi8, #gpu.address_space<workgroup>> to memref<?x?xf32, #gpu.address_space<workgroup>>
-  // CHECK:           %[[VAL_62:.*]] = memref.subview %[[VAL_61]][0, 0] {{\[}}%[[VAL_52]], %[[VAL_55]]] [1, 1] : memref<?x?xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
-// CHECK:           linalg.copy ins(%[[VAL_3]] : memref<4x3xf32, strided<[4, 1]>, 1>) outs(%[[VAL_24]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>)
-// CHECK:           linalg.copy ins(%[[VAL_4]] : memref<4x3xf32, strided<[4, 1]>, 1>) outs(%[[VAL_43]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>)
-  // CHECK:           linalg.generic {doc = "", indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"], library_call = ""} ins(%[[VAL_24]], %[[VAL_43]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%[[VAL_62]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>) {
+  // CHECK:           %[[VAL_62:.*]] = memref.subview %[[VAL_61]][0, 0] {{\[}}%[[VAL_52]], %[[VAL_55]]] [1, 1] : memref<?x?xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>
+// CHECK:           linalg.copy ins(%[[VAL_3]] : memref<4x3xf32, strided<[4, 1]>, 1>) outs(%[[VAL_24]] : memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>)
+// CHECK:           linalg.copy ins(%[[VAL_4]] : memref<4x3xf32, strided<[4, 1]>, 1>) outs(%[[VAL_43]] : memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>)
+  // CHECK:           linalg.generic {doc = "", indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"], library_call = ""} ins(%[[VAL_24]], %[[VAL_43]] : memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>, memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>) outs(%[[VAL_62]] : memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>) {
   // CHECK:           ^bb0(%[[VAL_63:.*]]: f32, %[[VAL_64:.*]]: f32, %[[VAL_65:.*]]: f32):
   // CHECK:             %[[VAL_66:.*]] = arith.addf %[[VAL_63]], %[[VAL_64]] : f32
   // CHECK:             linalg.yield %[[VAL_66]] : f32
@@ -372,7 +372,7 @@ func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf
     linalg.yield %1 : f32
   }
 
-  // CHECK:           linalg.copy ins(%[[VAL_62]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%[[VAL_5]] : memref<4x3xf32, strided<[4, 1]>, 1>)
+  // CHECK:           linalg.copy ins(%[[VAL_62]] : memref<?x?xf32, strided<[?, 1]>, #gpu.address_space<workgroup>>) outs(%[[VAL_5]] : memref<4x3xf32, strided<[4, 1]>, 1>)
   // CHECK:           memref.dealloc %[[VAL_22]] : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           memref.dealloc %[[VAL_41]] : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           memref.dealloc %[[VAL_60]] : memref<48xi8, #gpu.address_space<workgroup>>
diff --git a/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir b/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir
new file mode 100644
index 00000000000000..c086d0fd7e6332
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir
@@ -0,0 +1,267 @@
+//RUN: mlir-opt -test-linalg-rank-reduce-contraction-ops --canonicalize -split-input-file %s | FileCheck %s
+
+func.func @singleton_batch_matmul_tensor(%arg0 : tensor<1x128x512xf32>, %arg1 : tensor<1x512x256xf32>, %arg2: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
+  // CHECK-LABEL: @singleton_batch_matmul_tensor
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: tensor<1x128x512xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: tensor<1x512x256xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: tensor<1x128x256xf32>
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = tensor.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_RHS:.*]] = tensor.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[MATMUL:.+]] = linalg.matmul ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : tensor<128x512xf32>, tensor<512x256xf32>) outs(%[[COLLAPSED_INIT]] : tensor<128x256xf32>)
+  //  CHECK-NEXT:   %[[RES:.*]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0, 1], [2]] output_shape [1, 128, 256]
+  //  CHECK-NEXT:   return %[[RES]]
+  %1 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x128x512xf32>, tensor<1x512x256xf32>)
+      outs(%arg2 : tensor<1x128x256xf32>) -> tensor<1x128x256xf32>
+  return %1 : tensor<1x128x256xf32>
+}
+
+// -----
+
+func.func @singleton_batch_matmul_memref(%arg0 : memref<1x?x?xf32>, %arg1 : memref<1x?x?xf32>, %arg2: memref<1x?x?xf32>) {
+  // CHECK-LABEL: @singleton_batch_matmul_memref
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: memref<1x?x?xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: memref<1x?x?xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: memref<1x?x?xf32>
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_RHS:.*]] = memref.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:    linalg.matmul ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : memref<?x?xf32>, memref<?x?xf32>) outs(%[[COLLAPSED_INIT]] : memref<?x?xf32>)
+  //  CHECK-NEXT:   return
+  linalg.batch_matmul ins(%arg0, %arg1 : memref<1x?x?xf32>, memref<1x?x?xf32>)
+      outs(%arg2 : memref<1x?x?xf32>)
+  return
+}
+
+// -----
+
+func.func @singleton_batch_matvec(%arg0 : tensor<1x128x512xf32>, %arg1 : tensor<1x512xf32>, %arg2: tensor<1x128xf32>) -> tensor<1x128xf32> {
+  // CHECK-LABEL: @singleton_batch_matvec
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: tensor<1x128x512xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: tensor<1x512xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: tensor<1x128xf32>
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = tensor.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_RHS:.*]] = tensor.collapse_shape %[[RHS]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[MATMUL:.+]] = linalg.matvec ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : tensor<128x512xf32>, tensor<512xf32>) outs(%[[COLLAPSED_INIT]] : tensor<128xf32>)
+  //  CHECK-NEXT:   %[[RES:.*]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0, 1]] output_shape [1, 128]
+  //  CHECK-NEXT:   return %[[RES]]
+  %1 = linalg.batch_matvec ins(%arg0, %arg1 : tensor<1x128x512xf32>, tensor<1x512xf32>)
+      outs(%arg2 : tensor<1x128xf32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+}
+
+// -----
+
+func.func @singleton_batch_vecmat(%arg0 : tensor<1x?xf32>, %arg1 : tensor<1x?x?xf32>, %arg2: tensor<1x?xf32>) -> tensor<1x?xf32> {
+  // CHECK-LABEL: @singleton_batch_vecmat
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: tensor<1x?xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: tensor<1x?x?xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: tensor<1x?xf32>
+  //  CHECK-DAG:    %[[C1:.*]] = arith.constant 1
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = tensor.collapse_shape %[[LHS]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[COLLAPSED_RHS:.*]] = tensor.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[MATMUL:.+]] = linalg.vecmat ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : tensor<?xf32>, tensor<?x?xf32>) outs(%[[COLLAPSED_INIT]] : tensor<?xf32>)
+  //  CHECK-NEXT:   %[[DIM1:.*]] = tensor.dim %[[INIT]], %[[C1]]
+  //  CHECK-NEXT:   %[[RES:.*]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0, 1]] output_shape [1, %[[DIM1]]]
+  //  CHECK-NEXT:   return %[[RES]]
+  %1 = linalg.batch_vecmat ins(%arg0, %arg1 : tensor<1x?xf32>, tensor<1x?x?xf32>)
+      outs(%arg2 : tensor<1x?xf32>) -> tensor<1x?xf32>
+  return %1 : tensor<1x?xf32>
+}
+
+// -----
+
+func.func @singleton_batchmatmul_transpose_a(%arg0: memref<1x5x3xf32>, %arg1: memref<1x5x7xf32>, %arg2: memref<1x3x7xf32>) {
+  // CHECK-LABEL: @singleton_batchmatmul_transpose_a
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: memref<1x5x3xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: memref<1x5x7xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: memref<1x3x7xf32>
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_RHS:.*]] = memref.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:    linalg.matmul_transpose_a ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : memref<5x3xf32>, memref<5x7xf32>) outs(%[[COLLAPSED_INIT]] : memref<3x7xf32>)
+  //  CHECK-NEXT:   return
+  linalg.batch_matmul_transpose_a ins(%arg0, %arg1 : memref<1x5x3xf32>, memref<1x5x7xf32>) outs(%arg2: memref<1x3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @singleton_batchmatmul_transpose_b(%arg0: memref<1x3x5xf32>, %arg1: memref<1x7x5xf32>, %arg2: memref<1x3x7xf32>) {
+  // CHECK-LABEL: @singleton_batchmatmul_transpose_b
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: memref<1x3x5xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: memref<1x7x5xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: memref<1x3x7xf32>
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_RHS:.*]] = memref.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]]
+  //  CHECK-NEXT:    linalg.matmul_transpose_b ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[COLLAPSED_INIT]] : memref<3x7xf32>)
+  //  CHECK-NEXT:   return
+  linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : memref<1x3x5xf32>, memref<1x7x5xf32>) outs(%arg2: memref<1x3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @matmul_to_matvec_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<?x1xf32>) -> tensor<?x1xf32> {
+  // CHECK-LABEL: @matmul_to_matvec_tensor
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: tensor<?x1xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: tensor<?x1xf32>
+  //  CHECK-DAG:    %[[C0:.*]] = arith.constant 0
+  //  CHECK-NEXT:   %[[COLLAPSED_RHS:.*]] = tensor.collapse_shape %[[RHS]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[MATMUL:.+]] = linalg.matvec ins(%[[LHS]], %[[COLLAPSED_RHS]] : tensor<?x?xf32>, tensor<?xf32>) outs(%[[COLLAPSED_INIT]] : tensor<?xf32>)
+  //  CHECK-NEXT:   %[[DIM0:.*]] = tensor.dim %[[INIT]], %[[C0]]
+  //  CHECK-NEXT:   %[[RES:.*]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0, 1]] output_shape [%[[DIM0]], 1]
+  //  CHECK-NEXT:   return %[[RES]]
+    %0 = linalg.matmul ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x1xf32>) outs(%arg2: tensor<?x1xf32>) -> tensor<?x1xf32>
+    return %0 : tensor<?x1xf32>
+}
+
+// -----
+
+func.func @matmul_to_matvec(%arg0: memref<?x?xf32>, %arg1: memref<?x1xf32>, %arg2: memref<?x1xf32>) {
+  // CHECK-LABEL: @matmul_to_matvec
+  // CHECK: linalg.matvec
+    linalg.matmul ins(%arg0, %arg1: memref<?x?xf32>, memref<?x1xf32>) outs(%arg2: memref<?x1xf32>)
+    return
+}
+
+// -----
+
+func.func @matmul_to_vecmat_tensor(%arg0: tensor<1x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<1x?xf32>) -> tensor<1x?xf32> {
+  // CHECK-LABEL: @matmul_to_vecmat
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: tensor<1x?xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: tensor<1x?xf32>
+  //  CHECK-DAG:    %[[C1:.*]] = arith.constant 1
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = tensor.collapse_shape %[[LHS]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[RESULT:.*]] = linalg.vecmat ins(%[[COLLAPSED_LHS]], %[[RHS]] : tensor<?xf32>, tensor<?x?xf32>) outs(%[[COLLAPSED_INIT]] : tensor<?xf32>)
+  //  CHECK-NEXT:   %[[DIM1:.*]] = tensor.dim %[[INIT]], %[[C1]]
+  //  CHECK-NEXT:   %[[RES:.*]] = tensor.expand_shape %[[RESULT]] {{\[}}[0, 1]] output_shape [1, %[[DIM1]]]
+  //  CHECK-NEXT:   return %[[RES]]
+    %0 = linalg.matmul ins(%arg0, %arg1: tensor<1x?xf32>, tensor<?x?xf32>) outs(%arg2: tensor<1x?xf32>) -> tensor<1x?xf32>
+    return %0 : tensor<1x?xf32>
+}
+
+// -----
+
+func.func @batch_matmul_to_vecmat(%arg0: memref<1x1x?xf32>, %arg1: memref<1x?x?xf32>, %arg2: memref<1x1x?xf32>) {
+  // CHECK-LABEL: @batch_matmul_to_vecmat
+  // CHECK: linalg.vecmat
+    linalg.batch_matmul ins(%arg0, %arg1: memref<1x1x?xf32>, memref<1x?x?xf32>) outs(%arg2: memref<1x1x?xf32>)
+    return
+}
+
+// -----
+
+func.func @matvec_to_dot(%arg0: memref<1x?xf32>, %arg1: memref<?xf32>, %arg2: memref<1xf32>) {
+  // CHECK-LABEL: @matvec_to_dot
+  //  CHECK-SAME:     %[[LHS:[a-zA-Z0-9]+]]: memref<1x?xf32>
+  //  CHECK-SAME:     %[[RHS:[a-zA-Z0-9]+]]: memref<?xf32>
+  //  CHECK-SAME:     %[[INIT:[a-zA-Z0-9]+]]: memref<1xf32>
+  //  CHECK-NEXT:   %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1]]
+  //  CHECK-NEXT:   %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] []
+  //  CHECK-NEXT:   linalg.dot ins(%[[COLLAPSED_LHS]], %[[RHS]] : memref<?xf32>, memref<?xf32>) outs(%[[COLLAPSED_INIT]] : memref<f32>)
+    linalg.matvec ins(%arg0, %arg1: memref<1x?xf32>, memref<?xf32>) outs(%arg2: memref<1xf32>)
+    return
+}
+
+// -----
+
+func.func @vecmat_to_dot(%arg0: memref<?xf32>, %arg1: memref<?x1xf32>, %arg2: memref<1xf32>) {
+  // CHECK-LABEL: @vecmat_to_dot
+  // CHECK: linalg.dot
+    linalg.vecmat ins(%arg0, %arg1: memref<?xf32>, memref<?x1xf32>) outs(%arg2: memref<1xf32>)
+    return
+}
+
+// -----
+
+func.func @matvec_to_dot_tensor(%arg0: tensor<1x?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK-LABEL: @matvec_to_dot_tensor
+  // CHECK: linalg.dot
+    %0 = linalg.matvec ins(%arg0, %arg1: tensor<1x?xf32>, tensor<?xf32>) outs(%arg2: tensor<1xf32>) -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+}
+
+// -----
+
+func.func @matmul_transpose_a_to_vecmat(%arg0: tensor<256x1xf32>, %arg1: tensor<256x512xf32>, %arg2: tensor<1x512xf32>) -> tensor<1x512xf32> {
+  // CHECK-LABEL: @matmul_transpose_a_to_vecmat
+  // CHECK: collapse_shape {{.*}} into tensor<256xf32>
+  // CHECK: collapse_shape {{.*}} into tensor<512xf32>
+  // CHECK: linalg.vecmat
+  // CHECK: expand_shape {{.*}} into tensor<1x512xf32>
+    %0 = linalg.matmul_transpose_a ins(%arg0, %arg1: tensor<256x1xf32>, tensor<256x512xf32>) outs(%arg2: tensor<1x512xf32>) -> tensor<1x512xf32>
+    return %0 : tensor<1x512xf32>
+}
+
+// -----
+
+func.func @batch_matmul_transpose_a_to_batch_vecmat(%arg0: tensor<64x256x1xf32>, %arg1: tensor<64x256x512xf32>, %arg2: tensor<64x1x512xf32>) -> tensor<64x1x512xf32> {
+  // CHECK-LABEL: @batch_matmul_transpose_a_to_batch_vecmat
+  // CHECK: collapse_shape {{.*}} into tensor<64x256xf32>
+  // CHECK: collapse_shape {{.*}} into tensor<64x512xf32>
+  // CHECK: linalg.batch_vecmat
+  // CHECK: expand_shape {{.*}} into tensor<64x1x512xf32>
+    %0 = linalg.batch_matmul_transpose_a ins(%arg0, %arg1: tensor<64x256x1xf32>, tensor<64x256x512xf32>) outs(%arg2: tensor<64x1x512xf32>) -> tensor<64x1x512xf32>
+    return %0 : tensor<64x1x512xf32>
+}
+
+// -----
+
+func.func @matmul_transpose_b_to_matvec(%arg0: memref<?x?xf32>, %arg1: memref<1x?xf32>, %arg2: memref<?x1xf32>) {
+  // CHECK-LABEL: @matmul_transpose_b_to_matvec
+  // CHECK: linalg.matvec
+    linalg.matmul_transpose_b ins(%arg0, %arg1: memref<?x?xf32>, memref<1x?xf32>) outs(%arg2: memref<?x1xf32>)
+    return
+}
+
+// -----
+
+func.func @batchmatmul_transpose_b_to_batchmatvec_tensor(%arg0: tensor<64x128x256xf32>, %arg1: tensor<64x1x256xf32>, %arg2: tensor<64x128x1xf32>) -> tensor<64x128x1xf32> {
+  // CHECK: collapse_shape {{.*}} into tensor<64x256xf32>
+  // CHECK: collapse_shape {{.*}} into tensor<64x128xf32>
+  // CHECK: linalg.batch_matvec
+  // CHECK: expand_shape {{.*}} into tensor<64x128x1xf32>
+    %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1: tensor<64x128x256xf32>, tensor<64x1x256xf32>) outs(%arg2: tensor<64x128x1xf32>) -> tensor<64x128x1xf32> 
+    return %0 : tensor<64x128x1xf32>
+}
+
+// -----
+
+func.func @batchmatmul_transpose_b_to_to_dot(%arg0: tensor<1x1x?xf32>, %arg1: tensor<1x1x?xf32>, %arg2: tensor<1x1x1xf32>) -> tensor<1x1x1xf32> {
+  // CHECK-LABEL: @batchmatmul_transpose_b_to_to_dot
+  // CHECK: linalg.dot
+    %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1: tensor<1x1x?xf32>, tensor<1x1x?xf32>) outs(%arg2: tensor<1x1x1xf32>) -> tensor<1x1x1xf32> 
+    return %0 : tensor<1x1x1xf32>
+}
+
+// -----
+
+func.func @nonsingleton_batch_matmul(%arg0 : tensor<2x?x?xf32>, %arg1 : tensor<2x?x?xf32>, %arg2: tensor<2x?x?xf32>) -> tensor<2x?x?xf32> {
+  // CHECK-LABEL: @nonsingleton_batch_matmul
+  // CHECK-NOT:   collapse_shape
+  // CHECK:       linalg.batch_matmul
+  // CHECK-NOT:   expand_shape
+  %1 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<2x?x?xf32>, tensor<2x?x?xf32>)
+      outs(%arg2 : tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
+  return %1 : tensor<2x?x?xf32>
+}
+
+// -----
+
+func.func @nonsingleton_batch_matmul_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  // CHECK-LABEL: @nonsingleton_batch_matmul_dynamic
+  // CHECK-NOT:   collapse_shape
+  // CHECK:       linalg.batch_matmul
+  // CHECK-NOT:   expand_shape
+  %1 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
+      outs(%arg2 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %1 : tensor<?x?x?xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir
new file mode 100644
index 00000000000000..1fb520c5982e68
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir
@@ -0,0 +1,52 @@
+// The following test examples of linalg named ops lowered to linalg.generic and then
+// lifted back up to named op.
+// RUN: mlir-opt %s -linalg-generalize-named-ops | mlir-opt --linalg-specialize-generic-ops | FileCheck %s
+
+func.func @unary_exp(%A: memref<7x14x21xf32>, %Out: memref<7x14x21xf32>) {
+  linalg.exp ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK-LABEL: unary_exp
+// CHECK-SAME: %[[A:.+]]: memref<7x14x21xf32>, %[[Out:.+]]: memref<7x14x21xf32>)
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.exp ins(%[[A]] : memref<7x14x21xf32>) outs(%[[Out]] : memref<7x14x21xf32>)
+
+// -----
+
+func.func @binary_add(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.add ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: binary_add
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.add ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @matmul
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+func.func @mixed_named_ops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                                   %C: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %AB = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.add ins(%AB, %C : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @mixed_named_ops
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[C:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: %[[AB:.+]] = linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK: linalg.add ins(%[[AB]], %[[C]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
new file mode 100644
index 00000000000000..cf495a7d29b703
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -0,0 +1,126 @@
+// RUN: mlir-opt %s -split-input-file --linalg-specialize-generic-ops | FileCheck %s
+
+#umap = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %0 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%A : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %1 = math.exp %in : f32
+    linalg.yield %1 : f32
+  } -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// CHECK-LABEL: unary_op_exp
+// CHECK-SAME: %[[A:.+]]: tensor<?x?x?xf32>, %[[Out:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.exp ins(%[[A]] : tensor<?x?x?xf32>) outs(%[[Out]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.generic
+         {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]}
+         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.divf %in, %in_0 : f32
+    linalg.yield %1 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: binary_op_div
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.div ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.generic
+         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+   ^bb0(%in: f32, %in_0: f32, %out: f32):
+     %1 = arith.mulf %in, %in_0 : f32
+     %2 = arith.addf %out, %1 : f32
+     linalg.yield %2 : f32
+   } -> tensor<?x?xf32>
+   return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: op_matmul
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>, %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+  %0 = linalg.generic
+           {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+           ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x16x16xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<2x16x16xf32>
+  return %0 : tensor<2x16x16xf32>
+}
+
+// CHECK-LABEL: op_batch_matmul
+// CHECK-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>,  %[[Out:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// -----
+
+// This is a multi-reduction linalg.generic and cannot be lifted to matrix multiply
+#mapA = affine_map<(m, n, k1, k2) -> (m, k1, k2)>
+#mapB = affine_map<(m, n, k1, k2) -> (k2, k1, n)>
+#mapC = affine_map<(m, n, k1, k2) -> (m, n)>
+func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
+                                       %B: tensor<30x20x40xf32>,
+                                       %C: tensor<10x40xf32>) -> tensor<10x40xf32> {
+  %0 = linalg.generic
+           {indexing_maps = [#mapA, #mapB, #mapC],
+            iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
+  ins(%A, %B : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
+  outs(%C : tensor<10x40xf32>) {
+  ^bb0(%a: f32, %b: f32, %c: f32):
+    %1 = arith.mulf %a, %b : f32
+    %2 = arith.addf %c, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<10x40xf32>
+  return %0 : tensor<10x40xf32>
+}
+
+// CHECK-LABEL: negative_op_multi_reduction
+// CHECK: linalg.generic
+
+// -----
+
+// TODO: matvec
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d1)>
+#map2 = affine_map<(d0, d1) -> (d0)>
+func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = linalg.generic
+          {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "reduction"]}
+          ins(%A, %B : tensor<?x?xf32>, tensor<?xf32>) outs(%Out : tensor<?xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32):
+        %1 = arith.mulf %in, %in_0 : f32
+        %2 = arith.addf %out, %1 : f32
+        linalg.yield %2 : f32
+  } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+// CHECK-LABEL: op_matvec
+// CHECK: linalg.generic
diff --git a/mlir/test/Dialect/Linalg/transform-op-specialize_elemwise_binary.mlir b/mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-binary.mlir
similarity index 100%
rename from mlir/test/Dialect/Linalg/transform-op-specialize_elemwise_binary.mlir
rename to mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-binary.mlir
diff --git a/mlir/test/Dialect/Linalg/transform-op-specialize_elemwise_unary.mlir b/mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-unary.mlir
similarity index 100%
rename from mlir/test/Dialect/Linalg/transform-op-specialize_elemwise_unary.mlir
rename to mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-unary.mlir
diff --git a/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir b/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir
new file mode 100644
index 00000000000000..f64953bceefe13
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir
@@ -0,0 +1,148 @@
+// RUN: mlir-opt --transform-interpreter --split-input-file --verify-diagnostics %s | FileCheck %s
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @specialize_matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.generic
+          {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+          ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %0 = arith.mulf %in, %in_0 : f32
+      %1 = arith.addf %out, %0 : f32
+      linalg.yield %1 : f32
+    } -> tensor<?x?xf32>
+ return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @specialize_matmul
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul ins(%[[ARG0]], %[[ARG1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[ARG2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d2, d0)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.generic
+     {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+     ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) outs(%arg2 : memref<3x7xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32):
+       %0 = arith.mulf %in, %in_0 : f32
+       %1 = arith.addf %out, %0 : f32
+       linalg.yield %1 : f32
+  }
+  return
+}
+
+// CHECK-LABEL: @matmul_transpose_a
+// CHECK-SAME: %[[ARG0:.+]]: memref<5x3xf32>, %[[ARG1:.+]]: memref<5x7xf32>, %[[ARG2:.+]]: memref<3x7xf32>) {
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul_transpose_a ins(%[[ARG0]], %[[ARG1]] : memref<5x3xf32>, memref<5x7xf32>) outs(%[[ARG2]] : memref<3x7xf32>)
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @matmul_transpose_b(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.generic
+          {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+          ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @matmul_transpose_b
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul_transpose_b ins(%[[ARG0]], %[[ARG1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[ARG2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @batch_matmul(%arg0: tensor<2x16x8xf32>, %arg1: tensor<2x8x16xf32>, %arg2: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+  %0 = linalg.generic
+           {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+           ins(%arg0, %arg1 : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%arg2 : tensor<2x16x16xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %1 = arith.mulf %in, %in_0 : f32
+      %2 = arith.addf %out, %1 : f32
+      linalg.yield %2 : f32
+  } -> tensor<2x16x16xf32>
+  return %0 : tensor<2x16x16xf32>
+}
+
+// CHECK-LABEL: @batch_matmul
+// CHECK-SAME: %[[ARG0:.+]]: tensor<2x16x8xf32>, %[[ARG1:.+]]: tensor<2x8x16xf32>, %[[ARG2:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.batch_matmul ins(%[[ARG0]], %[[ARG1]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[ARG2]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @batch_matmul_transpose_b(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %0 = linalg.generic
+       {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+       ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%arg2 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %1 = arith.mulf %in, %in_0 : f32
+      %2 = arith.addf %out, %1 : f32
+      linalg.yield %2 : f32
+  } -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// CHECK-LABEL: @batch_matmul_transpose_b
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?xf32>, %[[ARG2:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.batch_matmul_transpose_b ins(%[[ARG0]], %[[ARG1]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[ARG2]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/transform-promotion.mlir b/mlir/test/Dialect/Linalg/transform-promotion.mlir
index d6112db0f7772d..7c4cd623c742dd 100644
--- a/mlir/test/Dialect/Linalg/transform-promotion.mlir
+++ b/mlir/test/Dialect/Linalg/transform-promotion.mlir
@@ -42,15 +42,15 @@ func.func @promote_subview_matmul(%arg0: memref<?x?xf32, strided<[?, 1], offset:
 // CHECK:               %[[a0:.*]] = memref.alloc() : memref<32000000xi8>
 // CHECK:               %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<32000000xi8> to memref<?x?xf32>
 // CHECK:               %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1]
-// CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+// CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 // CHECK:               %[[a1:.*]] = memref.alloc() : memref<48000000xi8>
 // CHECK:               %[[v1:.*]] = memref.view %[[a1]]{{.*}} : memref<48000000xi8> to memref<?x?xf32>
 // CHECK:               %[[l1:.*]] = memref.subview %[[v1]][0, 0] [%{{.*}}, %{{.*}}] [1, 1]
-// CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+// CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 // CHECK:               %[[a2:.*]] = memref.alloc() : memref<24000000xi8>
 // CHECK:               %[[v2:.*]] = memref.view %[[a2]]{{.*}} : memref<24000000xi8> to memref<?x?xf32>
 // CHECK:               %[[l2:.*]] = memref.subview %[[v2]][0, 0] [%{{.*}}, %{{.*}}] [1, 1]
-// CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+// CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 // CHECK:               linalg.copy ins(%[[s0]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l0]] : memref<?x?xf32, strided{{.*}}>)
 // CHECK:               linalg.copy ins(%[[s1]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l1]] : memref<?x?xf32, strided{{.*}}>)
 // CHECK:               linalg.copy ins(%[[s2]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l2]] : memref<?x?xf32, strided{{.*}}>)
@@ -110,7 +110,7 @@ func.func @promote_first_subview_matmul(%arg0: memref<?x?xf32, strided<[?, 1], o
 // CHECK:         %[[s2:.*]] = memref.subview {{.*}}: memref<?x?xf32, strided{{.*}}> to memref<?x?xf32, strided{{.*}}>
 // CHECK:         %[[a0:.*]] = memref.alloc() : memref<32000000xi8>
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<32000000xi8> to memref<?x?xf32>
-// CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+// CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 // CHECK-NOT:     memref.alloc
 // CHECK-NOT:     memref.view
 // CHECK-NOT:     memref.subview
@@ -147,7 +147,7 @@ func.func @aligned_promote_fill(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?
 // CHECK:         %[[s0:.*]] = memref.subview {{.*}}: memref<?x?xf32, strided{{.*}}> to memref<?x?xf32, strided{{.*}}>
 // CHECK:         %[[a0:.*]] = memref.alloc() {alignment = 32 : i64} : memref<32000000xi8>
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<32000000xi8> to memref<?x?xf32>
-// CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+// CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
 // CHECK:         linalg.fill ins({{.*}} : f32) outs(%[[v0]] : memref<?x?xf32>)
 // CHECK:         linalg.copy ins(%[[s0]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l0]] : memref<?x?xf32, strided{{.*}}>)
 // CHECK:         linalg.fill ins(%[[cf]] : f32) outs(%[[v0]] : memref<?x?xf32>)
@@ -180,7 +180,7 @@ func.func @aligned_promote_fill_complex(%arg0: memref<?x?xcomplex<f32>, strided<
 // CHECK:         %[[s0:.*]] = memref.subview {{.*}}: memref<?x?xcomplex<f32>, strided{{.*}}> to memref<?x?xcomplex<f32>, strided{{.*}}>
 // CHECK:         %[[a0:.*]] = memref.alloc() {alignment = 32 : i64} : memref<64000000xi8>
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<64000000xi8> to memref<?x?xcomplex<f32>>
-// CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xcomplex<f32>> to memref<?x?xcomplex<f32>, strided<[?, 1], offset: ?>>
+// CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xcomplex<f32>> to memref<?x?xcomplex<f32>, strided<[?, 1]>>
 // CHECK:         linalg.fill ins({{.*}} : complex<f32>) outs(%[[v0]] : memref<?x?xcomplex<f32>>)
 // CHECK:         linalg.copy ins(%[[s0]] : memref<?x?xcomplex<f32>, strided{{.*}}>) outs(%[[l0]] : memref<?x?xcomplex<f32>, strided{{.*}}>)
 // CHECK:         linalg.fill ins(%[[cc]] : complex<f32>) outs(%[[v0]] : memref<?x?xcomplex<f32>>)
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
index 8feb3c2a2c306a..cce4b4efa61c8b 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -23,9 +23,8 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0)>
-// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 5)>
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 5)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 //     CHECK: func @reduction_tile(%[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<?xf32>
 // CHECK-DAG:   %[[I:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:   %[[C5:.*]] = arith.constant 5 : index
@@ -37,10 +36,10 @@ module attributes {transform.with_named_sequence} {
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
 //     CHECK:   %[[L:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[D1]] step %[[C5]] iter_args(%[[ARG3:.*]] = %[[F]]) -> (tensor<?x5xf32>) {
-//     CHECK:     %[[PS:.*]] = affine.min #[[MAP2]](%[[K]])[%[[D1]]]
+//     CHECK:     %[[PS:.*]] = affine.min #[[MAP0]](%[[K]])[%[[D1]]]
 //     CHECK:     %[[EXT2:.*]] = tensor.extract_slice %[[ARG0]][0, %[[K:.*]]] [%[[D0]], %[[PS]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 //     CHECK:     %[[EXT:.*]] = tensor.extract_slice %[[ARG3]][0, 0] [%[[D0]], %[[PS]]] [1, 1] : tensor<?x5xf32> to tensor<?x?xf32>
-//     CHECK:     %[[PR:.*]] = linalg.generic {indexing_maps = [#[[MAP0]], #[[MAP0]]], iterator_types = ["parallel", "parallel"]} ins(%[[EXT2]] : tensor<?x?xf32>) outs(%[[EXT]] : tensor<?x?xf32>) {
+//     CHECK:     %[[PR:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[EXT2]] : tensor<?x?xf32>) outs(%[[EXT]] : tensor<?x?xf32>) {
 //     CHECK:       arith.mulf
 //     CHECK:       arith.addf
 //     CHECK:       linalg.yield
@@ -48,10 +47,10 @@ module attributes {transform.with_named_sequence} {
 //     CHECK:     %[[INS:.*]] = tensor.insert_slice %[[PR]] into %[[ARG3]][0, 0] [%[[D0]], %[[PS]]] [1, 1] : tensor<?x?xf32> into tensor<?x5xf32>
 //     CHECK:     scf.yield %[[INS]] : tensor<?x5xf32>
 //     CHECK:   }
-//     CHECK:   %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP0]], #[[MAP1]]], iterator_types = ["parallel", "reduction"]} ins(%[[L]] : tensor<?x5xf32>) outs(%[[ARG1]] : tensor<?xf32>) {
+//     CHECK:   %[[R:.*]] = linalg.reduce ins(%[[L]] : tensor<?x5xf32>) outs(%[[ARG1]] : tensor<?xf32>) dimensions = [1]
 //     CHECK:     arith.addf
 //     CHECK:     linalg.yield
-//     CHECK:   } -> tensor<?xf32>
+//     CHECK:   }
 //     CHECK:   return %[[R]] : tensor<?xf32>
 
 // -----
@@ -81,7 +80,6 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 5)>
 // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d1)>
 //     CHECK: func @reduction_tile_transpose
 //     CHECK:   tensor.empty(%{{.*}}) : tensor<5x?xf32>
 //     CHECK:   linalg.fill {{.*}} : tensor<5x?xf32>) -> tensor<5x?xf32>
@@ -91,7 +89,7 @@ module attributes {transform.with_named_sequence} {
 //     CHECK:     %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0] [%[[D0]], %[[D1]]] [1, 1] : tensor<?x?xf32> into tensor<5x?xf32>
 //     CHECK:     scf.yield {{.*}} : tensor<5x?xf32>
 //     CHECK:   }
-//     CHECK:   linalg.generic
+//     CHECK:   linalg.reduce
 //     CHECK:   return
 
 // -----
@@ -150,10 +148,11 @@ module attributes {transform.with_named_sequence} {
 //     CHECK:       tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
-//     CHECK:   %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP3]], #[[MAP4]]], iterator_types = ["parallel", "reduction"]} ins(%[[L]] : tensor<?x5xf32>) outs(%[[ARG1]] : tensor<?xf32>) {
+//     CHECK:   %[[R:.*]] = linalg.reduce ins(%[[L]] : tensor<?x5xf32>) outs(%[[ARG1]] : tensor<?xf32>) dimensions = [1]
+//     CHECK:   {
 //     CHECK:     arith.addf
 //     CHECK:     linalg.yield
-//     CHECK:   } -> tensor<?xf32>
+//     CHECK:   }
 //     CHECK:   return %[[R]] : tensor<?xf32>
 
 // -----
@@ -177,8 +176,6 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 5)) + s0, s0 ceildiv 5)>
 // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (0, d0)>
 // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 5))>
-// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 //     CHECK: func @matmul_tile_parallel(%[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>
 // CHECK-DAG:   %[[I:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
@@ -203,10 +200,10 @@ module attributes {transform.with_named_sequence} {
 //     CHECK:       tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?xf32> into tensor<?x?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
-//     CHECK:   %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP3]], #[[MAP4]]], iterator_types = ["parallel", "parallel", "reduction"]} ins(%[[L]] : tensor<?x?x5xf32>) outs(%[[ARG2]] : tensor<?x?xf32>) {
+//     CHECK:   %[[R:.*]] = linalg.reduce ins(%[[L]] : tensor<?x?x5xf32>) outs(%[[ARG2]] : tensor<?x?xf32>) dimensions = [2]
 //     CHECK:     arith.addf
 //     CHECK:     linalg.yield
-//     CHECK:   } -> tensor<?x?xf32>
+//     CHECK:   }
 //     CHECK:   return %[[R]] : tensor<?x?xf32>
 
 // -----
@@ -270,10 +267,10 @@ module attributes {transform.with_named_sequence} {
 //     CHECK:       tensor.parallel_insert_slice %[[CARRY]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
-//     CHECK:   %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP2]], #[[MAP3]]], iterator_types = ["parallel", "reduction"]} ins(%[[L]] : tensor<?x5xf32>) outs(%[[ARG1]] : tensor<?xf32>) {
+//     CHECK:   %[[R:.*]] = linalg.reduce ins(%[[L]] : tensor<?x5xf32>) outs(%[[ARG1]] : tensor<?xf32>) dimensions = [1]
 //     CHECK:     arith.addf
 //     CHECK:     linalg.yield
-//     CHECK:   } -> tensor<?xf32>
+//     CHECK:   }
 //     CHECK:   return %[[R]] : tensor<?xf32>
 
 // -----
@@ -307,7 +304,7 @@ module attributes {transform.with_named_sequence} {
     //      CHECK:     iterator_types = ["parallel", "reduction"]
     transform.print %2 {name = "expecting parallel reduction"} : !transform.any_op
     //      CHECK:     expecting parallel reduction
-    // CHECK-NEXT:     linalg.generic
+    // CHECK-NEXT:     linalg.reduce
     //      CHECK:     iterator_types = ["parallel", "reduction"]
     transform.print %3 {name = "expecting parallel reduction"} : !transform.any_op
     transform.yield
@@ -401,7 +398,7 @@ module {
 // CHECK:       %[[OUT:.*]] = linalg.generic  {indexing_maps = [{{.*}}, {{.*}}, {{.*}}], iterator_types = ["parallel", "parallel", "parallel"]} ins(%{{.*}}, %{{.*}}: tensor<2x64xf32>, tensor<4096x2x64xf32>) outs(%{{.*}}: tensor<4096x2x64xf32>)
 // CHECK:       scf.yield %[[OUT]] : tensor<4096x2x64xf32>
 // CHECK:     scf.yield %[[L1]] : tensor<4096x2x64xf32>
-// CHECK:   %[[OUT2:.*]] = linalg.generic {indexing_maps = [{{.*}}, {{.*}}], iterator_types = ["parallel", "reduction", "reduction"]} ins(%{{.*}} : tensor<4096x2x64xf32>) outs(%{{.*}} : tensor<4096xf32>)
+// CHECK:   %[[OUT2:.*]] = linalg.reduce ins(%{{.*}} : tensor<4096x2x64xf32>) outs(%{{.*}} : tensor<4096xf32>)
 // CHECK:  return %[[OUT2]] : tensor<4096xf32>
 
 // -----
@@ -445,6 +442,6 @@ module attributes {transform.with_named_sequence} {
 // CHECK:       %[[INSERT1:.+]] = tensor.insert_slice %[[UPDATED]]#0 into %[[SUM]]
 // CHECK:       %[[INSERT2:.+]] = tensor.insert_slice %[[UPDATED]]#1 into %[[MAX]]
 // CHECK:       scf.yield %[[INSERT1]], %[[INSERT1]]
-// CHECK:       linalg.generic
+// CHECK:       linalg.reduce
 // CHECK:         arith.addf
 // CHECK:         arith.maximumf
diff --git a/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir b/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir
index 84b556d80887c0..4964a8d2e0db84 100644
--- a/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir
@@ -34,18 +34,18 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[C8:.*]] = arith.constant 8 : index
 // CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_IN]] : vector<1x8x4xi1>
 /// Read the input tensor
-// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
 
 /// Create a mask for the filter tensor
 // CHECK:           %[[CH_DIM_FLT:.*]] = tensor.dim %[[FILTER]], %[[C1]] : tensor<1x?xi8>
 // CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C1]], %[[CH_DIM_FLT]] : vector<1x4xi1>
 /// Read the filter tensor
-// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : tensor<1x?xi8>, vector<1x4xi8> } : vector<1x4xi1> -> vector<1x4xi8>
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true]} : tensor<1x?xi8>, vector<1x4xi8> } : vector<1x4xi1> -> vector<1x4xi8>
 
 /// Create a mask for the output tensor
 // CHECK:           %[[CH_DIM_OUT:.*]] = tensor.dim %[[OUTPUT]], %[[C2]] : tensor<1x8x?xi8>
 // CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_OUT]] : vector<1x8x4xi1>
-// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
 
 /// Convolution
 // CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x4xi8> to vector<1x8x4xi8>
@@ -55,7 +55,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[MULI:.*]] = arith.muli %[[IN_1]], %[[FLT_1_B]] : vector<1x8x4xi8>
 // CHECK:           %[[ADDI:.*]] = arith.addi %[[MULI]], %[[OUT_1]] : vector<1x8x4xi8>
 // CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[ADDI]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x8x4xi8> into vector<1x8x4xi8>
-// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<1x8x4xi8>, tensor<1x8x?xi8> } : vector<1x8x4xi1> -> tensor<1x8x?xi8>
+// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x8x4xi8>, tensor<1x8x?xi8> } : vector<1x8x4xi1> -> tensor<1x8x?xi8>
 // CHECK:           return %[[OUT]] : tensor<1x8x?xi8>
 
 // -----
@@ -95,19 +95,19 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[C8:.*]] = arith.constant 8 : index
 // CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_IN]] : vector<1x8x[4]xi1>
 /// Read the input tensor
-// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
 
 /// Create a mask for the filter tensor
 // CHECK:           %[[CH_DIM_FLT:.*]] = tensor.dim %[[FILTER]], %[[C1]] : tensor<1x?xi8>
 // CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C1]], %[[CH_DIM_FLT]] : vector<1x[4]xi1>
 /// Read the filter tensor
-// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : tensor<1x?xi8>, vector<1x[4]xi8> } : vector<1x[4]xi1> -> vector<1x[4]xi8>
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true]} : tensor<1x?xi8>, vector<1x[4]xi8> } : vector<1x[4]xi1> -> vector<1x[4]xi8>
 
 /// Create a mask for the output tensor
 // CHECK:           %[[CH_DIM_OUT:.*]] = tensor.dim %[[OUTPUT]], %[[C2]] : tensor<1x8x?xi8>
 // CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_OUT]] : vector<1x8x[4]xi1>
 /// Read the output tensor
-// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
 
 /// Convolution
 // CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x[4]xi8> to vector<1x8x[4]xi8>
@@ -117,7 +117,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[MULI:.*]] = arith.muli %[[IN_1]], %[[FLT_1_B]] : vector<1x8x[4]xi8>
 // CHECK:           %[[ADDI:.*]] = arith.addi %[[MULI]], %[[OUT_1]] : vector<1x8x[4]xi8>
 // CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[ADDI]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x8x[4]xi8> into vector<1x8x[4]xi8>
-// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<1x8x[4]xi8>, tensor<1x8x?xi8> } : vector<1x8x[4]xi1> -> tensor<1x8x?xi8>
+// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x8x[4]xi8>, tensor<1x8x?xi8> } : vector<1x8x[4]xi1> -> tensor<1x8x?xi8>
 // CHECK:           return %[[OUT]] : tensor<1x8x?xi8>
 
 // -----
@@ -157,19 +157,19 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[C5:.*]] = arith.constant 5 : index
 // CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C3]], %[[C5]], %[[CH_DIM_IN]] : vector<3x4x[4]xi1>
 /// Read the input tensor
-// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : memref<3x5x?xf32>, vector<3x4x[4]xf32> } : vector<3x4x[4]xi1> -> vector<3x4x[4]xf32>
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : memref<3x5x?xf32>, vector<3x4x[4]xf32> } : vector<3x4x[4]xi1> -> vector<3x4x[4]xf32>
 
 /// Create a mask for the filter tensor
 // CHECK:           %[[CH_DIM_FLT:.*]] = memref.dim %[[FILTER]], %[[C1]] : memref<2x?xf32>
 // CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C2]], %[[CH_DIM_FLT]] : vector<2x[4]xi1>
 /// Read the filter tensor
-// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : memref<2x?xf32>, vector<2x[4]xf32> } : vector<2x[4]xi1> -> vector<2x[4]xf32>
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true]} : memref<2x?xf32>, vector<2x[4]xf32> } : vector<2x[4]xi1> -> vector<2x[4]xf32>
 
 /// Create a mask for the output tensor
 // CHECK:           %[[CH_DIM_OUT:.*]] = memref.dim %[[OUTPUT]], %[[C2]] : memref<3x2x?xf32>
 // CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C3]], %[[C2]], %[[CH_DIM_OUT]] : vector<3x2x[4]xi1>
 /// Read the output tensor
-// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : memref<3x2x?xf32>, vector<3x2x[4]xf32> } : vector<3x2x[4]xi1> -> vector<3x2x[4]xf32>
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : memref<3x2x?xf32>, vector<3x2x[4]xf32> } : vector<3x2x[4]xi1> -> vector<3x2x[4]xf32>
 
 /// Convolution
 // CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [3, 2, 4], strides = [1, 1, 1]} : vector<3x4x[4]xf32> to vector<3x2x[4]xf32>
@@ -182,4 +182,4 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[FLT_2_B:.*]] = vector.broadcast %[[FLT_2]] : vector<[4]xf32> to vector<3x2x[4]xf32>
 // CHECK:           %[[FMA_2:.*]] = vector.fma %[[IN_2]], %[[FLT_2_B]], %[[FMA_1]] : vector<3x2x[4]xf32>
 // CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[FMA_2]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<3x2x[4]xf32> into vector<3x2x[4]xf32>
-// CHECK:           vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<3x2x[4]xf32>, memref<3x2x?xf32> } : vector<3x2x[4]xi1>
+// CHECK:           vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x2x[4]xf32>, memref<3x2x?xf32> } : vector<3x2x[4]xi1>
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index c4ff6480a4ce5e..b15af9baca7dc7 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -32,15 +32,14 @@ func.func @collapse_expand_rank0_cancel(%arg0 : memref<1x1xi8>) -> memref<1x1xi8
 // CHECK-LABEL: func @subview_of_size_memcast
 //  CHECK-SAME:   %[[ARG0:.[a-z0-9A-Z_]+]]: memref<4x6x16x32xi8>
 //       CHECK:   %[[S:.+]] = memref.subview %[[ARG0]][0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : memref<4x6x16x32xi8> to memref<16x32xi8, strided{{.*}}>
-//       CHECK:   %[[M:.+]] = memref.cast %[[S]] : memref<16x32xi8, {{.*}}> to memref<16x32xi8, strided{{.*}}>
-//       CHECK:   return %[[M]] : memref<16x32xi8, strided{{.*}}>
+//       CHECK:   return %[[S]] : memref<16x32xi8, strided{{.*}}>
 func.func @subview_of_size_memcast(%arg : memref<4x6x16x32xi8>) ->
-  memref<16x32xi8, strided<[32, 1], offset: ?>>{
+  memref<16x32xi8, strided<[32, 1], offset: 512>>{
   %0 = memref.cast %arg : memref<4x6x16x32xi8> to memref<?x?x16x32xi8>
   %1 = memref.subview %0[0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] :
     memref<?x?x16x32xi8> to
-    memref<16x32xi8, strided<[32, 1], offset: ?>>
-  return %1 : memref<16x32xi8, strided<[32, 1], offset: ?>>
+    memref<16x32xi8, strided<[32, 1], offset: 512>>
+  return %1 : memref<16x32xi8, strided<[32, 1], offset: 512>>
 }
 
 // -----
@@ -643,13 +642,13 @@ func.func @no_fold_subview_with_non_unit_stride(%arg0 : memref<20x42xf32>) -> me
 
 // -----
 
-func.func @no_fold_dynamic_no_op_subview(%arg0 : memref<?x?xf32>) -> memref<?x?xf32, strided<[?, 1], offset: ?>> {
+func.func @no_fold_dynamic_no_op_subview(%arg0 : memref<?x?xf32>) -> memref<?x?xf32, strided<[?, 1]>> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
   %1 = memref.dim %arg0, %c1 : memref<?x?xf32>
-  %2 = memref.subview %arg0[0, 0] [%0, %1] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-  return %2 : memref<?x?xf32, strided<[?, 1], offset: ?>>
+  %2 = memref.subview %arg0[0, 0] [%0, %1] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
+  return %2 : memref<?x?xf32, strided<[?, 1]>>
 }
 // CHECK-LABEL: func @no_fold_dynamic_no_op_subview(
 //       CHECK:   %[[SUBVIEW:.+]] = memref.subview
@@ -969,7 +968,7 @@ func.func @canonicalize_rank_reduced_subview(%arg0 : memref<8x?xf32>,
 // CHECK-SAME:     %[[ARG0:.+]]: memref<8x?xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: index
 //      CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ARG0]][0, 0] [1, %[[ARG1]]] [1, 1]
-// CHECK-SAME:       memref<8x?xf32> to memref<?xf32, strided<[1], offset: ?>>
+// CHECK-SAME:       memref<8x?xf32> to memref<?xf32, strided<[1]>>
 
 // -----
 
diff --git a/mlir/test/Dialect/MemRef/subview.mlir b/mlir/test/Dialect/MemRef/subview.mlir
index 2df3e24942d701..135a1124066e43 100644
--- a/mlir/test/Dialect/MemRef/subview.mlir
+++ b/mlir/test/Dialect/MemRef/subview.mlir
@@ -47,9 +47,9 @@ func.func @memref_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
   %7 = memref.alloc(%arg1, %arg2) : memref<?x?xf32>
   // CHECK: memref.subview {{%.*}}[0, 0] [4, 4] [1, 1] :
   // CHECK-SAME: memref<?x?xf32>
-  // CHECK-SAME: to memref<4x4xf32, strided<[?, 1], offset: ?>>
+  // CHECK-SAME: to memref<4x4xf32, strided<[?, 1]>>
   %8 = memref.subview %7[0, 0][4, 4][1, 1]
-    : memref<?x?xf32> to memref<4x4xf32, strided<[?, 1], offset: ?>>
+    : memref<?x?xf32> to memref<4x4xf32, strided<[?, 1]>>
 
   %9 = memref.alloc() : memref<16x4xf32>
   // CHECK: memref.subview {{%.*}}[{{%.*}}, {{%.*}}] [4, 4] [{{%.*}}, {{%.*}}] :
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 115d164b6cc7ec..98284822f90a45 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -250,6 +250,26 @@ func.func @order_value(%lb : index, %ub : index, %step : index) {
   }
 }
 
+// -----
+func.func @reproducible_order(%lb : index, %ub : index, %step : index) {
+  // expected-error @below {{invalid clause value: 'default'}}
+  omp.wsloop order(reproducible:default) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+    omp.terminator
+  }
+}
+// -----
+func.func @unconstrained_order(%lb : index, %ub : index, %step : index) {
+  // expected-error @below {{invalid clause value: 'default'}}
+  omp.wsloop order(unconstrained:default) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+    omp.terminator
+  }
+}
 // -----
 
 func.func @if_not_allowed(%lb : index, %ub : index, %step : index, %bool_var : i1) {
@@ -485,6 +505,26 @@ func.func @omp_simd_order_value(%lb : index, %ub : index, %step : index) {
 
 // -----
 
+func.func @omp_simd_reproducible_order(%lb : index, %ub : index, %step : index) {
+  // expected-error @below {{invalid clause value: 'default'}}
+  omp.simd order(reproducible:default) {
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
+  }
+  return
+}
+// -----
+func.func @omp_simd_unconstrained_order(%lb : index, %ub : index, %step : index) {
+  // expected-error @below {{invalid clause value: 'default'}}
+  omp.simd order(unconstrained:default) {
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
+  }
+  return
+}
+// -----
 func.func @omp_simd_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () {
   // expected-error @below {{op attribute 'simdlen' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}}
   omp.simd simdlen(0) {
@@ -2131,6 +2171,36 @@ func.func @omp_distribute_nested_wrapper(%data_var : memref<i32>) -> () {
 
 // -----
 
+func.func @omp_distribute_order() -> () {
+// expected-error @below {{invalid clause value: 'default'}}
+  omp.distribute order(default) {
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
+  }
+  return
+}
+// -----
+func.func @omp_distribute_reproducible_order() -> () {
+// expected-error @below {{invalid clause value: 'default'}}
+  omp.distribute order(reproducible:default) {
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
+  }
+  return
+}
+// -----
+func.func @omp_distribute_unconstrained_order() -> () {
+// expected-error @below {{invalid clause value: 'default'}}
+  omp.distribute order(unconstrained:default) {
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
+  }
+  return
+}
+// -----
 omp.private {type = private} @x.privatizer : i32 alloc {
 ^bb0(%arg0: i32):
   %0 = arith.constant 0.0 : f32
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index af66d0c65dab87..56e65213d147ba 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -502,6 +502,22 @@ func.func @omp_wsloop_pretty(%lb : index, %ub : index, %step : index, %data_var
     omp.terminator
   }
 
+  // CHECK: omp.wsloop nowait order(reproducible:concurrent) {
+  // CHECK-NEXT: omp.loop_nest
+  omp.wsloop order(reproducible:concurrent) nowait {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+    omp.terminator
+  }
+  // CHECK: omp.wsloop nowait order(unconstrained:concurrent) {
+  // CHECK-NEXT: omp.loop_nest
+  omp.wsloop order(unconstrained:concurrent) nowait {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+    omp.terminator
+  }
   // CHECK: omp.wsloop {
   // CHECK-NEXT: omp.simd
   // CHECK-NEXT: omp.loop_nest
@@ -652,6 +668,18 @@ func.func @omp_simd_pretty_order(%lb : index, %ub : index, %step : index) -> ()
       omp.yield
     }
   }
+  // CHECK: omp.simd order(reproducible:concurrent)
+  omp.simd order(reproducible:concurrent) {
+    omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+  }
+  // CHECK: omp.simd order(unconstrained:concurrent)
+  omp.simd order(unconstrained:concurrent) {
+    omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+  }
   return
 }
 
@@ -711,6 +739,18 @@ func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i3
       omp.yield
     }
   }
+  // CHECK: omp.distribute order(reproducible:concurrent)
+  omp.distribute order(reproducible:concurrent) {
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
+  }
+  // CHECK: omp.distribute order(unconstrained:concurrent)
+  omp.distribute order(unconstrained:concurrent) {
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
+  }
   // CHECK: omp.distribute allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>)
   omp.distribute allocate(%data_var : memref<i32> -> %data_var : memref<i32>) {
     omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
@@ -1007,6 +1047,14 @@ func.func @omp_teams(%lb : i32, %ub : i32, %if_cond : i1, %num_threads : i32,
     omp.terminator
   }
 
+  // Test reduction byref
+  // CHECK: omp.teams reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.teams reduction(byref @add_f32 -> %0 : !llvm.ptr) {
+    %1 = arith.constant 2.0 : f32
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+
   // Test allocate.
   // CHECK: omp.teams allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>)
   omp.teams allocate(%data_var : memref<i32> -> %data_var : memref<i32>) {
@@ -1038,6 +1086,27 @@ func.func @sections_reduction() {
   return
 }
 
+// CHECK-LABEL: func @sections_reduction_byref
+func.func @sections_reduction_byref() {
+  %c1 = arith.constant 1 : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+  // CHECK: omp.sections reduction(byref @add_f32 -> {{.+}} : !llvm.ptr)
+  omp.sections reduction(byref @add_f32 -> %0 : !llvm.ptr) {
+    // CHECK: omp.section
+    omp.section {
+      %1 = arith.constant 2.0 : f32
+      omp.terminator
+    }
+    // CHECK: omp.section
+    omp.section {
+      %1 = arith.constant 3.0 : f32
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
 // CHECK: omp.declare_reduction
 // CHECK-LABEL: @add2_f32
 omp.declare_reduction @add2_f32 : f32
@@ -1970,6 +2039,15 @@ func.func @omp_task(%bool_var: i1, %i64_var: i64, %i32_var: i32, %data_var: memr
     omp.terminator
   }
 
+  // Checking `in_reduction` clause (mixed) byref
+  // CHECK: omp.task in_reduction(byref @add_f32 -> %[[redn_var1]] : !llvm.ptr, @add_f32 -> %[[redn_var2]] : !llvm.ptr) {
+  omp.task in_reduction(byref @add_f32 -> %0 : !llvm.ptr, @add_f32 -> %1 : !llvm.ptr) {
+    // CHECK: "test.foo"() : () -> ()
+    "test.foo"() : () -> ()
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+
   // Checking priority clause
   // CHECK: omp.task priority(%[[i32_var]]) {
   omp.task priority(%i32_var) {
@@ -1991,8 +2069,8 @@ func.func @omp_task(%bool_var: i1, %i64_var: i64, %i32_var: i32, %data_var: memr
   // Checking multiple clauses
   // CHECK: omp.task if(%[[bool_var]]) final(%[[bool_var]]) untied
   omp.task if(%bool_var) final(%bool_var) untied
-      // CHECK-SAME: in_reduction(@add_f32 -> %[[redn_var1]] : !llvm.ptr, @add_f32 -> %[[redn_var2]] : !llvm.ptr)
-      in_reduction(@add_f32 -> %0 : !llvm.ptr, @add_f32 -> %1 : !llvm.ptr)
+      // CHECK-SAME: in_reduction(@add_f32 -> %[[redn_var1]] : !llvm.ptr, byref @add_f32 -> %[[redn_var2]] : !llvm.ptr)
+      in_reduction(@add_f32 -> %0 : !llvm.ptr, byref @add_f32 -> %1 : !llvm.ptr)
       // CHECK-SAME: priority(%[[i32_var]])
       priority(%i32_var)
       // CHECK-SAME: allocate(%[[data_var]] : memref<i32> -> %[[data_var]] : memref<i32>)
@@ -2247,8 +2325,26 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
     }
   }
 
-  // CHECK: omp.taskloop reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+  // Checking byref attribute for in_reduction
+  // CHECK: omp.taskloop in_reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop in_reduction(byref @add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+
+  // CHECK: omp.taskloop reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop reduction(byref @add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+
+  // check byref attrbute for reduction
+  // CHECK: omp.taskloop reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr, byref @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop reduction(byref @add_f32 -> %testf32 : !llvm.ptr, byref @add_f32 -> %testf32_2 : !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
diff --git a/mlir/test/Dialect/Ptr/layout.mlir b/mlir/test/Dialect/Ptr/layout.mlir
new file mode 100644
index 00000000000000..73189a388942ac
--- /dev/null
+++ b/mlir/test/Dialect/Ptr/layout.mlir
@@ -0,0 +1,114 @@
+// RUN: mlir-opt --test-data-layout-query --split-input-file --verify-diagnostics %s | FileCheck %s
+
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!ptr.ptr, #ptr.spec<size = 32, abi = 32, preferred = 64>>,
+  #dlti.dl_entry<!ptr.ptr<5>,#ptr.spec<size = 64, abi = 64, preferred = 64>>,
+  #dlti.dl_entry<!ptr.ptr<4>, #ptr.spec<size = 32, abi = 64, preferred = 64, index = 24>>,
+  #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui64>,
+  #dlti.dl_entry<"dlti.global_memory_space", 2 : ui64>,
+  #dlti.dl_entry<"dlti.program_memory_space", 3 : ui64>,
+  #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>
+>} {
+  // CHECK: @spec
+  func.func @spec() {
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: global_memory_space = 2
+    // CHECK: index = 32
+    // CHECK: preferred = 8
+    // CHECK: program_memory_space = 3
+    // CHECK: size = 4
+    // CHECK: stack_alignment = 128
+    "test.data_layout_query"() : () -> !ptr.ptr
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: global_memory_space = 2
+    // CHECK: index = 32
+    // CHECK: preferred = 8
+    // CHECK: program_memory_space = 3
+    // CHECK: size = 4
+    // CHECK: stack_alignment = 128
+    "test.data_layout_query"() : () -> !ptr.ptr<3>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 64
+    // CHECK: global_memory_space = 2
+    // CHECK: index = 64
+    // CHECK: preferred = 8
+    // CHECK: program_memory_space = 3
+    // CHECK: size = 8
+    // CHECK: stack_alignment = 128
+    "test.data_layout_query"() : () -> !ptr.ptr<5>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: global_memory_space = 2
+    // CHECK: index = 24
+    // CHECK: preferred = 8
+    // CHECK: program_memory_space = 3
+    // CHECK: size = 4
+    // CHECK: stack_alignment = 128
+    "test.data_layout_query"() : () -> !ptr.ptr<4>
+    return
+  }
+}
+
+// -----
+
+// expected-error@+2 {{preferred alignment is expected to be at least as large as ABI alignment}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!ptr.ptr, #ptr.spec<size = 64, abi = 64, preferred = 32>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
+
+// -----
+
+// expected-error@+2 {{size entry must be divisible by 8}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!ptr.ptr, #ptr.spec<size = 33, abi = 32, preferred = 32>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
+
+
+// -----
+
+// expected-error@+2 {{abi entry must be divisible by 8}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!ptr.ptr, #ptr.spec<size = 32, abi = 33, preferred = 64>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
+
+
+// -----
+
+// expected-error@+2 {{preferred entry must be divisible by 8}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!ptr.ptr, #ptr.spec<size = 32, abi = 32, preferred = 33>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
+
+
+// -----
+
+// expected-error@+2 {{index entry must be divisible by 8}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!ptr.ptr, #ptr.spec<size = 32, abi = 32, preferred = 32, index = 33>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
diff --git a/mlir/test/Dialect/Ptr/types.mlir b/mlir/test/Dialect/Ptr/types.mlir
new file mode 100644
index 00000000000000..279213bd6fc3e5
--- /dev/null
+++ b/mlir/test/Dialect/Ptr/types.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-opt %s -split-input-file | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @ptr_test
+// CHECK: (%[[ARG0:.*]]: !ptr.ptr, %[[ARG1:.*]]: !ptr.ptr<1 : i32>)
+// CHECK: -> (!ptr.ptr<1 : i32>, !ptr.ptr)
+func.func @ptr_test(%arg0: !ptr.ptr, %arg1: !ptr.ptr<1 : i32>) -> (!ptr.ptr<1 : i32>, !ptr.ptr) {
+  // CHECK: return %[[ARG1]], %[[ARG0]] : !ptr.ptr<1 : i32>, !ptr.ptr
+  return %arg1, %arg0 : !ptr.ptr<1 : i32>, !ptr.ptr
+}
+
+// -----
+
+// CHECK-LABEL: func @ptr_test
+// CHECK: %[[ARG:.*]]: memref<!ptr.ptr>
+func.func @ptr_test(%arg0: memref<!ptr.ptr>) {
+  return
+}
diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
index e562cf8efc3567..c2f21683d0cd68 100644
--- a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
+++ b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
@@ -102,6 +102,18 @@ func.func @insert_slice_of_transfer_write(%t1 : tensor<?x12xf32>, %v : vector<5x
   return %1 : tensor<?x12xf32>
 }
 
+// CHECK-LABEL: func @unit_insert_slice_of_unit_transfer_write(
+//  CHECK-SAME:     %[[t1:.*]]: tensor<1x1x12xf32>, %[[v:.*]]: vector<1x6xf32>, %[[s:.*]]: index
+//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
+//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c0]], %[[c0]], %[[s]]] {in_bounds = [true, true]} : vector<1x6xf32>, tensor<1x1x12xf32>
+//       CHECK:   return %[[r]]
+func.func @unit_insert_slice_of_unit_transfer_write(%t1 : tensor<1x1x12xf32>, %v : vector<1x6xf32>, %s : index, %t2 : tensor<1x6xf32>) -> tensor<1x1x12xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<1x6xf32>, tensor<1x6xf32>
+  %1 = tensor.insert_slice %0 into %t1[0, 0, %s] [1, 1, 6] [1, 1, 1] : tensor<1x6xf32> into tensor<1x1x12xf32>
+  return %1 : tensor<1x1x12xf32>
+}
+
 // CHECK-LABEL: func @insert_slice_of_transfer_write_non_leading_rank_reduction(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
 //   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
index f2e529b4cac950..1a84e141049325 100644
--- a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
+++ b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
@@ -282,11 +282,13 @@ func.func @insert_slice_of_insert_slice(%t: tensor<f32>, %r0: tensor<1x1xf32>, %
 
 // -----
 
+//   CHECK-DAG: #[[$map:.*]] = affine_map<()[s0] -> (s0 + 2)>
 // CHECK-LABEL: func @insert_slice_of_insert_slice(
 //  CHECK-SAME:     %[[t:[0-9a-z]*]]: tensor<f32>
 //  CHECK-SAME:     %[[r1:[0-9a-z]*]]: tensor<1x14xf32>
 //  CHECK-SAME:     %[[pos:[0-9a-z]*]]: index
-//       CHECK:   tensor.insert_slice %[[t]] into %[[r1]][5, %[[pos]]] [1, 1] [1, 1] : tensor<f32> into tensor<1x14xf32>
+//       CHECK:   %[[composed_pos:.+]] = affine.apply #[[$map]]()[%[[pos]]]
+//       CHECK:   tensor.insert_slice %[[t]] into %[[r1]][3, %[[composed_pos]]] [1, 1] [1, 1] : tensor<f32> into tensor<1x14xf32>
 func.func @insert_slice_of_insert_slice(%t: tensor<f32>, %r0: tensor<1xf32>, %r1: tensor<1x14xf32>, %pos: index)
     -> tensor<1x14xf32> 
 {
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 8181f1a8c5d133..fc5651f5bb02f0 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -868,7 +868,7 @@ func.func @canonicalize_broadcast_shapecast_to_shapecast(%arg0: vector<3x4xf32>)
 // -----
 
 // CHECK-LABEL: fold_vector_transfer_masks
-func.func @fold_vector_transfer_masks(%A: memref<?x?xf32>) -> (vector<4x8xf32>) {
+func.func @fold_vector_transfer_masks(%A: memref<?x?xf32>) -> (vector<4x8xf32>, vector<4x[4]xf32>) {
   // CHECK: %[[C0:.+]] = arith.constant 0 : index
   %c0 = arith.constant 0 : index
   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f32
@@ -876,6 +876,8 @@ func.func @fold_vector_transfer_masks(%A: memref<?x?xf32>) -> (vector<4x8xf32>)
 
   %mask = vector.constant_mask [8, 4] : vector<8x4xi1>
 
+  %arith_all_true_mask = arith.constant dense<true> : vector<4x[4]xi1>
+
   // CHECK: vector.transfer_read %{{.*}}, %[[F0]] {permutation_map
   %1 = vector.transfer_read %A[%c0, %c0], %f0, %mask
       {permutation_map = affine_map<(d0, d1) -> (d1, d0)>} : memref<?x?xf32>, vector<4x8xf32>
@@ -884,8 +886,14 @@ func.func @fold_vector_transfer_masks(%A: memref<?x?xf32>) -> (vector<4x8xf32>)
   vector.transfer_write %1, %A[%c0, %c0], %mask
       {permutation_map = affine_map<(d0, d1) -> (d1, d0)>} : vector<4x8xf32>, memref<?x?xf32>
 
+  // CHECK: vector.transfer_read %{{.*}}, %[[F0]] :
+  %2 = vector.transfer_read %A[%c0, %c0], %f0, %arith_all_true_mask : memref<?x?xf32>, vector<4x[4]xf32>
+
+  // CHECK: vector.transfer_write {{.*}}[%[[C0]], %[[C0]]] :
+  vector.transfer_write %2, %A[%c0, %c0], %arith_all_true_mask : vector<4x[4]xf32>, memref<?x?xf32>
+
   // CHECK: return
-  return %1 : vector<4x8xf32>
+  return %1, %2 : vector<4x8xf32>, vector<4x[4]xf32>
 }
 
 // -----
diff --git a/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir b/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
index 31ae126906f216..6dbde7afbdd33b 100644
--- a/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
+++ b/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
@@ -48,3 +48,25 @@ func.func @fold_arith_extf_into_contract_scalable(
       %lhs_f32, %rhs_f32, %arg2 : vector<[64]x64xf32>, vector<64x64xf32> into vector<[64]x64xf32>
     return %result : vector<[64]x64xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @fold_arith_extsi_into_contract
+//  CHECK-SAME: (%[[ARG0:.*]]: vector<64x64xi8>, %[[ARG1:.*]]: vector<64x64xi8>, %[[ARG2:.*]]: vector<64x64xi32>)
+//  CHECK-NEXT:   %[[R:.+]] = vector.contract
+//  CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
+//  CHECK-SAME:   %[[ARG0]], %[[ARG1]], %[[ARG2]] : vector<64x64xi8>, vector<64x64xi8> into vector<64x64xi32>
+//  CHECK-NEXT:   return %[[R]] : vector<64x64xi32>
+func.func @fold_arith_extsi_into_contract(
+  %arg0: vector<64x64xi8>,
+  %arg1: vector<64x64xi8>,
+  %arg2: vector<64x64xi32>) -> vector<64x64xi32> {
+    %lhs_i32 = arith.extsi %arg0 : vector<64x64xi8> to vector<64x64xi32>
+    %rhs_i32 = arith.extsi %arg1 : vector<64x64xi8> to vector<64x64xi32>
+    %result = vector.contract {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"],
+      kind = #vector.kind<add>}
+      %lhs_i32, %rhs_i32, %arg2 : vector<64x64xi32>, vector<64x64xi32> into vector<64x64xi32>
+    return %result : vector<64x64xi32>
+}
diff --git a/mlir/test/Dialect/Vector/vector-gather-lowering.mlir b/mlir/test/Dialect/Vector/vector-gather-lowering.mlir
index c2eb88afa4dbfe..5ad3a23e0ba15c 100644
--- a/mlir/test/Dialect/Vector/vector-gather-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-gather-lowering.mlir
@@ -74,6 +74,43 @@ func.func @gather_memref_1d_i32_index(%base: memref<?xf32>, %v: vector<2xi32>, %
   return %0 : vector<2x3xf32>
  }
 
+// CHECK-LABEL: @scalable_gather_memref_2d
+// CHECK-SAME:      %[[BASE:.*]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[IDXVEC:.*]]: vector<2x[3]xindex>,
+// CHECK-SAME:      %[[MASK:.*]]: vector<2x[3]xi1>,
+// CHECK-SAME:      %[[PASS:.*]]: vector<2x[3]xf32>
+// CHECK:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[C1:.*]] = arith.constant 1 : index
+// CHECK:         %[[INIT:.*]] = arith.constant dense<0.000000e+00> : vector<2x[3]xf32>
+// CHECK:         %[[IDXVEC0:.*]] = vector.extract %[[IDXVEC]][0] : vector<[3]xindex> from vector<2x[3]xindex>
+// CHECK:         %[[MASK0:.*]] = vector.extract %[[MASK]][0] : vector<[3]xi1> from vector<2x[3]xi1>
+// CHECK:         %[[PASS0:.*]] = vector.extract %[[PASS]][0] : vector<[3]xf32> from vector<2x[3]xf32>
+// CHECK:         %[[GATHER0:.*]] = vector.gather %[[BASE]]{{\[}}%[[C0]], %[[C1]]] {{\[}}%[[IDXVEC0]]], %[[MASK0]], %[[PASS0]] : memref<?x?xf32>, vector<[3]xindex>, vector<[3]xi1>, vector<[3]xf32> into vector<[3]xf32>
+// CHECK:         %[[INS0:.*]] = vector.insert %[[GATHER0]], %[[INIT]] [0] : vector<[3]xf32> into vector<2x[3]xf32>
+// CHECK:         %[[IDXVEC1:.*]] = vector.extract %[[IDXVEC]][1] : vector<[3]xindex> from vector<2x[3]xindex>
+// CHECK:         %[[MASK1:.*]] = vector.extract %[[MASK]][1] : vector<[3]xi1> from vector<2x[3]xi1>
+// CHECK:         %[[PASS1:.*]] = vector.extract %[[PASS]][1] : vector<[3]xf32> from vector<2x[3]xf32>
+// CHECK:         %[[GATHER1:.*]] = vector.gather %[[BASE]]{{\[}}%[[C0]], %[[C1]]] {{\[}}%[[IDXVEC1]]], %[[MASK1]], %[[PASS1]] : memref<?x?xf32>, vector<[3]xindex>, vector<[3]xi1>, vector<[3]xf32> into vector<[3]xf32>
+// CHECK:         %[[INS1:.*]] = vector.insert %[[GATHER1]], %[[INS0]] [1] : vector<[3]xf32> into vector<2x[3]xf32>
+// CHECK-NEXT:    return %[[INS1]] : vector<2x[3]xf32>
+func.func @scalable_gather_memref_2d(%base: memref<?x?xf32>, %v: vector<2x[3]xindex>, %mask: vector<2x[3]xi1>, %pass_thru: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = vector.gather %base[%c0, %c1][%v], %mask, %pass_thru : memref<?x?xf32>, vector<2x[3]xindex>, vector<2x[3]xi1>, vector<2x[3]xf32> into vector<2x[3]xf32>
+ return %0 : vector<2x[3]xf32>
+}
+
+// CHECK-LABEL: @scalable_gather_cant_unroll
+// CHECK-NOT: extract
+// CHECK: vector.gather
+// CHECK-NOT: extract
+func.func @scalable_gather_cant_unroll(%base: memref<?x?xf32>, %v: vector<[4]x8xindex>, %mask: vector<[4]x8xi1>, %pass_thru: vector<[4]x8xf32>) -> vector<[4]x8xf32> {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = vector.gather %base[%c0, %c1][%v], %mask, %pass_thru : memref<?x?xf32>, vector<[4]x8xindex>, vector<[4]x8xi1>, vector<[4]x8xf32> into vector<[4]x8xf32>
+ return %0 : vector<[4]x8xf32>
+}
+
 // CHECK-LABEL: @gather_tensor_1d
 // CHECK-SAME:    ([[BASE:%.+]]: tensor<?xf32>, [[IDXVEC:%.+]]: vector<2xindex>, [[MASK:%.+]]: vector<2xi1>, [[PASS:%.+]]: vector<2xf32>)
 // CHECK-DAG:     [[M0:%.+]]    = vector.extract [[MASK]][0] : i1 from vector<2xi1>
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
index 614b2d4945348b..84aaa9c61200b9 100644
--- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -268,8 +268,7 @@ func.func @aligned_trunci(%a: vector<8xi32>) -> vector<8xi4> {
 // CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
 // CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
 // CHECK:           %[[I8:.*]] = arith.trunci %[[IN]] : vector<8xi32> to vector<8xi8>
-// CHECK:           %[[LOW:.*]] = vector.shuffle %[[I8]], %[[I8]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
-// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[I8]], %[[I8]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[LOW:.*]], %[[HIGH:.*]] = vector.deinterleave %[[I8]] : vector<8xi8> -> vector<4xi8>
 // CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
 // CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
 // CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
@@ -283,8 +282,7 @@ func.func @aligned_trunci_base_case(%a: vector<8xi8>) -> vector<8xi4> {
 // CHECK-SAME:    %[[IN:.*]]: vector<8xi8>) -> vector<8xi4> {
 // CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
 // CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
-// CHECK:           %[[LOW:.*]] = vector.shuffle %[[IN]], %[[IN]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
-// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[IN]], %[[IN]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[LOW:.*]], %[[HIGH:.*]] = vector.deinterleave %[[IN]] : vector<8xi8> -> vector<4xi8>
 // CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
 // CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
 // CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
@@ -299,17 +297,34 @@ func.func @aligned_trunci_2d(%a: vector<8x32xi32>) -> vector<8x32xi4> {
 // CHECK-NOT:       vector.andi
 // CHECK-NOT:       vector.shli
 // CHECK-NOT:       vector.ori
-// CHECK:           arith.trunci
+// CHECK:           arith.trunci {{.*}} : vector<8x32xi32> to vector<8x32xi8>
+// CHECK-NOT:       arith.trunci {{.*}} : vector<8x32xi8> to vector<8x32xi4>
+// CHECK:           vector.deinterleave
   %0 = arith.trunci %a : vector<8x32xi32> to vector<8x32xi4>
   return %0 : vector<8x32xi4>
 }
 
+// CHECK-LABEL: func.func @aligned_trunci_nd(
+// CHECK-SAME: %[[IN:.*]]: vector<3x8x32xi32>) -> vector<3x8x32xi4> {
+func.func @aligned_trunci_nd(%a: vector<3x8x32xi32>) -> vector<3x8x32xi4> {
+  // CHECK: %[[LEFT_SHIFT_BITS:.*]] = arith.constant dense<4> : vector<3x8x16xi8>
+  // CHECK: %[[I4_MASK:.*]] = arith.constant dense<15> : vector<3x8x16xi8>
+  // CHECK: %[[I8:.*]] = arith.trunci %[[IN]] : vector<3x8x32xi32> to vector<3x8x32xi8>
+  // CHECK: %[[LOW:.*]], %[[HIGH:.*]] = vector.deinterleave %[[I8]] : vector<3x8x32xi8> -> vector<3x8x16xi8>
+  // CHECK: %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[I4_MASK]] : vector<3x8x16xi8>
+  // CHECK: %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[LEFT_SHIFT_BITS]] : vector<3x8x16xi8>
+  // CHECK: %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<3x8x16xi8>
+  // CHECK: %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<3x8x16xi8> to vector<3x8x32xi4> 
+  %0 = arith.trunci %a : vector<3x8x32xi32> to vector<3x8x32xi4>
+  return %0 : vector<3x8x32xi4>
+}
+
 // CHECK-LABEL: func.func @i4_transpose(
 func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
 // CHECK-SAME:    %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> {
 // CHECK:           %[[EXT:.*]] = vector.interleave
 // CHECK:           %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8>
-// CHECK:           %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi4>
+// CHECK:           vector.deinterleave %[[TRANS]] : vector<16x8xi8> -> vector<16x4xi8>
   %0 = vector.transpose %a, [1, 0] : vector<8x16xi4> to vector<16x8xi4>
   return %0 : vector<16x8xi4>
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index 686b4a0b60c2ad..bd6845d1c7cdac 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -85,9 +85,9 @@ func.func @contiguous_inner_most_dynamic_outer(%a: index, %b: index, %memref: me
 // CHECK:        %[[D0:.+]] = memref.dim %[[SRC]], %[[C0]]
 // CHECK:        %[[D1:.+]] = memref.dim %[[SRC]], %[[C1]]
 // CHECK:        %[[VIEW:.+]] = memref.subview %[[SRC]][0, 0, 0, 0] [%[[D0]], %[[D1]], 8, 1] [1, 1, 1, 1]
-// CHECK-SAME:     memref<?x?x8x1xf32> to memref<?x?x8xf32, strided<[?, 8, 1], offset: ?>>
+// CHECK-SAME:     memref<?x?x8x1xf32> to memref<?x?x8xf32, strided<[?, 8, 1]>>
 // CHECK:        %[[VEC:.+]] = vector.transfer_read %[[VIEW]]
-// CHECK-SAME:     memref<?x?x8xf32, strided<[?, 8, 1], offset: ?>>, vector<8xf32>
+// CHECK-SAME:     memref<?x?x8xf32, strided<[?, 8, 1]>>, vector<8xf32>
 // CHECK:        %[[RESULT:.+]] = vector.shape_cast %[[VEC]]
 // CHECK:        return %[[RESULT]]
 
@@ -105,10 +105,10 @@ func.func @contiguous_inner_most_outer_dim_dyn_scalable_inner_dim(%a: index, %b:
 // CHECK-SAME:   %[[IDX0:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[IDX1:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[SRC:[a-zA-Z0-9]+]]
-// CHECK:         %[[VIEW:.+]] = memref.subview %[[SRC]]{{.*}} memref<?x?x8x1xf32> to memref<?x?x8xf32, strided<[?, 8, 1], offset: ?>>
+// CHECK:         %[[VIEW:.+]] = memref.subview %[[SRC]]{{.*}} memref<?x?x8x1xf32> to memref<?x?x8xf32, strided<[?, 8, 1]>>
 // CHECK:         %[[VEC_READ:.+]] = vector.transfer_read %[[VIEW]]
 // CHECK-SAME:    {in_bounds = [true]}
-// CHECK-SAME:     memref<?x?x8xf32, strided<[?, 8, 1], offset: ?>>, vector<[8]xf32>
+// CHECK-SAME:     memref<?x?x8xf32, strided<[?, 8, 1]>>, vector<[8]xf32>
 // CHECK:         vector.shape_cast %[[VEC_READ]]
 
 // -----
@@ -340,9 +340,9 @@ func.func @contiguous_inner_most_dynamic_outer(%a: index, %b: index, %arg0: memr
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[DIM0:.*]] = memref.dim %[[MEM]], %[[C0]] : memref<?x?x16x1xf32>
 // CHECK:           %[[DIM1:.*]] = memref.dim %[[MEM]], %[[C1]] : memref<?x?x16x1xf32>
-// CHECK:           %[[SV:.*]] = memref.subview %[[MEM]][0, 0, 0, 0] {{\[}}%[[DIM0]], %[[DIM1]], 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32> to memref<?x?x16xf32, strided<[?, 16, 1], offset: ?>>
+// CHECK:           %[[SV:.*]] = memref.subview %[[MEM]][0, 0, 0, 0] {{\[}}%[[DIM0]], %[[DIM1]], 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32> to memref<?x?x16xf32, strided<[?, 16, 1]>>
 // CHECK:           %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<8x1xf32> to vector<8xf32>
-// CHECK:           vector.transfer_write %[[SC]], %[[SV]]{{\[}}%[[IDX_0]], %[[IDX_1]], %[[C0]]] {in_bounds = [true]} : vector<8xf32>, memref<?x?x16xf32, strided<[?, 16, 1], offset: ?>>
+// CHECK:           vector.transfer_write %[[SC]], %[[SV]]{{\[}}%[[IDX_0]], %[[IDX_1]], %[[C0]]] {in_bounds = [true]} : vector<8xf32>, memref<?x?x16xf32, strided<[?, 16, 1]>>
 
 // Same as the top example within this split, but with the outer vector
 // dim scalable. Note that this example only makes sense when "8 = [8]" (i.e.
@@ -361,9 +361,9 @@ func.func @contiguous_inner_most_dynamic_outer_scalable_inner_dim(%a: index, %b:
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[DIM0:.*]] = memref.dim %[[MEM]], %[[C0]] : memref<?x?x16x1xf32>
 // CHECK:           %[[DIM1:.*]] = memref.dim %[[MEM]], %[[C1]] : memref<?x?x16x1xf32>
-// CHECK:           %[[SV:.*]] = memref.subview %[[MEM]][0, 0, 0, 0] {{\[}}%[[DIM0]], %[[DIM1]], 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32> to memref<?x?x16xf32, strided<[?, 16, 1], offset: ?>>
+// CHECK:           %[[SV:.*]] = memref.subview %[[MEM]][0, 0, 0, 0] {{\[}}%[[DIM0]], %[[DIM1]], 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32> to memref<?x?x16xf32, strided<[?, 16, 1]>>
 // CHECK:           %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<[8]x1xf32> to vector<[8]xf32>
-// CHECK:           vector.transfer_write %[[SC]], %[[SV]]{{\[}}%[[IDX_0]], %[[IDX_1]], %[[C0]]] {in_bounds = [true]} : vector<[8]xf32>, memref<?x?x16xf32, strided<[?, 16, 1], offset: ?>>
+// CHECK:           vector.transfer_write %[[SC]], %[[SV]]{{\[}}%[[IDX_0]], %[[IDX_1]], %[[C0]]] {in_bounds = [true]} : vector<[8]xf32>, memref<?x?x16xf32, strided<[?, 16, 1]>>
 
 // -----
 
diff --git a/mlir/test/Dialect/Vector/vector-transferop-opt.mlir b/mlir/test/Dialect/Vector/vector-transferop-opt.mlir
index 3ddfacf40cf64a..07e6647533c6fe 100644
--- a/mlir/test/Dialect/Vector/vector-transferop-opt.mlir
+++ b/mlir/test/Dialect/Vector/vector-transferop-opt.mlir
@@ -504,14 +504,14 @@ func.func @forward_and_eliminate_stores_through_trivial_aliases(
   %c32 = arith.constant 32 : index
   %cst = arith.constant 0.0 : f32
   vector.transfer_write %vec, %buffer[%c0, %c0] {in_bounds = [true, true]} : vector<[8]x[8]xf32>, memref<?x?xf32>
-  %direct_subview = memref.subview %buffer[0, 0] [%a_size, %a_size] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-  %cast = memref.cast %direct_subview : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32>
-  %subview_of_cast = memref.subview %cast[0, 0] [%another_size, %another_size] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-  %21 = vector.transfer_read %direct_subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<?x?xf32, strided<[?, 1], offset: ?>>, vector<[8]x[8]xf32>
+  %direct_subview = memref.subview %buffer[0, 0] [%a_size, %a_size] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
+  %cast = memref.cast %direct_subview : memref<?x?xf32, strided<[?, 1]>> to memref<?x?xf32>
+  %subview_of_cast = memref.subview %cast[0, 0] [%another_size, %another_size] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1]>>
+  %21 = vector.transfer_read %direct_subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<?x?xf32, strided<[?, 1]>>, vector<[8]x[8]xf32>
   %23 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %21) -> (vector<[8]x[8]xf32>) {
     %24 = arith.addf %arg3, %arg3 : vector<[8]x[8]xf32>
     scf.yield %24 : vector<[8]x[8]xf32>
   }
-  vector.transfer_write %23, %subview_of_cast[%c0, %c0] {in_bounds = [true, true]} : vector<[8]x[8]xf32>, memref<?x?xf32, strided<[?, 1], offset: ?>>
+  vector.transfer_write %23, %subview_of_cast[%c0, %c0] {in_bounds = [true, true]} : vector<[8]x[8]xf32>, memref<?x?xf32, strided<[?, 1]>>
   return
 }
diff --git a/mlir/test/Examples/NVGPU/Ch1.py b/mlir/test/Examples/NVGPU/Ch1.py
index da65aa2ef6a172..cfb48d56f8d494 100644
--- a/mlir/test/Examples/NVGPU/Ch1.py
+++ b/mlir/test/Examples/NVGPU/Ch1.py
@@ -23,7 +23,7 @@
 @NVDSL.mlir_func
 def saxpy(x, y, alpha):
     # 1. Use MLIR GPU dialect to allocate and copy memory
-    token_ty = ir.Type.parse("!gpu.async.token")
+    token_ty = gpu.AsyncTokenType.get()
     t1 = gpu.wait(token_ty, [])
     x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
     y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
diff --git a/mlir/test/Examples/NVGPU/Ch2.py b/mlir/test/Examples/NVGPU/Ch2.py
index 78c14cb2c7ad8c..729913c6d5c4f8 100644
--- a/mlir/test/Examples/NVGPU/Ch2.py
+++ b/mlir/test/Examples/NVGPU/Ch2.py
@@ -27,7 +27,7 @@
 
 @NVDSL.mlir_func
 def saxpy(x, y, alpha):
-    token_ty = ir.Type.parse("!gpu.async.token")
+    token_ty = gpu.AsyncTokenType.get()
     t1 = gpu.wait(token_ty, [])
     x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
     y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
diff --git a/mlir/test/Examples/NVGPU/Ch3.py b/mlir/test/Examples/NVGPU/Ch3.py
index a417014de8b49a..eb96b11c634165 100644
--- a/mlir/test/Examples/NVGPU/Ch3.py
+++ b/mlir/test/Examples/NVGPU/Ch3.py
@@ -59,7 +59,7 @@ def tma_load(
 
 @NVDSL.mlir_func
 def gemm_128_128_64(a, b, d):
-    token_ty = ir.Type.parse("!gpu.async.token")
+    token_ty = gpu.AsyncTokenType.get()
     t1 = gpu.wait(token_ty, [])
     a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
     b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
diff --git a/mlir/test/Examples/NVGPU/Ch4.py b/mlir/test/Examples/NVGPU/Ch4.py
index 8f38d8a90add31..0e3460ff8d63b2 100644
--- a/mlir/test/Examples/NVGPU/Ch4.py
+++ b/mlir/test/Examples/NVGPU/Ch4.py
@@ -258,7 +258,7 @@ def epilogue(D: WGMMAMatrix, d_dev):
 #   d -> memref<MxNxf32>
 @NVDSL.mlir_func
 def gemm_multistage(a, b, d, num_stages):
-    token_ty = ir.Type.parse("!gpu.async.token")
+    token_ty = gpu.AsyncTokenType.get()
     t1 = gpu.wait(token_ty, [])
     a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
     b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
diff --git a/mlir/test/Examples/NVGPU/Ch5.py b/mlir/test/Examples/NVGPU/Ch5.py
index 92e9314e1b812d..f98cfd758a75f0 100644
--- a/mlir/test/Examples/NVGPU/Ch5.py
+++ b/mlir/test/Examples/NVGPU/Ch5.py
@@ -252,7 +252,7 @@ def epilogue(D: WGMMAMatrix, d_dev):
 
 @NVDSL.mlir_func
 def gemm_warp_specialized(a, b, d, num_stages):
-    token_ty = ir.Type.parse("!gpu.async.token")
+    token_ty = gpu.AsyncTokenType.get()
     t1 = gpu.wait(token_ty, [])
     a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
     b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
index ada744b322fe9c..03a7d25cffa767 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt %s \
 // RUN:   -transform-interpreter -test-transform-dialect-erase-schedule  \
 // RUN:   -one-shot-bufferize="bufferize-function-boundaries" -canonicalize \
-// RUN:   -test-lower-to-arm-sme -test-lower-to-llvm | \
+// RUN:   -test-lower-to-arm-sme -convert-vector-to-llvm="enable-arm-sve" \
+// RUN:   -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=main -entry-point-result=void \
 // RUN:   -march=aarch64 -mattr="+sve,+sme" \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py b/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py
index 5269522000f135..75f0dc947e0681 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py
+++ b/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py
@@ -182,7 +182,7 @@ def generate_matmul_ws(
     assert K % BLOCK_K == 0
 
     module = ir.Module.create()
-    token_ty = ir.Type.parse("!gpu.async.token")
+    token_ty = gpu.AsyncTokenType.get()
     a_elem_ty = get_mlir_ty(input_type)
     b_elem_ty = get_mlir_ty(input_type)
     c_elem_ty = get_mlir_ty(output_type)
@@ -682,7 +682,7 @@ def generate_matmul_multistage(
     assert K % BLOCK_K == 0
 
     module = ir.Module.create()
-    token_ty = ir.Type.parse("!gpu.async.token")
+    token_ty = gpu.AsyncTokenType.get()
     a_elem_ty = get_mlir_ty(input_type)
     b_elem_ty = get_mlir_ty(input_type)
     c_elem_ty = get_mlir_ty(output_type)
diff --git a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir
index 7356c11e85ac0c..3c0ada9d2cabcf 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir
@@ -58,3 +58,65 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:     %[[INSERT1:.+]] = tensor.insert_slice %[[GEMM0_TILE]] into %[[ITERARG1]][%[[IV]], 0]
 //      CHECK:     scf.yield %[[INSERT0]], %[[INSERT1]]
 //      CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
+
+// -----
+
+func.func @multiple_outputs_fusion_yield_all(%lhs0: tensor<32x32xf32>,
+                       %rhs0: tensor<32x32xf32>, %init0: tensor<32x32xf32>, %init1: tensor<32x32xf32>, 
+                       %rhs1: tensor<32x32xf32>, %init2: tensor<32x32xf32>) 
+                       -> (tensor<32x32xf32>, tensor<32x32xf32>, tensor<32x32xf32>) {
+  %out0, %out1 = linalg.generic {
+    indexing_maps = [affine_map<(i, j) -> (i, j)>,
+                     affine_map<(i, j) -> (i, j)>,
+                     affine_map<(i, j) -> (i, j)>,
+                     affine_map<(i, j) -> (j, i)>],
+    iterator_types = ["parallel", "parallel"]
+  }
+  ins(%lhs0, %rhs0: tensor<32x32xf32>, tensor<32x32xf32>)
+  outs(%init0, %init1: tensor<32x32xf32>, tensor<32x32xf32>) {
+  ^bb0(%0: f32, %1: f32, %2: f32, %3: f32):
+    %4 = arith.mulf %0, %1 : f32
+    %5 = arith.addf %0, %1 : f32
+    linalg.yield %4, %5: f32, f32
+  } -> (tensor<32x32xf32>, tensor<32x32xf32>)
+
+  %out3 = linalg.add ins(%out0, %rhs1: tensor<32x32xf32>, tensor<32x32xf32>) outs(%init2: tensor<32x32xf32>) -> tensor<32x32xf32>
+
+  return %out0, %out1, %out3 : tensor<32x32xf32>, tensor<32x32xf32>, tensor<32x32xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0 : !transform.any_op {transform.readonly}) {
+    %add = transform.structured.match ops{["linalg.add"]} in %arg0
+      : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_and_yield %add [16]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: func.func @multiple_outputs_fusion_yield_all(
+// CHECK-SAME:     %[[LHS0:[a-zA-Z0-9]+]]: tensor<32x32xf32>
+// CHECK-SAME:     %[[RHS0:[a-zA-Z0-9]+]]: tensor<32x32xf32>,
+// CHECK-SAME:     %[[INIT0:[a-zA-Z0-9]+]]: tensor<32x32xf32>,
+// CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<32x32xf32>,
+// CHECK-SAME:     %[[RHS1:[a-zA-Z0-9]+]]: tensor<32x32xf32>,
+// CHECK-SAME:     %[[INIT2:[a-zA-Z0-9]+]]: tensor<32x32xf32>)
+//      CHECK:   %[[RESULT:.+]]:3 = scf.for %[[IV:[a-zA-Z0-9]+]] =
+// CHECK-SAME:       iter_args(%[[ITERARG0:[a-zA-Z0-9]+]] = %[[INIT2]], %[[ITERARG1:[a-zA-Z0-9]+]] = %[[INIT0]], %[[ITERARG2:[a-zA-Z0-9]+]] = %[[INIT1]])
+//  CHECK-DAG:     %[[LHS0_TILE:.+]] = tensor.extract_slice %[[LHS0]][%[[IV]], 0]
+//  CHECK-DAG:     %[[RHS0_TILE:.+]] = tensor.extract_slice %[[RHS0]][%[[IV]], 0]
+//  CHECK-DAG:     %[[INIT0_TILE:.+]] = tensor.extract_slice %[[ITERARG1]][%[[IV]], 0]
+//  CHECK-DAG:     %[[INIT1_TILE:.+]] = tensor.extract_slice %[[ITERARG2]][0, %[[IV]]]
+//      CHECK:     %[[GENERIC_TILE:.+]]:2 = linalg.generic
+// CHECK-SAME:         ins(%[[LHS0_TILE]], %[[RHS0_TILE]] :
+// CHECK-SAME:         outs(%[[INIT0_TILE]], %[[INIT1_TILE]] :
+//  CHECK-DAG:     %[[RHS1_TILE:.+]] = tensor.extract_slice %[[RHS1]][%[[IV]], 0]
+//  CHECK-DAG:     %[[INIT2_TILE:.+]] = tensor.extract_slice %[[ITERARG0]][%[[IV]], 0]
+//      CHECK:     %[[ADD_TILE:.+]] = linalg.add
+// CHECK-SAME:         ins(%[[GENERIC_TILE]]#0, %[[RHS1_TILE]] :
+// CHECK-SAME:         outs(%[[INIT2_TILE]] :
+//      CHECK:     %[[INSERT0:.+]] = tensor.insert_slice %[[ADD_TILE]] into %[[ITERARG0]][%[[IV]], 0]
+//      CHECK:     %[[INSERT1:.+]] = tensor.insert_slice %[[GENERIC_TILE]]#0 into %[[ITERARG1]][%[[IV]], 0]
+//      CHECK:     %[[INSERT2:.+]] = tensor.insert_slice %[[GENERIC_TILE]]#1 into %[[ITERARG2]][0, %[[IV]]]
+//      CHECK:     scf.yield %[[INSERT0]], %[[INSERT1]], %[[INSERT2]]
+//      CHECK:   return %[[RESULT]]#1, %[[RESULT]]#2, %[[RESULT]]#0
diff --git a/mlir/test/Target/LLVMIR/omptarget-fortran-common-block-host.mlir b/mlir/test/Target/LLVMIR/omptarget-fortran-common-block-host.mlir
new file mode 100644
index 00000000000000..7273f53d0a3db7
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-fortran-common-block-host.mlir
@@ -0,0 +1,59 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This test checks the offload sizes, map types and base pointers and pointers
+// provided to the OpenMP kernel argument structure are correct when lowering
+// to LLVM-IR from MLIR when a fortran common block is lowered alongside
+// the omp.map.info.
+
+module attributes {omp.is_target_device = false} {
+  llvm.func @omp_map_common_block_using_common_block_members() {
+    %0 = llvm.mlir.constant(4 : index) : i64
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %2 = llvm.mlir.addressof @var_common_ : !llvm.ptr
+    %3 = llvm.getelementptr %2[%1] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %4 = llvm.getelementptr %2[%0] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %5 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var1"}
+    %6 = omp.map.info var_ptr(%4 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var2"}
+    omp.target map_entries(%5 -> %arg0, %6 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+      omp.terminator
+    }
+    llvm.return
+  }
+
+  llvm.func @omp_map_common_block_using_common_block_symbol() {
+    %0 = llvm.mlir.addressof @var_common_ : !llvm.ptr
+    %1 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<8 x i8>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var_common"}
+    omp.target map_entries(%1 -> %arg0 : !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr):
+      omp.terminator
+    }
+    llvm.return
+  }
+
+  llvm.mlir.global common @var_common_(dense<0> : vector<8xi8>) {addr_space = 0 : i32, alignment = 4 : i64} : !llvm.array<8 x i8>
+}
+
+// CHECK: @[[GLOBAL_BYTE_ARRAY:.*]] = common global [8 x i8] zeroinitializer, align 4
+
+// CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
+// CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+
+// CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8]
+// CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 35]
+
+// CHECK: define void @omp_map_common_block_using_common_block_members()
+// CHECK:  %[[BASEPTRS:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK:  store ptr @[[GLOBAL_BYTE_ARRAY]], ptr %[[BASEPTRS]], align 8
+// CHECK:  %[[OFFLOADPTRS:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK:  store ptr @[[GLOBAL_BYTE_ARRAY]], ptr %[[OFFLOADPTRS]], align 8
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+// CHECK: store ptr getelementptr (i8, ptr @[[GLOBAL_BYTE_ARRAY]], i64 4), ptr %[[BASEPTRS]], align 8
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+// CHECK: store ptr getelementptr (i8, ptr @[[GLOBAL_BYTE_ARRAY]], i64 4), ptr %[[OFFLOADPTRS]], align 8
+
+// CHECK: define void @omp_map_common_block_using_common_block_symbol()
+// CHECK:  %[[BASEPTRS:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK:  store ptr @[[GLOBAL_BYTE_ARRAY]], ptr %[[BASEPTRS]], align 8
+// CHECK:  %[[OFFLOADPTRS:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK:  store ptr @[[GLOBAL_BYTE_ARRAY]], ptr %[[OFFLOADPTRS]], align 8
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index a1cc76f9ab7708..7c6ab21e21c600 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -826,6 +826,26 @@ llvm.func @simd_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fi
 
 // -----
 
+// CHECK-LABEL: @simd_order
+llvm.func @simd_order() {
+  %0 = llvm.mlir.constant(10 : i64) : i64
+  %1 = llvm.mlir.constant(1 : i64) : i64
+  %2 = llvm.alloca %1 x i64 : (i64) -> !llvm.ptr
+  omp.simd order(concurrent) safelen(2) {
+    omp.loop_nest (%arg0) : i64 = (%1) to (%0) inclusive step (%1) {
+      llvm.store %arg0, %2 : i64, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+// If clause order(concurrent) is specified then the memory instructions
+// are marked parallel even if 'safelen' is finite.
+// CHECK: llvm.loop.parallel_accesses
+// CHECK-NEXT: llvm.loop.vectorize.enable
+// CHECK-NEXT: llvm.loop.vectorize.width{{.*}}i64 2
+// -----
+
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 58bda87c3b7bea..5924377581db67 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -140,3 +140,97 @@ omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
   llvm.store %1, %arg2 : f32, !llvm.ptr
   omp.yield(%arg2 : !llvm.ptr)
 }
+
+// Tests fix for Fujitsu test suite test: 0007_0019.f90: the
+// `llvm.mlir.addressof` op needs access to the parent module when lowering
+// from the LLVM dialect to LLVM IR. If such op is used inside an `omp.private`
+// op instance that was not created/cloned inside the module, we would get a
+// seg fault due to trying to access a null pointer.
+
+// CHECK-LABEL: define internal void @lower_region_with_addressof..omp_par
+// CHECK:         omp.par.region:
+// CHECK:           br label %[[PAR_REG_BEG:.*]]
+// CHECK:         [[PAR_REG_BEG]]:
+// CHECK:           %[[PRIVATIZER_GEP:.*]] = getelementptr double, ptr @_QQfoo, i64 111
+// CHECK:           call void @bar(ptr %[[PRIVATIZER_GEP]])
+// CHECK:           call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 222))
+llvm.func @lower_region_with_addressof() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f64 {bindc_name = "u1"} : (i64) -> !llvm.ptr
+  omp.parallel private(@_QFlower_region_with_addressof_privatizer %1 -> %arg0 : !llvm.ptr) {
+    %c0 = llvm.mlir.constant(111 : i64) : i64
+    %2 = llvm.getelementptr %arg0[%c0] : (!llvm.ptr, i64) -> !llvm.ptr, f64
+    llvm.call @bar(%2) : (!llvm.ptr) -> ()
+
+    %c1 = llvm.mlir.constant(222 : i64) : i64
+    %3 = llvm.mlir.addressof @_QQfoo: !llvm.ptr
+    %4 = llvm.getelementptr %3[%c1] : (!llvm.ptr, i64) -> !llvm.ptr, f64
+    llvm.call @bar(%4) : (!llvm.ptr) -> ()
+    omp.terminator
+  }
+
+  llvm.return
+}
+
+omp.private {type = private} @_QFlower_region_with_addressof_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.addressof @_QQfoo: !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+llvm.mlir.global linkonce constant @_QQfoo() {addr_space = 0 : i32} : !llvm.array<3 x i8> {
+  %0 = llvm.mlir.constant("foo") : !llvm.array<3 x i8>
+  llvm.return %0 : !llvm.array<3 x i8>
+}
+
+llvm.func @bar(!llvm.ptr)
+
+
+// Tests fix for Fujitsu test suite test: 0275_0032.f90. The MLIR to LLVM
+// translation logic assumed that reduction arguments to an `omp.parallel`
+// op are always the last set of arguments to the op. However, this is a
+// wrong assumption since private args come afterward. This tests the fix
+// that we access the different sets of args properly.
+
+// CHECK-LABEL: define internal void @private_and_reduction_..omp_par
+// CHECK-DAG:    %[[PRV_ALLOC:.*]] = alloca float, i64 1, align 4
+// CHECK-DAG:     %[[RED_ALLOC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+
+// CHECK:         omp.par.region:
+// CHECK:           br label %[[PAR_REG_BEG:.*]]
+// CHECK:         [[PAR_REG_BEG]]:
+// CHECK-NEXT:      %{{.*}} = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[RED_ALLOC]], align 8
+// CHECK-NEXT:      store float 8.000000e+00, ptr %[[PRV_ALLOC]], align 4
+
+llvm.func @private_and_reduction_() attributes {fir.internal_name = "_QPprivate_and_reduction", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x f32 {bindc_name = "to_priv"} : (i64) -> !llvm.ptr
+  omp.parallel reduction(byref @reducer.part %1 -> %arg0 : !llvm.ptr) private(@privatizer.part %2 -> %arg1 : !llvm.ptr) {
+    %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %4 = llvm.mlir.constant(8.000000e+00 : f32) : f32
+    llvm.store %4, %arg1 : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+omp.private {type = private} @privatizer.part : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "to_priv", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+
+omp.declare_reduction @reducer.part : !llvm.ptr init {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+} combiner {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}  cleanup {
+^bb0(%arg0: !llvm.ptr):
+  omp.yield
+}
diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
index 5e0d78fafb25ce..9b74362b6ee757 100644
--- a/mlir/test/Transforms/canonicalize.mlir
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -505,9 +505,9 @@ func.func @dim_op_fold(%arg0: index, %arg1: index, %arg2: index, %BUF: memref<?x
       affine.for %arg5 = %l to %u {
         "foo"() : () -> ()
       }
-      %sv2 = memref.subview %0[0, 0][17, %arg4][1, 1] : memref<?x?xf32> to memref<17x?xf32, strided<[?, 1], offset: ?>>
+      %sv2 = memref.subview %0[0, 0][17, %arg4][1, 1] : memref<?x?xf32> to memref<17x?xf32, strided<[?, 1]>>
       %l2 = memref.dim %v, %c1 : memref<?x?xf32>
-      %u2 = memref.dim %sv2, %c1 : memref<17x?xf32, strided<[?, 1], offset: ?>>
+      %u2 = memref.dim %sv2, %c1 : memref<17x?xf32, strided<[?, 1]>>
       scf.for %arg5 = %l2 to %u2 step %c1 {
         "foo"() : () -> ()
       }
@@ -1057,13 +1057,13 @@ func.func @memref_cast_folding_subview(%arg0: memref<4x5xf32>, %i: index) -> (me
 
 // CHECK-LABEL: func @memref_cast_folding_subview_static(
 func.func @memref_cast_folding_subview_static(%V: memref<16x16xf32>, %a: index, %b: index)
-  -> memref<3x4xf32, strided<[?, 1], offset: ?>>
+  -> memref<3x4xf32, strided<[?, 1]>>
 {
   %0 = memref.cast %V : memref<16x16xf32> to memref<?x?xf32>
-  %1 = memref.subview %0[0, 0][3, 4][1, 1] : memref<?x?xf32> to memref<3x4xf32, strided<[?, 1], offset: ?>>
+  %1 = memref.subview %0[0, 0][3, 4][1, 1] : memref<?x?xf32> to memref<3x4xf32, strided<[?, 1]>>
 
   // CHECK:  memref.subview{{.*}}: memref<16x16xf32> to memref<3x4xf32, strided<[16, 1]>>
-  return %1: memref<3x4xf32, strided<[?, 1], offset: ?>>
+  return %1: memref<3x4xf32, strided<[?, 1]>>
 }
 
 // -----
diff --git a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
index b28f2b3564662a..283e426b4e5947 100644
--- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
@@ -5,6 +5,7 @@ add_mlir_library(MLIRLinalgTestPasses
   TestLinalgDropUnitDims.cpp
   TestLinalgElementwiseFusion.cpp
   TestLinalgFusionTransforms.cpp
+  TestLinalgRankReduceContractionOps.cpp
   TestLinalgTransforms.cpp
   TestPadFusion.cpp
 
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgRankReduceContractionOps.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgRankReduceContractionOps.cpp
new file mode 100644
index 00000000000000..8b455d7d68c302
--- /dev/null
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgRankReduceContractionOps.cpp
@@ -0,0 +1,67 @@
+//===- TestLinalgRankReduceContractionOps.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass for testing rank reduing patterns for named
+// contraction ops with unit dims.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TestLinalgRankReduceContractionOps
+    : public PassWrapper<TestLinalgRankReduceContractionOps,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestLinalgRankReduceContractionOps)
+
+  TestLinalgRankReduceContractionOps() = default;
+  TestLinalgRankReduceContractionOps(
+      const TestLinalgRankReduceContractionOps &pass)
+      : PassWrapper(pass) {}
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<affine::AffineDialect, linalg::LinalgDialect,
+                    memref::MemRefDialect, tensor::TensorDialect>();
+  }
+  StringRef getArgument() const final {
+    return "test-linalg-rank-reduce-contraction-ops";
+  }
+  StringRef getDescription() const final {
+    return "Test Linalg rank reduce contraction ops with unit dims";
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &this->getContext();
+    func::FuncOp funcOp = this->getOperation();
+
+    RewritePatternSet patterns(context);
+    linalg::populateContractionOpRankReducingPatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(funcOp.getBody(),
+                                            std::move(patterns))))
+      return signalPassFailure();
+    return;
+  }
+};
+
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestLinalgRankReduceContractionOps() {
+  PassRegistration<TestLinalgRankReduceContractionOps>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index 9b36748b54dbde..b86f6922f5ea38 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -1321,7 +1321,7 @@ DenseMap<Attribute, MemorySlot> TestMultiSlotAlloca::destructure(
   DenseMap<Attribute, MemorySlot> slotMap;
 
   for (Attribute usedIndex : usedIndices) {
-    Type elemType = slot.elementPtrs.lookup(usedIndex);
+    Type elemType = slot.subelementTypes.lookup(usedIndex);
     MemRefType elemPtr = MemRefType::get({}, elemType);
     auto subAlloca = builder.create<TestMultiSlotAlloca>(getLoc(), elemPtr);
     newAllocators.push_back(subAlloca);
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index e4fbd03d8c6785..8cafb0afac9ae8 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -108,6 +108,7 @@ void registerTestLinalgDecomposeOps();
 void registerTestLinalgDropUnitDims();
 void registerTestLinalgElementwiseFusion();
 void registerTestLinalgGreedyFusion();
+void registerTestLinalgRankReduceContractionOps();
 void registerTestLinalgTransforms();
 void registerTestLivenessAnalysisPass();
 void registerTestLivenessPass();
@@ -239,6 +240,7 @@ void registerTestPasses() {
   mlir::test::registerTestLinalgDropUnitDims();
   mlir::test::registerTestLinalgElementwiseFusion();
   mlir::test::registerTestLinalgGreedyFusion();
+  mlir::test::registerTestLinalgRankReduceContractionOps();
   mlir::test::registerTestLinalgTransforms();
   mlir::test::registerTestLivenessAnalysisPass();
   mlir::test::registerTestLivenessPass();
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index b8a3d5b49938ec..0d3b29cc94e840 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -1117,7 +1117,7 @@ void PatternEmitter::emit(StringRef rewriteName) {
 
       os << "return ::mlir::success();\n";
     }
-    os << "};\n";
+    os << "}\n";
   }
   os << "};\n\n";
 }
diff --git a/mlir/unittests/Analysis/Presburger/Utils.h b/mlir/unittests/Analysis/Presburger/Utils.h
index ef4429b5c6bc85..58b92671685404 100644
--- a/mlir/unittests/Analysis/Presburger/Utils.h
+++ b/mlir/unittests/Analysis/Presburger/Utils.h
@@ -16,12 +16,7 @@
 #include "mlir/Analysis/Presburger/GeneratingFunction.h"
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
 #include "mlir/Analysis/Presburger/Matrix.h"
-#include "mlir/Analysis/Presburger/PWMAFunction.h"
-#include "mlir/Analysis/Presburger/PresburgerRelation.h"
 #include "mlir/Analysis/Presburger/QuasiPolynomial.h"
-#include "mlir/Analysis/Presburger/Simplex.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/Support/LLVM.h"
 
 #include <gtest/gtest.h>
 #include <optional>
diff --git a/mlir/unittests/IR/AffineExprTest.cpp b/mlir/unittests/IR/AffineExprTest.cpp
index ff154eb29807c3..a0affc4341b0b4 100644
--- a/mlir/unittests/IR/AffineExprTest.cpp
+++ b/mlir/unittests/IR/AffineExprTest.cpp
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cstdint>
+#include <limits>
+
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "gtest/gtest.h"
@@ -30,3 +33,68 @@ TEST(AffineExprTest, constructFromBinaryOperators) {
   ASSERT_EQ(product.getKind(), AffineExprKind::Mul);
   ASSERT_EQ(remainder.getKind(), AffineExprKind::Mod);
 }
+
+TEST(AffineExprTest, constantFolding) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+  auto cn1 = b.getAffineConstantExpr(-1);
+  auto c0 = b.getAffineConstantExpr(0);
+  auto c1 = b.getAffineConstantExpr(1);
+  auto c2 = b.getAffineConstantExpr(2);
+  auto c3 = b.getAffineConstantExpr(3);
+  auto c6 = b.getAffineConstantExpr(6);
+  auto cmax = b.getAffineConstantExpr(std::numeric_limits<int64_t>::max());
+  auto cmin = b.getAffineConstantExpr(std::numeric_limits<int64_t>::min());
+
+  ASSERT_EQ(getAffineBinaryOpExpr(AffineExprKind::Add, c1, c2), c3);
+  ASSERT_EQ(getAffineBinaryOpExpr(AffineExprKind::Mul, c2, c3), c6);
+  ASSERT_EQ(getAffineBinaryOpExpr(AffineExprKind::FloorDiv, c3, c2), c1);
+  ASSERT_EQ(getAffineBinaryOpExpr(AffineExprKind::CeilDiv, c3, c2), c2);
+
+  // Test division by zero:
+  auto c3ceildivc0 = getAffineBinaryOpExpr(AffineExprKind::CeilDiv, c3, c0);
+  ASSERT_EQ(c3ceildivc0.getKind(), AffineExprKind::CeilDiv);
+
+  auto c3floordivc0 = getAffineBinaryOpExpr(AffineExprKind::FloorDiv, c3, c0);
+  ASSERT_EQ(c3floordivc0.getKind(), AffineExprKind::FloorDiv);
+
+  auto c3modc0 = getAffineBinaryOpExpr(AffineExprKind::Mod, c3, c0);
+  ASSERT_EQ(c3modc0.getKind(), AffineExprKind::Mod);
+
+  // Test overflow:
+  auto cmaxplusc1 = getAffineBinaryOpExpr(AffineExprKind::Add, cmax, c1);
+  ASSERT_EQ(cmaxplusc1.getKind(), AffineExprKind::Add);
+
+  auto cmaxtimesc2 = getAffineBinaryOpExpr(AffineExprKind::Mul, cmax, c2);
+  ASSERT_EQ(cmaxtimesc2.getKind(), AffineExprKind::Mul);
+
+  auto cminceildivcn1 =
+      getAffineBinaryOpExpr(AffineExprKind::CeilDiv, cmin, cn1);
+  ASSERT_EQ(cminceildivcn1.getKind(), AffineExprKind::CeilDiv);
+
+  auto cminfloordivcn1 =
+      getAffineBinaryOpExpr(AffineExprKind::FloorDiv, cmin, cn1);
+  ASSERT_EQ(cminfloordivcn1.getKind(), AffineExprKind::FloorDiv);
+}
+
+TEST(AffineExprTest, divisionSimplification) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+  auto cn6 = b.getAffineConstantExpr(-6);
+  auto c6 = b.getAffineConstantExpr(6);
+  auto d0 = b.getAffineDimExpr(0);
+  auto d1 = b.getAffineDimExpr(1);
+
+  ASSERT_EQ(c6.floorDiv(-1), cn6);
+  ASSERT_EQ((d0 * 6).floorDiv(2), d0 * 3);
+  ASSERT_EQ((d0 * 6).floorDiv(4).getKind(), AffineExprKind::FloorDiv);
+  ASSERT_EQ((d0 * 6).floorDiv(-2), d0 * -3);
+  ASSERT_EQ((d0 * 6 + d1).floorDiv(2), d0 * 3 + d1.floorDiv(2));
+  ASSERT_EQ((d0 * 6 + d1).floorDiv(-2), d0 * -3 + d1.floorDiv(-2));
+  ASSERT_EQ((d0 * 6 + d1).floorDiv(4).getKind(), AffineExprKind::FloorDiv);
+
+  ASSERT_EQ(c6.ceilDiv(-1), cn6);
+  ASSERT_EQ((d0 * 6).ceilDiv(2), d0 * 3);
+  ASSERT_EQ((d0 * 6).ceilDiv(4).getKind(), AffineExprKind::CeilDiv);
+  ASSERT_EQ((d0 * 6).ceilDiv(-2), d0 * -3);
+}
diff --git a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
index bcbfa88c5d8159..d1227b045d4ed3 100644
--- a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
+++ b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
@@ -495,11 +495,11 @@ TEST(DataLayout, NullSpec) {
   EXPECT_EQ(layout.getGlobalMemorySpace(), Attribute());
   EXPECT_EQ(layout.getStackAlignment(), 0u);
 
-  EXPECT_EQ(layout.getDevicePropertyValueAsInt(
+  EXPECT_EQ(layout.getDevicePropertyValue(
                 Builder(&ctx).getStringAttr("CPU" /* device ID*/),
                 Builder(&ctx).getStringAttr("L1_cache_size_in_bytes")),
             std::nullopt);
-  EXPECT_EQ(layout.getDevicePropertyValueAsInt(
+  EXPECT_EQ(layout.getDevicePropertyValue(
                 Builder(&ctx).getStringAttr("CPU" /* device ID*/),
                 Builder(&ctx).getStringAttr("max_vector_width")),
             std::nullopt);
@@ -535,11 +535,11 @@ TEST(DataLayout, EmptySpec) {
   EXPECT_EQ(layout.getGlobalMemorySpace(), Attribute());
   EXPECT_EQ(layout.getStackAlignment(), 0u);
 
-  EXPECT_EQ(layout.getDevicePropertyValueAsInt(
+  EXPECT_EQ(layout.getDevicePropertyValue(
                 Builder(&ctx).getStringAttr("CPU" /* device ID*/),
                 Builder(&ctx).getStringAttr("L1_cache_size_in_bytes")),
             std::nullopt);
-  EXPECT_EQ(layout.getDevicePropertyValueAsInt(
+  EXPECT_EQ(layout.getDevicePropertyValue(
                 Builder(&ctx).getStringAttr("CPU" /* device ID*/),
                 Builder(&ctx).getStringAttr("max_vector_width")),
             std::nullopt);
@@ -599,8 +599,8 @@ TEST(DataLayout, SpecWithTargetSystemDescEntries) {
   module attributes { dl_target_sys_desc_test.target_system_spec =
     #dl_target_sys_desc_test.target_system_spec<
       "CPU": #dlti.target_device_spec<
-              #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : ui32>,
-              #dlti.dl_entry<"max_vector_op_width", 128 : ui32>>
+              #dlti.dl_entry<"L1_cache_size_in_bytes", "4096">,
+              #dlti.dl_entry<"max_vector_op_width", "128">>
     > } {}
   )MLIR";
 
@@ -610,14 +610,14 @@ TEST(DataLayout, SpecWithTargetSystemDescEntries) {
 
   OwningOpRef<ModuleOp> module = parseSourceString<ModuleOp>(ir, &ctx);
   DataLayout layout(*module);
-  EXPECT_EQ(layout.getDevicePropertyValueAsInt(
+  EXPECT_EQ(layout.getDevicePropertyValue(
                 Builder(&ctx).getStringAttr("CPU") /* device ID*/,
                 Builder(&ctx).getStringAttr("L1_cache_size_in_bytes")),
-            std::optional<int64_t>(4096));
-  EXPECT_EQ(layout.getDevicePropertyValueAsInt(
+            std::optional<Attribute>(Builder(&ctx).getStringAttr("4096")));
+  EXPECT_EQ(layout.getDevicePropertyValue(
                 Builder(&ctx).getStringAttr("CPU") /* device ID*/,
                 Builder(&ctx).getStringAttr("max_vector_op_width")),
-            std::optional<int64_t>(128));
+            std::optional<Attribute>(Builder(&ctx).getStringAttr("128")));
 }
 
 TEST(DataLayout, Caching) {
diff --git a/offload/test/api/omp_dynamic_shared_memory.c b/offload/test/api/omp_dynamic_shared_memory.c
index 3fe75f24db3e67..76f625f07bfaf3 100644
--- a/offload/test/api/omp_dynamic_shared_memory.c
+++ b/offload/test/api/omp_dynamic_shared_memory.c
@@ -6,12 +6,7 @@
 // RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
 // RUN:   %libomptarget-run-generic | %fcheck-generic
 
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/jit/empty_kernel_lvl1.c b/offload/test/jit/empty_kernel_lvl1.c
index a0b8cd448837da..812d5c1f463140 100644
--- a/offload/test/jit/empty_kernel_lvl1.c
+++ b/offload/test/jit/empty_kernel_lvl1.c
@@ -28,11 +28,6 @@
 // RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include "empty_kernel.inc"
diff --git a/offload/test/jit/empty_kernel_lvl2.c b/offload/test/jit/empty_kernel_lvl2.c
index 81a04f55ce43d6..90e4ce321fe8ae 100644
--- a/offload/test/jit/empty_kernel_lvl2.c
+++ b/offload/test/jit/empty_kernel_lvl2.c
@@ -88,11 +88,6 @@
 // RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include "empty_kernel.inc"
diff --git a/offload/test/jit/type_punning.c b/offload/test/jit/type_punning.c
index 10e3d2cef718b8..574168b8a69cbb 100644
--- a/offload/test/jit/type_punning.c
+++ b/offload/test/jit/type_punning.c
@@ -8,12 +8,7 @@
 //
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 // Ensure that there is only the kernel function left, not any outlined
 // parallel regions.
diff --git a/offload/test/libc/assert.c b/offload/test/libc/assert.c
index 803a8207e10a45..c2c5055c47051b 100644
--- a/offload/test/libc/assert.c
+++ b/offload/test/libc/assert.c
@@ -5,12 +5,7 @@
 
 // NVPTX without LTO uses the implementation in OpenMP currently.
 // UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: powerpc64-ibm-linux-gnu
-// UNSUPPORTED: powerpc64-ibm-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <assert.h>
 
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 6c590603079c4d..99d7defdb9e118 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -188,6 +188,24 @@ def add_libraries(source):
                    config.llvm_library_intdir + "/libomptarget.devicertl.a"
     return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
 
+# Add platform targets
+host_targets = [
+    "aarch64-unknown-linux-gnu",
+    "aarch64-unknown-linux-gnu-LTO",
+    "x86_64-pc-linux-gnu",
+    "x86_64-pc-linux-gnu-LTO",
+    "s390x-ibm-linux-gnu",
+    "s390x-ibm-linux-gnu-LTO",
+]
+if config.libomptarget_current_target.startswith('nvptx'):
+    config.available_features.add('gpu')
+    config.available_features.add('nvidiagpu')
+if config.libomptarget_current_target.startswith('amdgcn'):
+    config.available_features.add('gpu')
+    config.available_features.add('amdgpu')
+if config.libomptarget_current_target in host_targets:
+    config.available_features.add('host')
+
 # substitutions
 # - for targets that exist in the system create the actual command.
 # - for valid targets that do not exist in the system, return false, so that the
diff --git a/offload/test/mapping/auto_zero_copy.cpp b/offload/test/mapping/auto_zero_copy.cpp
index 46641200bb56e2..69dabadd25322a 100644
--- a/offload/test/mapping/auto_zero_copy.cpp
+++ b/offload/test/mapping/auto_zero_copy.cpp
@@ -7,15 +7,7 @@
 // RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
 // RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
+// REQUIRES: amdgpu
 // REQUIRES: unified_shared_memory
 
 // clang-format on
diff --git a/offload/test/mapping/auto_zero_copy_apu.cpp b/offload/test/mapping/auto_zero_copy_apu.cpp
index 48360e4fd7f7bb..4a03fb2e7084e9 100644
--- a/offload/test/mapping/auto_zero_copy_apu.cpp
+++ b/offload/test/mapping/auto_zero_copy_apu.cpp
@@ -7,13 +7,7 @@
 // RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
 // RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
+// REQUIRES: amdgpu
 // REQUIRES: apu
 
 // clang-format on
diff --git a/offload/test/mapping/auto_zero_copy_globals.cpp b/offload/test/mapping/auto_zero_copy_globals.cpp
index 55dfb2807ebc78..556f0142695fd7 100644
--- a/offload/test/mapping/auto_zero_copy_globals.cpp
+++ b/offload/test/mapping/auto_zero_copy_globals.cpp
@@ -3,15 +3,7 @@
 // RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=60 %libomptarget-run-generic 2>&1 \
 // RUN: | %fcheck-generic -check-prefix=CHECK
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
+// REQUIRES: amdgpu
 // REQUIRES: unified_shared_memory
 
 // clang-format on
diff --git a/offload/test/mapping/prelock.cpp b/offload/test/mapping/prelock.cpp
index d019108dcad624..437cdc6efa6610 100644
--- a/offload/test/mapping/prelock.cpp
+++ b/offload/test/mapping/prelock.cpp
@@ -1,13 +1,9 @@
 // RUN: %libomptarget-compilexx-generic
 // RUN: %libomptarget-run-generic %fcheck-generic
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: amdgcn-amd-amdhsa
+// REQURIES: gpu
+// UNSUPPORTED: nvidiagpu
+// UNSUPPORTED: amdgpu
 
 #include <cstdio>
 
diff --git a/offload/test/offloading/barrier_fence.c b/offload/test/offloading/barrier_fence.c
index b9a8ca27965a09..73d259d4f71b6a 100644
--- a/offload/test/offloading/barrier_fence.c
+++ b/offload/test/offloading/barrier_fence.c
@@ -3,12 +3,7 @@
 // RUN: %libomptarget-compileopt-generic -fopenmp-offload-mandatory -O3
 // RUN: %libomptarget-run-generic
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/bug49334.cpp b/offload/test/offloading/bug49334.cpp
index 1f19dab378810d..4ca01d4bbd9717 100644
--- a/offload/test/offloading/bug49334.cpp
+++ b/offload/test/offloading/bug49334.cpp
@@ -5,15 +5,9 @@
 // RUN: %libomptarget-compileoptxx-generic -O3 -ffast-math && \
 // RUN: %libomptarget-run-generic
 
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// REQUIRES: gpu
+// UNSUPPORTED: nvidiagpu
+// UNSUPPORTED: amdgpu
 
 #include <cassert>
 #include <cmath>
diff --git a/offload/test/offloading/default_thread_limit.c b/offload/test/offloading/default_thread_limit.c
index d9be843dfbda49..c195d9ed3fbb77 100644
--- a/offload/test/offloading/default_thread_limit.c
+++ b/offload/test/offloading/default_thread_limit.c
@@ -3,14 +3,7 @@
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
 
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: amdgpu
 
 __attribute__((optnone)) int optnone() { return 1; }
 
diff --git a/offload/test/offloading/dynamic_module.c b/offload/test/offloading/dynamic_module.c
index 9dcf3a1ae649a2..00ae8100db7e42 100644
--- a/offload/test/offloading/dynamic_module.c
+++ b/offload/test/offloading/dynamic_module.c
@@ -3,7 +3,7 @@
 // RUN: %libomptarget-compileopt-generic -DSHARED -fPIC -shared -o %t.so && \
 // RUN: %libomptarget-compileopt-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic
 //
-// UNSUPPORTED: x86_64-pc-linux-gnu
+// REQUIRES: gpu
 
 #ifdef SHARED
 void foo() {}
diff --git a/offload/test/offloading/fortran/basic-target-parallel-do.f90 b/offload/test/offloading/fortran/basic-target-parallel-do.f90
index 0027f15bdb423e..5ea505fca4d692 100644
--- a/offload/test/offloading/fortran/basic-target-parallel-do.f90
+++ b/offload/test/offloading/fortran/basic-target-parallel-do.f90
@@ -1,10 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-generic
 ! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
diff --git a/offload/test/offloading/fortran/basic-target-parallel-region.f90 b/offload/test/offloading/fortran/basic-target-parallel-region.f90
index 54341f7c40ff3b..6c5bf1c63bacb7 100644
--- a/offload/test/offloading/fortran/basic-target-parallel-region.f90
+++ b/offload/test/offloading/fortran/basic-target-parallel-region.f90
@@ -1,10 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/basic-target-region-1D-array-section.f90 b/offload/test/offloading/fortran/basic-target-region-1D-array-section.f90
index e0b0f2ea734e94..15795fbcf7f354 100644
--- a/offload/test/offloading/fortran/basic-target-region-1D-array-section.f90
+++ b/offload/test/offloading/fortran/basic-target-region-1D-array-section.f90
@@ -1,11 +1,6 @@
 ! Basic offloading test of arrays with provided lower
 ! and upper bounds as specified by OpenMP's sectioning
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/basic-target-region-3D-array-section.f90 b/offload/test/offloading/fortran/basic-target-region-3D-array-section.f90
index 1b3f86c0dbfaa4..ca0364543c20f8 100644
--- a/offload/test/offloading/fortran/basic-target-region-3D-array-section.f90
+++ b/offload/test/offloading/fortran/basic-target-region-3D-array-section.f90
@@ -1,11 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/basic-target-region-3D-array.f90 b/offload/test/offloading/fortran/basic-target-region-3D-array.f90
index ea3048185d52b0..d575810713895e 100644
--- a/offload/test/offloading/fortran/basic-target-region-3D-array.f90
+++ b/offload/test/offloading/fortran/basic-target-region-3D-array.f90
@@ -1,11 +1,8 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
 ! REQUIRES: flang
+! REQUIRES: gpu
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/basic_target_region.f90 b/offload/test/offloading/fortran/basic_target_region.f90
index d856d42bb8cd68..d13c19b013b6bd 100644
--- a/offload/test/offloading/fortran/basic_target_region.f90
+++ b/offload/test/offloading/fortran/basic_target_region.f90
@@ -1,10 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/constant-arr-index.f90 b/offload/test/offloading/fortran/constant-arr-index.f90
index 669630555c31e3..80440b6836ac8a 100644
--- a/offload/test/offloading/fortran/constant-arr-index.f90
+++ b/offload/test/offloading/fortran/constant-arr-index.f90
@@ -2,12 +2,7 @@
 ! that checks constant indexing on device
 ! correctly works (regression test for prior
 ! bug).
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/declare-target-vars-in-target-region.f90 b/offload/test/offloading/fortran/declare-target-vars-in-target-region.f90
index f524deac3bcce9..c0bc9304b3030d 100644
--- a/offload/test/offloading/fortran/declare-target-vars-in-target-region.f90
+++ b/offload/test/offloading/fortran/declare-target-vars-in-target-region.f90
@@ -1,12 +1,7 @@
 ! Offloading test with a target region mapping a declare target
 ! Fortran array writing some values to it and checking the host
 ! correctly receives the updates made on the device.
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 module test_0
diff --git a/offload/test/offloading/fortran/double-target-call-with-declare-target.f90 b/offload/test/offloading/fortran/double-target-call-with-declare-target.f90
index 38cb055bfa2da6..fa909815757f92 100644
--- a/offload/test/offloading/fortran/double-target-call-with-declare-target.f90
+++ b/offload/test/offloading/fortran/double-target-call-with-declare-target.f90
@@ -2,13 +2,7 @@
 ! declare target Fortran array and writing some values to
 ! it before checking the host correctly receives the
 ! correct updates made on the device.
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 module test_0
diff --git a/offload/test/offloading/fortran/target-map-all-common-block-members.f90 b/offload/test/offloading/fortran/target-map-all-common-block-members.f90
new file mode 100644
index 00000000000000..def1e7c6630731
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-all-common-block-members.f90
@@ -0,0 +1,55 @@
+! Offloading test checking interaction of
+! mapping all the members of a common block
+! to a target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    common /var_common/ var1, var2, var3
+    integer :: var1, var2, var3
+
+    call modify_1
+
+  !$omp target map(tofrom: var1, var2, var3)
+    var3 = var3 * 10
+    var2 = var2 * 10
+    var1 = var1 * 10
+  !$omp end target
+
+  call modify_2
+
+  print *, var1
+  print *, var2
+  print *, var3
+end program
+
+subroutine modify_1
+  common /var_common/ var1, var2, var3
+  integer :: var1, var2, var3
+!$omp target map(tofrom: var2, var1, var3)
+  var3 = var3 + 40
+  var2 = var2 + 20
+  var1 = var1 + 30
+!$omp end target
+end
+
+subroutine modify_2
+  common /var_common/ var1, var2, var3
+  integer :: var1, var2, var3
+!$omp target map(tofrom: var2, var3, var1)
+  var3 = var3 + 20
+  var1 = var1 + 10
+  var2 = var2 + 15
+!$omp end target
+end
+
+!CHECK: 310
+!CHECK: 215
+!CHECK: 420
diff --git a/offload/test/offloading/fortran/target-map-allocatable-array-section-1d-bounds.f90 b/offload/test/offloading/fortran/target-map-allocatable-array-section-1d-bounds.f90
index 99dbe99d40497c..3e981116ab42f1 100644
--- a/offload/test/offloading/fortran/target-map-allocatable-array-section-1d-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-allocatable-array-section-1d-bounds.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of a
 ! two 1-D allocatable arrays with a target region
 ! while providing the map upper and lower bounds
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-allocatable-array-section-3d-bounds.f90 b/offload/test/offloading/fortran/target-map-allocatable-array-section-3d-bounds.f90
index 0786e0fd744e78..31b6daa35ca5e8 100644
--- a/offload/test/offloading/fortran/target-map-allocatable-array-section-3d-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-allocatable-array-section-3d-bounds.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of allocatables
 ! with multi-dimensional bounds (3-D in this case) and
 ! a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-allocatable-map-scopes.f90 b/offload/test/offloading/fortran/target-map-allocatable-map-scopes.f90
index 506976b339dfc7..4a97a9ade91cdc 100644
--- a/offload/test/offloading/fortran/target-map-allocatable-map-scopes.f90
+++ b/offload/test/offloading/fortran/target-map-allocatable-map-scopes.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of allocatables
 ! with target in different scopes
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 module test
diff --git a/offload/test/offloading/fortran/target-map-common-block.f90 b/offload/test/offloading/fortran/target-map-common-block.f90
new file mode 100644
index 00000000000000..8fcd504950515b
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-common-block.f90
@@ -0,0 +1,50 @@
+! Offloading test checking interaction of
+! mapping a full common block in a target
+! region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    common /var_common/ var1, var2
+    integer :: var1, var2
+
+    call modify_1
+
+  !$omp target map(tofrom: /var_common/)
+      var1 = var1 + 20
+      var2 = var2 + 50
+  !$omp end target
+
+    call modify_2
+
+    print *, var1
+    print *, var2
+end program
+
+subroutine modify_1
+  common /var_common/ var1, var2
+  integer :: var1, var2
+!$omp target map(tofrom: /var_common/)
+  var1 = var1 + 20
+  var2 = var2 + 30
+!$omp end target
+end
+
+subroutine modify_2
+  common /var_common/ var1, var2
+  integer :: var1, var2
+!$omp target map(tofrom: /var_common/)
+  var1 = var1 * 10
+  var2 = var2 * 10
+!$omp end target
+end
+
+!CHECK: 400
+!CHECK: 800
diff --git a/offload/test/offloading/fortran/target-map-declare-target-link-common-block.f90 b/offload/test/offloading/fortran/target-map-declare-target-link-common-block.f90
new file mode 100644
index 00000000000000..47f50840befd3e
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-declare-target-link-common-block.f90
@@ -0,0 +1,72 @@
+! Offloading test checking interaction of
+! mapping a declare target link common
+! block with device_type any to a target
+! region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    common /var_common/ var1, var2, var3
+    integer :: var1, var2, var3
+    !$omp declare target link(/var_common/)
+
+    call modify_1
+
+  !$omp target map(tofrom: var2)
+    var2 = var2 + var3
+  !$omp end target
+
+  call modify_2
+  call modify_3
+
+  print *, var1
+  print *, var2
+  print *, var3
+end program
+
+subroutine modify_1
+  common /var_common/ var1, var2, var3
+  integer :: var1, var2, var3
+
+!$omp target map(tofrom: /var_common/)
+  var1 = 10
+  var2 = 20
+  var3 = 30
+!$omp end target
+
+end
+
+subroutine modify_2
+  common /var_common/ var1, var2, var3
+  integer :: var1, var2, var3
+  integer :: copy
+
+!$omp target map(tofrom: copy)
+  copy =  var2 + var3
+!$omp end target
+
+  print *, copy
+end
+
+subroutine modify_3
+  common /var_common/ var1, var2, var3
+  integer :: var1, var2, var3
+
+!$omp target map(tofrom: /var_common/)
+  var1 = var1 + var1
+  var2 = var2 + var2
+  var3 = var3 + var3
+!$omp end target
+end
+
+!CHECK: 80
+!CHECK: 20
+!CHECK: 100
+!CHECK: 60
\ No newline at end of file
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-1.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-1.f90
index f4e11f8b9e326c..fe3b84398eaf01 100644
--- a/offload/test/offloading/fortran/target-map-derived-type-full-1.f90
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-1.f90
@@ -2,13 +2,7 @@
 ! explicit derived type mapping when mapped
 ! to target and assinging one derived type
 ! to another
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-2.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-2.f90
index f97880a2023d4a..302af23a3c7e8b 100644
--- a/offload/test/offloading/fortran/target-map-derived-type-full-2.f90
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-2.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! explicit derived type mapping when mapped to
 ! target and assigning to individual members
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-implicit-1.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-1.f90
index 0f1d58de961cac..f43fa3cc69055f 100644
--- a/offload/test/offloading/fortran/target-map-derived-type-full-implicit-1.f90
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-1.f90
@@ -2,13 +2,7 @@
 ! implicit derived type mapping when mapped
 ! to target and assinging one derived type
 ! to another
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-implicit-2.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-2.f90
index a20b644652f823..f6c8f9812b14c0 100644
--- a/offload/test/offloading/fortran/target-map-derived-type-full-implicit-2.f90
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-2.f90
@@ -2,13 +2,7 @@
 ! explicit derived type mapping when mapped
 ! to target and assinging one derived type
 ! to another
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-double-large-nested-dtype-multi-member.f90 b/offload/test/offloading/fortran/target-map-double-large-nested-dtype-multi-member.f90
index 17e1cd9549f074..e390481bbaba0a 100644
--- a/offload/test/offloading/fortran/target-map-double-large-nested-dtype-multi-member.f90
+++ b/offload/test/offloading/fortran/target-map-double-large-nested-dtype-multi-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! explicit member map from two large nested
 ! derived types
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-double-nested-dtype-array-bounds.f90 b/offload/test/offloading/fortran/target-map-double-nested-dtype-array-bounds.f90
index d327ebf8c1a921..7f260ac4cdc1ed 100644
--- a/offload/test/offloading/fortran/target-map-double-nested-dtype-array-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-double-nested-dtype-array-bounds.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of two
 ! explicit arrau member maps with bounds from
 ! two nested derived types
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-double-nested-dtype-double-array-bounds.f90 b/offload/test/offloading/fortran/target-map-double-nested-dtype-double-array-bounds.f90
index d80ec8a4e0699d..bd9bbb1b7eed50 100644
--- a/offload/test/offloading/fortran/target-map-double-nested-dtype-double-array-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-double-nested-dtype-double-array-bounds.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of two
 ! explicit array member maps with array bounds
 ! from two nested derived types
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-double-nested-dtype-single-member.f90 b/offload/test/offloading/fortran/target-map-double-nested-dtype-single-member.f90
index 59bcd4cc383405..3c3b5e4b91e2fd 100644
--- a/offload/test/offloading/fortran/target-map-double-nested-dtype-single-member.f90
+++ b/offload/test/offloading/fortran/target-map-double-nested-dtype-single-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! explicit derived type member mapping of two
 ! derived types for a single array member each
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit-update.f90 b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit-update.f90
index 47fb6d210e9786..e3c28062687c84 100644
--- a/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit-update.f90
+++ b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit-update.f90
@@ -3,13 +3,7 @@
 ! an array with bounds when mapped to
 ! target using a combination of update,
 ! enter and exit directives.
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit.f90 b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit.f90
index e92e85ed0bc003..fd56142c5cbf25 100644
--- a/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit.f90
+++ b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit.f90
@@ -3,13 +3,7 @@
 ! an array with bounds when mapped to
 ! target using a combination of enter and
 ! exit directives.
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-dtype-explicit-individual-array-member.f90 b/offload/test/offloading/fortran/target-map-dtype-explicit-individual-array-member.f90
index 97b804e6d1d9d6..ce8a1b2ba84bae 100644
--- a/offload/test/offloading/fortran/target-map-dtype-explicit-individual-array-member.f90
+++ b/offload/test/offloading/fortran/target-map-dtype-explicit-individual-array-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! explicit derived type member mapping of
 ! an array when mapped to target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-3D-member-bounds.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-3D-member-bounds.f90
index 44727bec73c0e6..c72bb853780952 100644
--- a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-3D-member-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-3D-member-bounds.f90
@@ -2,13 +2,7 @@
 ! explicit derived type member mapping of
 ! two arrays with explicit bounds when
 ! mapped to target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member-bounds.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member-bounds.f90
index fddc956e3fd534..b8defce2abc726 100644
--- a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member-bounds.f90
@@ -2,13 +2,7 @@
 ! explicit derived type member mapping of
 ! two arrays with explicit bounds when
 ! mapped to target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member.f90
index 2f74b28a67f6b7..5816598b5b50d5 100644
--- a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member.f90
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! derived type mapping of two explicit array
 ! members to target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-member.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-member.f90
index 9f2c185aa1ad09..e869804470165b 100644
--- a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-member.f90
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! derived type mapping of two explicit
 ! members to target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-allocatables.f90 b/offload/test/offloading/fortran/target-map-enter-exit-allocatables.f90
index c14b23f6170d51..60806feb7a1912 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-allocatables.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-allocatables.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of allocatables
 ! with enter, exit and target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90 b/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90
index fe65d6a00cbb8d..320b76fcdb467f 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of an
 ! enter and exit map of an array of scalars
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90 b/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
index dfb5d59273c616..cae1d47013f622 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! enter and exit map of an array of scalars
 ! with specified bounds
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-array.f90 b/offload/test/offloading/fortran/target-map-enter-exit-array.f90
index 341c36647c4eda..67946ba66cd961 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-array.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-array.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of fixed size
 ! arrays with enter, exit and target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90 b/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90
index f2d7099c041c15..046a906dd524dd 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of an
 ! enter and exit map of an scalar
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-first-common-block-member.f90 b/offload/test/offloading/fortran/target-map-first-common-block-member.f90
new file mode 100644
index 00000000000000..69c1fa13501d21
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-first-common-block-member.f90
@@ -0,0 +1,47 @@
+! Offloading test checking interaction of
+! mapping a member of a common block to a
+! target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    common /var_common/ var1, var2
+    integer :: var1, var2
+
+    call modify_1
+
+  !$omp target map(tofrom: var1)
+      var1 = var1 + 20
+  !$omp end target
+
+    call modify_2
+
+      print *, var1
+      print *, var2
+end program
+
+subroutine modify_1
+  common /var_common/ var1, var2
+  integer :: var1, var2
+!$omp target map(tofrom: var1)
+  var1 = var1 + 20
+!$omp end target
+end
+
+subroutine modify_2
+  common /var_common/ var1, var2
+  integer :: var1, var2
+!$omp target map(tofrom: var1)
+  var1 = var1 * 10
+!$omp end target
+end
+
+!CHECK: 400
+!CHECK: 0
\ No newline at end of file
diff --git a/offload/test/offloading/fortran/target-map-individual-dtype-member-map.f90 b/offload/test/offloading/fortran/target-map-individual-dtype-member-map.f90
index 9a65053fd1455c..42a09499691058 100644
--- a/offload/test/offloading/fortran/target-map-individual-dtype-member-map.f90
+++ b/offload/test/offloading/fortran/target-map-individual-dtype-member-map.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! single explicit member map from a single
 ! derived type.
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-large-nested-dtype-multi-member.f90 b/offload/test/offloading/fortran/target-map-large-nested-dtype-multi-member.f90
index 578b885046041c..2e345fd49cfaa5 100644
--- a/offload/test/offloading/fortran/target-map-large-nested-dtype-multi-member.f90
+++ b/offload/test/offloading/fortran/target-map-large-nested-dtype-multi-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! explicit member map a large nested derived
 ! type
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-mix-imp-exp-common-block-members.f90 b/offload/test/offloading/fortran/target-map-mix-imp-exp-common-block-members.f90
new file mode 100644
index 00000000000000..672630aec7d7ce
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-mix-imp-exp-common-block-members.f90
@@ -0,0 +1,58 @@
+! Offloading test checking interaction of
+! mapping all the members of a common block
+! with a mix of explicit and implicit
+! mapping to a target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    common /var_common/ var1, var2, var3
+    integer :: var1, var2, var3
+
+    call modify_1
+
+    !$omp target map(tofrom: var2)
+      var2 = var2 + var3
+    !$omp end target
+
+    call modify_2
+
+    print *, var1
+    print *, var2
+    print *, var3
+end program
+
+subroutine modify_1
+    common /var_common/ var1, var2, var3
+    integer :: var1, var2, var3
+
+  !$omp target map(tofrom: /var_common/)
+    var1 = 10
+    var2 = 20
+    var3 = 30
+  !$omp end target
+end
+
+subroutine modify_2
+    common /var_common/ var1, var2, var3
+    integer :: var1, var2, var3
+    integer :: copy
+
+  !$omp target map(tofrom: copy)
+    copy =  var2 + var3
+  !$omp end target
+
+    print *, copy
+end
+
+!CHECK: 80
+!CHECK: 10
+!CHECK: 50
+!CHECK: 30
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-complex-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-complex-member.f90
index e2e34e6b9b5c13..becc88d3157c8b 100644
--- a/offload/test/offloading/fortran/target-map-nested-dtype-complex-member.f90
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-complex-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! nested derived type member map of a complex
 ! number member
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-derived-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-derived-member.f90
index b94915594a1562..c5f79de22b2963 100644
--- a/offload/test/offloading/fortran/target-map-nested-dtype-derived-member.f90
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-derived-member.f90
@@ -2,13 +2,7 @@
 ! nested derived type member map with the
 ! inclusion of an entire nested derived
 ! type being mapped
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-multi-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-multi-member.f90
index c3d230853872d6..b3579c39f7f66b 100644
--- a/offload/test/offloading/fortran/target-map-nested-dtype-multi-member.f90
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-multi-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! explicit member map from a small nested
 ! derived type
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-single-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-single-member.f90
index de436edb2ef24d..20c245a5877c53 100644
--- a/offload/test/offloading/fortran/target-map-nested-dtype-single-member.f90
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-single-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of an
 ! single explicit member map from a nested
 ! derived type.
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-pointer-scopes-enter-exit.f90 b/offload/test/offloading/fortran/target-map-pointer-scopes-enter-exit.f90
index e148fb123ecba6..2dc03641f8a696 100644
--- a/offload/test/offloading/fortran/target-map-pointer-scopes-enter-exit.f90
+++ b/offload/test/offloading/fortran/target-map-pointer-scopes-enter-exit.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of pointers
 ! with target in different scopes
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 module test
diff --git a/offload/test/offloading/fortran/target-map-pointer-target-array-section-3d-bounds.f90 b/offload/test/offloading/fortran/target-map-pointer-target-array-section-3d-bounds.f90
index c20fbbc0ff35cc..2b2c60f5d4b50e 100644
--- a/offload/test/offloading/fortran/target-map-pointer-target-array-section-3d-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-pointer-target-array-section-3d-bounds.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of pointer
 ! and target with target where 3-D bounds have
 ! been specified
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-pointer-target-scopes.f90 b/offload/test/offloading/fortran/target-map-pointer-target-scopes.f90
index 4d5b995a7dfb3c..fcbcf024d9c9df 100644
--- a/offload/test/offloading/fortran/target-map-pointer-target-scopes.f90
+++ b/offload/test/offloading/fortran/target-map-pointer-target-scopes.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of pointer
 ! and target with target across multiple scopes
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 module test
diff --git a/offload/test/offloading/fortran/target-map-second-common-block-member.f90 b/offload/test/offloading/fortran/target-map-second-common-block-member.f90
new file mode 100644
index 00000000000000..3aa937eb9adc6d
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-second-common-block-member.f90
@@ -0,0 +1,47 @@
+! Offloading test checking interaction of
+! mapping a member of a common block to a
+! target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    common /var_common/ var1, var2
+    integer :: var1, var2
+
+    call modify_1
+
+  !$omp target map(tofrom: var2)
+      var2 = var2 + 20
+  !$omp end target
+
+    call modify_2
+
+      print *, var1
+      print *, var2
+end program
+
+subroutine modify_1
+  common /var_common/ var1, var2
+  integer :: var1, var2
+!$omp target map(tofrom: var2)
+  var2 = var2 + 20
+!$omp end target
+end
+
+subroutine modify_2
+  common /var_common/ var1, var2
+  integer :: var1, var2
+!$omp target map(tofrom: var2)
+  var2 = var2 * 10
+!$omp end target
+end
+
+!CHECK: 0
+!CHECK: 400
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-explicit-member.f90 b/offload/test/offloading/fortran/target-map-two-dtype-explicit-member.f90
index 2ca621faf6ae2f..2d9f3cdfccc91d 100644
--- a/offload/test/offloading/fortran/target-map-two-dtype-explicit-member.f90
+++ b/offload/test/offloading/fortran/target-map-two-dtype-explicit-member.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of two
 ! derived type's with one explicit member
 ! each being mapped with bounds to target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-individual-member-array-1D-bounds.f90 b/offload/test/offloading/fortran/target-map-two-dtype-individual-member-array-1D-bounds.f90
index 77a5e1bc9e5fd1..26d030c2f042d6 100644
--- a/offload/test/offloading/fortran/target-map-two-dtype-individual-member-array-1D-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-two-dtype-individual-member-array-1D-bounds.f90
@@ -2,13 +2,7 @@
 ! derived type's with a single explicit array
 ! member each being mapped with bounds to
 ! target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-1.f90 b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-1.f90
index 8675f470e1cb2c..43cdf0497c57de 100644
--- a/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-1.f90
+++ b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-1.f90
@@ -1,13 +1,7 @@
 ! Offloading test checking interaction of two
 ! derived type's with a mix of explicit and
 ! implicit member mapping to target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-2.f90 b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-2.f90
index d13d9e8068ec22..336e182f8a16b8 100644
--- a/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-2.f90
+++ b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-2.f90
@@ -2,13 +2,7 @@
 ! derived type's with a mix of explicit and
 ! implicit member mapping of arrays to target
 ! one with bounds.
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-multi-member-array-1D-bounds.f90 b/offload/test/offloading/fortran/target-map-two-dtype-multi-member-array-1D-bounds.f90
index b788c6c5bcf262..308d6a0c361002 100644
--- a/offload/test/offloading/fortran/target-map-two-dtype-multi-member-array-1D-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-two-dtype-multi-member-array-1D-bounds.f90
@@ -2,13 +2,7 @@
 ! derived type's with two explicit array
 ! members each being mapped with bounds to
 ! target
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-map-two-nested-dtype-member-array-map.f90 b/offload/test/offloading/fortran/target-map-two-nested-dtype-member-array-map.f90
index 2114a33e7ba129..1a59a7c95b9c0a 100644
--- a/offload/test/offloading/fortran/target-map-two-nested-dtype-member-array-map.f90
+++ b/offload/test/offloading/fortran/target-map-two-nested-dtype-member-array-map.f90
@@ -1,12 +1,6 @@
 ! Offloading test checking interaction of an
 ! explicit member map utilising array bounds
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-nested-target-data.f90 b/offload/test/offloading/fortran/target-nested-target-data.f90
index a694f2464546c5..6041433afbdc6e 100644
--- a/offload/test/offloading/fortran/target-nested-target-data.f90
+++ b/offload/test/offloading/fortran/target-nested-target-data.f90
@@ -1,11 +1,6 @@
 ! Offloading test for target nested inside
 ! a target data region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target-parallel-do-collapse.f90 b/offload/test/offloading/fortran/target-parallel-do-collapse.f90
index e44d6a50969bbc..d68ea944484c43 100644
--- a/offload/test/offloading/fortran/target-parallel-do-collapse.f90
+++ b/offload/test/offloading/fortran/target-parallel-do-collapse.f90
@@ -1,10 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-generic
 ! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
diff --git a/offload/test/offloading/fortran/target-region-implicit-array.f90 b/offload/test/offloading/fortran/target-region-implicit-array.f90
index ada6ef2ad73012..22d0c5ea655fcd 100644
--- a/offload/test/offloading/fortran/target-region-implicit-array.f90
+++ b/offload/test/offloading/fortran/target-region-implicit-array.f90
@@ -1,11 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
diff --git a/offload/test/offloading/fortran/target_map_common_block.f90 b/offload/test/offloading/fortran/target_map_common_block.f90
index 402de2782b2536..8e52e2f97496fc 100644
--- a/offload/test/offloading/fortran/target_map_common_block.f90
+++ b/offload/test/offloading/fortran/target_map_common_block.f90
@@ -1,12 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
+! REQUIRES: flang, amdgpu
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 
 ! Testing simple variables in common block.
diff --git a/offload/test/offloading/fortran/target_map_common_block1.f90 b/offload/test/offloading/fortran/target_map_common_block1.f90
index 6aaa66d6449b6d..721ef238aa5853 100644
--- a/offload/test/offloading/fortran/target_map_common_block1.f90
+++ b/offload/test/offloading/fortran/target_map_common_block1.f90
@@ -1,11 +1,4 @@
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
+! REQUIRES: flang, amdgpu
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 
 program main
diff --git a/offload/test/offloading/fortran/target_map_common_block2.f90 b/offload/test/offloading/fortran/target_map_common_block2.f90
index 24e3e2b7ab117a..3c7ab786d49d5e 100644
--- a/offload/test/offloading/fortran/target_map_common_block2.f90
+++ b/offload/test/offloading/fortran/target_map_common_block2.f90
@@ -1,11 +1,4 @@
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
+! REQUIRES: flang, amdgpu
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 
 program main
diff --git a/offload/test/offloading/fortran/target_update.f90 b/offload/test/offloading/fortran/target_update.f90
index fb35c5a36ab0e5..013a72a2227845 100644
--- a/offload/test/offloading/fortran/target_update.f90
+++ b/offload/test/offloading/fortran/target_update.f90
@@ -1,6 +1,6 @@
 ! Offloading test for the `target update` directive.
 
-! REQUIRES: flang, amdgcn-amd-amdhsa
+! REQUIRES: flang, amdgpu
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program target_update
diff --git a/offload/test/offloading/memory_manager.cpp b/offload/test/offloading/memory_manager.cpp
index ac437c53521c6d..fb0a4428d0d972 100644
--- a/offload/test/offloading/memory_manager.cpp
+++ b/offload/test/offloading/memory_manager.cpp
@@ -1,8 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// REQURIES: nvidiagpu
 
 #include <omp.h>
 
diff --git a/offload/test/offloading/ompx_bare.c b/offload/test/offloading/ompx_bare.c
index 3dabdcd15e0d8d..b9a8759db1de14 100644
--- a/offload/test/offloading/ompx_bare.c
+++ b/offload/test/offloading/ompx_bare.c
@@ -2,12 +2,7 @@
 // RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
 // RUN:   %fcheck-generic
 //
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <assert.h>
 #include <ompx.h>
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
index 00d8ac9d914246..101d1255f0d670 100644
--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 //
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <assert.h>
 #include <ompx.h>
diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
index c9246894b0894e..9b0e66e25f68c9 100644
--- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
+++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 //
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <cassert>
 #include <cmath>
diff --git a/offload/test/offloading/ompx_coords.c b/offload/test/offloading/ompx_coords.c
index 5e4e14b4c6daeb..7cca784a90905e 100644
--- a/offload/test/offloading/ompx_coords.c
+++ b/offload/test/offloading/ompx_coords.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compileopt-run-and-check-generic
 //
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <omp.h>
 #include <ompx.h>
diff --git a/offload/test/offloading/ompx_saxpy_mixed.c b/offload/test/offloading/ompx_saxpy_mixed.c
index f479be8a484fc9..02cb78b216b87d 100644
--- a/offload/test/offloading/ompx_saxpy_mixed.c
+++ b/offload/test/offloading/ompx_saxpy_mixed.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compileopt-run-and-check-generic
 //
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <math.h>
 #include <omp.h>
diff --git a/offload/test/offloading/parallel_offloading_map.cpp b/offload/test/offloading/parallel_offloading_map.cpp
index 55cdb4583f9ed6..d2a6c4200df52c 100644
--- a/offload/test/offloading/parallel_offloading_map.cpp
+++ b/offload/test/offloading/parallel_offloading_map.cpp
@@ -1,7 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <cassert>
 #include <iostream>
diff --git a/offload/test/offloading/parallel_target_teams_reduction.cpp b/offload/test/offloading/parallel_target_teams_reduction.cpp
index 10e1b33f1ce405..d6486f62377836 100644
--- a/offload/test/offloading/parallel_target_teams_reduction.cpp
+++ b/offload/test/offloading/parallel_target_teams_reduction.cpp
@@ -2,12 +2,7 @@
 // RUN: %libomptarget-compileoptxx-run-and-check-generic
 
 // FIXME: This is a bug in host offload, this should run fine.
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <iostream>
 #include <vector>
diff --git a/offload/test/offloading/parallel_target_teams_reduction_max.cpp b/offload/test/offloading/parallel_target_teams_reduction_max.cpp
index 8f06802aa772a5..b6e39daf718c8a 100644
--- a/offload/test/offloading/parallel_target_teams_reduction_max.cpp
+++ b/offload/test/offloading/parallel_target_teams_reduction_max.cpp
@@ -2,10 +2,7 @@
 // RUN: %libomptarget-compileoptxx-and-run-generic
 
 // FIXME: This is a bug in host offload, this should run fine.
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// REQUIRES: gpu
 
 // This test validates that the OpenMP target reductions to find a maximum work
 // as indended for a few common data types.
diff --git a/offload/test/offloading/parallel_target_teams_reduction_min.cpp b/offload/test/offloading/parallel_target_teams_reduction_min.cpp
index 66c972f7604402..1ab4e9868985e3 100644
--- a/offload/test/offloading/parallel_target_teams_reduction_min.cpp
+++ b/offload/test/offloading/parallel_target_teams_reduction_min.cpp
@@ -2,10 +2,7 @@
 // RUN: %libomptarget-compileoptxx-and-run-generic
 
 // FIXME: This is a bug in host offload, this should run fine.
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// REQUIRES: gpu
 
 // This test validates that the OpenMP target reductions to find a minimum work
 // as indended for a few common data types.
diff --git a/offload/test/offloading/small_trip_count.c b/offload/test/offloading/small_trip_count.c
index 65f094f1574694..78750411ff8f49 100644
--- a/offload/test/offloading/small_trip_count.c
+++ b/offload/test/offloading/small_trip_count.c
@@ -5,12 +5,7 @@
 // RUN: env LIBOMPTARGET_INFO=16 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=8 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=EIGHT
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #define N 128
 
diff --git a/offload/test/offloading/small_trip_count_thread_limit.cpp b/offload/test/offloading/small_trip_count_thread_limit.cpp
index b7ae52a62c83bb..cfb9fe712d270f 100644
--- a/offload/test/offloading/small_trip_count_thread_limit.cpp
+++ b/offload/test/offloading/small_trip_count_thread_limit.cpp
@@ -3,12 +3,7 @@
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 int main(int argc, char *argv[]) {
   constexpr const int block_size = 256;
diff --git a/offload/test/offloading/spmdization.c b/offload/test/offloading/spmdization.c
index ab26456823e4bb..7f3f47d9ef32ec 100644
--- a/offload/test/offloading/spmdization.c
+++ b/offload/test/offloading/spmdization.c
@@ -7,12 +7,7 @@
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,GENERIC
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/target_critical_region.cpp b/offload/test/offloading/target_critical_region.cpp
index 495632bf76e175..0b97823e266b29 100644
--- a/offload/test/offloading/target_critical_region.cpp
+++ b/offload/test/offloading/target_critical_region.cpp
@@ -1,13 +1,8 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// REQUIRES: gpu
 // UNSUPPORTED: nvptx64-nvidia-cuda
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 // UNSUPPORTED: amdgcn-amd-amdhsa
 
 #include <omp.h>
diff --git a/offload/test/offloading/taskloop_offload_nowait.cpp b/offload/test/offloading/taskloop_offload_nowait.cpp
index 7069580bff4f76..26a40298eea3ef 100644
--- a/offload/test/offloading/taskloop_offload_nowait.cpp
+++ b/offload/test/offloading/taskloop_offload_nowait.cpp
@@ -1,7 +1,6 @@
 // RUN: %libomptarget-compilexx-and-run-generic
 
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// REQUIRES: gpu
 
 #include <cmath>
 #include <cstdlib>
diff --git a/offload/test/offloading/thread_limit.c b/offload/test/offloading/thread_limit.c
index f18a306614c790..72fa0b218a3c5b 100644
--- a/offload/test/offloading/thread_limit.c
+++ b/offload/test/offloading/thread_limit.c
@@ -5,12 +5,7 @@
 
 // UNSUPPORTED: nvptx64-nvidia-cuda
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 int main() {
   int n = 1 << 20;
diff --git a/offload/test/offloading/workshare_chunk.c b/offload/test/offloading/workshare_chunk.c
index a8c60c0097791a..a6845bf4faccdc 100644
--- a/offload/test/offloading/workshare_chunk.c
+++ b/offload/test/offloading/workshare_chunk.c
@@ -1,12 +1,7 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // RUN: %libomptarget-compileopt-run-and-check-generic
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 // clang-format off
 
diff --git a/offload/test/ompt/target_memcpy.c b/offload/test/ompt/target_memcpy.c
index 3224c5ac4c916d..f244e0f418ed63 100644
--- a/offload/test/ompt/target_memcpy.c
+++ b/offload/test/ompt/target_memcpy.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Verify that for the target OpenMP APIs, the return address is non-null and
diff --git a/offload/test/ompt/target_memcpy_emi.c b/offload/test/ompt/target_memcpy_emi.c
index fd7da13cb05d07..934caba6efab30 100644
--- a/offload/test/ompt/target_memcpy_emi.c
+++ b/offload/test/ompt/target_memcpy_emi.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Verify all three data transfer directions: H2D, D2D and D2H
diff --git a/offload/test/ompt/veccopy.c b/offload/test/ompt/veccopy.c
index bc70f5a0b5b67d..f28d94f524bb86 100644
--- a/offload/test/ompt/veccopy.c
+++ b/offload/test/ompt/veccopy.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Example OpenMP program that registers non-EMI callbacks
diff --git a/offload/test/ompt/veccopy_data.c b/offload/test/ompt/veccopy_data.c
index 264b484b881f10..059ca97c3cde38 100644
--- a/offload/test/ompt/veccopy_data.c
+++ b/offload/test/ompt/veccopy_data.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Example OpenMP program that registers EMI callbacks.
diff --git a/offload/test/ompt/veccopy_disallow_both.c b/offload/test/ompt/veccopy_disallow_both.c
index a011c21955bb2d..b531a628803e46 100644
--- a/offload/test/ompt/veccopy_disallow_both.c
+++ b/offload/test/ompt/veccopy_disallow_both.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Example OpenMP program that shows that both EMI and non-EMI
diff --git a/offload/test/ompt/veccopy_emi.c b/offload/test/ompt/veccopy_emi.c
index 8718a39b4af812..2c57a85c14756c 100644
--- a/offload/test/ompt/veccopy_emi.c
+++ b/offload/test/ompt/veccopy_emi.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Example OpenMP program that registers EMI callbacks
diff --git a/offload/test/ompt/veccopy_emi_map.c b/offload/test/ompt/veccopy_emi_map.c
index 2accba34034e64..fa18a43cd8a50c 100644
--- a/offload/test/ompt/veccopy_emi_map.c
+++ b/offload/test/ompt/veccopy_emi_map.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Example OpenMP program that shows that map-EMI callbacks are not supported.
diff --git a/offload/test/ompt/veccopy_map.c b/offload/test/ompt/veccopy_map.c
index 56a0dd48f53683..2e817d328e59ff 100644
--- a/offload/test/ompt/veccopy_map.c
+++ b/offload/test/ompt/veccopy_map.c
@@ -1,11 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// REQUIRES: gpu
 
 /*
  * Example OpenMP program that shows that map callbacks are not supported.
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 0d49643709e0a0..747b8280931568 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -1268,6 +1268,11 @@ kmp_set_disp_num_buffers                    890
     __kmpc_atomic_val_8_cas_cpt            2158
     %endif
 
+    # No longer need to put ordinal numbers
+    __kmpc_push_num_threads_list
+    __kmpc_push_num_threads_strict
+    __kmpc_push_num_threads_list_strict
+
 %endif
 
 __kmpc_set_thread_limit
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index f625e84a93d437..c8d821b12ff0c7 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -532,6 +532,15 @@ enum clock_function_type {
 enum mic_type { non_mic, mic1, mic2, mic3, dummy };
 #endif
 
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+  int *nth;
+  int size;
+  int used;
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
 /* -- fast reduction stuff ------------------------------------------------ */
 
 #undef KMP_FAST_REDUCTION_BARRIER
@@ -2965,6 +2974,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* The data set by the primary thread at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
+  int *th_set_nested_nth;
+  bool th_nt_strict; // num_threads clause has strict modifier
+  ident_t *th_nt_loc; // loc for strict modifier
+  int th_nt_sev; // error severity for strict modifier
+  const char *th_nt_msg; // error message for strict modifier
+  int th_set_nested_nth_sz;
 #if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
 #endif
@@ -3206,6 +3221,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
   distributedBarrier *b; // Distributed barrier data associated with team
+  kmp_nested_nthreads_t *t_nested_nth;
 } kmp_base_team_t;
 
 // Assert that the list structure fits and aligns within
@@ -3542,15 +3558,6 @@ extern enum mic_type __kmp_mic_type;
 extern double __kmp_load_balance_interval; // load balance algorithm interval
 #endif /* USE_LOAD_BALANCE */
 
-// OpenMP 3.1 - Nested num threads array
-typedef struct kmp_nested_nthreads_t {
-  int *nth;
-  int size;
-  int used;
-} kmp_nested_nthreads_t;
-
-extern kmp_nested_nthreads_t __kmp_nested_nth;
-
 #if KMP_USE_ADAPTIVE_LOCKS
 
 // Parameters for the speculative lock backoff system.
@@ -3785,6 +3792,11 @@ extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
   ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
 
 extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+extern void __kmp_push_num_threads_list(ident_t *loc, int gtid,
+                                        kmp_uint32 list_length,
+                                        int *num_threads_list);
+extern void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+                                         const char *msg);
 
 extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
                                  kmp_proc_bind_t proc_bind);
@@ -4423,6 +4435,18 @@ KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
 KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
                                         kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_push_num_threads_strict(ident_t *loc,
+                                               kmp_int32 global_tid,
+                                               kmp_int32 num_threads,
+                                               int severity,
+                                               const char *message);
+
+KMP_EXPORT void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+                                             kmp_uint32 list_length,
+                                             kmp_int32 *num_threads_list);
+KMP_EXPORT void __kmpc_push_num_threads_list_strict(
+    ident_t *loc, kmp_int32 global_tid, kmp_uint32 list_length,
+    kmp_int32 *num_threads_list, int severity, const char *message);
 
 KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
                                       int proc_bind);
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index f45fe646d1d9aa..d638acd85116d6 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -237,6 +237,50 @@ void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
   __kmp_push_num_threads(loc, global_tid, num_threads);
 }
 
+void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32 global_tid,
+                                    kmp_int32 num_threads, int severity,
+                                    const char *message) {
+  __kmp_push_num_threads(loc, global_tid, num_threads);
+  __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param list_length number of entries in the num_threads_list array
+@param num_threads_list array of numbers of threads requested for this parallel
+construct and subsequent nested parallel constructs
+
+Set the number of threads to be used by the next fork spawned by this thread,
+and some nested forks as well.
+This call is only required if the parallel construct has a `num_threads` clause
+that has a list of integers as the argument.
+*/
+void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_uint32 list_length,
+                                  kmp_int32 *num_threads_list) {
+  KA_TRACE(20, ("__kmpc_push_num_threads_list: enter T#%d num_threads_list=",
+                global_tid));
+  KA_TRACE(20, ("%d", num_threads_list[0]));
+#ifdef KMP_DEBUG
+  for (kmp_uint32 i = 1; i < list_length; ++i)
+    KA_TRACE(20, (", %d", num_threads_list[i]));
+#endif
+  KA_TRACE(20, ("/n"));
+
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+}
+
+void __kmpc_push_num_threads_list_strict(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_uint32 list_length,
+                                         kmp_int32 *num_threads_list,
+                                         int severity, const char *message) {
+  __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+  __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
   /* the num_threads are automatically popped */
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 74b44b5d4d9cc9..b49c44f348d6b4 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -113,6 +113,21 @@ void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
                                int new_nthreads);
 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
 
+static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
+                                                        int level) {
+  kmp_nested_nthreads_t *new_nested_nth =
+      (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
+          sizeof(kmp_nested_nthreads_t));
+  int new_size = level + thr->th.th_set_nested_nth_sz;
+  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
+  for (int i = 0; i < level + 1; ++i)
+    new_nested_nth->nth[i] = 0;
+  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
+    new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
+  new_nested_nth->size = new_nested_nth->used = new_size;
+  return new_nested_nth;
+}
+
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -930,6 +945,11 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
                   __kmp_get_gtid(), new_nthreads, set_nthreads));
   }
 #endif // KMP_DEBUG
+
+  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
+    __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
+                 this_thr->th.th_nt_msg);
+  }
   return new_nthreads;
 }
 
@@ -1265,6 +1285,10 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     serial_team->t.t_serialized = 1;
     serial_team->t.t_nproc = 1;
     serial_team->t.t_parent = this_thr->th.th_team;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    else
+      serial_team->t.t_nested_nth = &__kmp_nested_nth;
     // Save previous team's task state on serial team structure
     serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
@@ -1286,9 +1310,11 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
 
     if (__kmp_nested_proc_bind.used &&
@@ -1339,10 +1365,14 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     int level = this_thr->th.th_team->t.t_level;
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (serial_team->t.t_nested_nth)
+      nested_nth = serial_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
+
     serial_team->t.t_level++;
     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
                   "of serial team %p to %d\n",
@@ -2093,9 +2123,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     // See if we need to make a copy of the ICVs.
     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
-    if ((level + 1 < __kmp_nested_nth.used) &&
-        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
-      nthreads_icv = __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = NULL;
+    if (!master_th->th.th_set_nested_nth &&
+        (level + 1 < parent_team->t.t_nested_nth->used) &&
+        (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
+      nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
+    } else if (master_th->th.th_set_nested_nth) {
+      nested_nth = __kmp_override_nested_nth(master_th, level);
+      if ((level + 1 < nested_nth->used) &&
+          (nested_nth->nth[level + 1] != nthreads_icv))
+        nthreads_icv = nested_nth->nth[level + 1];
+      else
+        nthreads_icv = 0; // don't update
     } else {
       nthreads_icv = 0; // don't update
     }
@@ -2204,6 +2243,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
 
+    // Check if hot team has potentially outdated list, and if so, free it
+    if (team->t.t_nested_nth &&
+        team->t.t_nested_nth != parent_team->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+      team->t.t_nested_nth = NULL;
+    }
+    team->t.t_nested_nth = parent_team->t.t_nested_nth;
+    if (master_th->th.th_set_nested_nth) {
+      if (!nested_nth)
+        nested_nth = __kmp_override_nested_nth(master_th, level);
+      team->t.t_nested_nth = nested_nth;
+      KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
+      master_th->th.th_set_nested_nth = NULL;
+      master_th->th.th_set_nested_nth_sz = 0;
+      master_th->th.th_nt_strict = false;
+    }
+
     // Update the floating point rounding in the team if required.
     propagateFPControl(team);
 #if OMPD_SUPPORT
@@ -3337,6 +3394,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   root_team->t.t_serialized = 1;
   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   root_team->t.t_sched.sched = r_sched.sched;
+  root_team->t.t_nested_nth = &__kmp_nested_nth;
   KA_TRACE(
       20,
       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
@@ -3374,6 +3432,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   hot_team->t.t_sched.sched = r_sched.sched;
   hot_team->t.t_size_changed = 0;
+  hot_team->t.t_nested_nth = &__kmp_nested_nth;
 }
 
 #ifdef KMP_DEBUG
@@ -4240,6 +4299,7 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
   else // no tasking --> always safe to reap
     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
   this_thr->th.th_set_proc_bind = proc_bind_default;
+
 #if KMP_AFFINITY_SUPPORTED
   this_thr->th.th_new_place = this_thr->th.th_current_place;
 #endif
@@ -4492,6 +4552,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   /* allocate space for it. */
   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
 
+  new_thr->th.th_nt_strict = false;
+  new_thr->th.th_nt_loc = NULL;
+  new_thr->th.th_nt_sev = severity_fatal;
+  new_thr->th.th_nt_msg = NULL;
+
   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
 
 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
@@ -4602,6 +4667,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   new_thr->th.th_active_in_pool = FALSE;
   TCW_4(new_thr->th.th_active, TRUE);
 
+  new_thr->th.th_set_nested_nth = NULL;
+  new_thr->th.th_set_nested_nth_sz = 0;
+
   /* adjust the global counters */
   __kmp_all_nth++;
   __kmp_nth++;
@@ -5398,7 +5466,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       }
     } // Check changes in number of threads
 
-    kmp_info_t *master = team->t.t_threads[0];
     if (master->th.th_teams_microtask) {
       for (f = 1; f < new_nproc; ++f) {
         // propagate teams construct specific info to workers
@@ -5504,6 +5571,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
+      team->t.t_nested_nth = NULL;
+
       KMP_MB();
 
       return team;
@@ -5575,6 +5644,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
   KMP_MB();
 
+  team->t.t_nested_nth = NULL;
+
   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
                 team->t.t_id));
 
@@ -5677,6 +5748,14 @@ void __kmp_free_team(kmp_root_t *root,
       }
     }
 
+    // Before clearing parent pointer, check if nested_nth list should be freed
+    if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
+        team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+    }
+    team->t.t_nested_nth = NULL;
+
     // Reset pointer to parent team only for non-hot teams.
     team->t.t_parent = NULL;
     team->t.t_level = 0;
@@ -7774,7 +7853,6 @@ int __kmp_invoke_teams_master(int gtid) {
    encountered by this team. since this should be enclosed in the forkjoin
    critical section it should avoid race conditions with asymmetrical nested
    parallelism */
-
 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
   kmp_info_t *thr = __kmp_threads[gtid];
 
@@ -7782,6 +7860,39 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
     thr->th.th_set_nproc = num_threads;
 }
 
+void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
+                                 int *num_threads_list) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(list_length > 1);
+
+  if (num_threads_list[0] > 0)
+    thr->th.th_set_nproc = num_threads_list[0];
+  thr->th.th_set_nested_nth =
+      (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
+  for (kmp_uint32 i = 0; i < list_length; ++i)
+    thr->th.th_set_nested_nth[i] = num_threads_list[i];
+  thr->th.th_set_nested_nth_sz = list_length;
+}
+
+void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+                                  const char *msg) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  thr->th.th_nt_strict = true;
+  thr->th.th_nt_loc = loc;
+  // if sev is unset make fatal
+  if (sev == severity_warning)
+    thr->th.th_nt_sev = sev;
+  else
+    thr->th.th_nt_sev = severity_fatal;
+  // if msg is unset, use an appropriate message
+  if (msg)
+    thr->th.th_nt_msg = msg;
+  else
+    thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
+                        "strict num_threads clause.";
+}
+
 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
                                     int num_threads) {
   KMP_DEBUG_ASSERT(thr);
@@ -8238,6 +8349,7 @@ void __kmp_cleanup(void) {
   __kmp_nested_nth.nth = NULL;
   __kmp_nested_nth.size = 0;
   __kmp_nested_nth.used = 0;
+
   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
   __kmp_nested_proc_bind.bind_types = NULL;
   __kmp_nested_proc_bind.size = 0;
diff --git a/openmp/runtime/test/parallel/omp_parallel_num_threads_list.c b/openmp/runtime/test/parallel/omp_parallel_num_threads_list.c
new file mode 100644
index 00000000000000..ad79b048ba764e
--- /dev/null
+++ b/openmp/runtime/test/parallel/omp_parallel_num_threads_list.c
@@ -0,0 +1,212 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2,2,2,2,2 %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+// When compiler supports num_threads clause list format, remove the following
+// and use num_threads clause directly
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __kmpc_global_thread_num(void *loc);
+void __kmpc_push_num_threads_list(void *loc, int gtid, unsigned length,
+                                  int *list);
+
+#if defined(__cplusplus)
+}
+#endif
+
+int test_omp_parallel_num_threads_list() {
+  int num_failed = 0;
+
+// Initially, 5 levels specified via OMP_NUM_THREADS with 2 threads per level
+// Check top 3 levels
+#pragma omp parallel reduction(+ : num_failed) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 2);
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+  } // end 1st level parallel
+
+// Make sure that basic single element num_threads clause works
+#pragma omp parallel reduction(+ : num_failed) num_threads(4) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() == 4);
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+  } // end 1st level parallel
+
+// Check that basic single element num_threads clause works on second level
+#pragma omp parallel reduction(+ : num_failed) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+#pragma omp parallel reduction(+ : num_failed) num_threads(4) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 4);
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+  } // end 1st level parallel
+
+  // Try a short list. It should completely overwrite the old settings.
+  // We need to use the compiler interface for now.
+  int threads[2] = {3, 3};
+  __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2,
+                               threads);
+#pragma omp parallel reduction(+ : num_failed) // num_threads(3,3) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+// NOTE: should just keep using last element in list, to nesting depth
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 3);
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+  } // end 1st level parallel
+
+// Similar,  but at a lower level.
+#pragma omp parallel reduction(+ : num_failed) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+    int threads[2] = {3, 3};
+    __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2,
+                                 threads);
+#pragma omp parallel reduction(+ : num_failed) // num_threads(3,3) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+// NOTE: just keep using last element in list, to nesting depth
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 3);
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+// Make sure a second inner parallel is NOT affected by the clause
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+#pragma omp single
+        // NOTE: just keep using last element in list, to nesting depth
+        num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+  } // end 1st level parallel
+
+  // Test lists at multiple levels
+  int threads2[2] = {3,2};
+  __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2,
+                               threads2);
+#pragma omp parallel reduction(+ : num_failed) // num_threads(3,2) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 2);
+        int threads3[2] = {3,1};
+        __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2,
+                                     threads3);
+#pragma omp parallel reduction(+ : num_failed) // num_threads(3,1) // 4th level
+        {
+#pragma omp single
+          num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+ : num_failed) // 5th level
+          {
+#pragma omp single
+            num_failed = num_failed + !(omp_get_num_threads() == 1);
+#pragma omp parallel reduction(+ : num_failed) // 6th level
+            {
+#pragma omp single
+              num_failed = num_failed + !(omp_get_num_threads() == 1);
+            } // end 6th level parallel
+          } // end 5th level parallel
+        } // end 4th level parallel
+#pragma omp parallel reduction(+ : num_failed) // 4th level
+        {
+#pragma omp single
+          num_failed = num_failed + !(omp_get_num_threads() == 2);
+        } // end 4th level parallel
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 2);
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+  } // end 1st level parallel
+
+// Now we should be back to the way we started.
+#pragma omp parallel reduction(+ : num_failed) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+ : num_failed) // 3rd level
+      {
+#pragma omp single
+        num_failed = num_failed + !(omp_get_num_threads() == 2);
+      } // end 3rd level parallel
+    } // end 2nd level parallel
+  } // end 1st level parallel
+
+  return (!num_failed);
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_parallel_num_threads_list()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c b/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
new file mode 100644
index 00000000000000..577aa3a250578c
--- /dev/null
+++ b/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
@@ -0,0 +1,103 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2,2,2,2,2 OMP_THREAD_LIMIT=16 \
+// RUN: %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+// When compiler supports num_threads clause list format and strict modifier,
+// remove the following and use num_threads clause directly
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __kmpc_global_thread_num(void *loc);
+void __kmpc_push_num_threads_list(void *loc, int gtid, unsigned length,
+                                  int *list);
+void __kmpc_push_num_threads_strict(void *loc, int gtid, int nth, int sev,
+                                    const char *msg);
+void __kmpc_push_num_threads_list_strict(void *loc, int gtid, unsigned length,
+                                         int *list, int sev, const char *msg);
+
+#if defined(__cplusplus)
+}
+#endif
+
+int test_omp_parallel_num_threads_strict() {
+  int num_failed = 0;
+
+// Test regular runtime warning about exceeding thread limit.
+// Tolerate whatever value was given.
+#pragma omp parallel reduction(+ : num_failed) num_threads(22)
+#pragma omp single
+  num_failed = num_failed + !(omp_get_num_threads() <= 22);
+
+  // Test with 4 threads and strict -- no problem, no warning.
+  __kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 4, 1,
+                                 "This warning shouldn't happen.");
+#pragma omp parallel reduction(+ : num_failed) // num_threads(strict:4)
+#pragma omp single
+  num_failed = num_failed + !(omp_get_num_threads() == 4);
+
+  // Exceed limit, specify user warning message. Tolerate whatever was given.
+  __kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 20, 1,
+                                 "User-supplied warning for strict.");
+#pragma omp parallel reduction(+ : num_failed)
+  // num_threads(strict:20) severity(warning)
+  // message("User-supplied warning for strict.")
+#pragma omp single
+  num_failed = num_failed + !(omp_get_num_threads() <= 20);
+
+  // Exceed limit, no user message, use runtime default message for strict.
+  // Tolerate whatever value was given.
+  __kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 21, 1,
+                                 NULL);
+#pragma omp parallel reduction(+ : num_failed) // num_threads(strict:21)
+#pragma omp single
+  num_failed = num_failed + !(omp_get_num_threads() <= 21);
+
+  // Exceed limit at top level. Should see user warning message.
+  int threads3[2] = {24, 2};
+  __kmpc_push_num_threads_list_strict(NULL, __kmpc_global_thread_num(NULL), 2,
+                                      threads3, 1,
+                                      "User-supplied warning on strict list.");
+#pragma omp parallel reduction(+ : num_failed)
+  // num_threads(strict:24,2)  severity(warning)
+  // message("User-supplied warning on strict. list") // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() <= 24);
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() <= 2);
+    }
+  }
+
+  // No strict limit at top level. Regular runtime limiting applies.
+  __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2,
+                               threads3);
+#pragma omp parallel reduction(+ : num_failed)
+  // num_threads(24,2) // 1st level
+  {
+#pragma omp single
+    num_failed = num_failed + !(omp_get_num_threads() <= 24);
+#pragma omp parallel reduction(+ : num_failed) // 2nd level
+    {
+#pragma omp single
+      num_failed = num_failed + !(omp_get_num_threads() <= 2);
+    }
+  }
+
+  return (!num_failed);
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_parallel_num_threads_strict()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index a34f2931527255..d594823410f5ff 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2768,7 +2768,7 @@ isl::set ScopBuilder::getNonHoistableCtx(MemoryAccess *Access,
   AccessRelation = AccessRelation.intersect_domain(Stmt.getDomain());
   isl::set SafeToLoad;
 
-  auto &DL = scop->getFunction().getParent()->getDataLayout();
+  auto &DL = scop->getFunction().getDataLayout();
   if (isSafeToLoadUnconditionally(LI->getPointerOperand(), LI->getType(),
                                   LI->getAlign(), DL)) {
     SafeToLoad = isl::set::universe(AccessRelation.get_space().range());
@@ -2814,7 +2814,7 @@ bool ScopBuilder::canAlwaysBeHoisted(MemoryAccess *MA,
                                      bool MAInvalidCtxIsEmpty,
                                      bool NonHoistableCtxIsEmpty) {
   LoadInst *LInst = cast<LoadInst>(MA->getAccessInstruction());
-  const DataLayout &DL = LInst->getParent()->getModule()->getDataLayout();
+  const DataLayout &DL = LInst->getDataLayout();
   if (PollyAllowDereferenceOfAllFunctionParams &&
       isAParameter(LInst->getPointerOperand(), scop->getFunction()))
     return true;
diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index da0e503b23efc1..f7c777b4e80118 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -81,7 +81,7 @@ Value *BlockGenerator::trySynthesizeNewValue(ScopStmt &Stmt, Value *Old,
   VTV.insert(GlobalMap.begin(), GlobalMap.end());
 
   Scop &S = *Stmt.getParent();
-  const DataLayout &DL = S.getFunction().getParent()->getDataLayout();
+  const DataLayout &DL = S.getFunction().getDataLayout();
   auto IP = Builder.GetInsertPoint();
 
   assert(IP != Builder.GetInsertBlock()->end() &&
@@ -492,7 +492,7 @@ Value *BlockGenerator::getOrCreateAlloca(const ScopArrayInfo *Array) {
   else
     NameExt = ".s2a";
 
-  const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
+  const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
 
   Addr =
       new AllocaInst(Ty, DL.getAllocaAddrSpace(), nullptr,
diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp
index c4ca3f2e0f0f0e..8813cfd959ef6e 100644
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@@ -198,7 +198,7 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI,
   auto ScopStats = S.getStatistics();
   ScopsProcessed++;
 
-  auto &DL = S.getFunction().getParent()->getDataLayout();
+  auto &DL = S.getFunction().getDataLayout();
   Region *R = &S.getRegion();
   assert(!R->isTopLevelRegion() && "Top level regions are not supported");
 
@@ -328,7 +328,7 @@ class CodeGeneration final : public ScopPass {
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    DL = &S.getFunction().getParent()->getDataLayout();
+    DL = &S.getFunction().getDataLayout();
     RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
     return generateCode(S, *AI, *LI, *DT, *SE, *RI);
   }
diff --git a/polly/lib/CodeGen/PerfMonitor.cpp b/polly/lib/CodeGen/PerfMonitor.cpp
index 056d5b372723b0..3cad8537f3ee19 100644
--- a/polly/lib/CodeGen/PerfMonitor.cpp
+++ b/polly/lib/CodeGen/PerfMonitor.cpp
@@ -13,6 +13,7 @@
 #include "polly/ScopInfo.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/Module.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
diff --git a/polly/lib/Support/SCEVAffinator.cpp b/polly/lib/Support/SCEVAffinator.cpp
index c579dc18a6f525..d8463b238822d9 100644
--- a/polly/lib/Support/SCEVAffinator.cpp
+++ b/polly/lib/Support/SCEVAffinator.cpp
@@ -15,6 +15,7 @@
 #include "polly/ScopInfo.h"
 #include "polly/Support/GICHelper.h"
 #include "polly/Support/SCEVValidator.h"
+#include "llvm/IR/DataLayout.h"
 #include "isl/aff.h"
 #include "isl/local_space.h"
 #include "isl/set.h"
@@ -82,7 +83,7 @@ static __isl_give isl_pw_aff *getWidthExpValOnDomain(unsigned Width,
 
 SCEVAffinator::SCEVAffinator(Scop *S, LoopInfo &LI)
     : S(S), Ctx(S->getIslCtx().get()), SE(*S->getSE()), LI(LI),
-      TD(S->getFunction().getParent()->getDataLayout()) {}
+      TD(S->getFunction().getDataLayout()) {}
 
 Loop *SCEVAffinator::getScope() { return BB ? LI.getLoopFor(BB) : nullptr; }
 
diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp
index 8afa63724ade5d..b7e464e6739c67 100644
--- a/polly/lib/Transform/DeLICM.cpp
+++ b/polly/lib/Transform/DeLICM.cpp
@@ -24,6 +24,7 @@
 #include "polly/Support/ISLTools.h"
 #include "polly/ZoneAlgo.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 
 #include "polly/Support/PollyDebug.h"
diff --git a/runtimes/cmake/Modules/WarningFlags.cmake b/runtimes/cmake/Modules/WarningFlags.cmake
index d06409841dc9df..068d22150d3875 100644
--- a/runtimes/cmake/Modules/WarningFlags.cmake
+++ b/runtimes/cmake/Modules/WarningFlags.cmake
@@ -60,6 +60,12 @@ function(cxx_add_warning_flags target enable_werror enable_pedantic)
       -Wno-c++14-compat
       -Wno-noexcept-type
       -Wno-suggest-override
+      -Wno-alloc-size-larger-than
+      -Wno-deprecated-declarations
+      -Wno-dangling-reference
+      -Wno-strict-overflow
+      -Wno-maybe-uninitialized
+      -Wno-strict-aliasing
       )
 
   endif()
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index ea3d5c2f5816e6..c3cddf94aed50d 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -72,6 +72,7 @@ libc_support_library(
 libc_support_library(
     name = "llvm_libc_macros_float16_macros",
     hdrs = ["include/llvm-libc-macros/float16-macros.h"],
+    deps = [":llvm_libc_types_float128"],
 )
 
 libc_support_library(
@@ -900,6 +901,8 @@ libc_support_library(
     deps = [
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_dyadic_float",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
@@ -917,7 +920,7 @@ libc_support_library(
     ],
     deps = [
         ":__support_common",
-        ":__support_fputil_fma",
+        ":__support_cpp_type_traits",
     ],
 )
 
@@ -980,6 +983,8 @@ libc_support_library(
     deps = [
         ":__support_big_int",
         ":__support_common",
+        ":__support_cpp_type_traits",
+        ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_fputil_multiply_add",
         ":__support_macros_optimization",
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index ddcaea5184d434..50da7411a1c2fc 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -869,7 +869,7 @@ cc_library(
     ],
 )
 
-genrule(
+apple_genrule(
     name = "mach_gen",
     srcs = ["tools/debugserver/source/MacOSX/dbgnub-mig.defs"],
     outs = [
@@ -878,7 +878,10 @@ genrule(
         "mach_excUser.c",
     ],
     cmd = "mig -header $(location :mach_exc.h) -server $(location :mach_excServer.c) -user $(location :mach_excUser.c) $(SRCS)",
-    tags = ["nobuildkite"],
+    tags = [
+        "manual",
+        "nobuildkite",
+    ],
     target_compatible_with = select({
         "@platforms//os:macos": [],
         "//conditions:default": ["@platforms//:incompatible"],
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index ce9158b297908b..161b58814d2762 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -1964,6 +1964,7 @@ cc_library(
         ":PluginProcessMinidump",
         ":PluginProcessUtility",
         "//lldb:Core",
+        "//lldb:Headers",
         "//lldb:Host",
         "//lldb:Symbol",
         "//lldb:SymbolHeaders",
@@ -1973,6 +1974,7 @@ cc_library(
         "//llvm:BinaryFormat",
         "//llvm:Object",
         "//llvm:Support",
+        "//llvm:TargetParser",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/lldb/tools/lldb-dap/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/tools/lldb-dap/BUILD.bazel
new file mode 100644
index 00000000000000..6d3f19098fb452
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/lldb/tools/lldb-dap/BUILD.bazel
@@ -0,0 +1,62 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("//:vars.bzl", "PACKAGE_VERSION")
+load("//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+expand_template(
+    name = "plist",
+    out = "lldb-dap-Info.plist",
+    substitutions = {
+        "${LLDB_VERSION}": PACKAGE_VERSION,
+    },
+    template = "lldb-dap-Info.plist.in",
+)
+
+gentbl_cc_library(
+    name = "lldb_dap_opts_gen",
+    strip_include_prefix = ".",
+    tbl_outs = [(
+        ["-gen-opt-parser-defs"],
+        "Options.inc",
+    )],
+    tblgen = "//llvm:llvm-tblgen",
+    td_file = "Options.td",
+    deps = ["//llvm:OptParserTdFiles"],
+)
+
+cc_binary(
+    name = "lldb-dap",
+    srcs = glob([
+        "*.cpp",
+        "*.h",
+    ]),
+    additional_linker_inputs = [
+        ":plist",
+    ],
+    linkopts = select({
+        "@platforms//os:macos": [
+            "-Wl,-sectcreate,__TEXT,__info_plist,$(location :plist)",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":lldb_dap_opts_gen",
+        "//lldb:APIHeaders",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:liblldb.wrapper",
+        "//llvm:Option",
+        "//llvm:Support",
+        "//llvm:config",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index cf7ee18f0d0688..38970d9929b9c3 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -31,6 +31,7 @@ exports_files([
     "include/llvm/IR/Intrinsics.td",
     "include/llvm/Option/OptParser.td",
     "utils/lit/lit.py",
+    "utils/lldbDataFormatters.py",
 ])
 
 # It may be tempting to add compiler flags here, but that should be avoided.
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index b0440620515e95..dfc0f870177bbd 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -174,6 +174,7 @@ cc_test(
     deps = [
         "//llvm:AllTargetsAsmParsers",
         "//llvm:AllTargetsCodeGens",
+        "//llvm:AsmParser",
         "//llvm:CodeGen",
         "//llvm:CodeGenTypes",
         "//llvm:Core",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index d8369853e22f8e..c9061e7eea3dc9 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -26,6 +26,7 @@ licenses(["notice"])
 exports_files([
     "LICENSE.TXT",
     "run_lit.sh",
+    "utils/lldb-scripts/mlirDataFormatters.py",
     "utils/textmate/mlir.json",
 ])
 
@@ -824,6 +825,7 @@ mlir_c_api_cc_library(
     includes = ["include"],
     deps = [
         ":ConversionPasses",
+        ":ConvertToSPIRV",
         ":Pass",
     ],
 )
@@ -1030,6 +1032,7 @@ cc_library(
         ":CAPIAsyncHeaders",
         ":CAPIDebugHeaders",
         ":CAPIIRHeaders",
+        ":config",
         ":MLIRBindingsPythonHeaders",
         ":Support",
         "//llvm:Support",
@@ -2197,6 +2200,7 @@ cc_library(
         ":LLVMDialect",
         ":MemRefDialect",
         ":Pass",
+        ":SCFDialect",
         ":SCFTransforms",
         ":TransformUtils",
         "//llvm:Support",
@@ -4200,6 +4204,7 @@ cc_library(
         ":ControlFlowToSPIRV",
         ":ConversionPassIncGen",
         ":ConvertToLLVM",
+        ":ConvertToSPIRV",
         ":FuncToEmitC",
         ":FuncToLLVM",
         ":FuncToSPIRV",
@@ -6875,6 +6880,110 @@ gentbl_cc_library(
     ],
 )
 
+td_library(
+    name = "PtrTdFiles",
+    srcs = [
+        "include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td",
+        "include/mlir/Dialect/Ptr/IR/PtrDialect.td",
+        "include/mlir/Dialect/Ptr/IR/PtrOps.td",
+    ],
+    includes = ["include"],
+    deps = [
+        ":AttrTdFiles",
+        ":BuiltinDialectTdFiles",
+        ":DataLayoutInterfacesTdFiles",
+        ":OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "PtrDialectIncGen",
+    tbl_outs = [
+        (
+            ["-gen-dialect-decls"],
+            "include/mlir/Dialect/Ptr/IR/PtrOpsDialect.h.inc",
+        ),
+        (
+            ["-gen-dialect-defs"],
+            "include/mlir/Dialect/Ptr/IR/PtrOpsDialect.cpp.inc",
+        ),
+        (
+            ["-gen-typedef-decls"],
+            "include/mlir/Dialect/Ptr/IR/PtrOpsTypes.h.inc",
+        ),
+        (
+            ["-gen-typedef-defs"],
+            "include/mlir/Dialect/Ptr/IR/PtrOpsTypes.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Ptr/IR/PtrDialect.td",
+    deps = [
+        ":PtrTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "PtrOpsIncGen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "include/mlir/Dialect/Ptr/IR/PtrOps.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "include/mlir/Dialect/Ptr/IR/PtrOps.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Ptr/IR/PtrOps.td",
+    deps = [
+        ":PtrTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "PtrAttrsIncGen",
+    tbl_outs = [
+        (
+            ["-gen-attrdef-decls"],
+            "include/mlir/Dialect/Ptr/IR/PtrOpsAttrs.h.inc",
+        ),
+        (
+            ["-gen-attrdef-defs"],
+            "include/mlir/Dialect/Ptr/IR/PtrOpsAttrs.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td",
+    deps = [
+        ":PtrTdFiles",
+    ],
+)
+
+cc_library(
+    name = "PtrDialect",
+    srcs = glob(
+        [
+            "lib/Dialect/Ptr/IR/*.cpp",
+        ],
+    ),
+    hdrs = glob([
+        "include/mlir/Dialect/Ptr/IR/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":BytecodeOpInterface",
+        ":DataLayoutInterfaces",
+        ":IR",
+        ":InliningUtils",
+        ":PtrAttrsIncGen",
+        ":PtrDialectIncGen",
+        ":PtrOpsIncGen",
+        "//llvm:Support",
+    ],
+)
+
 td_library(
     name = "SPIRVOpsTdFiles",
     srcs = glob(["include/mlir/Dialect/SPIRV/IR/*.td"]),
@@ -8209,6 +8318,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ConvertToSPIRV",
+    srcs = ["lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp"],
+    hdrs = ["include/mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h"],
+    includes = ["include"],
+    deps = [
+        ":ArithToSPIRV",
+        ":ArithTransforms",
+        ":ConversionPassIncGen",
+        ":FuncToSPIRV",
+        ":IR",
+        ":IndexToSPIRV",
+        ":Pass",
+        ":Rewrite",
+        ":SCFToSPIRV",
+        ":SPIRVConversion",
+        ":SPIRVDialect",
+        ":SPIRVTransforms",
+        ":TransformUtils",
+        ":Transforms",
+        ":UBToSPIRV",
+        ":VectorToSPIRV",
+        ":VectorTransforms",
+    ],
+)
+
 cc_library(
     name = "ControlFlowToSCF",
     srcs = [
@@ -9357,6 +9492,7 @@ cc_library(
         ":PDLInterpDialect",
         ":PDLToPDLInterp",
         ":PolynomialDialect",
+        ":PtrDialect",
         ":QuantOps",
         ":ROCDLDialect",
         ":ROCDLTarget",